String Matching
Jump to navigation
Jump to search
#include<stdio.h>
#include<algorithm>
#include<string.h>
#include<math.h>
#include<time.h>
using namespace std;
char text[50000005],word[1000005];
int f[1000005];
int suff[1000005];
int tran[10005][300];
int BC[256],GS[1000005],HS[256];
int n,m;
void BF()
{
int a,b,out=0;
printf("Brute Force\n");
clock_t t=clock();
for(a=0;a<=n-m;a++)
{
for(b=0;b<m;b++)
if(text[a+b]!=word[b]) break;
if(b==m)
out++;
}printf("%d words\n",out);
t=clock()-t;
printf("Time : %d ms\n",(int)t);
}
void RK()
{
int a,b,c=0,p,q,out=0;
int sum=0,k=2173913;
printf("Rabin Karp\n");
clock_t t=clock();
for(a=0,b=0;a<m;a++)
{
sum*=256; sum+=word[a]; sum%=k;
b*=256; b+=text[a]; b%=k;
c*=256; c%=k;
if(!c) c=1;
}
for(a=1;a<=n-m;a++)
{
if(b==sum)
{
for(p=a-1,q=0;q<m;q++)
if(text[p+q]!=word[q]) break;
if(q==m)
out++;
}
if(a==n-m) break;
b-=text[a-1]*c; b%=k;
if(b<0) b+=k;
b*=256; b+=text[a+m-1]; b%=k;
}printf("%d words\n",out);
t=clock()-t;
printf("Time : %d ms\n",(int)t);
}
void FA()
{
int a,b,c,d,out=0;
printf("Finite Automaton\n");
clock_t t=clock();
for(a=0;a<=m;a++)
{
for(b=0;b<256;b++)
{
c=min(m+1,a+2);
for(;c>0;c--)
{
for(d=0;d<c-1;d++)
if(word[d]!=word[a-c+1+d]) break;
if(d==c-1 && word[d]==b)
break;
}
tran[a][b]=c;
}
}
for(a=0,b=0;a<n;a++)
{
b=tran[b][text[a]];
if(b==m) out++;
}printf("%d words\n",out);
t=clock()-t;
printf("Time : %d ms\n",(int)t);
}
void KMP()
{
int a,b,out=0;
printf("Knute Morris Pratt\n");
clock_t t=clock();
f[0]=0;
for(a=1,b=0;a<m;a++)
if(word[a]==word[b])
{
f[a]=f[a-1]+1;
b++;
}
else
{
f[a]=0;
b=0;
}
for(a=0,b=0;a<n;a++)
{
while(b>0 && word[b]!=text[a]) b=f[b-1];
if(word[b]==text[a]) b++;
if(b==m)
{
out++;
b=f[b-1];
}
}printf("%d words\n",out);
t=clock()-t;
printf("Time : %d ms\n",(int)t);
}
void BM()
{
int a,b,c,out=0,p,q;
printf("Boyer Moore\n");
clock_t t=clock();
//bad char
for(a=0;a<256;a++) BC[a]=-1;
for(a=m-1;a>=0;a--)
if(BC[word[a]]==-1)
BC[word[a]]=a;
//good suffix
suff[m-1]=m;
for(a=m-2,b=m-1;a>=0;a--)
if(a>b && suff[a+m-1-c]<a-b)
suff[a]=suff[a+m-1-c];
else
{
if(a<b) b=a;
for(c=a;b>=0 && word[b]==word[b+m-1-c];b--);
suff[a]=c-b;
}
for(a=0;a<m;a++)
GS[a]=m;
for(a=m-1,b=0;a>=0;a--)
if(suff[a]==a+1)
for(;b<m-1-a;b++)
if(GS[b]==m)
GS[b]=m-1-a;
for(a=0;a<m-1;a++)
GS[m-1-suff[a]]=m-1-a;
//calculate
for(a=0;a<=n-m;)
{
for(b=m-1;text[a+b]==word[b];b--);
if(b==-1)
{
out++;
a++;
}
else
{
p=b-BC[text[a+b]];
q=GS[b];
a+=max(p,q);
}
}printf("%d words\n",out);
t=clock()-t;
printf("Time : %d ms\n",(int)t);
}
void BMHS()
{
int a,b,c,out=0,p,q,r;
printf("Boyer Moore + Horspool\n");
clock_t t=clock();
//bad char
for(a=0;a<256;a++) BC[a]=-1;
for(a=m-1;a>=0;a--)
if(BC[word[a]]==-1)
BC[word[a]]=a;
//good suffix
suff[m-1]=m;
for(a=m-2,b=m-1;a>=0;a--)
if(a>b && suff[a+m-1-c]<a-b)
suff[a]=suff[a+m-1-c];
else
{
if(a<b) b=a;
for(c=a;b>=0 && word[b]==word[b+m-1-c];b--);
suff[a]=c-b;
}
for(a=0;a<m;a++)
GS[a]=m;
for(a=m-1,b=0;a>=0;a--)
if(suff[a]==a+1)
for(;b<m-1-a;b++)
if(GS[b]==m)
GS[b]=m-1-a;
for(a=0;a<m-1;a++)
GS[m-1-suff[a]]=m-1-a;
//horspool
for(a=0;a<256;a++) HS[a]=-1;
for(a=m-2;a>=0;a--)
if(HS[word[a]]==-1)
HS[word[a]]=a;
for(a=0;a<=n-m;)
{
for(b=m-1;text[a+b]==word[b];b--);
if(b==-1)
{
out++;
a++;
}
else
{
p=b-BC[text[a+b]]; //bad char BM
q=GS[b]; //good suffix BM
r=m-1-HS[text[a+m-1]]; //HS
a+=max(max(p,q),r);
}
}printf("%d words\n",out);
t=clock()-t;
printf("Time : %d ms\n",(int)t);
}
int main()
{
FILE *fp;
fp=fopen("big_edited.txt","r");
fgets(text,50000000,fp);
freopen("test_texts.txt","r",stdin);
for(int a=0;a<10;a++)
{
gets(word);
printf("==========case %d==========\n",a+1);
n=strlen(text); m=strlen(word);
BF(); //Brute Force
RK(); //Rabin Karp
if(a<5) FA(); //Finite Automaton
KMP(); //Knute Morris Pratt
BM(); //Boyer Moore
BMHS(); //Boyer Moore + Horspool
printf("\n");
}
return 0;
}