String Matching
Jump to navigation
Jump to search
#include<stdio.h> #include<algorithm> #include<string.h> #include<math.h> #include<time.h> using namespace std; char text[50000005],word[1000005]; int f[1000005]; int suff[1000005]; int tran[10005][300]; int BC[256],GS[1000005],HS[256]; int n,m; void BF() { int a,b,out=0; printf("Brute Force\n"); clock_t t=clock(); for(a=0;a<=n-m;a++) { for(b=0;b<m;b++) if(text[a+b]!=word[b]) break; if(b==m) out++; }printf("%d words\n",out); t=clock()-t; printf("Time : %d ms\n",(int)t); } void RK() { int a,b,c=0,p,q,out=0; int sum=0,k=2173913; printf("Rabin Karp\n"); clock_t t=clock(); for(a=0,b=0;a<m;a++) { sum*=256; sum+=word[a]; sum%=k; b*=256; b+=text[a]; b%=k; c*=256; c%=k; if(!c) c=1; } for(a=1;a<=n-m;a++) { if(b==sum) { for(p=a-1,q=0;q<m;q++) if(text[p+q]!=word[q]) break; if(q==m) out++; } if(a==n-m) break; b-=text[a-1]*c; b%=k; if(b<0) b+=k; b*=256; b+=text[a+m-1]; b%=k; }printf("%d words\n",out); t=clock()-t; printf("Time : %d ms\n",(int)t); } void FA() { int a,b,c,d,out=0; printf("Finite Automaton\n"); clock_t t=clock(); for(a=0;a<=m;a++) { for(b=0;b<256;b++) { c=min(m+1,a+2); for(;c>0;c--) { for(d=0;d<c-1;d++) if(word[d]!=word[a-c+1+d]) break; if(d==c-1 && word[d]==b) break; } tran[a][b]=c; } } for(a=0,b=0;a<n;a++) { b=tran[b][text[a]]; if(b==m) out++; }printf("%d words\n",out); t=clock()-t; printf("Time : %d ms\n",(int)t); } void KMP() { int a,b,out=0; printf("Knute Morris Pratt\n"); clock_t t=clock(); f[0]=0; for(a=1,b=0;a<m;a++) if(word[a]==word[b]) { f[a]=f[a-1]+1; b++; } else { f[a]=0; b=0; } for(a=0,b=0;a<n;a++) { while(b>0 && word[b]!=text[a]) b=f[b-1]; if(word[b]==text[a]) b++; if(b==m) { out++; b=f[b-1]; } }printf("%d words\n",out); t=clock()-t; printf("Time : %d ms\n",(int)t); } void BM() { int a,b,c,out=0,p,q; printf("Boyer Moore\n"); clock_t t=clock(); //bad char for(a=0;a<256;a++) BC[a]=-1; for(a=m-1;a>=0;a--) if(BC[word[a]]==-1) BC[word[a]]=a; //good suffix suff[m-1]=m; for(a=m-2,b=m-1;a>=0;a--) if(a>b && suff[a+m-1-c]<a-b) suff[a]=suff[a+m-1-c]; else { if(a<b) b=a; for(c=a;b>=0 && word[b]==word[b+m-1-c];b--); suff[a]=c-b; } for(a=0;a<m;a++) GS[a]=m; for(a=m-1,b=0;a>=0;a--) if(suff[a]==a+1) for(;b<m-1-a;b++) if(GS[b]==m) GS[b]=m-1-a; for(a=0;a<m-1;a++) GS[m-1-suff[a]]=m-1-a; //calculate for(a=0;a<=n-m;) { for(b=m-1;text[a+b]==word[b];b--); if(b==-1) { out++; a++; } else { p=b-BC[text[a+b]]; q=GS[b]; a+=max(p,q); } }printf("%d words\n",out); t=clock()-t; printf("Time : %d ms\n",(int)t); } void BMHS() { int a,b,c,out=0,p,q,r; printf("Boyer Moore + Horspool\n"); clock_t t=clock(); //bad char for(a=0;a<256;a++) BC[a]=-1; for(a=m-1;a>=0;a--) if(BC[word[a]]==-1) BC[word[a]]=a; //good suffix suff[m-1]=m; for(a=m-2,b=m-1;a>=0;a--) if(a>b && suff[a+m-1-c]<a-b) suff[a]=suff[a+m-1-c]; else { if(a<b) b=a; for(c=a;b>=0 && word[b]==word[b+m-1-c];b--); suff[a]=c-b; } for(a=0;a<m;a++) GS[a]=m; for(a=m-1,b=0;a>=0;a--) if(suff[a]==a+1) for(;b<m-1-a;b++) if(GS[b]==m) GS[b]=m-1-a; for(a=0;a<m-1;a++) GS[m-1-suff[a]]=m-1-a; //horspool for(a=0;a<256;a++) HS[a]=-1; for(a=m-2;a>=0;a--) if(HS[word[a]]==-1) HS[word[a]]=a; for(a=0;a<=n-m;) { for(b=m-1;text[a+b]==word[b];b--); if(b==-1) { out++; a++; } else { p=b-BC[text[a+b]]; //bad char BM q=GS[b]; //good suffix BM r=m-1-HS[text[a+m-1]]; //HS a+=max(max(p,q),r); } }printf("%d words\n",out); t=clock()-t; printf("Time : %d ms\n",(int)t); } int main() { FILE *fp; fp=fopen("big_edited.txt","r"); fgets(text,50000000,fp); freopen("test_texts.txt","r",stdin); for(int a=0;a<10;a++) { gets(word); printf("==========case %d==========\n",a+1); n=strlen(text); m=strlen(word); BF(); //Brute Force RK(); //Rabin Karp if(a<5) FA(); //Finite Automaton KMP(); //Knute Morris Pratt BM(); //Boyer Moore BMHS(); //Boyer Moore + Horspool printf("\n"); } return 0; }