String Matching

From Ta Wiki
Revision as of 00:14, 8 March 2019 by Tata (talk | contribs) (Created page with " #include<stdio.h> #include<algorithm> #include<string.h> #include<math.h> #include<time.h> using namespace std; char text[50000005],word[1000005]; int f[1000005]; in...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search
#include<stdio.h> 
#include<algorithm>
#include<string.h>
#include<math.h>
#include<time.h>
using namespace std;
char text[50000005],word[1000005];
int f[1000005];
int suff[1000005];
int tran[10005][300];
int BC[256],GS[1000005],HS[256];
int n,m;
void BF()
{
	int a,b,out=0;
	printf("Brute Force\n");
	clock_t t=clock();
	for(a=0;a<=n-m;a++)
	{
		for(b=0;b<m;b++)
			if(text[a+b]!=word[b])	break;
		if(b==m)
			out++;
	}printf("%d words\n",out);
	t=clock()-t;
	printf("Time : %d ms\n",(int)t);
}
void RK()
{
	int a,b,c=0,p,q,out=0;
	int sum=0,k=2173913;
	printf("Rabin Karp\n");
	clock_t t=clock();
	for(a=0,b=0;a<m;a++)
	{
		sum*=256;	sum+=word[a];	sum%=k;
		b*=256;		b+=text[a];		b%=k;
		c*=256;		c%=k;
		if(!c)	c=1;
	}
	for(a=1;a<=n-m;a++)
	{
		if(b==sum)
		{
			for(p=a-1,q=0;q<m;q++)
				if(text[p+q]!=word[q])	break;
			if(q==m)
				out++;
		}
		if(a==n-m)	break;
		b-=text[a-1]*c;	b%=k;
		if(b<0)	b+=k;
		b*=256;	b+=text[a+m-1];	b%=k;
	}printf("%d words\n",out);
	t=clock()-t;
	printf("Time : %d ms\n",(int)t);
}
void FA()
{
	int a,b,c,d,out=0;
	printf("Finite Automaton\n");
	clock_t t=clock();
	for(a=0;a<=m;a++)
	{
		for(b=0;b<256;b++)
	    {
	    	c=min(m+1,a+2);
	        for(;c>0;c--)
	        {
	        	for(d=0;d<c-1;d++)
	            	if(word[d]!=word[a-c+1+d]) break;
	         	if(d==c-1 && word[d]==b)
	                break;
	        }
	     	tran[a][b]=c;
	    }
	}
	for(a=0,b=0;a<n;a++)
	{
		b=tran[b][text[a]];
		if(b==m) out++;
	}printf("%d words\n",out);
	t=clock()-t;
	printf("Time : %d ms\n",(int)t);
}
void KMP()
{
	int a,b,out=0;
	printf("Knute Morris Pratt\n");
	clock_t t=clock();
	f[0]=0;
	for(a=1,b=0;a<m;a++)
		if(word[a]==word[b])
		{
			f[a]=f[a-1]+1;
			b++;
		}
		else
		{
			f[a]=0;
			b=0;
		}
	for(a=0,b=0;a<n;a++)
	{
		while(b>0 && word[b]!=text[a])	b=f[b-1];
		if(word[b]==text[a])	b++;
		if(b==m)
		{
			out++;
			b=f[b-1];
		}
	}printf("%d words\n",out);
	t=clock()-t;
	printf("Time : %d ms\n",(int)t);
}
void BM()
{
	int a,b,c,out=0,p,q;
	printf("Boyer Moore\n");
	clock_t t=clock();
	//bad char
	for(a=0;a<256;a++)	BC[a]=-1;
	for(a=m-1;a>=0;a--)
		if(BC[word[a]]==-1)
			BC[word[a]]=a;
	//good suffix
	suff[m-1]=m;
	for(a=m-2,b=m-1;a>=0;a--)
		if(a>b && suff[a+m-1-c]<a-b)
			suff[a]=suff[a+m-1-c];
		else
		{
			if(a<b)	b=a;
			for(c=a;b>=0 && word[b]==word[b+m-1-c];b--);
			suff[a]=c-b;
		}
	for(a=0;a<m;a++)
		GS[a]=m;
	for(a=m-1,b=0;a>=0;a--)
		if(suff[a]==a+1)
			for(;b<m-1-a;b++)
				if(GS[b]==m)
					GS[b]=m-1-a;
	for(a=0;a<m-1;a++)
		GS[m-1-suff[a]]=m-1-a;
	//calculate
	for(a=0;a<=n-m;)
	{
		for(b=m-1;text[a+b]==word[b];b--);
		if(b==-1)
		{
			out++;
			a++;
		}
		else
		{
			p=b-BC[text[a+b]];
			q=GS[b];
			a+=max(p,q);
		}
	}printf("%d words\n",out);
	t=clock()-t;
	printf("Time : %d ms\n",(int)t);
}
void BMHS()
{
	int a,b,c,out=0,p,q,r;
	printf("Boyer Moore + Horspool\n");
	clock_t t=clock();
	//bad char
	for(a=0;a<256;a++)	BC[a]=-1;
	for(a=m-1;a>=0;a--)
		if(BC[word[a]]==-1)
			BC[word[a]]=a;
	//good suffix
	suff[m-1]=m;
	for(a=m-2,b=m-1;a>=0;a--)
		if(a>b && suff[a+m-1-c]<a-b)
			suff[a]=suff[a+m-1-c];
		else
		{
			if(a<b)	b=a;
			for(c=a;b>=0 && word[b]==word[b+m-1-c];b--);
			suff[a]=c-b;
		}
	for(a=0;a<m;a++)
		GS[a]=m;
	for(a=m-1,b=0;a>=0;a--)
		if(suff[a]==a+1)
			for(;b<m-1-a;b++)
				if(GS[b]==m)
					GS[b]=m-1-a;
	for(a=0;a<m-1;a++)
		GS[m-1-suff[a]]=m-1-a;
	//horspool
	for(a=0;a<256;a++)	HS[a]=-1;
	for(a=m-2;a>=0;a--)
		if(HS[word[a]]==-1)
			HS[word[a]]=a;
	for(a=0;a<=n-m;)
	{
		for(b=m-1;text[a+b]==word[b];b--);
		if(b==-1)
		{
			out++;
			a++;
		}
		else
		{
			p=b-BC[text[a+b]];	//bad char BM
			q=GS[b];	//good suffix BM
			r=m-1-HS[text[a+m-1]];	//HS
			a+=max(max(p,q),r);
		}
	}printf("%d words\n",out);
	t=clock()-t;
	printf("Time : %d ms\n",(int)t);
}
int main()
{
	
	FILE *fp;
	fp=fopen("big_edited.txt","r");
	fgets(text,50000000,fp);
	freopen("test_texts.txt","r",stdin);
	for(int a=0;a<10;a++)
	{
		gets(word);
		printf("==========case %d==========\n",a+1);
		n=strlen(text);	m=strlen(word);
		BF();	//Brute Force
		RK();	//Rabin Karp
		if(a<5)	FA();	//Finite Automaton
		KMP();	//Knute Morris Pratt
		BM();	//Boyer Moore
		BMHS();	//Boyer Moore + Horspool
		printf("\n");
	}
	return 0;
}