/* KMP.c: Implements strstr library function using the Knuth-Morris-Pratt substring search algorithm. By Terry R. McConnell The entry point my_strstr is an implementation of the standard C library function strstr using the KMP algorithm. Extensive documentation is provided in the form of comments. There is also a main program invoked as "kmp " (or with -h option for usage information) which searches for the first occurence of the target in the stdin stream and prints information about where it was found. Reference: Knuth, D.E., J.H. Morris, and V.R. Pratt, "Fast pattern matching in strings", SIAM J. Computing, 6:2, 323-350. Compile: cc -o kmp kmp.c Include -DMAX_SOURCE=... to change size of largest string that can be searched. */ #include #include #include #ifndef MAX_SOURCE #define MAX_SOURCE 16384 #endif #define VERSION "1.1" #define PROGNAME "kmp" #define USAGE "kmp [-flnvhr] target" #define HELP "-h: print this helpful message\n\ -v: print version number and exit\n\ -l: use library version of strstr instead. (Useful for benchmarking.)\n\ -f: list definition of the Knuth-Morris-Pratt failure function for target\n\ -r: print the shortest repeating prefix of target \n\ -n: count occurrences of target in source and exit\n\ Read stdin and find first occurence of target string in it. \n\ Print information about where it was found.\n\n" int *f; /* Pointer to array containing values of failure function */ /* Create the failure function for target string. The failure function is defined as follows, for i = 1 to m, where m is the length of the target string: Let the target string be b_1...b_m. Then f(i) is the length of the longest proper suffix of b_1...b_i which is a prefix of the target string. If you are trying to find the first occurence of a target string in a stream of input characters and the i+1st character is wrong, you need not start over looking for the beginning of the target from the next received character on, because the last few characters received might still be a valid head start on forming the target. The failure function tells exactly how many characters head start you have. In more colorful language, the idea behind this algorithm is: Don't throw the baby out with the bathwater. The code below was translated into C from psuedo code given on page 151 of Aho, Sethi, and Ullman. Unfortunately, the C-convention of starting array indices from 0 makes the code harder to follow than it should be. To make it easier, we never use f[0]. */ int make_f(const char *target){ int t = 0; /* think of t+1 as the length of the current valid prefix. */ int s,m; m = strlen(target); f = malloc((m+1)*sizeof(int)); f[1] = 0; /* When first character is received there is no proper suffix. */ for(s=1;s 0) && (target[s] != target[t])) t = f[t]; /* This is the subtle part of the algorithm. The target does not begin t_0...t_t t_s. We could examine t_1...t_s, t_2...t_s,... successively to see which (if any) gives a valid prefix. But this is not necessary. Say the longest valid prefix is t_k...t_t t_s. Then t_k...t_t is also a valid prefix. But f(t) gave the length of the longest valid prefix of the form t_k...t_t. Thus no k such that t_k..t_t is longer than f(t) need be considered. Now iterate this reasoning to conclude that only candidates t_k...t_t of lengths f(t), f(f(t)),... need be considered. */ if(target[t] == target[s]){ /* The next character can be added to current prefix to get a longer valid prefix. Good! */ t++; f[s+1] = t; } else f[s+1] = 0; /* Clearly the while loop above must have exited due to t=0. */ } } /* Theoretical note: the assignment t = f[t] can be done at most m-1 times, where m is the length of the target string (although the while loop can iterate more than once on given pass. For example, if the target is abababb, then f(6)=4, f(f(6))=2, and f(f(f(6)))= 0. The assignment is done 3 times when s = 6.) It is then easy to see that the running time of the routine make_f is O(m). Lemma. Let f be a non-negative integer valued function such that f(1) = 0, and so that f is "continuous upwards", i.e., f(t+1) <= f(t) + 1. Then __ A = \ [ f(s) - f(s+1) ]+ <= (l-1), /__ where a+ denotes the maximum of a and 0, and a- denotes the minimum of a and 0. Here and below, all sums run from 1 to l-1. Proof: Since a = (a+) + (a-) for any number a, we have __ -f(l) = f(1) - f(l) = \ [ f(s) - f(s+1) ] = /__ __ A + \ [ f(s) - f(s+1) ]- >= A - (l-1). /__ Using f(l) >= 0 and rearranging, we obtain the desired inequality. QED. Consider now a given iteration of the 'for' loop with a given value of s. Let n be the number of times the assignment t = f(t) is done. (n) (n) (n) Note that f(s+1) is either f (s) + 1, or 0 = f(s), where f denotes the n-1 fold iterate of f. Since f(t) < t (only proper suffixes are considered,) we have (n) f (s) <= f(s) - n + 1. Thus, in either case, f(s) - f(s+1) >= n. Now apply the lemma to obtain the desired bound on the sum of n. */ /* Return a pointer to the first occurence of target string in source * string, or null if the target does not occur in the source. */ char *my_strstr(const char *src, const char *target){ int s = 0; /* Trie state */ int i; /* Current index of source char */ int m; make_f(target); /* Create failure function for this target */ m = strlen(target); for(i=0;i0) && (src[i] != target[s])) s = f[s]; if(src[i] == target[s]) s++; if(s==m) return ((char *)src+i-m+1); } return NULL; } /* Return a pointer to the shortest repeating prefix of s. Note that s is destroyed in the process. The shortest repeating prefix a string s is defined to be the shortest string t such that s = t^k, where the exponentiation denotes string concatentation. The following algorithm works because the length of t clearly belongs to m = strlen(s), f(m), f(f(m)), ... */ char *srepp(char *s){ int m,u,k; char *buf; m = strlen(s); buf = malloc(m+1); buf[0]='\0'; make_f(s); /* u is a trial repeating prefix */ u = m; while((u=f[u])>0) /* If u is a repeating prefix, set m = u: m records the length of the shortest repeating prefix found so far */ if(m%u==0){ /* u must divide m */ k = m/u; while(k--) strncat(buf,s,u); if(strcmp(s,buf)==0){ m=u; s[m]='\0'; } buf[0]='\0'; } return s; } /* Print out the failure function */ void list_f(char *target){ int i; printf("Failure function for %s:\n", target); for(i=1;i<=strlen(target);i++) printf("f[%d] = %d\n",i,f[i]); } int main(int argc, char **argv){ char source[MAX_SOURCE],*target,*p; int c,i=0; int f_flag = 0,n_flag = 0,r_flag=0; int lines = 0; char *ll; char *(*use_strstr)(const char *,const char*) = &my_strstr; /* Process command line */ if(argc <= 1){ fprintf(stderr,"%s:%s\n",PROGNAME,USAGE); return 1; } i=1; while(argv[i][0]=='-'){ switch(argv[i][1]){ case 'f': f_flag=1; break; case 'l': use_strstr = &strstr; break; case 'n': n_flag=1; break; case 'v': printf("%s\n",VERSION); return 0; case 'h': printf("%s\n",USAGE); printf("%s\n",HELP); return 0; case 'r': r_flag = 1; break; default: fprintf(stderr,"%s:%s\n",PROGNAME,USAGE); return 1; } i++; } if(i != argc-1){ fprintf(stderr,"%s:%s\n",PROGNAME,USAGE); return 1; } target = argv[i]; if(r_flag){ printf("%s\n",srepp(target)); return 0; } if(f_flag){ make_f(target); list_f(target); return 0; } /* Get the source string from stdin */ i=0; while((c=getchar())!=EOF){ source[i++]=(char)c; if(i >= MAX_SOURCE){ fprintf(stderr,"Source too long. Must be < %d\n", MAX_SOURCE); return 1; } } source[i] = '\0'; /* properly terminate */ if(n_flag){ make_f(target); c=0; p=source; while(p=use_strstr(p,target))c++,p++; printf("Target '%s' found %d times in source.\n",target,c); return 0; } /* Search for target in source */ p = use_strstr(source,target); printf("target = %s\n",target); if(p){ /* don't print out lines before the one containing target */ ll = source; /* Position of start of most recent line */ for(i=0;i