/* A demonstration of the Boyer-Moore string matching algorithm.

I've used the letter conventions used in the explanation of Boyre-Moore
algorithm given in the book by Levitin */


#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#define ALPHABIT_LENGTH 128
#define TEXT_LENGTH 500
#define PATTERN_LENGTH 20
#define NOT_FOUND -1
#define FOUND 0

/* this function searches for the rightmost occurance of the suffix 
in the pattern,  in the first m-1 characters of pattern. m is the 
pattern length and k is the length of the suffix and returns. It returns
the number of right shift required. (it takes into account if the prefix
is a part of the matched suffix */

// k is assumed to be greater than or equal to 1
// value returned is 'd2' corresponding to textbook

int find_suffix (char *pattern, int k)
{
	int m, result;
	int i, j, h;
	char *suffix = (char *) malloc ((k*sizeof (char))+1);
	if (suffix == NULL)
	{
		printf ("Error allocating memory....\n");
		return EXIT_FAILURE;
	}
	
	m = (int) strlen (pattern);
	strcpy (suffix, (char *) ((int) pattern + (m-k))); // potential bug

// copy the 'good suffix part' to suffix (already successfully matched characters)
// comparison is done from right.
	
	for (i = m-2; i >= 0; i--)
	{
		h = k-1, j = i; result = FOUND;
		while (j >= 0 && (i-j) < k)
			if (suffix[h] == pattern[j])
				j--, h--;
			else
			{
				result = NOT_FOUND;
				break;
			}
			
		if (result == FOUND)
		{
// if first condition is satisfied, second is not checked.
			if (j == -1 || pattern[j] != pattern[m-1-k])
			{
				free (suffix);
				return m-1-i;
			}
// j == -1 implies that beginnning part of the pattern (called prefix)
// is matched. This need not cover the full good suffix. It can only match 
// last few characters of suffix (comparison from right remember?).
		}
	}
			
	free (suffix);
	return m;
}

/* this functions builds the good suffix table. table[k] should give the
appropriate shifts, where k is the number of successful matches. m here is the
length of the pattern string. this would contain the information of shifts for
upto m-1 matches (m matches is a successful search) */

void build_good_suffix_table (char pattern[], int good_suffix_table[])
{
	
	int k, m;
	m = (int) strlen (pattern);
	good_suffix_table[0] = 1;
	
	for (k = 1; k <= m-1; k++)
		good_suffix_table[k] = find_suffix (pattern, k);
}

void build_bad_symbol_table (char pattern[], int bad_symbol_table[])
{
	int m, j;
	
	m = strlen (pattern);
	
	for (j = 0; j < ALPHABIT_LENGTH; j++)
		bad_symbol_table[j] = m;
	
	for (j = 0; j <= m-2; j++)
		bad_symbol_table[(int)pattern[j]] = m-j-1;
	
}

// here, 'k' is the number of successful matches and 'c' is the character mismatched

int calculate_shift (int bad_symbol_table[], int good_suffix_table[], int k, char c)
{
	int bad_symbol_shift, good_suffix_shift;
	
	good_suffix_shift = good_suffix_table[k];
	bad_symbol_shift = bad_symbol_table[(int) c] - k;
	
	if (bad_symbol_shift <= 0)
		bad_symbol_shift = 1;
	
	return (bad_symbol_shift > good_suffix_shift ? bad_symbol_shift : good_suffix_shift);
}

/*

Considering only the "text[i-k] == pattern[m-k-1]" comparison as the basic operation,
that is, "k < m" comparison in the loop not being counted, the variable 'number_of_comparisons'
will give the total number of comparisons that this algorithm makes. 

Note that, as expected, the number of basic operations decrease as the pattern size increase.
So it can be concluded that the algorithm works better for longer search pattern.

(have a look at the wikipedia article)

"Generally the algorithm gets faster as the key being searched for becomes longer"

*/

int main()
{
	int i, m, k, n, position, shift, result;
	
	int number_of_comparisons = 0;
	
	char pattern[PATTERN_LENGTH];
	char text[TEXT_LENGTH];
	
	printf ("Enter the text (maximum of %d characters): \n", TEXT_LENGTH);
	fgets (text, TEXT_LENGTH, stdin);
	n = (int) strlen (text);
	text[--n] = '\0';

//	don't use gets() to read strings. it will not check for buffer overrun.
// 	fgets() includes the newline character also in the string. this operation
// 	is to remove the newline character.
	
	printf ("Enter the pattern (maximum of %d characters): \n", PATTERN_LENGTH);
	fgets (pattern, PATTERN_LENGTH, stdin);
	m = (int) strlen (pattern);
	pattern[--m] = '\0';

	int *good_suffix_table = (int *) malloc (sizeof (int) * m);
	if (good_suffix_table == NULL)
	{
		printf ("Error allocating memory....\n");
		return EXIT_FAILURE;
	}
	int *bad_symbol_table = (int *) malloc (sizeof (int) * ALPHABIT_LENGTH);
	if (bad_symbol_table == NULL)
	{
		printf ("Error allocating memory....\n");
		return EXIT_FAILURE;
	}
	
	build_good_suffix_table (pattern, good_suffix_table);
	build_bad_symbol_table (pattern, bad_symbol_table);
	
/*	
	//the following two loops are just to verify the tables
	printf ("k   :  d2\n");
	for (i = 0; i < m; i++)
		printf ("%d   :   %d\n", i, good_suffix_table[i]);
	
	printf ("\nc   :   d1\n");
	for (i = 0; i < ALPHABIT_LENGTH; i++)
		printf ("%c   :   %d\n", (char) i, bad_symbol_table[i]);
*/
	
	
	position = NOT_FOUND;
	i = m-1;
	while (i <= n-1)
	{
		k = 0;
		
		number_of_comparisons++;
		while (k < m && text[i-k] == pattern[m-k-1])
			k++, number_of_comparisons++;
		if (k == m)
		{
			number_of_comparisons--;
			position =  i-m+1;
			break;
		}
		shift = calculate_shift (bad_symbol_table, good_suffix_table, k, text[i-k]);
		i = i+shift;
	}
	
	if (position == NOT_FOUND)
	{
		printf ("Pattern not found....\n");
		result = EXIT_FAILURE;
	}
	else
	{
		printf ("Pattern found at position %d\n", position);
		result = EXIT_SUCCESS;
	}
	
	printf ("Number of comparisons: %d\n", number_of_comparisons);
	
	free (good_suffix_table);
	free (bad_symbol_table);
	
	return result;
}
www.000webhost.com