/* binfind - find byte sequences in binary files */
/* Copyright (C) 2005 Edwin Steiner <edwin.steiner@gmx.net> */

/* This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#if HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <ctype.h>

#if HAVE_GETOPT_H
#	include <getopt.h>
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

/**********************************************************************/
/* TYPES                                                              */
/**********************************************************************/

#if HAVE___OFF64_T
	typedef __off64_t binfind_off_t;
#	define SIZEOF_BINFIND_OFF_T  SIZEOF___OFF64_T
#elif HAVE_LONG_LONG
	typedef unsigned long long binfind_off_t;
#	define SIZEOF_BINFIND_OFF_T  SIZEOF_LONG_LONG
#elif HAVE_OFF_T
	typedef off_t binfind_off_t;
#	define SIZEOF_BINFIND_OFF_T  SIZEOF_OFF_T
#else
	typedef unsigned long binfind_off_t;
#	define SIZEOF_BINFIND_OFF_T  SIZEOF_LONG
#endif

/**********************************************************************/
/* CONSTANTS                                                          */
/**********************************************************************/

#define BINFIND_VERSION  VERSION

                            /* BLOCKSIZE must be a power of 2 */
#define BLOCKSIZE           512
#define BLOCKMASK           (BLOCKSIZE - 1)

#define MIN_BUFFER_SIZE    (BLOCKSIZE * 256)

#define INITIAL_PATTERN_CAPACITY  1024

#if SIZEOF_BINFIND_OFF_T == 4
#	define HEX_PREFIX "%08"
#elif SIZEOF_BINFIND_OFF_T == 8
#	define HEX_PREFIX "%016"
#else
#	define HEX_PREFIX "%"
#endif

#if SIZEOF_BINFIND_OFF_T == SIZEOF_LONG
#	define OFFSET_DEC_FORMAT  "%lu"
#	define OFFSET_HEX_FORMAT  HEX_PREFIX "lx"
#elif SIZEOF_BINFIND_OFF_T == SIZEOF_LONG_LONG
#	define OFFSET_DEC_FORMAT  "%llu"
#	define OFFSET_HEX_FORMAT  HEX_PREFIX "llx"
#elif SIZEOF_BINFIND_OFF_T == SIZEOF_INT
#	define OFFSET_DEC_FORMAT  "%u"
#	define OFFSET_HEX_FORMAT  HEX_PREFIX "x"
#else
#	error "Could not determine printf format for offset type"
#endif

/**********************************************************************/
/* GLOBAL VARIABLES                                                   */
/**********************************************************************/

static const char *progname;

/* options and option-like parameters: */

static int o_verbose = 0;
static int o_printfilename = 1;
static char *o_hex_delimiters = " \t,";
static int o_little_endian = 0;
static int o_offset_radix = 10;

/* option definitions: */

enum {
	OPT_HELP = 1000,
	OPT_VERBOSE
};

static char short_options[] = "a:x:f:o:LBhHW;";
static struct option long_options[] = {
	{"big-endian",0,NULL,'B'},
	{"help",0,NULL,OPT_HELP},
	{"little-endian",0,NULL,'L'},
	{"no-filename",0,NULL,'h'},
	{"offset-radix",1,NULL,'o'},
	{"verbose",0,NULL,OPT_VERBOSE},
	{"version",0,NULL,'V'},
	{"with-filename",0,NULL,'H'},
	{NULL,0,NULL,0}
};

/* work variables: */

static unsigned char *g_unaligned_buffer;
static unsigned char *g_buffer;
static int g_bufsize;
static int g_bytesinbuf;
static binfind_off_t g_bufferoffset; /* XXX handle 64 bit offsets */
static int g_reachedeof;

static unsigned char *g_pattern;
static int g_patternlength;
static int g_patterncapacity;

static int *g_bad_char_shift;
static int *g_good_suffix_shift;

static int g_found_it;

/**********************************************************************/
/* CODE                                                               */
/**********************************************************************/

/*--------------------------------------------------------------------*/
/* error reporting                                                    */
/*--------------------------------------------------------------------*/

static void
error_nomem(void)
{
	fprintf(stderr,"%s: out of memory\n",progname);
	exit(2);
}

static void
error(const char *msg,const char *fname)
{
	if (fname)
		fprintf(stderr,"%s: %s: %s: %s\n",progname,msg,fname,strerror(errno));
	else
		fprintf(stderr,"%s: %s: %s\n",progname,msg,strerror(errno));
	exit(2);
}

static void
errorstr(const char *msg,const char *str,int exitval)
{
	assert(str);
	fprintf(stderr,"%s: %s: %s\n",progname,msg,str);
	exit(exitval);
}

static void
warn_offset(void)
{
	fprintf(stderr,"%s: warning: offset overflow\n",progname);
}

/*--------------------------------------------------------------------*/
/* save output functions                                              */
/*--------------------------------------------------------------------*/

static void
outc(int ch)
{
	int ret;

	ret = fputc(ch,stdout);
	if (ret == EOF)
		error("error writing to stdout",NULL);
}

static void
outs(const char *str)
{
	int ret;

	ret = fputs(str,stdout);
	if (ret == EOF)
		error("error writing to stdout",NULL);
}

#if 0
static void
outint(int value)
{
	int ret;

	ret = printf("%d",value);
	if (ret < 0)
		error("error writing to stdout",NULL);
}
#endif

static void
outoffset(binfind_off_t value)
{
	int ret;
	const char *fmt;

	fmt = (o_offset_radix == 16) ? OFFSET_HEX_FORMAT : OFFSET_DEC_FORMAT;

	ret = printf(fmt,value);
	if (ret < 0)
		error("error writing to stdout",NULL);
}

/*--------------------------------------------------------------------*/
/* buffer handling                                                    */
/*--------------------------------------------------------------------*/

static void
buffer_init(int patternlength)
{
	unsigned int misalign;
	
	assert(patternlength > 0);
	
	g_bufsize = 2 * patternlength;
	if (g_bufsize < MIN_BUFFER_SIZE)
		g_bufsize = MIN_BUFFER_SIZE;

	g_bufsize = (g_bufsize + BLOCKSIZE - 1) / BLOCKSIZE;
	g_bufsize *= BLOCKSIZE;

	/* be paranoid about overflows */
	assert(g_bufsize > 0);
        assert(g_bufsize % BLOCKSIZE == 0);

	if (o_verbose) {
		fprintf(stderr,"buffer size: %d\n",g_bufsize);
		fprintf(stderr,"block size: %d\n",BLOCKSIZE);
		fflush(stderr);
	}

	g_unaligned_buffer = (unsigned char *) malloc(g_bufsize + BLOCKSIZE - 1);
	if (!g_unaligned_buffer)
		error_nomem();
	g_buffer = g_unaligned_buffer;
	
	/* the & should be safer than % here */
	misalign = (unsigned int)g_unaligned_buffer & BLOCKMASK;
	assert(misalign >= 0 && misalign < BLOCKSIZE);
	if (misalign) {
		g_buffer += (BLOCKSIZE - misalign);
	}

	g_reachedeof = 0;
	g_bytesinbuf = 0;
	g_bufferoffset = 0;
}

static void
buffer_fill(int fd,const char *filename,int preserve,int *shift)
{
	int obsolete;
	int discard;
	ssize_t res;
	binfind_off_t oldofs;

	*shift = 0;
	if (g_reachedeof)
		return;
	
	assert(preserve >= 0);
	assert(preserve <= g_bytesinbuf);
	assert(g_bytesinbuf == g_bufsize || g_bytesinbuf == 0);
	
	/* determine how many bytes to discard */
	obsolete = g_bytesinbuf - preserve;
	discard = (obsolete / BLOCKSIZE) * BLOCKSIZE;
	*shift = discard;

	if (g_bytesinbuf) {
		/* if we cannot discard any blocks we are stuck */
		assert(discard > 0);

		if (o_verbose > 1)
			fprintf(stderr,"discarding %d\n",discard);

		/* copy the preserved bytes */
		memmove(g_buffer + (obsolete % BLOCKSIZE),
				g_buffer + obsolete,
				preserve);
		g_bytesinbuf -= discard;
		oldofs = g_bufferoffset;
		g_bufferoffset += discard;
		if (g_bufferoffset < oldofs)
			warn_offset();
	}

	do {
#ifdef BINFIND_DEBUG_READ
		fprintf(stderr,"read(%d,%p,%d)\n",fd,g_buffer + g_bytesinbuf,g_bufsize - g_bytesinbuf);
		fflush(stderr);
#endif
		res = read(fd,g_buffer + g_bytesinbuf,g_bufsize - g_bytesinbuf);
#ifdef BINFIND_DEBUG_READ
		fprintf(stderr,"    => %d\n",res);
		fflush(stderr);
#endif
		if (res < 0) {
			if (errno == EINTR) /* XXX may have read some data! */
				continue;
			error("error reading file",filename);	
		}

		if (res == 0) {
			g_reachedeof = 1;
			break;
		}
		
		g_bytesinbuf += res;
		
	} while (g_bytesinbuf < g_bufsize);
}

static void
buffer_free()
{
	free(g_unaligned_buffer);
	g_unaligned_buffer = NULL;
	g_buffer = NULL;
}

/*--------------------------------------------------------------------*/
/* pattern handling                                                   */
/*--------------------------------------------------------------------*/

static void
pattern_init(void)
{
	g_patterncapacity = INITIAL_PATTERN_CAPACITY;
	g_patternlength = 0;
	g_pattern = (unsigned char *) malloc(g_patterncapacity);
	if (!g_pattern)
		error_nomem();
}

static void
pattern_add(int ch)
{
	if (g_patternlength == g_patterncapacity) {
		g_patterncapacity *= 2;
		assert(g_patterncapacity > 0);
		g_pattern = (unsigned char *) realloc(g_pattern,g_patterncapacity);
		if (!g_pattern)
			error_nomem();
	}

	g_pattern[g_patternlength++] = ch;
}

static void
pattern_free(void)
{
	free(g_pattern);
	g_pattern = 0;
}

static void
pattern_add_ascii(const char *str)
{
	while (*str)
		pattern_add(*str++);
}

static void
pattern_add_hexbyte(const char *hexbyte,const char *hex)
{
	int i;
	int value = 0;
	int ch;
	for (i=0; i<2; ++i) {
		value <<= 4;
		ch = tolower(hexbyte[i]);
		if (ch >= '0' && ch <= '9')
			value += (ch - '0');
		else if (ch >= 'a' && ch <= 'f')
			value += 10 + (ch - 'a');
		else
			errorstr("invalid hex character",hex,2);
	}
	pattern_add(value);
}

static void
pattern_add_hex(const char *hex)
{
	const char *p = hex;
	const char *hexbyte;
	size_t len;

	while (*p) {
		/* skip delimiters */
		len = strspn(p,o_hex_delimiters);
		p += len;

		/* skip 0x */
		if (p[0] == '0' && p[1] == 'x')
			p += 2;

		/* get next token */
		len = strcspn(p,o_hex_delimiters);
		if (len == 0) {
			if (*p)
				errorstr("invalid hex string",hex,2);
			return;
		}

		if (len & 1)
			errorstr("odd number of hex digits",hex,2);

		hexbyte = (o_little_endian) ? (p + len - 2) : p;
		p += len;

		while (len > 0) {
			pattern_add_hexbyte(hexbyte,hex);
			hexbyte += (o_little_endian) ? -2 : +2;
			len -= 2;
		}
	}
}

static void
pattern_add_file(const char *filename)
{
	FILE *file;
	int ch;

	if (strcmp(filename,"-") == 0) {
		file = stdin;
		filename = "(standard input)";
	}
	else {
		file = fopen(filename,"r");
		if (!file)
			error("could not open file",filename);
	}

	errno = 0;
	while (1) {
		ch = fgetc(file);
		if (ch == EOF)
			break;
		pattern_add(ch);
	}
	if (errno)
		error("error reading file",filename);

	if (file != stdin && fclose(file) != 0)
		error("could not close file",filename);
}

/*--------------------------------------------------------------------*/
/* Boyer-Moore search                                                 */
/*--------------------------------------------------------------------*/

static void
boyer_moore_bad_char_shifts(const unsigned char *pattern, int len) 
{
	int i;

	for (i = 0; i < 0x100; ++i)
		g_bad_char_shift[i] = len;
	
	for (i = 0; i < (len - 1); ++i)
		g_bad_char_shift[pattern[i]] = len - i - 1;
	
	/* add a constant so we don't have to later */
	for (i = 0; i < 0x100; ++i)
		g_bad_char_shift[i] += (1 - len);
}

static void 
boyer_moore_suffixes(const unsigned char *pattern, int len, int *suff) 
{
	int f, g, i;

	suff[len - 1] = len;
	g = len - 1;
	for (i = len - 2; i >= 0; --i) {
		if (i > g && suff[i + len - 1 - f] < i - g)
			suff[i] = suff[i + len - 1 - f];
		else {
			if (i < g)
				g = i;
			f = i;
			while (g >= 0 && pattern[g] == pattern[g + len - 1 - f])
				--g;
			suff[i] = f - g;
		}
	}
}

static void
boyer_moore_good_suffix_shifts(const unsigned char *pattern, int len) 
{
	int i, j;
	int *suff;

	suff = (int *) malloc(sizeof(int) * len);
	if (!suff)
		error_nomem();
			
	boyer_moore_suffixes(pattern, len, suff);

	for (i = 0; i < len; ++i)
		g_good_suffix_shift[i] = len;
	
	j = 0;
	for (i = len - 1; i >= -1; --i)
		if (i == -1 || suff[i] == i + 1)
			for (; j < len - 1 - i; ++j)
				if (g_good_suffix_shift[j] == len)
					g_good_suffix_shift[j] = len - 1 - i;
	
	for (i = 0; i <= len - 2; ++i)
		g_good_suffix_shift[len - 1 - suff[i]] = len - 1 - i;

	free(suff);
}

static void
boyer_moore_init()
{
	assert(g_patternlength > 0);

	g_bad_char_shift = (int *) malloc(sizeof(int) * 0x100);
	if (!g_bad_char_shift)
		error_nomem();
	
	g_good_suffix_shift = (int *) malloc(sizeof(int) * g_patternlength);
	if (!g_good_suffix_shift)
		error_nomem();

	boyer_moore_bad_char_shifts(g_pattern,g_patternlength);
	boyer_moore_good_suffix_shifts(g_pattern,g_patternlength);
}

static void
boyer_moore_free(void)
{
	free(g_bad_char_shift);
	g_bad_char_shift = NULL;
	free(g_good_suffix_shift);
	g_good_suffix_shift = NULL;
}

static void
output_match(const char *filename,binfind_off_t offset)
{
	if (o_printfilename) {
		outs(filename);
		outc(':');
	}
	outoffset(offset);
	outc('\n');
}

static int
dofile(int fd,const char *filename)
{
	int i;
	int shift;
	int ofs;
	int preserve;
	
	buffer_init(g_patternlength);
	buffer_fill(fd,filename,0,&shift);

	ofs = 0;
	while (1) {
		assert(ofs >= 0);
		
		while (ofs <= (g_bytesinbuf - g_patternlength)) {
			/* from right to left find the first non-matching byte */
			
			for (i = g_patternlength - 1; i >= 0 && g_pattern[i] == g_buffer[i + ofs]; --i)
				/* loop */;

			if (i < 0) {
				/* we have a match */
				
				if (g_bufferoffset + ofs < g_bufferoffset)
					warn_offset();
				output_match(filename,g_bufferoffset + ofs);
				g_found_it = 1;
				ofs += g_good_suffix_shift[0];
			}
			else {
				/* we found a bad character not matching g_pattern[i] */
				
				shift = g_bad_char_shift[g_buffer[i + ofs]] + i;
				if (g_good_suffix_shift[i] > shift)
					shift = g_good_suffix_shift[i];
				ofs += shift;
			}
		}
		
		if (g_reachedeof)
			break;
		
		/* calculate how many bytes to preserve at the end of the buffer */
		preserve = g_bytesinbuf - ofs;
		if (preserve < 0) preserve = 0;
		if (o_verbose > 1)
			fprintf(stderr,"preserving %d\n",preserve);
		assert(preserve < g_patternlength);
		
		buffer_fill(fd,filename,preserve,&shift);
		ofs -= shift;
	}
	
	buffer_free();	
	return 0;
}

/*--------------------------------------------------------------------*/
/* driver                                                             */
/*--------------------------------------------------------------------*/

static int
dofilename(const char *filename)
{
	int fd;

	if (filename) {
		fd = open(filename,O_RDONLY);
		if (fd == -1) {
			error("could not open file",filename);
		}
	}
	else {
		fd = 0;
		filename = "(standard input)";
	}
	dofile(fd,filename);

	if (fd != 0 && close(fd) != 0) {
		error("could not close file",filename);
	}
	
	return 0;
}

static void
print_version(void)
{
	printf("binfind %s\n\n",BINFIND_VERSION);
	
	printf("Copyright (C) 2005 Edwin Steiner\n");
	printf("This is free software; see the source for copying conditions. There is NO\n"
		"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\n");
	exit(0);
}

static void
print_help(int err)
{
	FILE *f = (err) ? stderr : stdout;
	fprintf(f,"Usage: binfind OPTION... [FILE]...\n"
		"Find a byte sequence in binary files.\n"
		"\n"
		"Options:\n"
		"  -a ASCII                   append ASCII string to pattern\n"
		"  -x HEX                     append hexadecimal data to pattern\n"
		"  -f FILE                    append FILE to pattern (- means stdin)\n"
		"  -B, --big-endian           interpret -x options as big-endian (default)\n"
		"  -L, --little-endian        interpret -x options as little-endian\n"
		"  -o, --offset-radix RADIX   set offset radix ('d' or 'x') for output\n");
	fprintf(f,
		"  -H, --with-filename        always print filenames\n"
		"  -h, --no-filename          do not print filenames\n"
		"  --version                  print version and exit\n"
		"  --help                     print this help message\n"
		"\n");
	exit((err) ? 2 : 0);
}

int 
main(int argc,char **argv)
{
	int nfiles;

	progname = argv[0];
	if (!progname)
		progname = "binfind (argv[0] == NULL)";

	pattern_init();
	g_found_it = 0;
	
	while (1) {
		int opt;
		int opt_index;

		opt = getopt_long(argc,argv,short_options,long_options,&opt_index);

		if (opt == -1)
			break;

		switch (opt) {
			case 0:
				/* long option has been processed */
				break;

			case '?':
			case ':':
				print_help(1);
				assert(0);

			case 'a':
				pattern_add_ascii(optarg);
				break;

			case 'x':
				pattern_add_hex(optarg);
				break;

			case 'f':
				pattern_add_file(optarg);
				break;

			case 'o':
				switch (tolower(optarg[0])) {
					case 'd':
						o_offset_radix = 10;
						break;
					case 'x':
						o_offset_radix = 16;
						break;
					default:
						print_help(1);
						assert(0);
				}
				break;

			case 'B':
				o_little_endian = 0;
				break;

			case 'L':
				o_little_endian = 1;
				break;

			case 'h':
				o_printfilename = 0;
				break;

			case 'H':
				o_printfilename = 2;
				break;

			case 'V':
				print_version();
				assert(0);

			case OPT_HELP:
				print_help(0);
				assert(0);

			case OPT_VERBOSE:
				o_verbose++;
				break;

			default:
				assert(0);
		}
	}

	nfiles = argc - optind;
	if (nfiles <= 1 && o_printfilename != 2)
		o_printfilename = 0;

	if (g_patternlength < 1)
		errorstr("error","no pattern specified",2);

	if (o_verbose)
		fprintf(stderr,"pattern length: %d\n",g_patternlength);

	boyer_moore_init();

	if (nfiles) {
		do {
			dofilename((strcmp(argv[optind],"-") == 0) ?
					(const char *)NULL : argv[optind]);
		} while (++optind < argc);
	}
	else {
		dofilename((const char *)NULL);
	}

	boyer_moore_free();
	pattern_free();

	return (g_found_it) ? 0 : 1;
}

/* vim:sw=8:noexpandtab 
 */

