/*
suffix array ץ

Υץθϡ
NLPRS '95  Invited Lecture  Kenneth W. Church
ΤΤѤޤ
*/
#include <fcntl.h>
#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>

#define FNLEN 1000 /* ե̾Ĺ */
#define MODE_ON 1
#define MODE_OFF 0

void open_array_file(char *ary_fname);
size_t open_text_file(char *fname);
void sort_array_file(char *ary_fname, long pointer_cnt);
long make_first_array_file(long);
void usage(void);
void merge_sort(long *a, long *b, long l, long r); /* 990217 */

/*=============
   ѿ
=============*/
char *text; /* оݤȤʤƥ */
long *suf; /* pointer(index)  */

int option_byline = MODE_OFF; /* ʸ˥ǥå */
char delimitter[20]; /* ԡñζڤ국 */
char *progname; /* program name */
int quiet_mode = MODE_OFF; /* åϤʤŤ⡼ɡ */
int comment_out_mode = MODE_OFF; /* #ǻϤޤԤϥȥ */
int no_sort_mode = MODE_OFF; /* Ȥʤ⡼ */
int sort_only_mode = MODE_OFF; /* Ȥʤ⡼ */
int bit_8_mode = MODE_ON; /* 2ХȰʸ⡼ */
int dict_mode = MODE_OFF;
int j_mode = MODE_OFF; /* ܸ'<'ˤǥåĥ⡼ 981115 */
int merge_sort_mode = MODE_OFF; /* 990217 */

FILE *ofd = NULL; /* ե */



int suffix_compare(long *a, long *b)
{
  return strcmp(text + *a, text + *b);
/*
  char *s1 = text + *a, *s2 = text + *b;
  while(*s1 == *s2){
    s1++; s2++;
  }
  return (unsigned char)*s1-(unsigned char)*s2;
*/
}


main(int argc, char **argv)
{
  int fd = -1; /* ϥƥȥե */
  char in_fname[FNLEN]; /* ϥե̾ */
  char ary_fname[FNLEN]; /* ե̾ */
  size_t N;
  long i;
  long pointer_cnt;

  in_fname[0] = '\0';
  ary_fname[0] = '\0';

  progname = argv[0]; /* ץ̾ */

  /*================
    ץ
  ================*/
  if(argc <= 1){
    usage();
    exit(1);
  }
  while (argc > 1){
    if (argv[1][0] == '-')
      switch (argv[1][1]){
      case 'o': /* ϥե̾λ */
        if (argc == 2){ /* ʤȼդʤ */
          fprintf(stderr,"-o <filename> --- ϥե̾\n");
	  exit(1);
	}
	strcpy(ary_fname,argv[2]);
        argc--; argv++;
        break;
      case 'l': /* ˥ǥå */
        option_byline = MODE_ON;
        strcpy(delimitter,"\n");
        break;
      case 'w': /* ˥ǥå */
        option_byline = MODE_ON;
        strcpy(delimitter," \t\n\r\f{}.()~-`'");
        break;
      case 'c': /* ʸ˥ǥåʥǥեȡ */
        option_byline = MODE_OFF;
        break;
      case 'D': /*  */
	dict_mode = MODE_ON;
        break;
      case 'J': /* ܸ⡼ */
	j_mode = MODE_ON;
        break;
      case '8': /* 2ХȰʸԤʤʤ */
        bit_8_mode = MODE_OFF;
        break;
      case 'q': /* åϤʤ */
        quiet_mode = MODE_ON;
        break;
      case 'n': /* -ns Ȥʤ⡼ */
	if(argv[1][2] == 's') no_sort_mode = MODE_ON;
        break;
      case 's': /* -so Ȥʤ⡼ */
	if(argv[1][2] == 'o') sort_only_mode = MODE_ON;
        break;
      case '#': /* #ǻϤޤԤϥȥ */
        comment_out_mode = MODE_ON;
        break;
      case 'm': /* ޡ 990217 */
	merge_sort_mode = MODE_ON;
	break;
      default : /* 顼 */
        fprintf(stderr, "%c: ̵ʥץǤ\n", argv[1][1]);
	usage();
        exit(1);
      }
    else{
      strcpy(in_fname, argv[1]); /* ƥȥե̾ */
    }
    argc--; argv++;
  }


  /*** ƥȥե򳫤 (mmap) ***/
  N = open_text_file(in_fname);

  /*** arrayե򳫤 ***/
  if (ary_fname[0] == '\0') sprintf(ary_fname,"%s.ary",in_fname);
  if(sort_only_mode != MODE_ON) open_array_file(ary_fname);

  /*** ݥ(arrayե) ***/
  if(sort_only_mode == MODE_ON){
    /* arrayեϴ¸ߤΤǥȤ */
    if(quiet_mode == MODE_OFF) fprintf(stderr,"Array file exists...\n");
  } else {
    if(quiet_mode == MODE_OFF) fprintf(stderr,"Reading text file \"%s\"\n",in_fname);
    pointer_cnt = make_first_array_file((long)N);
    (void)fclose(ofd);
  }

  /*** arrayեΥݥ󥿤򥽡 ***/
  if(no_sort_mode == MODE_ON){ /* Ȥʤ⡼ */
    if(quiet_mode == MODE_OFF) fprintf(stderr,"No sort.\n");
  } else sort_array_file(ary_fname, pointer_cnt);

  if(quiet_mode == MODE_OFF) fprintf (stderr,"Done.\n");
  exit(0); /* ｪλ */
}


/******************************************************************************
  ƥȥե򳫤
 *****************************************************************************/
size_t open_text_file(char *fname)
{
  struct stat stat_buf;
  int fd;
  size_t N;

  if ((fd = open(fname, O_RDONLY)) < 0){ /* ϥե */
    fprintf(stderr,"ե \"%s\" ץޤ\n", fname);
    exit(1);
  }

  (void)fstat(fd, &stat_buf);
  N = (size_t)stat_buf.st_size;

  if((text = mmap((caddr_t)0, N, PROT_READ, MAP_SHARED, fd, 0))
     == (caddr_t)-1){
    fprintf(stderr,"ERROR: text file mapping error.\n");
    exit(1);
  }
  return N;
}


/******************************************************************************
  arrayե򳫤
 *****************************************************************************/
void open_array_file(char *fname)
{
  if((ofd = fopen(fname,"w")) == NULL){
    fprintf(stderr,"ե \"%s\" ץޤ\n", fname);
    exit(1);
  }
  if(quiet_mode == MODE_OFF) fprintf(stderr,"Save to \"%s\"\n",fname);
}


/******************************************************************************
 *   long make_first_array_file(long N);
 *
 * purpose
 *   ƥȥե򥹥󤷤ơݥ󥿤 arrayեǤФ
 *
 * parameters
 *   N : ƥȥեΥ
 *
 * return value
 *   arrayեΥ
 *
 * description
 *   ݥ󥿤򿶤ؿ(㤨СʸȤƬʤ)ϡ
 *   option_byline, delimitter ʤɤѿǷ롣
 *****************************************************************************/
long make_first_array_file(long N)
{
  long i, jj = 0;
  int last_char_is_delimitter = 1;
  int last_char_is_kanji = 0;
  int dic_ent = 1;

  /* printf("IN   N = %d\n",N);fflush(stdout);*/

  if (dict_mode == MODE_ON) {
      for(i = 0; i < N; i++){
	  /*printf("i  %d  %d\n",i,last_char_is_kanji);fflush(stdout);*/
	  if (dic_ent) {
	      if (!last_char_is_kanji && text[i] == '\t') dic_ent = 0;
	  } else {
	      if (!last_char_is_kanji && text[i] == '\n') {
		  dic_ent = 1;
		  continue;
	      }
	      else continue;
	  }
	  /* EUC2char */
	  if(bit_8_mode == MODE_ON
	     && last_char_is_kanji == 1
	     && (0x80 & text[i])) {
	      last_char_is_kanji = 0;
	  } else {
	      fwrite(&i, 1, sizeof(long), ofd);
	      jj++;
	      if((0x80 & text[i])) last_char_is_kanji = 1;
	  }
	  if(quiet_mode == MODE_ON || i == 0) continue;
	  if(!(i % 50000)) fprintf(stderr,"+");
	  if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000);
      }
  } else 
  if(option_byline == MODE_ON){ /* ԡ˥ǥå */
    for(i = 0; i < N; i++){
      if(strchr(delimitter, text[i]) != NULL && text[i] != '\0'){
	last_char_is_delimitter = 1;
      } else {
	if(last_char_is_delimitter == 1){ /* ʸڤʸʤ */
	  if(comment_out_mode && text[i] == '#'){
	    /* #ϤޤԤ̵  980319 */
	  } else {
	    fwrite(&i, 1, sizeof(long), ofd);
/*	    write(fd, &i, sizeof(long));*/
	    jj++;
	  }
	  last_char_is_delimitter = 0;
	}
      }
      if(quiet_mode == MODE_ON || i == 0) continue;
      if(!(i % 50000)) fprintf(stderr,"+");
      if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000);
    }
  } else {                    /* ʸ˥ǥå */
    for(i = 0; i < N; i++){
      /*printf("i  %d  %d\n",i,last_char_is_kanji);fflush(stdout);*/
      /* EUC2char */
      if(bit_8_mode == MODE_ON
	 && (0x80 & text[i]) != 0x00
	 && last_char_is_kanji == 1){
	last_char_is_kanji = 0;
      } else {
	if(j_mode == MODE_ON && text[i] != '<' && ((0x80 & text[i]) == 0x00)){
	} else {
	  fwrite(&i, 1, sizeof(long), ofd);
	  jj++;
	  if((0x80 & text[i]) != 0x00) last_char_is_kanji = 1;
	}
      }
      if(quiet_mode == MODE_ON || i == 0) continue;
      if(!(i % 50000)) fprintf(stderr,"+");
      if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000);
    }
  }
  /* printf("OUT\n");fflush(stdout);*/

  if(quiet_mode == MODE_OFF) fprintf(stderr,"\n");
  return(jj);
}


/******************************************************************************
  ݥ󥿤򥽡
 *****************************************************************************/
void sort_array_file(char *ary_fname, long pointer_cnt)
{
  struct stat om_stat_buf;
  char *outmap;
  int omfd;
  size_t omsize;

  if ((omfd = open(ary_fname, O_RDWR)) < 0){  /* ϥե */
    fprintf(stderr,"ե \"%s\" ץޤ\n", ary_fname);
    exit(1);
  }

  (void)fstat(omfd, &om_stat_buf);
  omsize = (size_t)om_stat_buf.st_size;
  if(sort_only_mode == MODE_ON) pointer_cnt = omsize / sizeof(long);
  /* omsize = pointer_cnt * sizeof(int); ǤOK */
  /* printf("%ld %ld\n",pointer_cnt*sizeof(long),omsize);*/
  if((outmap = mmap((caddr_t)0, omsize, PROT_READ | PROT_WRITE, MAP_SHARED, omfd, 0)) == (caddr_t)-1){
    fprintf(stderr,"ERROR: array file mapping error.\n");
    exit(1);
  }
  suf = (long *)outmap;

  if(quiet_mode == MODE_OFF) fprintf (stderr,"Sorting...\n");
  if(merge_sort_mode == MODE_OFF)
    qsort(suf, (size_t)pointer_cnt, sizeof(long),
	  (int (*)(const void *,const void *))suffix_compare);
  else { /* 990217 ޡ */
    char fname[1000];

    struct stat tom_stat_buf;
    char *toutmap;
    int tomfd;
    size_t tomsize;

    sprintf(fname, "%s.tmp", ary_fname);
    if( (tomfd = open(fname, O_RDWR | O_CREAT )) < 0){
      fprintf(stderr,"ե \"%s\" ץޤ\n", fname);
      exit(1);
    }
    write(tomfd, outmap, pointer_cnt * sizeof(long));
    (void)fstat(tomfd, &tom_stat_buf);
    tomsize = (size_t)tom_stat_buf.st_size;
    if((toutmap = mmap((caddr_t)0, tomsize, PROT_READ | PROT_WRITE, MAP_SHARED, tomfd, 0)) == (caddr_t)-1){
      fprintf(stderr,"ERROR: tmp array file mapping error.\n");
      exit(1);
    }

    merge_sort(suf, (long*)toutmap, 0, pointer_cnt-1);

    close(tomfd);

  }


  if(quiet_mode == MODE_OFF) fprintf (stderr,"Saving...\n");

  close(omfd);
}


/*
   usage --- Ȥ
*/
void usage(void){
  fprintf(stderr, "\n"
	  "mkary --- array ե\n\n"
	  "Version 1.6 990217\n\n"
	  "USAGE\n"
	  "  mkary [ -l [-#] ] [ -w ] [ -c ] [ -q ] [ -ns ] [ -so ] [ -8 ]\n"
	  "        [ -J ] [ -m ] [ -o FILE_NAME ] FILE_NAME\n"
	  "\n"
	  "OPTION\n"
	  "  -o FILE_NAME  : ϥե( default  FILE_NAME.ary )\n"
	  "  -l            : ñ̤Ǻ ( \"\\n\" Ƕڤ )\n"
	  "  -w            : ññ̤Ǻ ( \" \",\"\\t\",\"\\n\" Ƕڤ )\n"
	  "  -c            : ʸñ̤Ǻ ( default )\n"
	  "  -q            : åʤ\n"
	  "  -ns           : Ȥʤ(No Sort)\n"
	  "  -so           : Ȥ(Sort Only)\n"
	  "  -8            : 2ХȰʸԤʤʤ\n"
	  "  -J            : ܸʸ '<' ʳ̵뤹(ʸñ̤ΤȤ)\n"
          "  -#            : #ǻϤޤԤϥȥ(ñ̤ΤȤ)\n"
          "  -m            : ޡȤˤ\n"
	  "\n"
	  );
}


/******************************************************************************
  ޡ 990217
   l ʾ塢r ʲϰϤо
 *****************************************************************************/
void merge_sort(long *a, long *b, long l, long r)
{
  int i, j, k;
  if(r-l <= 0) return;
  /*
  if(r-l < 100000){
    qsort(a+r, (size_t)r-l+1, sizeof(long),
	  (int (*)(const void *,const void *))suffix_compare);
    printf("%ld - %ld\n",l,r);
  }
  */
  else{
    int m = (l+r)/2;
    merge_sort(a, b, l, m); merge_sort(a, b, m+1, r);
    for(i = m; i >= l; i--) b[i] = a[i];
    for(j = m+1; j <= r; j++) b[r+m+1-j] = a[j];
    i = l; j = r;
    for(k = l; k <= r; k++)
      if(suffix_compare(b+i, b+j) < 0){
	a[k] = b[i]; i++;
      }else{
	a[k] = b[j]; j--;	
      }
  }
}

