/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
 
  $Id: japanese_tokenizer.h,v 1.10 2004/03/08 07:40:52 taku-ku Exp $;

  Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
  This is free software with ABSOLUTELY NO WARRANTY.
  
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/  
#ifndef _MECAB_JAPANESE_TOKENIZER_H
#define _MECAB_JAPANESE_TOKENIZER_H

#include "tokenizer.h"
#include "codeconv.h"

namespace MeCab {

  class JapaneseTokenizer: public Tokenizer 
  {
  protected:
    enum 
    {
      OTHER,
      KANJI,
      SYMBOL,
      ALPHANUMERIC,
      ALPHA,
      HIRAGANA,
      KATAKANA,
      HALFKATAKANA,
      GREEK,
      CYRILLIC,
      SPACE
    };

    enum { EUC_JP, SHIFT_JIS, UTF8 };

    unsigned char ascii_table [256];
    unsigned char ja_table    [256][256];
    int charset;

  public:
     JapaneseTokenizer (Param &param): charset(EUC_JP) 
                 { if (! this->open (param)) throw std::runtime_error (_what); }
    ~JapaneseTokenizer() { this->close (); };

    void preprocess  (const char *, const char *) { /* Do nothing */ };
    void postprocess (const char *, const char *) { /* Do nothing */ };

    bool open (Param &); 
    Node *lookup (const char *, const char *);

    inline int getCharClass (const char *str, const char *end, unsigned int &next) const
    {
      switch (charset) {

      case EUC_JP:
	if (str[0] & 0x80) {
	  next = 2;
	  return ja_table[static_cast<unsigned char>(str[0])][static_cast<unsigned char>(str[1])];
	}
	next = 1;
	return ascii_table[static_cast<unsigned char>(str[0])];
	break;

      case SHIFT_JIS: 
	{
	  unsigned int len = end - str;
	  if ((static_cast<unsigned char>(str[0])  <= 0x80) ||
	      (static_cast<unsigned char>(str[0]) >= 0xa0 &&
	       static_cast<unsigned char>(str[0]) <= 0xdf)) {
	    next = 1;
	    return ascii_table[static_cast<unsigned char>(str[0])];
	  } else if (len >= 2 && (str[0] & 0x80)) {
	    next = 2;
	    return ja_table[static_cast<unsigned char>(str[0])][static_cast<unsigned char>(str[1])];
	  } else { // unkwon
	    next = 1;
	    return OTHER;
	  }
	}
      break;

      case UTF8:
	{
	  unsigned int len = end - str;
	  unsigned int ucs2 = 0; // convert UTF8

	  if (static_cast<unsigned char>(str[0]) < 0x80) {  // 1byte
	    next = 1;
	    return ascii_table[static_cast<unsigned char>(str[0])]; // this is same as ascii
	  } else if (len >= 2 && static_cast<unsigned char>(str[0]) < 0xE0) { // 2 bytes 
	    ucs2 = ((str[0] & 0x1f) << 6) | (str[1] & 0x3f);
	    next = 2;
	  } else if (len >= 3) {  // 3bytes
	    ucs2 = ((str[0] & 0x0f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f);
	    next = 3;
	  } else { // unknown, never be here
	    next = 1; 
	    return OTHER;
	  }

	  unsigned short *_table = _unicode_to_jisx0208_map [(ucs2 & 0xff00) >> 8];
	  if (_table) {
	    unsigned short h = _table[(ucs2 & 0x00ff)];
	    if (h == 0) return OTHER;
	    return ja_table[((h & 0xff00) >> 8)][(h & 0x00ff)]; // ucs2 -> jisx0208
	  } 

	  return OTHER;
	}

	break;
      }

      return 1;
    }
  };
}
#endif
