/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer

  $Id: char_property.h,v 1.14 2006/07/09 13:34:22 taku-ku Exp $;

  Copyright (C) 2001-2006 Taku Kudo <taku@chasen.org>
  Copyright (C) 2004-2006 Nippon Telegraph and Telephone Corporation

*/

#ifndef MECAB_CHARACTER_CATEGORY_H
#define MECAB_CHARACTER_CATEGORY_H

#include "mmap.h"
#include "ucs.h"

namespace MeCab
{
  class Param;

  struct CharInfo {
    unsigned int type:         18;
    unsigned int default_type: 8;
    unsigned int length:       4;
    unsigned int group:        1;
    unsigned int invoke:       1;
    bool isKindOf (CharInfo c) { return type & c.type; }
  };

  class CharProperty {
  private:
    enum { EUC_JP, CP932, UTF8, ASCII };
    MeCab::Mmap<char> *cmmap_;
    std::vector<char *> clist_;
    CharInfo *map_;
    int charset_;
    whatlog what_;

  public:
    bool open(Param &);
    bool open(const char*);
    void close();
    size_t size();
    void set_charset(const char *);
    int id(const char *);
    const char *name(size_t i);
    const char *what() { return what_.str(); }

    inline char *seekToOtherType(const char *begin, const char *end,
                                 CharInfo c, CharInfo &fail, size_t& mblen) const {
      register char *p = const_cast<char *>(begin);
      while (p != end && c.isKindOf (fail = getCharInfo (p, end, mblen))) {
        p += mblen;
        c = fail;
      }
      return p;
    }

    inline char *seekToOtherType(const char *begin, const char *end, CharInfo c) const {
      register size_t mblen;
      register char *p = const_cast<char *>(begin);
      register CharInfo c2;
      while (p != end && c.isKindOf (c2 = getCharInfo (p, end, mblen))) {
        p += mblen;
        c = c2;
      }
      return p;
    }

    inline CharInfo getCharInfo(const char *begin, const char *end, size_t& mblen) const {
#ifndef MECAB_USE_UTF8_ONLY
      unsigned short int t = 0;
      switch (charset_) {
      case EUC_JP: t = euc_to_ucs2(begin, end, mblen); break;
      case CP932:  t = cp932_to_ucs2(begin, end, mblen); break;
      case UTF8:   t = utf8_to_ucs2(begin, end, mblen); break;
      case ASCII:  t = ascii_to_ucs2(begin, end, mblen); break;
      default:     t = utf8_to_ucs2(begin, end, mblen); break; // default charcode is UTF-8
      }
#else
      unsigned short int t = utf8_to_ucs2(begin, end, mblen);
#endif
      return map_[t];
    }

    inline CharInfo getCharInfo(size_t id) { return map_[id]; }

    static bool compile(const char *, const char *, const char*);

    explicit CharProperty():  cmmap_(0), map_(0), charset_(0) {};
    virtual ~CharProperty() { this->close (); }
  };
}
#endif
