/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
 
  $Id: libmecabdic.cpp,v 1.10 2004/03/08 07:40:52 taku-ku Exp $;

  Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
  This is free software with ABSOLUTELY NO WARRANTY.
  
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/  

#include "dictionary_maker.h"
#include "common.h"
#include "mecab.h"
#include "param.h"
#include "darts.h"
#include <cstring>
#include <stdexcept>
#include <algorithm>
#include <fstream>

#if defined (_WIN32) && !defined (__CYGWIN__)
# define MKMECABDIC_OUTPUT_MODE std::ios::binary|std::ios::out
#else
# define MKMECABDIC_OUTPUT_MODE std::ios::out
#endif

static std::ifstream* ifstream_open (std::ifstream *is, const char *dicdir, const char *file)
{
  if (is) delete is;
  std::string ifile = MeCab::createFileName (std::string (dicdir), std::string (file));
  is = new std::ifstream (ifile.c_str());
  if (!*is) throw std::runtime_error (ifile + ": no such file or directory");
  return is;
}

static std::ofstream* ofstream_open (std::ofstream *os, const char *outdir, const char *file)
{
  if (os) delete os; 
  std::string ofile = MeCab::createFileName (std::string (outdir), std::string (file));
  os = new std::ofstream;
  os->open (ofile.c_str(), MKMECABDIC_OUTPUT_MODE); 
  if (! *os) throw std::runtime_error (ofile + ": permission denied.");
  return os;
}

int mecab_make_dictionary (char *dicdir, char *outdir)
{  
  using namespace MeCab;
  
  try {
    Param                   param;
    DictionaryMaker         dm1, dm2, dm3;
    std::vector <std::string>         rule1, rule2, rule3; 
    std::vector <unsigned short> score;
    Csv csv;
    std::ofstream *ofs = 0;
    std::ifstream *ifs = 0;
    std::string line;

    ///////////////////////////////////////////
    //
    // Parameter Processing
    //
    std::string rcFile = createFileName (dicdir, "dicrc");
    if (! param.load (rcFile.c_str()))
      throw std::runtime_error (param.what ());

    ///////////////////////////////////////////
    //
    // Read connection filen
    //
    std::cerr << "(1/5): reading connection matrix ... " << std::flush;

    ifs = ifstream_open (ifs, dicdir, TEXT_CONNECTION_FILE);

    while (std::getline (*ifs, line)) {
      if (csv.split (line) != 4)
	throw std::runtime_error (std::string ("format error: ") + line);

      dm1.add (csv[0]); rule1.push_back(csv[0]); 
      dm2.add (csv[1]); rule2.push_back(csv[1]); 
      dm3.add (csv[2]); rule3.push_back(csv[2]);
      score.push_back (atoi (csv[3].c_str()));
    }

    dm1.build (); 
    dm2.build (); 
    dm3.build ();

    std::cerr << " done!" << std::endl;

    ///////////////////////////////////////////
    //
    // Write connection rules
    //
    unsigned short size1   = dm1.size ();
    unsigned short size2   = dm2.size ();
    unsigned short size3   = dm3.size ();
    unsigned int ruleSize  = rule1.size();
    unsigned short *matrix = new unsigned short [size1 * size2 * size3];

    std::cerr << "(2/5): writing connection matrix ("  
	 << size1 << " x " << size2 << " x " << size3 
	 << " = " << size1*size2*size3 << ") ..." << std::flush;

    unsigned short default_cost = param.getProfileInt ("default-connection-cost", true);

    for (unsigned int i = 0; i < static_cast<unsigned int>(size1 * size2 * size3); i++)
       matrix [i] = default_cost;

    ofs = ofstream_open (ofs, outdir, CONNECTION_FILE);

    for (unsigned int i = 0; i < ruleSize; i++) {
      std::vector <unsigned int> &r1 = dm1.getRuleIdList (rule1[i]);
      std::vector <unsigned int> &r2 = dm2.getRuleIdList (rule2[i]);
      std::vector <unsigned int> &r3 = dm3.getRuleIdList (rule3[i]);

      for (std::vector <unsigned int>::iterator i1 = r1.begin(); i1 != r1.end(); i1++) 
	for (std::vector <unsigned int>::iterator i2 = r2.begin(); i2 != r2.end(); i2++)
	  for (std::vector <unsigned int>::iterator i3 = r3.begin(); i3 != r3.end(); i3++)
	     matrix[size3 * (size2 * (*i1) + (*i2)) + (*i3)] = score [i];
    }

    ofs->write ((char *)&size1, sizeof (unsigned short));
    ofs->write ((char *)&size2, sizeof (unsigned short));
    ofs->write ((char *)&size3, sizeof (unsigned short));

    for (unsigned int i1 = 0; i1 < (unsigned int)size1; i1++)
      for (unsigned int i2 = 0; i2 < (unsigned int)size2; i2++)
	for (unsigned int i3 = 0; i3 < (unsigned int)size3; i3++) 
	  ofs->write ((char *)&matrix [size3 * (size2 * i1 + i2) + i3], sizeof (unsigned short));
     
    delete [] matrix;

    std::cerr << " done!" << std::endl;

    ///////////////////////////////////////////
    //
    // BOS EOS UNKNOWN handler
    //
    std::string bos_pos = param.getProfileString ("bos-pos", true);
    std::string eos_pos = param.getProfileString ("eos-pos", true);
    std::string unk_pos = param.getProfileString ("unk-pos", true);

    ofs = ofstream_open (ofs, outdir, OTHERS_FILE);

    Token token;
    memset (&token, 0, sizeof (Token));

    token.rcAttr2 = dm1.getDicId (bos_pos);
    token.rcAttr1 = dm2.getDicId (bos_pos);
    token.lcAttr  = dm3.getDicId (bos_pos);
    ofs->write (reinterpret_cast<char *>(&token), sizeof (Token));

    token.rcAttr2 = dm1.getDicId (eos_pos);
    token.rcAttr1 = dm2.getDicId (eos_pos);
    token.lcAttr  = dm3.getDicId (eos_pos);
    ofs->write (reinterpret_cast<char *>(&token), sizeof (Token));

    token.rcAttr2 = dm1.getDicId (unk_pos);
    token.rcAttr1 = dm2.getDicId (unk_pos);
    token.lcAttr  = dm3.getDicId (unk_pos);
    ofs->write (reinterpret_cast<char *>(&token), sizeof (Token));

    ///////////////////////////////////////////
    //
    // Read dictionary
    //
    std::cerr << "(3/5): reading tokens in dictionary ... " << std::flush;

    unsigned int pos_size = param.getProfileInt ("pos-size", true);
    unsigned int di = 0;
    unsigned int offset = 0;
    std::vector < std::pair <std::string, Token*> > dicList;

    ofs = ofstream_open (ofs, outdir, POS_FILE);
    ifs = ifstream_open (ifs, dicdir, TEXT_DIC_FILE);

    while (std::getline (*ifs, line)) {
      if (csv.split (line) < (pos_size + 3))
	throw std::runtime_error (std::string ("format error: ") + line);

      unsigned int start = csv.getFieldPos (2).first;
      unsigned int end   = csv.getFieldPos (2 + pos_size - 1).second;
      std::string key = line.substr (start, end - start);
      std::string pos = line.substr (start, std::string::npos);

      dicList.resize (di + 1);
      dicList[di].first  = csv[0];
      Token *token = new Token;
      memset (token, 0, sizeof (Token));
      dicList[di].second = token;
      dicList[di].second->rcAttr2  = dm1.getDicId (key);
      dicList[di].second->rcAttr1  = dm2.getDicId (key);
      dicList[di].second->lcAttr   = dm3.getDicId (key);
      dicList[di].second->posid    = dicList[di].second->lcAttr;
      dicList[di].second->feature  = offset;
      dicList[di].second->length   = csv[0].size();
      dicList[di].second->cost     = atoi (csv[1].c_str());
       
      offset += (pos.size () + 1);
      *ofs << pos << ends;

      if (++di % 10000 == 0) std::cerr << di << "... " << std::flush;
    }

    std::cerr << " done!" << std::endl;

    ///////////////////////////////////////////
    //
    // Sort lexs and write to file
    //
    std::cerr << "(4/5): sorting tokens ... " << std::flush;

    int   *value = new int    [dicList.size()];
    char  **key  = new char * [dicList.size()];
    unsigned int spos  = 0;
    unsigned int dsize = 0;
    unsigned int bsize = 0;
    std::string prev = "";

    std::sort (dicList.begin (), dicList.end ()); // go!

    ofs = ofstream_open (ofs, outdir, TOKEN_FILE);

    for (unsigned int i = 0; i < dicList.size (); i++) {
      if (prev != dicList[i].first && i != 0)  {
	key   [dsize] = const_cast<char *>(dicList[spos].first.c_str ());
	value [dsize] = bsize + (spos << 8);
	dsize++;
	bsize = 1;
	spos = i;
      } else {
	bsize++;
      }
      prev = dicList[i].first;
      ofs->write (reinterpret_cast<char *>(dicList[i].second), sizeof (Token));
    }

    key[dsize]   = const_cast<char *>(dicList[spos].first.c_str ());
    value[dsize] = bsize + (spos << 8);
    dsize++;

    std::cerr << " done!" << std::endl;

    ///////////////////////////////////////////
    //
    // Build Double Array
    //
    std::cerr << "(5/5): building `Double-Array` (size = " << dsize << ") ..." << std::flush;

    Darts::DoubleArray da;
    da.build (dsize, key, 0, value);
     
    if (-1 == da.save (DOUBLE_ARRAY_FILE, "wb"))
      throw std::runtime_error (std::string (DOUBLE_ARRAY_FILE) + " no such file or directory");

    delete [] key;
    delete [] value;

    std::cerr << " done!" << std::endl;
     
    ifs->close();
    ofs->close(); 

    delete ofs;
    delete ifs;

    return EXIT_SUCCESS;
  }

  catch (std::exception &e) {
    std::cerr << "\nFATAL: " << e.what () << std::endl;
    return EXIT_FAILURE;
  }
}
