# dictformat-wordlist.rb: Converter module for word list without each reading
# $Id: dictformat-text.rb,v 1.2 2005/03/07 07:51:33 komatsu Exp $
#
# Copyright (C) 2003 Hiroyuki Komatsu <komatsu@taiyaki.org>
#     All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
# You can redistribute it and/or modify it under the terms of 
# the GNU General Public License version 2.

require 'prime/taiyaki'
require 'prime/array-agent'
require 'prime/makedict/dictformat'
require 'MeCab'
$KCODE = 'e'

class DictFormatText < DictFormat
  def initialize (is_interactive = true)
    super(is_interactive)
#     @default_pos  = nil
#     @default_freq = 0
#     @default_attr = nil

    mecab_arguments = ['mecab'] # Dummy data
    @mecab = MeCab::Tagger.new(mecab_arguments)

    @combword_classes = [KukuraCombinationWordEmail,
                         KukuraCombinationWordURL,
                         KukuraCombinationWordDate,
                         KukuraCombinationWordEnglishProperNoun,
                         KukuraCombinationWordNoun]
  end

  def get_lines (filename)
    lines = open("|w3m -dump #{filename}").read.split("\n")
    return lines
  end


  ## Overwriting the method of the parent class.
  def load_external_dict (filename)
    buffer_line = ""

    label = @is_interactive ? "INPUT_FILE LODING" : nil

    lines = get_lines(filename)
    lines.each_with_pbar(label) { | line |
      line.chomp!
      if line =~ /^ *$/ then

        parse(buffer_line).each { | pron, pos, literal, freq, *attr |
          (point_beginnig, point_end, category) = attr
          set_word(pron, pos, literal, freq, category)
        }
        buffer_line = ""
      else
        ## Considering spaces between the end character of a line and
        ## the beginning character of the next line.
        ## <komatsu@taiyaki.org> (2004-11-13)
        if buffer_line[-1] and buffer_line[-1] > 127 and line[0] > 127 then
          buffer_line += line
        else
          buffer_line += (" " + line)
        end
      end
    }
    if buffer_line != "" then
      parse(buffer_line).each { | pron, pos, literal, freq, *attr |
        (point_beginnig, point_end, category) = attr
        set_word(pron, pos, literal, freq, category)
      }
    end
  end

  def merge_words(existent_word, new_word)
    (pron, pos, literal, freq, *attr) = super(existent_word, new_word)
    freq += 1
    return [pron, pos, literal, freq, *attr]
  end

  def parse (text)
    return combine( get_tokens(text) )
  end

  def get_tokens (line)
    node   = @mecab.parseToNode(line)
    tokens = []
    point_last_end = 0

    while (node.hasNode() != 0) do
#      puts node.toString()
      (pos0, pos1, pos2, pos3, form, type, base, reading, pron) = 
	node.getFeature().split(',')
      literal  = node.getSurface()
      pos_list = [pos0, pos1, pos2, pos3]
      pos_list.delete("*")
      pos = pos_list.join("-")

      point_beginning = node.getBegin()
      point_end       = node.getEnd()

      if point_beginning != point_last_end then
        token = KukuraToken.new(" ", " ", "ʬΥ",
                                point_last_end, point_beginning)
        tokens.push( token )
      end

      if node.getPOSID() != 0 then
        token = KukuraToken.new(literal, reading.hiragana, pos,
                                point_beginning, point_end)
        tokens.push( token )
      elsif literal then
        token = KukuraToken.new(literal, literal.hiragana, pos,
                                point_beginning, point_end)
        tokens.push( token )
      end
      point_last_end = point_end
      node = node.next()
    end
    
    tokens = preprocess_tokens(tokens)

    return tokens
  end

  def preprocess_tokens (tokens)
    index = 0 
    while index < (tokens.length() - 2) do
      token1 = tokens[index]
      token2 = tokens[index + 1]
      token3 = tokens[index + 2]

      if is_dotted_word?(token1, token2, token3) then
        new_token = KukuraToken::merge(token1, token2, token3)
        tokens[index, 3] = new_token
      elsif is_zenkaku_number?(token1, token2) then
        new_token = KukuraToken::merge(token1, token2)
        tokens[index, 2] = new_token
      else
        index += 1
      end
    end

    return tokens
  end

  def is_dotted_word? (token1, token2, token3)
    return (token1.literal =~ /^[a-zA-Z0-9\.]+$/ and
              token2.literal =~ /^[a-zA-Z0-9\.]+$/ and
              token3.literal =~ /^[a-zA-Z0-9\.]+$/)
  end

  def is_zenkaku_number? (token1, token2)
    return (token1.literal =~ /^(|||||||||)+$/ and
              token2.literal =~ /^(|||||||||)+$/)
  end

  def combine (tokens)
    words   = []
    entries = []

    tokens.each { | token |

      ## If this token is attachable to an entry, the entry is updated.
      ## If not, the entry expands combination words and stores them to
      ## the result, and then the entry is discarded.
      ## <komatsu@taiyaki.org> (2004-11-13)
      new_entries = []
      entries.each { | entry |
        if entry.attachable?(token) then
          entry.attach(token)
          new_entries.push(entry)
        else
          words += entry.make_words()
        end
      }
      entries = new_entries

      ## If a token is able to be the beginning string of a combination word,
      ## a new combination word class is created with the token.
      ## <komatsu@taiyaki.org> (2004-11-13)
      @combword_classes.each { | combword_class |
        if combword_class::potential?(token) then
          entries.push( combword_class.new(token) )
        end
      }
    }

    entries.each { | entry |
      words += entry.make_words()
    }

    return delete_invalid_words(words)
  end

  def delete_invalid_words (words)
    agent1 = ArrayAgent.new(words)
    agent2 = ArrayAgent.new(words)

    until agent1.is_last?() do
      src_word = agent1.get_item()
      (reading, pos, literal, score,
       point_beginning, point_end, category) = src_word

      if [:url, :email, :date, :english_proper_noun].member?(category) then
        until agent2.is_last?() do
          target_word = agent2.get_item()
          target_point_beginning = target_word[4]
          target_point_end       = target_word[5]
          target_category        = target_word[6]

          if target_category != :noun and target_category != category then
            agent2.go_next()
            next
          end
          
          if src_word == target_word then
            agent2.go_next()
            next
          end

          ## Checking an overlap of two words
          if target_point_end <= point_beginning or
              point_end <= target_point_beginning then
            ## not overlapped
            agent2.go_next()
            next
          end

          agent2.delete_item()
        end
        agent2.go_first()
      end
      agent1.go_next()
    end

    agent1.close()
    agent2.close()

    words.uniq!
    return words
  end
end


## Class for generation from Gaim logs to a PRIME dictionary.
class DictFormatGaim < DictFormatText
  def get_lines (filename)
    lines = []
    raw_lines = open("|w3m -dump #{filename}").read.split("\n")

    # The first line is a header.
    raw_lines.shift()

    raw_lines.each { | line |
      # Each comment would start "(time) name: ...\n".
      if line =~ /^\([0-9][0-9]:[0-9][0-9]:[0-9][0-9]\) .*: / then
        # Insert a delimiter
        lines.push( "" )
        lines.push( line.chomp.split(": ")[1..-1].join(": ") )

      # It would be a system log line
      elsif line =~ /^\([0-9][0-9]:[0-9][0-9]:[0-9][0-9]\) / then
        # Do nothing
      else
        lines.push( line )
      end
    }
    return lines
  end
end

# ------------------------------------------------------------

class KukuraToken
  attr_reader :literal, :reading, :pos, :point_beginning, :point_end

  def initialize (literal, reading, pos, point_beginning, point_end)
    @literal = literal
    @reading = reading
    @pos     = pos
    @point_beginning = point_beginning
    @point_end       = point_end
  end

  def KukuraToken::merge (*tokens)
    literal = ""
    reading = ""
    tokens.each { | token |
      literal += token.literal
      reading += token.reading
    }
    return KukuraToken.new(literal, reading, tokens[-1].pos,
                           tokens[0].point_beginning, tokens[-1].point_end)
  end
end

class KukuraCombinationWord
  def initialize (token)
    @category = :none
    @tokens = [token]
    @default_pos  = nil
    @default_freq = 0
    @default_attr = nil
  end

  def KukuraCombinationWord::potential? (token)
    return false
  end

  def attachable? (token)
    return false
  end

  def attach (token)
    @tokens.push(token)
  end

  def get_literal ()
    literal = ""
    @tokens.each { | token |
      literal += token.literal
    }
    return literal
  end

  def get_reading ()
    reading = ""
    @tokens.each { | token |
      reading += token.reading
    }
    return reading
  end

  def get_pos ()
    return nil
  end

  def make_words
    return []
  end
end

class KukuraCombinationWordNoun < KukuraCombinationWord
  @@POSRegexp_Potential          = /^(̾|̤θ|Ƭ)/
  @@POSRegexp_Potential_Invalid  = /^̾-(Ω|)/
  @@POSRegexp_Attachable         = /^(̾|̤θ|Ƭ|ʬΥ)/
  @@POSRegexp_Unknown            = /^̤θ/

  @@POSRegexp_Invalid   =
             /^(Ƭ|̾-(Ω|̾|ü|ǽ)|ʬΥ)/
  @@WordRegexp_Invalid  = Regexp.new(['\([^\)]*$',
                                      '^[^\(]*\)',
                                      '\[[^\]]*$',
                                      '^[^\[]*\]',
                                      '<[^>]*$',
                                      '^[^<]*>',
                                      '^[^<\(]*,[^>\)]*$',
                                      '^',
                                      '^"[^"]*$',  #' # <= dummy
                                      '^[:,/\|\-]',
                                      '::$',
                                      '[,/\|\-]$'].join('|'))

  def initialize (token)
    super(token)
    @category = :noun
    @pos = "̾"
  end

  def KukuraCombinationWordNoun::english_noun? (token)
    matched_POSes = []
    regexp = Regexp.new("^Information available for (.*) #{token.literal}")
    open("|wordnet #{token.literal}").read.split("\n").each { | line |
      match_data = regexp.match(line)
      if match_data then
        matched_POSes.push(match_data[1])
      end
    }

    if matched_POSes.member?("noun") then
      return true
    else
      return false
    end
  end

  def KukuraCombinationWordNoun::potential? (token)
    ## Check the potential of the token with its POS.
    if token.pos !~ @@POSRegexp_Potential then
      return false
    elsif token.pos =~ @@POSRegexp_Potential_Invalid then
      return false
    elsif token.literal.length < 4 and  token.literal =~ /^[a-z]*$/ then
      return false
    end

    if token.pos =~ @@POSRegexp_Unknown then
      ## "------"  "*+*+*+*+" ʤɤν
      chars = token.literal.split(//)
      if (chars.length / chars.uniq.length) >= 4 then
        return false
      end

      if token.literal =~ /^[a-z]*$/ then
        if KukuraCombinationWordNoun::english_noun?(token) then
          return true
        else
          return false
        end
      end
    end

    return true
  end

  def attachable? (token)
    if @tokens.length >= 5 then
      return false
    end

    if token.pos !~ @@POSRegexp_Attachable then
      return false
    end

    if token.pos =~ @@POSRegexp_Unknown then
      chars = token.literal.split(//)
      ## "------"  "*+*+*+*+" ʤɤν
      if (chars.length / chars.uniq.length) >= 4 then
        return false
      end
    end

    if token.literal =~ /^[a-z]*$/ then
      if KukuraCombinationWordNoun::english_noun?(token) then
        return true
      else
        return false
      end
    end

    if token.pos == "ʬΥ" then
      if @tokens[-1].literal !~ /[a-zA-Z0-9']$/ then  #'
        return false
      end
    end

    return true
  end

  def make_words ()
    words = []
    literal = ""
    reading = ""

    point_beginning = @tokens[0].point_beginning

    @tokens.each { | token |
      literal += token.literal
      reading += token.reading
      if literal =~ @@WordRegexp_Invalid then
        next
      elsif token.pos =~ @@POSRegexp_Invalid then
        next
      elsif literal.length == 1 and literal == reading then
        next
      end

      words.push( [reading, @pos, literal, @default_freq,
                   point_beginning, token.point_end, @category] )
    }
    return words
  end
end

class KukuraCombinationWordEnglishProperNoun < KukuraCombinationWord
  def initialize (token)
    super(token)
    @category = :english_proper_noun
    @pos = "ͭ̾"
  end

  def KukuraCombinationWordEnglishProperNoun::potential? (token)
    return ( token.literal =~ /^[A-Z]+/ )
  end

  def KukuraCombinationWordEnglishProperNoun::match_exact? (string)
    word      = '[A-Z][a-zA-Z0-9]*'
    delimiter = '[ -]+'
    regexp_pnoun = /^#{word}(#{delimiter}#{word})+$/
    return ((string =~ regexp_pnoun) == 0)
  end

  ## This returns true if the string is a prefix of an English proper noun,
  ## or false.
  def KukuraCombinationWordEnglishProperNoun::match_prefix? (string)
    word      = '[A-Z][a-zA-Z0-9]*'
    delimiter = '[ -]+'
    regexp_pnoun_prefix = /^#{word}(#{delimiter}(#{word})?)*$/
    return ((string =~ regexp_pnoun_prefix) == 0)
  end

  def attachable? (token)
    next_literal = get_literal() + token.literal
    return KukuraCombinationWordEnglishProperNoun::match_prefix?(next_literal)
  end

  def make_words
    if KukuraCombinationWordEnglishProperNoun::match_exact?(get_literal()) then
      word = [get_reading(), @pos, get_literal(), @default_freq,
              @tokens[0].point_beginning, @tokens[-1].point_end, @category]
      return [word]
    else
      ## Delete the last delimiter
      if @tokens[-1].pos != "ʬΥ" then
        return []
      end

      @tokens.pop()
      if KukuraCombinationWordEnglishProperNoun::match_exact?(get_literal()) then
        word = [get_reading(), @pos, get_literal(), @default_freq,
                @tokens[0].point_beginning, @tokens[-1].point_end, @category]
        return [word]
      else
        return []
      end
    end
  end
end


class KukuraCombinationWordDate < KukuraCombinationWord
  def initialize (token)
    super(token)
    @category = :date
    @pos = "̾"
  end

  def KukuraCombinationWordDate::potential? (token)
    return ( token.literal =~ /^[0-9]+/ )
  end

  def KukuraCombinationWordDate::match_exact? (string)
    number    = '[0-9]+'
    delimiter = '[:/-]+'
    regexp_date = /^#{number}(#{delimiter}#{number})+$/

    regexp_japanese_date = /^(#{number}ǯ)?(#{number})?(#{number})?$/
    return (((string =~ regexp_date) == 0) or 
              ((string =~ regexp_japanese_date) == 0))
  end

  ## This returns true if the string is a prefix of a date string, or false.
  def KukuraCombinationWordDate::match_prefix? (string)
    number    = '[0-9]+'
    delimiter = '[:/-]+'
    regexp_date_prefix = /^#{number}(#{delimiter}(#{number})?)*$/
    regexp_japanese_date_prefix =
          /^(#{number}ǯ?)?(#{number}?)?(#{number}?)?$/
    return (((string =~ regexp_date_prefix) == 0) or
              ((string =~ regexp_japanese_date_prefix) == 0))
  end

  def attachable? (token)
    next_literal = get_literal() + token.literal
    return KukuraCombinationWordDate::match_prefix?( next_literal )
  end

  def make_words
    if KukuraCombinationWordDate::match_exact?( get_literal() ) then
      word = [get_reading(), @pos, get_literal(), @default_freq,
              @tokens[0].point_beginning, @tokens[-1].point_end, @category]
      return [word]
    else
      return []
    end
  end
end


class KukuraCombinationWordEmail < KukuraCombinationWord
  def initialize (token)
    super(token)
    @category = :email
    @pos = "ͭ̾"
  end

  def KukuraCombinationWordEmail::potential? (token)
    return match_prefix?( token.literal )
  end

  def KukuraCombinationWordEmail::match_exact? (string)
    user   = '[0-9A-Za-z_!#\$%&*+\-\/=\?^_{|}\~\.]+'
    domain = '[0-9A-Za-z_\-]+'
    regexp_email = /^#{user}@#{domain}(\.#{domain})+$/
    return ((string =~ regexp_email) == 0)
  end

  ## This returns true if the string is a prefix of an email address, or false.
  def KukuraCombinationWordEmail::match_prefix? (string)
    user   = '[0-9A-Za-z_!#\$%&*+\-\/=\?^_{|}\~\.]+'
    domain = '[0-9A-Za-z_\-]+'
    regexp_email_prefix = /^#{user}(@(#{domain}(\.(#{domain})*)*)?)?$/
    return ((string =~ regexp_email_prefix) == 0)
  end

  def attachable? (token)
    next_literal = get_literal() + token.literal
    return KukuraCombinationWordEmail::match_prefix?( next_literal )
  end

  def make_words
    if KukuraCombinationWordEmail::match_exact?( get_literal() ) then
      word = [get_reading(), @pos, get_literal(), @default_freq,
              @tokens[0].point_beginning, @tokens[-1].point_end, @category]
      return [word]
    else
      return []
    end
  end
end

class KukuraCombinationWordURL < KukuraCombinationWord
  def initialize (token)
    super(token)
    @category = :url
    @pos = "ͭ̾"
  end

  def KukuraCombinationWordURL::potential? (token)
    return match_prefix?( token.literal )
  end

  def KukuraCombinationWordURL::match_exact? (string)
    domain = '[0-9A-Za-z_!#\$%&*+\-\/=\?^_{|}\~\.]+'
    regexp_url = /^[a-z-]+:\/\/#{domain}$/
    return ((string =~ regexp_url) == 0)
  end

  ## This returns true if the string is a prefix of an email address, or false.
  def KukuraCombinationWordURL::match_prefix? (string)
    domain = '[0-9A-Za-z_!#\$%&*+\-\/=\?^_{|}\~\.]+'
    regexp_url_prefix = /^[a-z-]+(:\/\/(#{domain})?)?$/
    return ((string =~ regexp_url_prefix) == 0)
  end

  def attachable? (token)
    next_literal = get_literal() + token.literal
    return KukuraCombinationWordURL::match_prefix?( next_literal )
  end

  def make_words
    if KukuraCombinationWordURL::match_exact?( get_literal() ) then
      word = [get_reading(), @pos, get_literal(), @default_freq,
              @tokens[0].point_beginning, @tokens[-1].point_end, @category]
      return [word]
    else
      return []
    end
  end
end
