/*
 * ʸʸsplitsplitter
 *
 * ʸζ򸡽Ф
 *  anthy_init_split_context() ʬѤΥƥȤä
 *  anthy_mark_border() ʬ򤷤
 *  anthy_release_split_context() ƥȤ
 *
 *  anthy_commit_border() ߥåȤ줿ƤФƳؽ򤹤
 *
 *  anthy_get_nr_seginfo()
 *  anthy_get_nth_seginfo() Ϥʸι
 *
 * Funded by IPA̤Ƨեȥ¤ 2001 9/22
 * Copyright (C) 2000-2004 TABATA Yusuke
 * Copyright (C) 2000-2001 UGAWA Tomoharu
 *
 * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
 */
/*
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 */
#include <stdlib.h>
#include <string.h>

#include <alloc.h>
#include <record.h>
#include <splitter.h>
#include <logger.h>
#include "wordborder.h"

#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000

static allocator seginfo_ator;
static int splitter_debug_flags;

/**/
wtype_t anthy_wtype_noun;
wtype_t anthy_wtype_name_noun;
wtype_t anthy_wtype_num_noun;
wtype_t anthy_wtype_prefix;
wtype_t anthy_wtype_num_prefix;
wtype_t anthy_wtype_postfix;
wtype_t anthy_wtype_num_postfix;
wtype_t anthy_wtype_name_postfix;
wtype_t anthy_wtype_sv_postfix;
wtype_t anthy_wtype_a_tail_of_v_renyou;
wtype_t anthy_wtype_v_renyou;
wtype_t anthy_wtype_noun_tail;/* ֤ơפȤ */
wtype_t anthy_wtype_n1;
wtype_t anthy_wtype_n10;
wtype_t anthy_wtype_noun_and_postfix;


static void
metaword_dtor(void *p)
{
  struct meta_word *mw= p;
  if (mw->si) {
    anthy_sfree(seginfo_ator, mw->si);
  }
}

static void
seginfo_dtor(void *p)
{
  struct seg_info *si = p;
  if (si->cand.str) {
    free(si->cand.str);
  }
  if (si->word_info) {
    free(si->word_info);
  }
}

/** make_word_cacheǺʸ
 */
static void
release_info_cache(struct splitter_context *sc)
{
  int i;
  struct word_split_info_cache *info = sc->word_split_info;

  anthy_free_allocator(info->MwAllocator);
  anthy_free_allocator(info->WlAllocator);
  anthy_free_allocator(info->ExAllocator);
  for (i = 0; i < sc->char_count; i++) {
    if (info->cnode[i].ex) {
      free(info->cnode[i].ex);
    }
  }
  free(info->cnode);
  free(info->seq_len);
  free(info->rev_seq_len);
  free(info);
}

/** Ƥʬʸåơʸθ󤹤
 *  ǳݤƤrelease_info_cacheǲ 
 */
static void
make_word_cache(struct splitter_context *sc)
{
  int i;
  struct word_split_info_cache *info;

  /* åΥǡ */
  sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
  info = sc->word_split_info;
  info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word),
					     metaword_dtor);
  info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
  info->ExAllocator = anthy_create_allocator(sizeof(struct extent), 0);
  info->cnode =
    malloc(sizeof(struct char_node) * (sc->char_count + 1));

  info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
  info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));

  /* ʸǥåФƽԤ */
  for (i = 0; i <= sc->char_count; i++) {
    info->seq_len[i] = 0;
    info->rev_seq_len[i] = 0;
    info->cnode[i].wl = 0;
    info->cnode[i].mw = 0;
    info->cnode[i].ex = 0;
    info->cnode[i].max_len = 0;
  }

  /* word_listƤmetaword */
  anthy_make_word_list_all(sc);
  anthy_make_metaword_all(sc);
}

static struct seg_info *
init_seginfo(struct meta_word *mw)
{
  mw->si = anthy_smalloc(seginfo_ator);
  mw->si->cand.str = 0;
  mw->si->word_info = 0;
  mw->si->nr_word_info = 0;
  mw->si->core_word_info_index = -1;
  mw->si->info_len = 0;
  mw->si->struct_ratio = RATIO_BASE;
  mw->si->score = 0;
  mw->si->seg_class = mw->seg_class;
  return mw->si;
}

/** seginfoword_listƤɲä
 */
static void
seginfo_pushback_word_list(struct seg_info *si,
			   struct word_list *wl)
{
  int i;
  /* word_listƤpartФ */
  for (i = 0; i < NR_PARTS; i++) {
    struct part_info *part = &wl->part[i];
    if (part->len == 0) {
      /* Ĺ̵part̵뤹 */
      continue ;
    }
    if (i == PART_CORE) {
      si->core_word_info_index = si->nr_word_info;
    }
    /* Ƥ򥳥ԡ */
    si->word_info = realloc(si->word_info,
			    (si->nr_word_info+1) * sizeof(struct word_info));
    si->word_info[si->nr_word_info].wt = part->wt;
    si->word_info[si->nr_word_info].len = part->len;
    si->word_info[si->nr_word_info].ratio = part->ratio;
    si->info_len += part->len;
    si->nr_word_info ++;
  }
}

/** seginfo¤ɾ
 */
static void
eval_seginfo_by_struct(struct seg_info *si)
{
  int len = si->info_len;
  int uncover = len - si->info_len;
  int i;

  /* seginfoʤХǥեȤ */
  if (!si->nr_word_info) {
    return ;
  }

  /* seginfoǥСƤΰγ礬礭ۤɥ礭 */
  if (uncover > 7) {
    /* Ĺ°줬Ĥ */
    uncover = 7;
  }
  /* °줬2ʸʤиʤ */
  uncover -= 2;
  if (uncover < 0) {
    uncover = 0;
  }
  si->struct_ratio  *= (RATIO_BASE - uncover *
			(RATIO_BASE/16));
  si->struct_ratio /= RATIO_BASE;
  
  /* ʬνŤߤ򤫤 */
  for (i = 0; i < si->nr_word_info; i++) {
    si->struct_ratio *= si->word_info[i].ratio;
    si->struct_ratio /= RATIO_BASE;
  }

  if (si->struct_ratio == 0) {
    si->struct_ratio = 1;
  }
}

/** 礵줿metawordseg_infoФ */
static struct seg_info *
get_seginfo_from_combined_metaword(struct meta_word *mw)
{
  struct seg_info *si;
  si = init_seginfo(mw);
  si->type = SI_NORMAL;
  seginfo_pushback_word_list(si, mw->mw1->wl);
  seginfo_pushback_word_list(si, mw->mw2->wl);

  si->struct_ratio = mw->mw2->wl->part[PART_DEPWORD].ratio;
  eval_seginfo_by_struct(mw->si);
  return si;
}

/** ޤwordlistmetawordseg_infoФ */
static struct seg_info *
get_seginfo_from_simple_metaword(struct meta_word *mw)
{
  struct seg_info *si;
  struct word_list *wl = mw->wl;
  si = init_seginfo(mw);
  si->type = SI_NORMAL;

  /* Ƭ, Ω, , ° */
  seginfo_pushback_word_list(si, wl);

  /* ¤ɾ */
  eval_seginfo_by_struct(mw->si);

  /* °ΥѥФ륹 */
  si->struct_ratio *= wl->part[PART_DEPWORD].ratio;
  si->struct_ratio /= RATIO_BASE;
  /* Ωγѷˤ륹 */
  si->struct_ratio *= wl->part[PART_CORE].ratio;
  si->struct_ratio /= RATIO_BASE;
  return si;
}

/**
 * metawordʸФ
 */
static struct seg_info *
get_seginfo_from_metaword(struct meta_word *mw)
{
  struct seg_info *si;

  /* ǤˤФ֤ */
  if (mw->si) {
    return mw->si;
  }
  /* ̵ΤǺ */

  /* ޤwordlistmetawordξ */
  if (mw->wl && mw->wl->len) {
    return get_seginfo_from_simple_metaword(mw);
  }

  /* metawordΥפˤäƼФѤ */
  switch (anthy_metaword_type_tab[mw->type].si) {
  case MW_SEGINFO_WRAPPED:
    /* wrap줿ΤξФ */
    return get_seginfo_from_metaword(mw->mw1);
  case MW_SEGINFO_COMBINED:
    /* 2ʸ */
    return get_seginfo_from_combined_metaword(mw);
  case MW_SEGINFO_OCHAIRE_LEAF:
    /* ؽˤ */
    si = init_seginfo(mw);
    si->type = SI_CAND;
    si->cand.str = anthy_xstr_dup_str(mw->cand_hint);
    si->cand.len = mw->cand_hint->len;
    /* seginfoĹmetawordĹ */
    si->info_len = mw->len;
    eval_seginfo_by_struct(mw->si);
    return si;
  case MW_SEGINFO_NONE:
    /* seginfoʤ */
    break;
  default:
    break;
  }
  return 0;
}

/** from,lenΰ򥫥Сseg_info */
int
anthy_get_nr_seginfo(struct splitter_context *sc,
		     int from, int len)
{
  struct meta_word *mw;
  int n;

  for (n = 0, mw = sc->word_split_info->cnode[from].mw;
       mw; mw = mw->next) {
    if (mw->len == len) {
      if (get_seginfo_from_metaword(mw)) {
	n++;
      }
    }
  }
  return n;
}

struct seg_info *
anthy_get_nth_seginfo(struct splitter_context *sc,
		      int from, int len, int nth)
{
  struct meta_word *mw;
  int n;
  for (n = 0, mw = sc->word_split_info->cnode[from].mw;
       mw; mw = mw->next) {
    if (mw->len == len) {
      struct seg_info *si = get_seginfo_from_metaword(mw);
      if (si) {
	if (n == nth) {
	  return si;
	}
	n++;
      }
    }
  }
  return NULL;
}

/** ƤӽФwordsplitterΥȥåץ٥δؿ */
void
anthy_mark_border(struct splitter_context *sc,
		  int from, int from2, int to)
{
  int i;
  struct word_split_info_cache *info;

  /* sanity check */
  if ((to - from) <= 0) {
    return ;
  }

  /* ޡѤΰ */
  info = sc->word_split_info ;
  info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
  for (i = 0; i < sc->char_count + 1; i++) {
    info->seg_border[i] = sc->ce[i].seg_border;
  }

  /* ꤹ */
  anthy_eval_border(sc, from, to);

  /* ̤ž */
  for (i = from+1; i < from2; i++) {
    info->seg_border[i] = 0;
  }
  for (i = from; i < to; i++) {
    sc->ce[i].seg_border = info->seg_border[i];
  }
}

/* ʸ᤬礵줿Τǡؽ */
static void
proc_expanded_segment(struct splitter_context *sc,
		      int from, int len)
{
  int initial_len = sc->ce[from].initial_seg_len;
  int i, nr;
  xstr from_xs, to_xs, *xs;

  from_xs.str = sc->ce[from].c;
  from_xs.len = initial_len;
  to_xs.str = sc->ce[from].c;
  to_xs.len = len;
  if (anthy_select_section("EXPANDPAIR", 1) == -1) {
    return ;
  }
  if (anthy_select_column(&from_xs, 1) == -1) {
    return ;
  }
  nr = anthy_get_nr_values();
  for (i = 0; i < nr; i ++) {
    xs = anthy_get_nth_xstr(i);
    if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
      /* ˤ */
      return ;
    }
  }
  anthy_set_nth_xstr(nr, &to_xs);
  anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
}

/* ʸΥޡȸؽ */
void
anthy_commit_border(struct splitter_context *sc, int nr_segments,
		    struct seg_info **info, int *seg_len)
{
  int i, from = 0;

  /* Фʸ */
  for (i = 0; i < nr_segments; i++) {
    /* 줾ʸФ */

    int len = seg_len[i];
    int initial_len = sc->ce[from].initial_seg_len;
    int real_len = 0;
    int l2;

    if (!initial_len || from + initial_len == sc->char_count) {
      /* ϶ǤϤʤ */
      goto tail;
    }
    l2 = sc->ce[from + initial_len].initial_seg_len;
    if (initial_len + l2 > len) {
      /* ٤ʸޤۤɳ礵줿櫓ǤϤʤ */
      goto tail;
    }
    if (info[i]) {
      real_len = info[i]->info_len;
    }
    if (real_len <= initial_len) {
      goto tail;
    }
    /* ʸޤĹ˳ĥ줿ʸ᤬ߥåȤ줿 */
    proc_expanded_segment(sc, from, real_len);
  tail:
    from += len;
  }
}

int
anthy_splitter_debug_flags(void)
{
  return splitter_debug_flags;
}

void
anthy_init_split_context(xstr *xs, struct splitter_context *sc)
{
  int i;
  sc->char_count = xs->len;
  sc->ce = (struct char_ent*)
    malloc(sizeof(struct char_ent)*(xs->len + 1));
  for (i = 0; i <= xs->len; i++) {
    sc->ce[i].c = &xs->str[i];
    sc->ce[i].seg_border = 0;
    sc->ce[i].initial_seg_len = 0;
  }

  /* ξüʸζǤ */
  sc->ce[0].seg_border = 1;
  sc->ce[xs->len].seg_border = 1;

  make_word_cache(sc);
}

void
anthy_release_split_context(struct splitter_context *sc)
{
  if (sc->word_split_info) {
    release_info_cache(sc);
    sc->word_split_info = 0;
  }
  if (sc->ce) {
    free(sc->ce);
    sc->ce = 0;
  }
}

/** splitterΤνԤ */
int
anthy_init_splitter(void)
{
  /* ǥХåץȤ */
  char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
  char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
  splitter_debug_flags = SPLITTER_DEBUG_NONE;
  if (!dis && en && strlen(en)) {
    char *fs = getenv("ANTHY_SPLITTER_PRINT");
    if (fs) {
      if (strchr(fs, 'w')) {
	splitter_debug_flags |= SPLITTER_DEBUG_WL;
      }
      if (strchr(fs, 'm')) {
	splitter_debug_flags |= SPLITTER_DEBUG_MW;
      }
      if (strchr(fs, 'a')) {
	splitter_debug_flags |= SPLITTER_DEBUG_AN;
      }
      if (strchr(fs, 'i')) {
	splitter_debug_flags |= SPLITTER_DEBUG_ID;
      }
      if (strchr(fs, '1')) {
	splitter_debug_flags |= SPLITTER_DEBUG_1;
      }
      if (strchr(fs, 'S')) {
	splitter_debug_flags |= SPLITTER_DEBUG_S;
      }
    }
  }
  /* °쥰դν */
  if (anthy_init_depword_tab()) {
    anthy_log(0, "Failed to init dependent word table.\n");
    return -1;
  }
  /**/
  seginfo_ator = anthy_create_allocator(sizeof(struct seg_info), seginfo_dtor);
  /**/
  anthy_name_to_wtype("̾", &anthy_wtype_noun);
  anthy_name_to_wtype("̾", &anthy_wtype_name_noun);
  anthy_name_to_wtype("", &anthy_wtype_num_noun);
  anthy_name_to_wtype("ƻ첽", &anthy_wtype_a_tail_of_v_renyou);
  anthy_name_to_wtype("ưϢѷ", &anthy_wtype_v_renyou);
  anthy_name_to_wtype("̾첽", &anthy_wtype_noun_tail);
  anthy_name_to_wtype("̾", &anthy_wtype_noun_and_postfix);
  anthy_name_to_wtype("̾Ƭ", &anthy_wtype_prefix);
  anthy_name_to_wtype("Ƭ", &anthy_wtype_num_prefix);
  anthy_name_to_wtype("̾", &anthy_wtype_postfix);
  anthy_name_to_wtype("", &anthy_wtype_num_postfix);
  anthy_name_to_wtype("̾", &anthy_wtype_name_postfix);
  anthy_name_to_wtype("", &anthy_wtype_sv_postfix);
  anthy_name_to_wtype("1", &anthy_wtype_n1);
  anthy_name_to_wtype("10", &anthy_wtype_n10);
  /* Ω°³ɤ */
  return anthy_init_wordlist();
}

void
anthy_quit_splitter(void)
{
  anthy_release_depword_tab();
}
