#! /usr/bin/env ruby
# coding: utf-8


# ==============================================================================
# convert_alt_cannadic_to_mozcdic
# ==============================================================================

def convert_alt_cannadic_to_mozcdic
	# 品詞IDを取得
	file = File.new("../mozc/id.def", "r")
		id = file.read.split("\n")
	file.close

	id_t35 = id.grep(/\ 名詞,一般,\*,\*,\*,\*,\*/)
	id_t35 = id_t35[0].split(" ")[0]

	id_cn = id.grep(/\ 名詞,固有名詞,地域,一般,\*,\*,\*/)
	id_cn = id_cn[0].split(" ")[0]

	file = File.new($filename, "r")
		lines = file.read.encode("UTF-8", "EUC-JP")
		lines = lines.split("\n")
	file.close

	l2 = []
	p = 0

	lines.length.times do |i|
		s = lines[i].chomp.split(" ")

		# あきびん #T35*202 空き瓶 空瓶 #T35*151 空きビン 空ビン #T35*150 空きびん
		yomi = s[0]

		hinsi = ""

		(s.length - 1).times do |c|
			# cannadicの品詞を取得
			if s[c + 1].index("#") == 0
				hinsi = s[c + 1]
				next
			end

			hyouki = s[c + 1]

			# alt-cannadicのコスト値は大きいほど優先度が高い
			cost = 7000 - hinsi.split("*")[1].to_i

			# 人名のIDは "名詞,一般,*,*,*,*,*" にする。
			# "名詞,固有名詞,人名,一般,*,*" は優先度が低く、
			# "名詞,固有名詞,一般,*,*,*" の「明石屋さんま」が優先されてしまう。
			# "名詞,固有名詞,一般,*,*,*" にするのは避ける。フィルタリング対象なので
			if hinsi.index("#T3") == 0 ||
			hinsi.index("#T0") == 0 ||
			hinsi.index("#JN") == 0 ||
			hinsi.index("#KK") == 0
				id = id_t35
			elsif hinsi.index("#CN") == 0
				id = id_cn
			else
				next
			end

			l2[p] = yomi + "	" + id + "	" + id + "	" + cost.to_s + "	" + hyouki
			p = p + 1
		end
	end

	lines = l2
	l2 = []
	lines = lines.uniq.sort

	dicfile = File.new($dicname, "w")
		dicfile.puts lines
	dicfile.close
end


# ==============================================================================
# main
# ==============================================================================

`wget -N https://ja.osdn.net/dl/alt-cannadic/alt-cannadic-110208.tar.bz2`
`rm -rf alt-cannadic-110208`
`tar xf alt-cannadic-110208.tar.bz2`
`mv alt-cannadic-110208/{gcanna.ctd,g_fname.ctd} .`

$filename = "gcanna.ctd"
$dicname = "mozcdic-ut-alt-cannadic.txt"
convert_alt_cannadic_to_mozcdic

$filename = "g_fname.ctd"
$dicname = "mozcdic-ut-alt-cannadic-jinmei.txt"
convert_alt_cannadic_to_mozcdic

`rm -rf alt-cannadic-110208/`
`rm -f *.ctd`
