#!/usr/local/bin/perl
#
# morogram.5.8.pl (N-gram tool for Perl 5.8.x)
# by Shigeki Moro (s-moro@hanazono.ac.jp)
#
# $Id: morogram.5.8.pl,v 1.3 2008/08/28 13:00:02 moroshigeki Exp $
#
# This file is part of "morogram".
# "morogram" is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
# 
# このスクリプトは以下の論文にあるアルゴリズムに基づいています。
# Makoto NAGAO and Shinsuke MORI.
# 	"A New Method of N-gram Statistics for Large Number of n and Automatic
# 	Extraction of Words and Phrases from Large Text Data of Japanese",
# 	In Proceedings of the 15th International Conference on Computational
# 	Linguistics (1994), pp.611-615.
# 	http://www-lab25.kuee.kyoto-u.ac.jp/member/mori/postscript/Coling94.ps

use strict;
use Encode qw(from_to decode encode);
use Encode::Guess;
use charnames ':full';	# \pM etc.
use utf8;



#-----------------------------------------------------------------//
# 定数定義
#-----------------------------------------------------------------//
my $VERSION			= '0.7.3';
my $VERSION2		= ''; # a b c...
my $FREQ_MIN		= 2;
my $FREQ_MAX		= 0;
my $GRAM_MIN		= 1;
my $GRAM_MAX		= 256;
my $OFFSET_FILE		= 'morogram.offset.bin';
my $POINTER_FILE	= 'morogram.pointer.bin';
my $COINCID_FILE	= 'morogram.coincidence.bin';
my $UTF8TEMP_FILE	= 'morogram.utf8';
my $UTF8WORD_FILE	= 'morogram.word.utf8';
my $DEFAULT_CODE	= 'utf8';
my $SORTER			= 'morogram-sort-'.$VERSION.$VERSION2.'.exe';



#-----------------------------------------------------------------//
# 著作権表示用文字列定義
#-----------------------------------------------------------------//
my $AUTHOR_INFO	=<< "__EOT__";
N-gram tool version $VERSION
  by Shigeki Moro (s-moro\@hanazono.ac.jp).

$0
  Win32 freestanding executable version
  by Toshihiro Yasuda (FZH01112\@nifty.com)
__EOT__



#-----------------------------------------------------------------//
# 使い方文字列定義
#-----------------------------------------------------------------//
my $USAGE_INFO	=<< "__EOT__";
Usage: $0 [switches] input_file >output_file
  --help, --?  Display this help.
  --f=min,max  Set min and max frequency
               (default: min=$FREQ_MIN, max=$FREQ_MAX).
  --g=min,max  Set min and max gram
               (default: min=$GRAM_MIN, max=$GRAM_MAX).
  --p          Delete punctuations.
  --e          Regard &Mnnnnnn; as a character.
  --BOM        Print Byte Order Mark (BOM).
  --c          Case sensitive.
  --w          Word-gram (need segmented text)
  --w1/--w2    Incl. spaces/char-gram by word.
  --w5/--w6    Word-gram (use internal segmenter).
  --I=encoding Input char encoding (default: utf8).
  --O=encoding Output char encoding.
  --V          Show version and available encodings.
  --d          debug (output/leave temp files)
__EOT__



#-----------------------------------------------------------------//
# 引数解析
#-----------------------------------------------------------------//
my($input_file, $output_file);
my($input_code, $output_code);
my($fmin,$fmax) = ($FREQ_MIN,$FREQ_MAX);
my($gmin,$gmax) = ($GRAM_MIN,$GRAM_MAX);
my $entity2char		= 0;
my $del_punct		= 0;
my $print_BOM		= 0;
my $ignore_case		= 1;
my $word_mode		= 0;
my $space_mode		= 0;
my $space_mode_chr	= "";
my $show_help		= 0;
my $show_enclist	= 0;
my $debug			= 0;

foreach(@ARGV){
	# 頻度設定
	if(m/^--f=(\d+),(\d+)$/){
		$fmin		= $1;
		$fmax		= $2;
	}
	elsif(m/^--f=,(\d+)$/){			$fmax		= $1	}
	elsif(m/^--f=(\d+),?$/){		$fmin		= $1	}

	# 文字数設定
	elsif(m/^--g=(\d+),(\d+)$/){
		$gmin		= $1;
		$gmax		= $2;
	}
	elsif(m/^--g=,(\d+)$/){			$gmax		= $1	}
	elsif(m/^--g=(\d+),?$/){		$gmin		= $1	}

	# その他
	elsif(m/^[\-\/]{1,2}[\?hH]$/){	$show_help		= 1		}
	elsif(m/^[\-\/]{1,2}help$/i){	$show_help		= 1		}
	elsif(m/^--V$/){				$show_enclist	= 1		}
	elsif(m/^--e$/){				$entity2char	= 1		}
	elsif(m/^--p$/){				$del_punct		= 1		}
	elsif(m/^--c$/){				$ignore_case	= 0		}
	elsif(m/^--BOM$/){				$print_BOM		= 1		}
	elsif(m/^--w$/){				$word_mode		= 1		}
	elsif(m/^--w1$/){ $space_mode = 1; $space_mode_chr = "\x20"; }
	elsif(m/^--w2$/){ $space_mode = 2; $space_mode_chr = "\x20"; }
	elsif(m/^--w5$/){ $space_mode = 1; $word_mode	= 1		}
	elsif(m/^--w6$/){ $space_mode = 2; $word_mode	= 1		}
	elsif(m/^--d$/){				$debug = 1;				}

	# 入出力文字コード
	elsif(m/^--I=(.+)$/){			$input_code		= $1	}
	elsif(m/^--O=(.+)$/){			$output_code	= $1	}

	elsif(defined $input_file or (!-e $_ and m/^--?[A-Z]/i)){
		print STDERR "invalid argument: $_\n";
		exit 255;
	}elsif(!-f $_){
		print STDERR "no such file or directory: $_\n";
		exit 255;
	}else{
		$input_file	= $_;
	}
}

$gmin ||= 1;
$fmin ||= 1;
$gmax	= $gmin if($gmax < $gmin and $gmax);
$fmax	= $fmin if($fmax < $fmin and $fmax);



#-----------------------------------------------------------------//
# ヘルプ表示
#-----------------------------------------------------------------//
if($show_help){
	print $USAGE_INFO;
	exit 8;
}



#-----------------------------------------------------------------//
# 対応文字コード一覧表示
#-----------------------------------------------------------------//
if($show_enclist){
	print qq(Available encodings:\n);
	my $i;
	
	foreach(Encode->encodings(":all")){
		printf("%-20s\t",$_);
		if(++$i % 3 == 0){ $i = 0; print "\n"; }
	}
	
	print "\n" if $i % 3;
	print "\n";
	exit 8;
}



#-----------------------------------------------------------------//
# ファイル指定がないときは GUI を表示する。
#----------------------------------------------------------------//
print STDERR $AUTHOR_INFO,"\n";
eval q'
	# Windows only
	while(not defined $input_file){	# Windows only
		use Win32::FileOp qw/
					OpenDialog SaveAsDialog
					OFN_FILEMUSTEXIST OFN_OVERWRITEPROMPT/;
	
		# Open/SaveDialog 表示には msvcr71.dll が必要
		my $dll = PerlApp::extract_bound_file("msvcr71.dll");
		$dll =~ s/\/[^\/]+$//;
		$ENV{PATH} = $dll.";".$ENV{PATH};
#		print `dir $dll`;

		my $file;
	
		# 入力ファイル名
		$file = OpenDialog({options => OFN_FILEMUSTEXIST});
		-e $file ? ($input_file = $file) : last;
	
		# 出力ファイル名
		$file = SaveAsDialog({options => OFN_OVERWRITEPROMPT});
		$output_file = $file || "con";
	
		# GUI 使用時は文字コードを自動判別
		$input_code ||= "Guess";
		last;
	}
';



#-----------------------------------------------------------------//
# ファイル指定がないときは使い方を表示して終了する。
#----------------------------------------------------------------//
if(not defined $input_file){
	print $USAGE_INFO;
	exit 8;
}



#-----------------------------------------------------------------//
# 入出力文字コードの設定
#-----------------------------------------------------------------//
if($input_code =~ m/^Guess$/i){
	open(FILE,"<",$input_file) or die qq($input_file : $!\n);
	local $/;
	local $_ = <FILE>;
	close FILE;

	my $suspects = $0;
	my @suspects;

#	while($suspects =~ s/[CJKT]//){
#		$& eq 'C' and push(@suspects,qw/euc-cn/);
#		$& eq 'J' and push(@suspects,qw/shiftjis euc-jp 7bit-jis/);
#		$& eq 'K' and push(@suspects,qw/euc-kr/);
#		$& eq 'T' and push(@suspects,qw/big5-eten/);
#	}

	push(@suspects,qw/euc-cn/);
	push(@suspects,qw/shiftjis euc-jp 7bit-jis/);
	push(@suspects,qw/euc-kr/);
	push(@suspects,qw/big5-eten/);

	my $enc = guess_encoding($_,@suspects);
	$input_code = ref($enc) ? $enc->name : $DEFAULT_CODE;

	# 後に行単位で読み込むので、２行目以降、ファイル先頭の
	# BOM はあてにできない。代わりにここで BE/LE を確定させる。
	if($input_code =~ m/^u.+\d/i){
		if(m/^\xFE\xFF/){		$input_code .= "BE" }
		elsif(m/^\xFF\xFE/){	$input_code .= "LE" }
	}
}

$input_code ||= $DEFAULT_CODE;
$output_code ||= $input_code;



#-----------------------------------------------------------------//
# sorter の選択
#-----------------------------------------------------------------//
my $sorter = $SORTER;;
eval{
	# Perl2Exe only
	no strict qw(subs);
	$sorter	= qq/"@{[PerlApp::extract_bound_file($SORTER)]}"/;
};



#-----------------------------------------------------------------//
# 引数解析結果の表示
#-----------------------------------------------------------------//
print STDERR << "__EOT__";
  frequency range  : [$fmin .. $fmax]
  gram size range  : [$gmin .. $gmax]
  delete punct     : @{[$del_punct   ? 'yes' : 'no']}
  regard &Mnnnnnn; : @{[$entity2char ? 'yes' : 'no']}
  print BOM        : @{[$print_BOM   ? 'yes' : 'no']}
  case sensitive   : @{[$ignore_case ? 'no' : 'yes']}
  word recognition : @{[$word_mode   ? 'yes' : 'no']}
  special mode(-wN): @{[$space_mode  ? 'yes:'.$space_mode : 'no']}
  input encoding   : $input_code
  output encoding  : $output_code
  filename         : $input_file
__EOT__
print STDERR "\n";



#-----------------------------------------------------------------//
# 前処理
#-----------------------------------------------------------------//
print STDERR "------- First Stage -------\n";
unlink($OFFSET_FILE, $POINTER_FILE, $COINCID_FILE, $UTF8TEMP_FILE);
print STDERR "\tcreating offset table...";

open(OFFSET,">:raw",$OFFSET_FILE) or die "can't open $OFFSET_FILE\n";
open(FILE,"<:raw",$input_file) or die "can't open $input_file\n";

my @utf8;

if($word_mode){
	local $/;
	
	if($input_code =~ m/u.+32LE$/i){
		$/ = "\x0A\x00\x00\x00";
	}elsif($input_code =~ m/u.+16LE$/i or $input_code =~ m/u.+2LE$/i){		$/ = "\x0A\x00";
	}

	if($space_mode and $debug){
		open(WORD, ">", $UTF8WORD_FILE)
			or die "can't open $UTF8WORD_FILE\n";
		print WORD "$_\n";
	}

	my(%id,$id);
	while(<FILE>){
		$_ = decode($input_code,$_);	# $input_code => internal
		s/\x{FEFF}//g;
		s/&M(\d{6});/chr(0x0EFFFF+$1)/ge	if $entity2char;

		if($space_mode){
			s/\p{InBasicLatin}+/ $& /g;
			s/\p{InHiragana}+/ $& /g;
			s/\p{InKatakana}+/ $& /g;
			s/\p{InCJKUnifiedIdeographs}+/ $& /g;

			if($space_mode == 2){
				s/(\p{InCJKUnifiedIdeographs})\s+
				([\p{InHiragana}\p{InKatakana}])/$1$2/gx;
				
				s/(\p{InKatakana})\s+
				([\p{InHiragana}\p{InCJKUnifiedIdeographs}])/$1$2/gx;
			}
		}

		s/[　\x0D\x0A\x09\x20]+/\x20/go;
		if($ignore_case){	tr/A-Z/a-z/;	tr/Ａ-Ｚ/ａ-ｚ/;	}
		s/[\pM\pP\pS\pZ]+/\x20/go if $del_punct;

		foreach my $utf8 (split m/\x20+/){
			if(not exists $id{$utf8}){
				next unless length $utf8;
				$utf8[++$id] = encode($DEFAULT_CODE,$utf8);
#				$utf8[++$id] = $utf8;
				$id{$utf8} = $id;
				# $id == 0xFFFFFFFF なら ID オーバフローだが、
				# そんな巨大なデータはあり得ない。
			}
			print OFFSET pack('N',$id{$utf8});
		}

		if($space_mode and $debug){
			print WORD "$_\n";
		}
	}
}else{
	local $/;
	
	if($input_code =~ m/u.+32LE$/i){
		$/ = "\x0A\x00\x00\x00";
	}elsif($input_code =~ m/u.+16LE$/i or $input_code =~ m/u.+2LE$/i){		$/ = "\x0A\x00";
	}

	while(<FILE>){
		$_ = decode($input_code,$_);	# $input_code => internal
		s/^\x{FEFF}//;
		s/&M(\d{6});/chr(0x0EFFFF+$1)/ge	if $entity2char;
		s/[　\x0D\x0A\x09\x20]+/$space_mode_chr/go;
		if($ignore_case){	tr/A-Z/a-z/;	tr/Ａ-Ｚ/ａ-ｚ/;	}
		s/[\pM\pP\pS\pZ]+//go if $del_punct;
		print OFFSET encode('UTF-32BE',$_);	# utf8 => ucs4
	}
}

close FILE;
close OFFSET;

my $du_byte = -s $OFFSET_FILE;
my $byte4	= $du_byte / 4;
$gmax	= $byte4	if($gmax > $byte4 or !$gmax);
$gmin	= $byte4	if $gmin > $byte4;
$fmax	= $byte4	if($fmax > $byte4 or !$fmax);
$fmin	= $byte4	if $fmin > $byte4;
printf STDERR "done.\n\tnumber of characters is $byte4.\n";

if($byte4 == 1){
	print STDERR "\t*** too few items for N-gram statistics ***\n";
	exit 255;
}



#-----------------------------------------------------------------//
# 主処理
#-----------------------------------------------------------------//
print STDERR qq(\n);
print STDERR qq(------- Second Stage ------\n);

if(defined $output_file){
	open(STDOUT,">",$output_file) or die "$! $output_file\n";
}

print "\x{FEFF}" if $print_BOM;

if($word_mode){
	my $opt = "-t";
	system "$sorter -$byte4 $gmin $gmax $fmin $fmax $opt";
	$du_byte += (-s $POINTER_FILE) + (-s $COINCID_FILE);
	unlink($OFFSET_FILE, $POINTER_FILE, $COINCID_FILE) unless $debug;
	open(FILE,"<:raw",$UTF8TEMP_FILE)
		or die "$!: $UTF8TEMP_FILE\n";
	while(<FILE>){
		s/^(\d+\t)((?:....)+?)(\t\d+)\D*/$1@{[
			join(" ",@utf8[unpack('N*',$2)])
		]}$3\n/s or do{ $_ .= <FILE>; redo; };

		from_to($_,$DEFAULT_CODE => $output_code);
		print $_;
	}
	close FILE;
}else{
	my $opt = ($space_mode or $output_code ne $DEFAULT_CODE) ?
				"-t" : undef;
	system "$sorter +$byte4 $gmin $gmax $fmin $fmax $opt";
	$du_byte += (-s $POINTER_FILE) + (-s $COINCID_FILE);
	unlink($OFFSET_FILE, $POINTER_FILE, $COINCID_FILE) unless $debug;
	if($opt){
		open(FILE,"<:raw",$UTF8TEMP_FILE)
			or die "$!: $UTF8TEMP_FILE\n";

		if($space_mode){
			while(<FILE>){
				if(m|$space_mode_chr|go){
					next if $space_mode == 2;
				}else{
					next if m|\t$space_mode_chr|go;
					next if m|$space_mode_chr\t|go;
				}

				from_to($_,$DEFAULT_CODE => $output_code);
				print $_;
			}
		}else{
			while(<FILE>){
				from_to($_,$DEFAULT_CODE => $output_code);
				print $_;
			}
		}
		close FILE;
	}
}

print STDERR "\n";



#-----------------------------------------------------------------//
# 後処理
#-----------------------------------------------------------------//
END{
	$du_byte += -s $UTF8TEMP_FILE;
	unlink($UTF8TEMP_FILE) unless $debug;
	printf STDERR "  Disk usage: %.1lf\[%s]\n",
		($du_byte /= 1024) < 1024 ? ($du_byte, "KB") :
		($du_byte /= 1024) < 1024 ? ($du_byte, "MB") :
		($du_byte /= 1024, "GB");

#	if($debug == 0){
#		my $first_message	= "  Deleting temporary file(s)...";
#		my $isok	= 1;
#
#		foreach my $file ($OFFSET_FILE, $POINTER_FILE, $COINCID_FILE,
#							$UTF8TEMP_FILE){
#			if(-e $file){
#				$du += -s $file;
#				$isok	*= unlink $file;
#				if(defined $first_message){
#					print STDERR $first_message;
#					undef $first_message;
#				}
#			}
#		}
#
#		exit 0 if $first_message;
#		print STDERR $isok ? "done.\n" : "failed\n";
#	}
	
	my $now = time - $^T;
	printf STDERR
			("  Total time: %d hour(s) %d minute(s) %d second(s)\n",
			$now / 3600, $now % 3600 / 60, $now % 60);

	exit 0;
}

__END__
