#!/usr/bin/perl -w

use strict;
use vars qw($VERSION);

require 5.005;

$VERSION = sprintf "%d.%d", q$Revision: 1.48 $ =~ /(\d+)/g; # jhi@iki.fi

use Getopt::Long;

my $n = 1;
my ($help, $debug, $input, $latin1, $output, $tables, $version);
my $fold = 1;

sub help {
    print <<EOF;
Usage: $0 [options] [files]
Compute the ngram frequencies and output the word lengths, ngrams,
initials ngrams, medial (non-initial non-final) ngrams, and final
ngrams.  The frequencies will be shown as absolute, as relative to
the total frequency, and as relative to the maximum frequency.
Options:
--n=N		The default is one character ngrams.
--nofold	The default is to lowercase the ngrams.
--input=C	The default is to use native 8-bit bytes and native definition
		of "letters".  Using this option requires Perl 5.8.
		The input will be mapped from C to Unicode, and the Unicode
		Letter definition will be used for "letters".
		For example to input ISO 8859-1 characters, use -i=latin1.
--latin1	Assume the input to be ISO 8859-1.  Does not require Perl 5.8.
--output=F	Output to files F.ngl, F.ngr, F.ngi, F.ngm, F.ngf.
		The default is to use STDOUT.
--tables	Instead of a linear list of ngram frequencies output a list
		of two-dimensional tables of ngram frequencies (for n > 1).
--help		Show this help.
--version	Show version.
--debug		Show the ngrams in STDERR.
The options can be shortened to their unique prefixes and
the two dashes to one dash.  No files means using STDIN.
EOF
    exit(1);
}

help()
    unless
      GetOptions('n=i'        => \$n,
		 'fold!'      => \$fold,
		 'input=s'    => \$input,
		 'latin1'     => \$latin1,
		 'output=s'   => \$output,
		 'tables'     => \$tables,
		 'help'       => \$help,
		 'version'    => \$version,
		 'debug'      => \$debug);

help() if $n < 1 || int($n) != $n;

sub version {
    print $VERSION, "\n";
    exit(1);
}

help()    if $help;
version() if $version;

require 5.008 if $input;

my %length_freq;
my %ngram_freq;
my %initial_freq;
my %medial_freq;
my %final_freq;

@ARGV = ("-") unless @ARGV;

my $letter = $latin1 ?
    '[A-Za-zÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]' :
    $] >= 5.008 ? '\pL' : '\w';

for my $fn (@ARGV) {
    if (open(my $fh, $fn)) {
	binmode($fh, ":encoding($input)") if $input;
	while (<$fh>) {
	    chomp;
	    while (/($letter+)/go) {
		my $s = $1;
		my $l = length $s;
		$length_freq{$l}++;
		if ($l >= $n) {
		    if ($fold) {
			if ($latin1) {
			    $s =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ/abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ/;
			} else {
			    $s = lc $s;
			}
		    }
		    my @n = ();
		    while ($s =~ /(.{$n})/g) {
			$ngram_freq{$1}++;
			push @n, $1;
			pos($s) -= $n - 1;
		    }
		    print STDERR "@n\n" if $debug;
		    $initial_freq{my $ini = shift @n}++;
		    $final_freq  {@n ? pop @n : $ini}++;
		    $medial_freq {$_}++ for @n;
		}
	    }
	}
    } else {
	warn "$0: Failed to open '$fn' for reading: $!\n";
    }
}

sub summax {
    my $freq = shift;

    my $max = 0;
    my $sum = 0;
    for my $a (keys %{$freq}) {
	next unless defined $freq->{$a};
	$max  = $freq->{$a} if $freq->{$a} > $max;
	$sum += $freq->{$a};
    }
    
    return ($sum, $max);
}

sub table_show {
    my ($out, $freq, $m, $sum, $max, $n, $s) = @_;
    if ($n == 2) {
	print $out length $s ? $s : " ";
	for my $b (@$m) {
	    print $out "\t$b";
	}
	print $out "\n";
	for my $a (@$m) {
	    print $out $a;
	    for my $b (@$m) {
		my $sab = $s . $a . $b;
		my $f = $freq->{$sab} || 0;
		printf $out "\t%6d", $f;
	    }
	    print $out "\n";
	}
    } else {
	for my $a (@$m) {
	    table_show($out, $freq, $m, $sum, $max, $n - 1, $s . $a);
	}
    }
}

sub show {
    my ($out, $freq, $k, $m, $title) = @_;
    
    print $out "$title Frequencies\n";
    my ($sum, $max) = summax($freq);
    if ($tables && defined $m && $n > 1) {
	table_show($out, $freq, $m, $sum, $max, $n, '');
    } else {
	for my $a (@{ $k }) {
	    my $f = $freq->{$a} || 0;
	    printf $out "%${n}s\t%6d\t%9.6f\t%.6f\n",
	           $a, $f, $sum ? $f / $sum : 0, $max ? $f / $max : 0;
	}
	printf $out "%${n}s\t%6d\t%9.6f\n",
	       "Sum", $sum, 1.0;
    }
}

my $outputfh = *STDOUT;

binmode(STDOUT, ":encoding($input)") if $input && !defined $output;

sub output_open {
    my $suffix = shift;
    my $outputfn = "$output.ng$suffix";
    if (open($outputfh, ">$outputfn")) {
	binmode($outputfh, ":encoding($input)") if $input;
    } else {
	warn "$0: Failed to open '$outputfn' for writing: $!\n";
    }
}

sub freq_show {
    my ($outputfh, $freq, $k, $m, $suffix, $title) = @_;
    output_open($suffix) if defined $output;
    show($outputfh, $freq, $k, $m, $title);
}

my @l = sort { $a <=> $b } keys %length_freq;

freq_show($outputfh, \%length_freq,
	  @l ? [ 1 .. $l[-1] ] : [], undef, 'l', "Word Length");

my @k = sort keys %ngram_freq;
my @m;

if ($tables && $n > 1) {
    my %s;
    @s{ split // } = () for @k;
    @m = sort keys %s;
}

freq_show($outputfh, \%ngram_freq,   \@k, \@m, 'r', "Ngram");
freq_show($outputfh, \%initial_freq, \@k, \@m, 'i', "Initial Ngram");
freq_show($outputfh, \%medial_freq,  \@k, \@m, 'm', "Medial Ngram");
freq_show($outputfh, \%final_freq,   \@k, \@m, 'f', "Final Ngram");

exit(0);

__END__
=head1 NAME

ngram - compute and display frequencies of ngrams

=head1 SYNOPIS

  ngram [--help] [--n=2] [input files]

=head1 DESCRIPTION

This script computes and display the ngram frequencies in its input.
If no ngram length is specified, 1 (one) is assumed.
If no files are specified, STDIN is read.

=head2 Sample Output

With the input C<banana and bandana> and ngram length of one would get
the following output.  The initial, medial, and final ngrams refer to
I<word-initial>, and so forth -- but note that the script doesn't have
artificial intelligence built in: C<doesn't> is two words, as is
C<inter-continental>.  The C<#> are comments added here for the sake
of documentation, they are not part of the real output.

  Word Length Frequencies
  1	     0	 0.000000	0.000000
  2	     0	 0.000000	0.000000
  3	     1	 0.333333	1.000000 # We had one word of length 3.
  4	     0	 0.000000	0.000000
  5	     0	 0.000000	0.000000
  6	     1	 0.333333	1.000000
  7	     1	 0.333333	1.000000
  Sum	     3	 1.000000                # We had three words total.
  Ngram Frequencies
  a	     7	 0.437500	1.000000 # We had seven letters "a".
  b	     2	 0.125000	0.285714
  d	     2	 0.125000	0.285714
  n	     5	 0.312500	0.714286
  Sum	    16	 1.000000                # We had sixteen letters total.
  Initial Ngram Frequencies
  a	     1	 0.333333	0.500000 # We had one word beginning with "a".
  b	     2	 0.666667	1.000000 # We had two words beginning with "b".
  d	     0	 0.000000	0.000000 # We had no words beginning with "d".
  n	     0	 0.000000	0.000000
  Sum	     3	 1.000000
  Medial Ngram Frequencies
  a	     4	 0.400000	0.800000
  b	     0	 0.000000	0.000000 # We had no "b"s in the middle.
  d	     1	 0.100000	0.200000 # We only one "d" in the middle.
  n	     5	 0.500000	1.000000
  Sum	    10	 1.000000
  Final Ngram Frequencies
  a	     2	 0.666667	1.000000
  b	     0	 0.000000	0.000000 # We had no "b"s ending words.
  d	     1	 0.333333	0.500000
  n	     0	 0.000000	0.000000
  Sum	     3	 1.000000

The sum of initial, medial, and final ngram frequencies equals the
number of all ngrams.

=head2 Options

  --n=N

The default is to use one character ngrams, in other words, single
characters.  With the C<-n> option you can change the ngram length.
I<N> must be greater than or equal to one.

  --nofold

The default is to lowercase all the letters before counting the frequencies:
for example C<Ab> is lowercased to C<ab>.  With the C<--nofold> option this
lowercasing is not done.

  --input=C

The default is to use the native 8-bit bytes and the native definition of
"letters".  With the C<--input> option any character set and encoding
recognized by the Encode extension can be used as the input.  Using this
option unfortunately requires at least Perl 5.8.0, but for a common
character set see the C<--latin1> option.

  --latin1

Assume the input to be in ISO 8859-1 (Latin 1).  Using this option
does not require Perl 5.8.

  --output=F

Output the resulting ngram frequencies to files C<F.ngl>, C<F.ngr>,
C<F.ngi>, C<F.ngm>, C<F.ngf> for the lengths, ngrams, initial ngrams,
medial ngrams, and final ngrams, respectively.  The default is to output
all the results to the standard output, and in a different order (lengths,
ngrams, initial ngrams, medial ngrams, final ngrams). 

  --tables

Relevant only if I<N> is two or more: usually a linear list of the
ngram frequencies is output, but with the C<--table> option a list of
two-dimensional tables is shown.  This may make it easier to visualize
the distribution of the ngrams.  The relative frequencies and sums are
not shown, only the absolute frequencies.

  --help

Show a concise help message.

  --version

Show version of the F<ngram.pl> script.

  --debug

Output various debugging information.  Currently shows the detected
ngrams to the standard error output.

All the options can be shortened to their unique prefixes, and also the
leading C<--> be shortened to a single C<->.

=head1 PREREQUISITES

Getopt::Long
strict
vars

=head1 COREQUISITES

Encode

=head1 SCRIPT CATEGORIES

Text::Statistics

=head1 README

Compute ngram (one letter, digram, trigram, ...) frequencies.
Useful for generating text using Markov chains or for cryptogeeks.

=head1 SEE ALSO

Simon Cozen's Text::Ngram module in CPAN

=head1 COPYRIGHT

(C) 2003 by Jarkko Hietaniemi <jhi@iki.fi>

All rights reserved. You may distribute this code under the terms
of either the GNU General Public License or the Artistic License,
as specified in the Perl README file.

=cut