#!/usr/bin/perl -w use strict; use vars qw($VERSION); require 5.005; $VERSION = sprintf "%d.%d", q$Revision: 1.48 $ =~ /(\d+)/g; # jhi@iki.fi use Getopt::Long; my $n = 1; my ($help, $debug, $input, $latin1, $output, $tables, $version); my $fold = 1; sub help { print < 1). --help Show this help. --version Show version. --debug Show the ngrams in STDERR. The options can be shortened to their unique prefixes and the two dashes to one dash. No files means using STDIN. EOF exit(1); } help() unless GetOptions('n=i' => \$n, 'fold!' => \$fold, 'input=s' => \$input, 'latin1' => \$latin1, 'output=s' => \$output, 'tables' => \$tables, 'help' => \$help, 'version' => \$version, 'debug' => \$debug); help() if $n < 1 || int($n) != $n; sub version { print $VERSION, "\n"; exit(1); } help() if $help; version() if $version; require 5.008 if $input; my %length_freq; my %ngram_freq; my %initial_freq; my %medial_freq; my %final_freq; @ARGV = ("-") unless @ARGV; my $letter = $latin1 ? '[A-Za-zÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]' : $] >= 5.008 ? '\pL' : '\w'; for my $fn (@ARGV) { if (open(my $fh, $fn)) { binmode($fh, ":encoding($input)") if $input; while (<$fh>) { chomp; while (/($letter+)/go) { my $s = $1; my $l = length $s; $length_freq{$l}++; if ($l >= $n) { if ($fold) { if ($latin1) { $s =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ/abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ/; } else { $s = lc $s; } } my @n = (); while ($s =~ /(.{$n})/g) { $ngram_freq{$1}++; push @n, $1; pos($s) -= $n - 1; } print STDERR "@n\n" if $debug; $initial_freq{my $ini = shift @n}++; $final_freq {@n ? pop @n : $ini}++; $medial_freq {$_}++ for @n; } } } } else { warn "$0: Failed to open '$fn' for reading: $!\n"; } } sub summax { my $freq = shift; my $max = 0; my $sum = 0; for my $a (keys %{$freq}) { next unless defined $freq->{$a}; $max = $freq->{$a} if $freq->{$a} > $max; $sum += $freq->{$a}; } return ($sum, $max); } sub table_show { my ($out, $freq, $m, $sum, $max, $n, $s) = @_; if ($n == 2) { print $out length $s ? $s : " "; for my $b (@$m) { print $out "\t$b"; } print $out "\n"; for my $a (@$m) { print $out $a; for my $b (@$m) { my $sab = $s . $a . $b; my $f = $freq->{$sab} || 0; printf $out "\t%6d", $f; } print $out "\n"; } } else { for my $a (@$m) { table_show($out, $freq, $m, $sum, $max, $n - 1, $s . $a); } } } sub show { my ($out, $freq, $k, $m, $title) = @_; print $out "$title Frequencies\n"; my ($sum, $max) = summax($freq); if ($tables && defined $m && $n > 1) { table_show($out, $freq, $m, $sum, $max, $n, ''); } else { for my $a (@{ $k }) { my $f = $freq->{$a} || 0; printf $out "%${n}s\t%6d\t%9.6f\t%.6f\n", $a, $f, $sum ? $f / $sum : 0, $max ? $f / $max : 0; } printf $out "%${n}s\t%6d\t%9.6f\n", "Sum", $sum, 1.0; } } my $outputfh = *STDOUT; binmode(STDOUT, ":encoding($input)") if $input && !defined $output; sub output_open { my $suffix = shift; my $outputfn = "$output.ng$suffix"; if (open($outputfh, ">$outputfn")) { binmode($outputfh, ":encoding($input)") if $input; } else { warn "$0: Failed to open '$outputfn' for writing: $!\n"; } } sub freq_show { my ($outputfh, $freq, $k, $m, $suffix, $title) = @_; output_open($suffix) if defined $output; show($outputfh, $freq, $k, $m, $title); } my @l = sort { $a <=> $b } keys %length_freq; freq_show($outputfh, \%length_freq, @l ? [ 1 .. $l[-1] ] : [], undef, 'l', "Word Length"); my @k = sort keys %ngram_freq; my @m; if ($tables && $n > 1) { my %s; @s{ split // } = () for @k; @m = sort keys %s; } freq_show($outputfh, \%ngram_freq, \@k, \@m, 'r', "Ngram"); freq_show($outputfh, \%initial_freq, \@k, \@m, 'i', "Initial Ngram"); freq_show($outputfh, \%medial_freq, \@k, \@m, 'm', "Medial Ngram"); freq_show($outputfh, \%final_freq, \@k, \@m, 'f', "Final Ngram"); exit(0); __END__ =head1 NAME ngram - compute and display frequencies of ngrams =head1 SYNOPIS ngram [--help] [--n=2] [input files] =head1 DESCRIPTION This script computes and display the ngram frequencies in its input. If no ngram length is specified, 1 (one) is assumed. If no files are specified, STDIN is read. =head2 Sample Output With the input C and ngram length of one would get the following output. The initial, medial, and final ngrams refer to I, and so forth -- but note that the script doesn't have artificial intelligence built in: C is two words, as is C. The C<#> are comments added here for the sake of documentation, they are not part of the real output. Word Length Frequencies 1 0 0.000000 0.000000 2 0 0.000000 0.000000 3 1 0.333333 1.000000 # We had one word of length 3. 4 0 0.000000 0.000000 5 0 0.000000 0.000000 6 1 0.333333 1.000000 7 1 0.333333 1.000000 Sum 3 1.000000 # We had three words total. Ngram Frequencies a 7 0.437500 1.000000 # We had seven letters "a". b 2 0.125000 0.285714 d 2 0.125000 0.285714 n 5 0.312500 0.714286 Sum 16 1.000000 # We had sixteen letters total. Initial Ngram Frequencies a 1 0.333333 0.500000 # We had one word beginning with "a". b 2 0.666667 1.000000 # We had two words beginning with "b". d 0 0.000000 0.000000 # We had no words beginning with "d". n 0 0.000000 0.000000 Sum 3 1.000000 Medial Ngram Frequencies a 4 0.400000 0.800000 b 0 0.000000 0.000000 # We had no "b"s in the middle. d 1 0.100000 0.200000 # We only one "d" in the middle. n 5 0.500000 1.000000 Sum 10 1.000000 Final Ngram Frequencies a 2 0.666667 1.000000 b 0 0.000000 0.000000 # We had no "b"s ending words. d 1 0.333333 0.500000 n 0 0.000000 0.000000 Sum 3 1.000000 The sum of initial, medial, and final ngram frequencies equals the number of all ngrams. =head2 Options --n=N The default is to use one character ngrams, in other words, single characters. With the C<-n> option you can change the ngram length. I must be greater than or equal to one. --nofold The default is to lowercase all the letters before counting the frequencies: for example C is lowercased to C. With the C<--nofold> option this lowercasing is not done. --input=C The default is to use the native 8-bit bytes and the native definition of "letters". With the C<--input> option any character set and encoding recognized by the Encode extension can be used as the input. Using this option unfortunately requires at least Perl 5.8.0, but for a common character set see the C<--latin1> option. --latin1 Assume the input to be in ISO 8859-1 (Latin 1). Using this option does not require Perl 5.8. --output=F Output the resulting ngram frequencies to files C, C, C, C, C for the lengths, ngrams, initial ngrams, medial ngrams, and final ngrams, respectively. The default is to output all the results to the standard output, and in a different order (lengths, ngrams, initial ngrams, medial ngrams, final ngrams). --tables Relevant only if I is two or more: usually a linear list of the ngram frequencies is output, but with the C<--table> option a list of two-dimensional tables is shown. This may make it easier to visualize the distribution of the ngrams. The relative frequencies and sums are not shown, only the absolute frequencies. --help Show a concise help message. --version Show version of the F script. --debug Output various debugging information. Currently shows the detected ngrams to the standard error output. All the options can be shortened to their unique prefixes, and also the leading C<--> be shortened to a single C<->. =head1 PREREQUISITES Getopt::Long strict vars =head1 COREQUISITES Encode =head1 SCRIPT CATEGORIES Text::Statistics =head1 README Compute ngram (one letter, digram, trigram, ...) frequencies. Useful for generating text using Markov chains or for cryptogeeks. =head1 SEE ALSO Simon Cozen's Text::Ngram module in CPAN =head1 COPYRIGHT (C) 2003 by Jarkko Hietaniemi All rights reserved. You may distribute this code under the terms of either the GNU General Public License or the Artistic License, as specified in the Perl README file. =cut