#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #first pass: read dictionary and generate 1-grams #second pass: #for n=2 to N # foreach n-1-grams # foreach n-grams with history n-1 # compute smoothing statistics # store successors # compute back-off probability # compute smoothing probability # write n-1 gram with back-off prob # write all n-grams with smoothed probability use strict; use Getopt::Long "GetOptions"; use File::Basename; my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); my $cutoffword=""; #special word for Google 1T-ngram cut-offs my $cutoffvalue=39; #cut-off threshold for Google 1T-ngram cut-offs #set defaults for optional parameters my ($verbose,$size,$ngrams,$sublm)=(0, 0, undef, undef); my ($witten_bell,$good_turing,$kneser_ney,$improved_kneser_ney)=(0, 0, "", ""); my ($witten_bell_flag,$good_turing_flag,$kneser_ney_flag,$improved_kneser_ney_flag)=(0, 0, 0, 0); my ($freqshift,$prune_singletons,$cross_sentence)=(0, 0, 0); my $help = 0; $help = 1 unless &GetOptions('size=i' => \$size, 'freq-shift=i' => \$freqshift, 'ngrams=s' => \$ngrams, 'sublm=s' => \$sublm, 'witten-bell' => \$witten_bell, 'good-turing' => \$good_turing, 'kneser-ney=s' => \$kneser_ney, 'improved-kneser-ney=s' => \$improved_kneser_ney, 'prune-singletons' => \$prune_singletons, 'cross-sentence' => \$cross_sentence, 'h|help' => \$help, 'verbose' => \$verbose); if ($help || !$size || !$ngrams || !$sublm) { my $cmnd = basename($0); print "\n$cmnd - estimates single LMs\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --size maximum n-gram size for the language model\n", " --ngrams input file or command to read the ngram table\n", " --sublm output file prefix to write the sublm statistics \n", " --freq-shift (optional) value to be subtracted from all frequencies\n", " --witten-bell (optional) use witten bell linear smoothing (default)\n", " --kneser-ney (optional) use kneser-ney smoothing with statistics in \n", " --improved-kneser-ney (optional) use improved kneser-ney smoothing with statistics in \n", " --good-turing (optional) use good-turing linear smoothing\n", " --prune-singletons (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n", " --cross-sentence (optional) include cross-sentence bounds (disabled by default)\n", " --verbose (optional) print debugging info\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } $witten_bell_flag = 1 if ($witten_bell); $good_turing_flag = 1 if ($good_turing); $kneser_ney_flag = 1 if ($kneser_ney); $improved_kneser_ney_flag = 1 if ($improved_kneser_ney); $witten_bell = $witten_bell_flag = 1 if ($witten_bell_flag + $kneser_ney_flag + $improved_kneser_ney_flag + $good_turing_flag) == 0; warn "build-sublm: size $size ngrams $ngrams sublm $sublm witten-bell $witten_bell kneser-ney $kneser_ney improved-kneser-ney $improved_kneser_ney good-turing $good_turing prune-singletons $prune_singletons cross-sentence $cross_sentence\n" if $verbose; die "build-sublm: value of --size must be larger than 0\n" if $size<1; die "build-sublm: choose only one smoothing method\n" if ($witten_bell_flag + $kneser_ney_flag + $improved_kneser_ney_flag + $good_turing_flag) > 1; my $log10=log(10.0); #service variable to convert log into log10 my $oldwrd=""; #variable to check if 1-gram changed my @cnt=(); #counter of n-grams my $totcnt=0; #total counter of n-grams my ($ng,@ng); #read ngrams my $ngcnt=0; #store ngram frequency my $n; warn "Collecting 1-gram counts\n"; open(INP,"$ngrams") || open(INP,"$ngrams|") || die "cannot open $ngrams\n"; open(GR,"|$gzip -c >${sublm}.1gr.gz") || die "cannot create ${sublm}.1gr.gz\n"; while ($ng=) { chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift; if ($oldwrd ne $ng[0]) { printf (GR "%s %s\n",$totcnt,$oldwrd) if $oldwrd ne ''; $totcnt=0;$oldwrd=$ng[0]; } #update counter $totcnt+=$ngcnt; } printf GR "%s %s\n",$totcnt,$oldwrd; close(INP); close(GR); my (@h,$h,$hpr); #n-gram history my (@dict,$code); #sorted dictionary of history successors my ($diff,$singlediff,$diff1,$diff2,$diff3); #different successors of history my (@n1,@n2,@n3,@n4,@uno3); #IKN: n-grams occurring once or twice ... my (@beta,$beta); #IKN: n-grams occurring once or twice ... my $locfreq; #collect global statistics for (Improved) Kneser-Ney smoothing if ($kneser_ney || $improved_kneser_ney) { my $statfile=$kneser_ney || $improved_kneser_ney; warn "load \& merge IKN statistics from $statfile \n"; open(IKN,"$statfile") || open(IKN,"$statfile|") || die "cannot open $statfile\n"; while () { my($lev,$n1,$n2,$n3,$n4,$uno3)=$_=~/level: (\d+) n1: (\d+) n2: (\d+) n3: (\d+) n4: (\d+) unover3: (\d+)/; $n1[$lev]+=$n1;$n2[$lev]+=$n2;$n3[$lev]+=$n3;$n4[$lev]+=$n4;$uno3[$lev]+=$uno3; } for (my $lev=1;$lev<=$#n1;$lev++) { warn "level $lev: $n1[$lev] $n2[$lev] $n3[$lev] $n4[$lev] $uno3[$lev]\n"; } close(IKN); } warn "Computing n-gram probabilities:\n"; foreach ($n=2;$n<=$size;$n++) { $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; #compute smothing statistics my (@beta,$beta); if ($kneser_ney) { if ($n1[$n]==0 || $n2[$n]==0) { warn "Error in Kneser-Ney smoothing statistics: resorting to Witten-Bell\n"; $beta=0; } else { $beta=$n1[$n]/($n1[$n] + 2 * $n2[$n]); warn "beta $n: $beta\n"; } } if ($improved_kneser_ney) { my $Y=$n1[$n]/($n1[$n] + 2 * $n2[$n]); if ($n3[$n] == 0 || $n4[$n] == 0 || $n2[$n] <= $n3[$n] || $n3[$n] <= $n4[$n]) { warn "Warning: higher order count-of-counts are wrong\n"; warn "Fixing this problem by resorting only on the lower order count-of-counts\n"; $beta[1] = $Y; $beta[2] = $Y; $beta[3] = $Y; } else { $beta[1] = 1 - 2 * $Y * $n2[$n] / $n1[$n]; $beta[2] = 2 - 3 * $Y * $n3[$n] / $n2[$n]; $beta[3] = 3 - 4 * $Y * $n4[$n] / $n3[$n]; } } open(HGR,"$gunzip -c ${sublm}.".($n-1)."gr.gz|") || die "cannot open ${sublm}.".($n-1)."gr.gz\n"; open(INP,"$ngrams") || open(INP,"$ngrams|") || die "cannot open $ngrams\n"; open(GR,"|$gzip -c >${sublm}.${n}gr.gz"); open(NHGR,"|$gzip -c > ${sublm}.".($n-1)."ngr.gz") || die "cannot open ${sublm}.".($n-1)."ngr.gz"; my $ngram; my ($reduced_h, $reduced_ng) = ("", ""); $ng=; chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift; $h=; chomp($h); @h=split(/ +/,$h); $hpr=shift @h; $reduced_ng=join(" ",@ng[0..$n-2]); $reduced_h=join(" ",@h[0..$n-2]); @cnt=(); @dict=(); $code=-1; $totcnt=0; $diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; do{ #load all n-grams starting with history h, and collect useful statistics while ($reduced_h eq $reduced_ng){ #must be true the first time! #print join(" ",@h[0..$n-2]),"--",join(" ",@ng[0..$n-1]),"--\n"; #print "oldwrd $oldwrd -- code $code\n"; if ($oldwrd ne $ng[$n-1]) { #could this be otherwise? [Marcello 22/5/09] $dict[++$code]=$oldwrd=$ng[$n-1]; $diff++; $singlediff++ if $ngcnt==1; } if ($diff>1 && $ng[$n-1] eq $cutoffword) { # in google n-grams #find estimates for remaining diff and singlediff #proportional estimate $diff--; #remove cutoffword my $concentration=1.0-($diff-1)/$totcnt; my $mass=1; #$totcnt/($totcnt+$ngcnt); my $index=(1-($concentration * $mass))/(1-1/$cutoffvalue) + (1/$cutoffvalue); my $cutoffdiff=int($ngcnt * $index); $cutoffdiff=1 if $cutoffdiff==0; #print "diff $diff $totcnt cutofffreq $ngcnt -- cutoffdiff: $cutoffdiff\n"; #print "concentration:",$concentration," mass:", $mass,"\n"; $diff+=$cutoffdiff; } $cnt[$code]+=$ngcnt; $totcnt+=$ngcnt; $ng=; if (defined($ng)){ chomp($ng); @ng=split(/[ \t]+/,$ng);$ngcnt=(pop @ng) - $freqshift; $reduced_ng=join(" ",@ng[0..$n-2]); }else{ last; } } if ($improved_kneser_ney) { for (my $c=0;$c<=$code;$c++) { $diff1++ if $cnt[$c]==1; $diff2++ if $cnt[$c]==2; $diff3++ if $cnt[$c]>=3; } } #print smoothed probabilities my $boprob=0; #accumulate pruned probabilities my $prob=0; for (my $c=0;$c<=$code;$c++) { if ($kneser_ney && $beta>0) { $prob=($cnt[$c]-$beta)/$totcnt; } elsif ($improved_kneser_ney) { my $b=($cnt[$c]>= 3? $beta[3]:$beta[$cnt[$c]]); $prob=($cnt[$c] - $b)/$totcnt; } elsif ($good_turing && $singlediff>0) { $prob=$cnt[$c]/($totcnt+$singlediff); } else { $prob=$cnt[$c]/($totcnt+$diff); } $ngram=join(" ",$reduced_h,$dict[$c]); #rm singleton n-grams for (n>=3), if flag is active #rm n-grams (n>=2) containing cross-sentence boundaries, if flag is not active #rm n-grams containing or except for 1-grams #warn "considering $size $n |$ngram|\n"; if (($prune_singletons && $n>=3 && $cnt[$c]==1) || (!$cross_sentence && $n>=2 && &CrossSentence($ngram)) || ($dict[$c]=~//i) || ($n>=2 && $h=~//i) || ($dict[$c] eq $cutoffword) ) { $boprob+=$prob; if ($n<$size) { #output this anyway because it will be an history for n+1 printf GR "%f %s %s\n",-10000,$reduced_h,$dict[$c]; } } else { # print unpruned n-1 gram printf(GR "%f %s %s\n",log($prob)/$log10,$reduced_h,$dict[$c]); } } #rewrite history including back-off weight print "$reduced_h --- $h --- $reduced_ng --- $ng --- $totcnt $diff \n" if $totcnt+$diff==0 && defined($ng); #check if history has to be pruned out if ($hpr==-10000) { #skip this history } elsif ($kneser_ney && $beta>0) { printf NHGR "%s %f\n",$h,log($boprob+($beta * $diff/$totcnt))/$log10; } elsif ($improved_kneser_ney) { my $lambda=($beta[1] * $diff1 + $beta[2] * $diff2 + $beta[3] * $diff3)/$totcnt; printf NHGR "%s %f\n",$h,log($boprob+$lambda)/$log10; } elsif ($good_turing && $singlediff>0) { printf NHGR "%s %f\n",$h,log($boprob+($singlediff/($totcnt+$singlediff)))/$log10; } else { printf NHGR "%s %f\n",$h,log($boprob+($diff/($totcnt+$diff)))/$log10; } #reset smoothing statistics $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0;$singlediff=0;$oldwrd="";$diff1=0;$diff2=0;$diff3=0;$locfreq=0; #read next history $h=; if (defined($h)){ chomp($h); @h=split(/ +/,$h); $hpr=shift @h; $reduced_h=join(" ",@h[0..$n-2]); }else{ die "ERROR: Somehing could be wrong: history are terminated before ngrams!" if defined($ng); } }until (!defined($ng)); #n-grams are over close(HGR); close(INP); close(GR); close(NHGR); rename("${sublm}.".($n-1)."ngr.gz","${sublm}.".($n-1)."gr.gz"); } #check if n-gram contains cross-sentence boundaries #this happens if # either ~~occurs not only in the first place # or~~ occurs not only in thes last place sub CrossSentence(){ my ($ngram) = @_; # warn "check CrossSentence |$ngram|\n"; if (($ngram=~/ ~~/i) || ($ngram=~/<\/s> /i)) { # warn "delete $ngram\n"; return 1; } return 0; }~~