#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #merge prefix LMs into one single file use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$size,$sublm)=(); $help=1 unless &GetOptions('size=i' => \$size, 'lm=s' => \$lm, 'sublm=s' => \$sublm, 'h|help' => \$help,); if ($help || !$size || !$lm || !$sublm) { my $cmnd = basename($0); print "\n$cmnd - merge single LMs\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --size maximum n-gram size for the language model\n", " --sublm path identifying all input prefix sub LMs\n", " --lm name of the output LM file (will be gzipped)\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); warn "merge-sublm.pl --size $size --sublm $sublm --lm $lm\n"; warn "Compute total sizes of n-grams\n"; my @size=(); #number of n-grams for each level my $tot1gr=0; #total frequency of 1-grams my $unk=0; #frequency of my $pr; #probability of 1-grams my (@files,$files); #sublm files for a given n-gram size for (my $n=1;$n<=$size;$n++){ @files=map { glob($_) } "${sublm}*.${n}gr*"; $files=join(" ",@files); $files || die "cannot find sublm files\n"; warn "join files $files\n"; if ($n==1){ open(INP,"$gunzip -c $files|") || die "cannot open $files\n"; while(my $line = ){ $size[$n]++; chomp($line); warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/; my @words = split(/ +/,$line); #cut down counts for sentence initial $words[0]=1 if $words[1]=~//; #there could be more independent words #generated by ngt with -sd option $size[$n]-- if $unk && $words[1] eq ""; $unk+=$words[0] if $words[1]=~//i; $tot1gr+=$words[0]; } close(INP); if ($unk==0){ warn "implicitely add word to counters\n"; $tot1gr+=$size[$n]; #equivalent to WB smoothing $size[$n]++; } }else{ for (my $j=0;$j wc$$") or die; open(INP,"wc$$") || die "cannot open wc$$\n"; my $wc = ; chomp($wc); $size[$n] += $wc; close(INP); unlink("wc$$"); } } warn "n:$n size:$size[$n] unk:$unk\n"; } warn "Merge all sub LMs\n"; $lm.=".gz" if $lm!~/.gz$/; open(LM,"|$gzip -c > $lm") || die "Cannot open $lm\n"; warn "Write LM Header\n"; printf LM "iARPA\n"; printf LM "\n\\data\\\n"; for (my $n=1;$n<=$size;$n++){ printf LM "ngram $n= $size[$n]\n"; } printf LM "\n\n"; close(LM); warn "Writing LM Tables\n"; for (my $n=1;$n<=$size;$n++){ warn "Level $n\n"; @files=map { glob($_) } "${sublm}*.${n}gr*"; $files=join(" ",@files); warn "input from: $files\n"; if ($n==1){ open(INP,"$gunzip -c $files|") || die "cannot open $files\n"; open(LM,"|$gzip -c >> $lm"); printf LM "\\$n-grams:\n"; while(my $line = ){ chomp($line); warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/; #lowercase some expressions of google n-grams $line=~s///g; $line=~s/<\/S>/<\/s>/g; $line=~s///g; my @words = split(/ +/,$line); #always print unk a the eqnd next if $words[1]=~//i; #cut down counts for sentence initial $words[0]=1 if $words[1]=~//i; #apply witten-bell smoothing on 1-grams $pr=(log($words[0]+1)-log($tot1gr+$size[1]))/log(10.0); shift @words; printf LM "%f %s\n",$pr,join(" ",@words); } close(INP); #print final #witten-bell smoothing of probability if ($unk){ $pr=(log($unk+1)-log($tot1gr+$size[1]))/log(10.0); }else{ $pr=(log($size[1]-1+1)-log($tot1gr+$size[1]))/log(10.0); } printf LM "%f \n",$pr; close(LM); }else{ open(LM,"|$gzip -c >> $lm"); printf LM "\\$n-grams:\n"; close(LM); for (my $j=0;$j> $lm") or die; } } } open(LM,"|$gzip -c >> $lm") || die "Cannot open $lm\n"; printf LM "\\end\\\n"; close(LM); sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } }