00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 using namespace std;
00024
00025 #include <iostream>
00026 #include <fstream>
00027 #include <vector>
00028 #include <string>
00029 #include <stdlib.h>
00030 #include "cmd.h"
00031 #include "util.h"
00032 #include "math.h"
00033 #include "lmtable.h"
00034
00035
00036
00037
00038 void print_help(int TypeFlag=0){
00039 std::cerr << std::endl << "prune-lm - prunes language models" << std::endl;
00040 std::cerr << std::endl << "USAGE:" << std::endl;
00041 std::cerr << " prune-lm [options] <inputfile> [<outputfile>]" << std::endl;
00042 std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00043 std::cerr << " prune-lm reads a LM in either ARPA or compiled format and" << std::endl;
00044 std::cerr << " prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl;
00045 std::cerr << " lower order n-gram results in a small difference in probability." << std::endl;
00046 std::cerr << " The pruned LM is saved in ARPA format" << std::endl;
00047 std::cerr << std::endl << "OPTIONS:" << std::endl;
00048
00049 FullPrintParams(TypeFlag, 0, 1, stderr);
00050 }
00051
00052 void usage(const char *msg = 0)
00053 {
00054 if (msg){
00055 std::cerr << msg << std::endl;
00056 }
00057 if (!msg){
00058 print_help();
00059 }
00060 exit(1);
00061 }
00062
00063 void s2t(string cps, float *thr)
00064 {
00065 int i;
00066 char *s=strdup(cps.c_str()), *tk;
00067
00068 thr[0]=0;
00069 for(i=1,tk=strtok(s, ","); tk; tk=strtok(0, ","),i++) thr[i]=atof(tk);
00070 for(; i<MAX_NGRAM; i++) thr[i]=thr[i-1];
00071 }
00072
00073 int main(int argc, char **argv)
00074 {
00075 float thr[MAX_NGRAM];
00076 char *spthr=NULL;
00077 int aflag=0;
00078 std::vector<std::string> files;
00079
00080 bool help=false;
00081
00082 DeclareParams((char*)
00083 "threshold", CMDSTRINGTYPE|CMDMSG, &spthr, "pruning thresholds for 2-grams, 3-grams, 4-grams,...; if less thresholds are specified, the last one is applied to all following n-gram levels; default is 0",
00084 "t", CMDSTRINGTYPE|CMDMSG, &spthr, "pruning thresholds for 2-grams, 3-grams, 4-grams,...; if less thresholds are specified, the last one is applied to all following n-gram levels; default is 0",
00085
00086 "abs", CMDBOOLTYPE|CMDMSG, &aflag, "uses absolute value of weighted difference; default is 0",
00087
00088 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00089 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00090
00091 (char *)NULL
00092 );
00093
00094 if (argc == 1){
00095 usage();
00096 }
00097
00098 int first_file=1;
00099 for (int i=1; i < argc; i++) {
00100 if (strcmp(argv[i],"-") == 0){
00101 if (first_file == 1){
00102 files.push_back("/dev/stdin");
00103 }else if (first_file == 2){
00104 files.push_back("/dev/stdout");
00105 }else{
00106 usage("Warning: You can use the value for the input or output file only");
00107 }
00108 first_file++;
00109 }else if(argv[i][0] != '-'){
00110 files.push_back(argv[i]);
00111 first_file++;
00112 }
00113 }
00114
00115
00116 GetParams(&argc, &argv, (char*) NULL);
00117
00118 if (help){
00119 usage();
00120 }
00121
00122 if (files.size() > 2) {
00123 usage("Warning: Too many arguments");
00124 }
00125
00126 if (files.size() < 1) {
00127 usage("Warning: Specify a LM file to read from");
00128 }
00129
00130 memset(thr, 0, sizeof(thr));
00131 if(spthr != NULL) s2t(spthr, thr);
00132 std::string infile = files[0];
00133 std::string outfile= "";
00134
00135 if (files.size() == 1) {
00136 outfile=infile;
00137
00138
00139 std::string::size_type p = outfile.rfind('/');
00140 if (p != std::string::npos && ((p+1) < outfile.size()))
00141 outfile.erase(0,p+1);
00142
00143
00144 if (outfile.compare(outfile.size()-3,3,".gz")==0)
00145 outfile.erase(outfile.size()-3,3);
00146
00147 outfile+=".plm";
00148 } else
00149 outfile = files[1];
00150
00151 lmtable lmt;
00152 inputfilestream inp(infile.c_str());
00153 if (!inp.good()) {
00154 std::cerr << "Failed to open " << infile << "!" << std::endl;
00155 exit(1);
00156 }
00157
00158 lmt.load(inp,infile.c_str(),outfile.c_str(),0,NONE);
00159 std::cerr << "pruning LM with thresholds: \n";
00160
00161 for (int i=1; i<lmt.maxlevel(); i++) std::cerr<< " " << thr[i];
00162 std::cerr << "\n";
00163 lmt.wdprune((float*)thr, aflag);
00164 lmt.savetxt(outfile.c_str());
00165 return 0;
00166 }
00167