00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 using namespace std;
00024
00025 #include <iostream>
00026 #include <fstream>
00027 #include <vector>
00028 #include <string>
00029 #include <stdlib.h>
00030 #include "util.h"
00031 #include "math.h"
00032 #include "lmtable.h"
00033
00034
00035
00036 std::string sinvert = "no";
00037 std::string stxt = "no";
00038 std::string sscore = "no";
00039 std::string seval = "";
00040 std::string srandcalls = "0";
00041 std::string sfilter = "";
00042 std::string sdebug = "0";
00043 std::string smemmap = "0";
00044 std::string sdub = "10000000";
00045 std::string skeepunigrams = "yes";
00046 std::string tmpdir = "";
00047
00048
00049 void usage(const char *msg = 0) {
00050
00051 if (msg) { std::cerr << msg << std::endl; }
00052 std::cerr << "Usage: compile-lm [options] input-file.lm [output-file.blm]" << std::endl;
00053 if (!msg) std::cerr << std::endl
00054 << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl
00055 << " a compiled representation that the IRST LM toolkit can quickly" << std::endl
00056 << " read and process. LM file can be compressed with gzip." << std::endl << std::endl;
00057 std::cerr << "Options:\n"
00058 << "--text|-t [yes|no] (output is again in text format)" << std::endl
00059 << "--invert|-i [yes|no] (build an inverted n-gram binary table for fast access: default no)" << std::endl
00060 << "--filter|-f wordlist (filter a binary language model with a word list)"<< std::endl
00061 << "--keepunigrams|-ku [yes|no] (filter by keeping all unigrams in the table: default yes)"<< std::endl
00062 << "--eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
00063 << "--randcalls|-r N (computes N random calls on the eval text-file)"<< std::endl
00064 << "--dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
00065 << "--score|-s [yes|no] (computes log-prob scores from standard input)"<< std::endl
00066 << "--debug|-d 1 (verbose output for --eval option)"<< std::endl
00067 << "--memmap|-mm 1 (uses memory map to read a binary LM)"<< std::endl
00068 << "--tmpdir directory (directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")"
00069 << std::endl;
00070 }
00071
00072 bool starts_with(const std::string &s, const std::string &pre) {
00073 if (pre.size() > s.size()) return false;
00074
00075 if (pre == s) return true;
00076 std::string pre_equals(pre+'=');
00077 if (pre_equals.size() > s.size()) return false;
00078 return (s.substr(0,pre_equals.size()) == pre_equals);
00079 }
00080
00081 std::string get_param(const std::string& opt, int argc, const char **argv, int& argi)
00082 {
00083 std::string::size_type equals = opt.find_first_of('=');
00084 if (equals != std::string::npos && equals < opt.size()-1) {
00085 return opt.substr(equals+1);
00086 }
00087 std::string nexto;
00088 if (argi + 1 < argc) {
00089 nexto = argv[++argi];
00090 } else {
00091 usage((opt + " requires a value!").c_str());
00092 exit(1);
00093 }
00094 return nexto;
00095 }
00096
00097 void handle_option(const std::string& opt, int argc, const char **argv, int& argi)
00098 {
00099 if (opt == "--help" || opt == "-h") { usage(); exit(1); }
00100
00101 if (starts_with(opt, "--text") || starts_with(opt, "-t"))
00102 stxt = get_param(opt, argc, argv, argi);
00103 else
00104 if (starts_with(opt, "--filter") || starts_with(opt, "-f"))
00105 sfilter = get_param(opt, argc, argv, argi);
00106 else
00107 if (starts_with(opt, "--keepunigrams") || starts_with(opt, "-ku"))
00108 skeepunigrams = get_param(opt, argc, argv, argi);
00109 else
00110 if (starts_with(opt, "--eval") || starts_with(opt, "-e"))
00111 seval = get_param(opt, argc, argv, argi);
00112 else
00113 if (starts_with(opt, "--randcalls") || starts_with(opt, "-r"))
00114 srandcalls = get_param(opt, argc, argv, argi);
00115 else
00116 if (starts_with(opt, "--score") || starts_with(opt, "-s"))
00117 sscore = get_param(opt, argc, argv, argi);
00118 else
00119 if (starts_with(opt, "--debug") || starts_with(opt, "-d"))
00120 sdebug = get_param(opt, argc, argv, argi);
00121 else
00122 if (starts_with(opt, "--memmap") || starts_with(opt, "-mm") || starts_with(opt, "-m"))
00123 smemmap = get_param(opt, argc, argv, argi);
00124 else
00125 if (starts_with(opt, "--dub") || starts_with(opt, "-dub"))
00126 sdub = get_param(opt, argc, argv, argi);
00127 else
00128 if (starts_with(opt, "--tmpdir") || starts_with(opt, "-tmpdir"))
00129 tmpdir = get_param(opt, argc, argv, argi);
00130 else
00131 if (starts_with(opt, "--invert") || starts_with(opt, "-i"))
00132 sinvert = get_param(opt, argc, argv, argi);
00133
00134 else {
00135 usage(("Don't understand option " + opt).c_str());
00136 exit(1);
00137 }
00138 }
00139
00140 int main(int argc, const char **argv)
00141 {
00142
00143 if (argc < 2) { usage(); exit(1); }
00144 std::vector<std::string> files;
00145 for (int i=1; i < argc; i++) {
00146 std::string opt = argv[i];
00147 if (opt[0] == '-') { handle_option(opt, argc, argv, i); }
00148 else files.push_back(opt);
00149 }
00150
00151 if (files.size() > 2) { usage("Too many arguments"); exit(1); }
00152 if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); }
00153
00154 bool textoutput = (stxt == "yes"? true : false);
00155 bool invert = (sinvert == "yes"? true : false);
00156
00157
00158 OUTFILE_TYPE outtype;
00159 if (textoutput)
00160 outtype=TEXT;
00161 else if (seval != "" || sscore == "yes")
00162 outtype=NONE;
00163 else
00164 outtype=BINARY;
00165
00166
00167 int debug = atoi(sdebug.c_str());
00168 int memmap = atoi(smemmap.c_str());
00169 int dub = atoi(sdub.c_str());
00170 int randcalls = atoi(srandcalls.c_str());
00171
00172 std::string infile = files[0];
00173 std::string outfile="";
00174
00175 if (files.size() == 1) {
00176 outfile=infile;
00177
00178
00179 std::string::size_type p = outfile.rfind('/');
00180 if (p != std::string::npos && ((p+1) < outfile.size()))
00181 outfile.erase(0,p+1);
00182
00183
00184 if (outfile.compare(outfile.size()-3,3,".gz")==0)
00185 outfile.erase(outfile.size()-3,3);
00186
00187 outfile+=(textoutput?".lm":".blm");
00188 }
00189 else
00190 outfile = files[1];
00191
00192 std::cerr << "inpfile: " << infile << std::endl;
00193 if (sscore=="" && seval=="") std::cerr << "outfile: " << outfile << std::endl;
00194 if (sscore=="") std::cerr << "interactive: " << sscore << std::endl;
00195 if (memmap) std::cerr << "memory mapping: " << memmap << std::endl;
00196 std::cerr << "dub: " << dub<< std::endl;
00197 if (tmpdir != ""){
00198 if (setenv("TMP",tmpdir.c_str(),1))
00199 std::cerr << "temporary directory has not been set" << std::endl;
00200 std::cerr << "tmpdir: " << tmpdir << std::endl;
00201 }
00202
00203 lmtable* lmt=new lmtable();
00204
00205
00206 if (invert) lmt->is_inverted(invert);
00207
00208 std::cerr << "Reading " << infile << "..." << std::endl;
00209 inputfilestream inp(infile.c_str());
00210
00211 if (!inp.good()) {
00212 std::cerr << "Failed to open " << infile << "!" << std::endl;
00213 exit(1);
00214 }
00215
00216 if (sfilter != ""){
00217 std::cerr << "loading filtered version of LM ... \n";
00218 lmt->load(inp,infile.c_str(),outfile.c_str(),memmap=1,outtype);
00219 dictionary *dict; dict=new dictionary((char *)sfilter.c_str());
00220 lmtable* sublmt; sublmt=lmt->cpsublm(dict,(skeepunigrams=="yes"));
00221 delete lmt; lmt=sublmt;
00222 delete dict;
00223 std::cerr << "...done\n";
00224 }
00225 else{
00226 lmt->load(inp,infile.c_str(),outfile.c_str(),memmap,outtype);
00227
00228 }
00229
00230
00231 if (debug)
00232 std::cout << "lmtable has " << (lmt->is_inverted()?"inverted":"direct") << " ngrams\n";
00233
00234 if (dub) lmt->setlogOOVpenalty((int)dub);
00235
00236 if (seval != "")
00237
00238 if (randcalls>0){
00239
00240 cerr << "perform random " << randcalls << " using dictionary of test set\n";
00241 dictionary *dict; dict=new dictionary((char *)seval.c_str());
00242
00243
00244 int histo[dict->totfreq()];
00245 int totfreq=0;
00246 for (int n=0;n<dict->size();n++)
00247 for (int m=0;m<dict->freq(n);m++)
00248 histo[totfreq++]=n;
00249
00250 ngram ng(lmt->dict);
00251 srand(1234);
00252 double bow; int bol=0;
00253
00254 if (debug>1) ResetUserTime();
00255
00256 for (int n=0;n<randcalls;n++){
00257
00258 int w=histo[rand() % totfreq];
00259
00260
00261 ng.pushc(lmt->dict->encode(dict->decode(w)));
00262 lmt->lprob(ng,&bow,&bol);
00263
00264 if (debug==1){
00265 std::cout << ng.dict->decode(*ng.wordp(1)) << "[" << lmt->maxlevel()-bol << "]" << " ";
00266 std::cout << std::endl;
00267 }
00268
00269 if ((n % 100000)==0){
00270 std::cerr << ".";
00271
00272 }
00273 }
00274 std::cerr << "\n";
00275 if (debug>1) PrintUserTime("Finished in");
00276 if (debug>1) lmt->stat();
00277
00278 delete lmt;
00279 return 0;
00280
00281 }
00282 else
00283 {
00284 std::cerr << "Start Eval" << std::endl;
00285 std::cerr << "OOV code: " << lmt->dict->oovcode() << std::endl;
00286 ngram ng(lmt->dict);
00287 std::cout.setf(ios::fixed);
00288 std::cout.precision(2);
00289
00290 if (debug>0) std::cout.precision(8);
00291 std::fstream inptxt(seval.c_str(),std::ios::in);
00292
00293 int Nbo=0,Nw=0,Noov=0;
00294 double logPr=0,PP=0,PPwp=0,Pr;
00295
00296 ng.dict->incflag(1);
00297 int bos=ng.dict->encode(ng.dict->BoS());
00298 int eos=ng.dict->encode(ng.dict->EoS());
00299 ng.dict->incflag(0);
00300
00301
00302 #ifdef TRACE_CACHE
00303 lmt.init_probcache();
00304 #endif
00305 double bow; int bol=0; char *msp; unsigned int statesize;
00306 while(inptxt >> ng){
00307
00308 if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel();
00309
00310
00311
00312 if (*ng.wordp(1)==bos) {ng.size=1;continue;}
00313
00314 if (ng.size>=1){
00315 logPr+=(Pr=lmt->lprob(ng,&bow,&bol,&msp,&statesize));
00316
00317 if (debug==1){
00318 std::cout << ng.dict->decode(*ng.wordp(1)) << "[" << ng.size-bol << "]" << " ";
00319 if (*ng.wordp(1)==eos) std::cout << std::endl;
00320 }
00321 if (debug==2)
00322 std::cout << ng << "[" << ng.size-bol << "-gram]" << " " << Pr << std::endl;
00323
00324 if (debug==3)
00325 std::cout << ng << "[" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow << std::endl;
00326
00327 if (debug==4){
00328 std::cout << ng << "[" << ng.size-bol << "-gram: recombine:" << statesize << "]" << " " << Pr << " bow:" << bow << "\n";
00329 }
00330 if (debug>4){
00331 std::cout << ng << "[" << ng.size-bol << "-gram: recombine:" << statesize << "]" << " " << Pr << " bow:" << bow;
00332 double totp=0.0; int oldw=*ng.wordp(1);
00333 double oovp=lmt->getlogOOVpenalty();lmt->setlogOOVpenalty2(0);
00334
00335 for (int c=0;c<ng.dict->size();c++){
00336 *ng.wordp(1)=c;totp+=pow(10.0,lmt->lprob(ng));
00337 }
00338 *ng.wordp(1)=oldw;
00339
00340 if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5))
00341 std::cout << " [t=" << totp << "] POSSIBLE ERROR\n";
00342 else
00343 std::cout << "\n";
00344
00345 lmt->setlogOOVpenalty2((double)oovp);
00346 }
00347
00348 if (*ng.wordp(1) == lmt->dict->oovcode()) Noov++;
00349 if (bol) Nbo++;
00350 Nw++;
00351 }
00352 }
00353
00354 PP=exp((-logPr * log(10.0)) /Nw);
00355
00356 PPwp= PP * (1 - 1/exp((Noov * lmt->getlogOOVpenalty()) * log(10.0) / Nw));
00357
00358 std::cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp
00359 << " Nbo=" << Nbo << " Noov=" << Noov
00360 << " OOV=" << (float)Noov/Nw * 100.0 << "%" << std::endl;
00361
00362 if (debug>1) lmt->stat();
00363
00364 delete lmt;
00365 return 0;
00366 };
00367
00368
00369 if (sscore == "yes"){
00370
00371 ngram ng(lmt->dict);
00372 std::cout.setf(ios::scientific);
00373 std::cout << "> ";
00374
00375
00376
00377
00378
00379 unsigned int n=0; int bol; double bow;
00380 while(std::cin >> ng){
00381
00382 if (ng.size>=lmt->maxlevel()){
00383 ng.size=lmt->maxlevel();
00384 ++n;
00385 std::cout << ng << " p= " << lmt->lprob(ng,&bow,&bol) * M_LN10;
00386
00387 std::cout << " bo= " << bol << std::endl;
00388 if ((n % 10000000)==0){
00389 std::cerr << "check cache levels" << std::endl;
00390 lmt->check_cache_levels();
00391 }
00392
00393 }
00394 else
00395 std::cout << ng << " p= NULL" << std::endl;
00396 std::cout << "> ";
00397 }
00398 if (debug>1) lmt->stat();
00399 delete lmt;
00400 return 0;
00401 }
00402
00403 if (textoutput) {
00404 std::cout << "Saving in txt format to " << outfile << std::endl;
00405 lmt->savetxt(outfile.c_str());
00406 } else if (sfilter != "" || !memmap) {
00407 std::cout << "Saving in bin format to " << outfile << std::endl;
00408 lmt->savebin(outfile.c_str());
00409 }
00410 delete lmt;
00411 return 0;
00412 }
00413