00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 using namespace std;
00024
00025 #include <iostream>
00026 #include <fstream>
00027 #include <vector>
00028 #include <string>
00029 #include <stdlib.h>
00030 #include <assert.h>
00031 #include "cmd.h"
00032 #include "math.h"
00033 #include "util.h"
00034
00035
00036
00037
00038
00039
00040
00041
00042 typedef struct {
00043 float pt;
00044 unsigned int idx;
00045 unsigned short code;
00046 } DataItem;
00047
00048
00049 int cmpFloatEntry(const void* a,const void* b)
00050 {
00051 if (*(float *)a > *(float*)b)
00052 return 1;
00053 else if (*(float *)a < *(float *)b)
00054 return -1;
00055 else
00056 return 0;
00057 }
00058
00059
00060
00061
00062
00063 int ComputeCluster(int nc, double* cl,unsigned int N,DataItem* Pts);
00064
00065
00066
00067
00068
00069 int k = 256;
00070 const int MAXLEV = 11;
00071
00072
00073
00074
00075
00076 void print_help(int TypeFlag=0){
00077 std::cerr << std::endl << "quantize-lm - quantizes probabilities and back-off weights" << std::endl;
00078 std::cerr << std::endl << "USAGE:" << std::endl;
00079 std::cerr << " quantize-lm <input-file.lm> [<output-file.qlm> [<tmpfile>]]" << std::endl;
00080 std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00081 std::cerr << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl;
00082 std::cerr << " a version of it with quantized probabilities and back-off weights"<< std::endl;
00083 std::cerr << " that the IRST LM toolkit can compile. Accepts LMs with .gz suffix." << std::endl;
00084 std::cerr << " You can specify the output file to be created and also the pathname" << std::endl;
00085 std::cerr << " of a temporary file used by the program. As default, the temporary " << std::endl;
00086 std::cerr << " file is created in the /tmp directory." << std::endl;
00087 std::cerr << " Output file can be written to standard output by using the special name -." << std::endl;
00088 std::cerr << std::endl << "OPTIONS:" << std::endl;
00089
00090 FullPrintParams(TypeFlag, 0, 1, stderr);
00091 }
00092
00093 void usage(const char *msg = 0)
00094 {
00095 if (msg){
00096 std::cerr << msg << std::endl;
00097 }
00098 else{
00099 print_help();
00100 }
00101 exit(1);
00102 }
00103
00104 int main(int argc, char **argv)
00105 {
00106
00107 std::vector<std::string> files;
00108
00109 bool help=false;
00110
00111 DeclareParams((char*)
00112 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00113 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00114
00115 (char *)NULL
00116 );
00117
00118 if (argc == 1){
00119 usage();
00120 }
00121
00122 int first_file=1;
00123 for (int i=1; i < argc; i++) {
00124 if (strcmp(argv[i],"-") == 0){
00125 if (first_file == 1){
00126 files.push_back("/dev/stdin");
00127 }else if (first_file == 2){
00128 files.push_back("/dev/stdout");
00129 }else{
00130 usage("Warning: You can use the value for the input and/or output file only");
00131 }
00132 first_file++;
00133 }else if(argv[i][0] != '-'){
00134 files.push_back(argv[i]);
00135 first_file++;
00136 }
00137 }
00138
00139 GetParams(&argc, &argv, (char*) NULL);
00140
00141 if (help){
00142 usage();
00143 }
00144 if (files.size() > 3) {
00145 usage("Warning: Too many arguments");
00146 }
00147
00148 if (files.size() < 1) {
00149 usage("Warning: Please specify a LM file to read from");
00150 }
00151
00152 std::string infile = files[0];
00153 std::string outfile="";
00154 std::string tmpfile="";
00155
00156 if (files.size() == 1) {
00157
00158 outfile=infile;
00159
00160
00161 std::string::size_type p = outfile.rfind('/');
00162 if (p != std::string::npos && ((p+1) < outfile.size()))
00163 outfile.erase(0,p+1);
00164
00165
00166 if (outfile.compare(outfile.size()-3,3,".gz")==0)
00167 outfile.erase(outfile.size()-3,3);
00168
00169 outfile+=".qlm";
00170 } else
00171 outfile = files[1];
00172
00173
00174 if (files.size()==3) {
00175
00176 tmpfile = files[2];
00177 mfstream dummy(tmpfile.c_str(),ios::out);
00178 dummy.close();
00179 } else {
00180
00181 mfstream dummy;
00182 createtempfile(dummy,tmpfile,ios::out);
00183 dummy.close();
00184 }
00185
00186 std::cerr << "Reading " << infile << "..." << std::endl;
00187
00188 inputfilestream inp(infile.c_str());
00189 if (!inp.good()) {
00190 std::cerr << "Failed to open " << infile << "!\n";
00191 exit(1);
00192 }
00193
00194 std::ofstream* out;
00195 if (outfile == "-")
00196 out = (ofstream *)&std::cout;
00197 else {
00198 out=new std::ofstream;
00199 out->open(outfile.c_str());
00200 }
00201 if (!out->good()) {
00202 std::cerr << "Failed to open " << outfile << "!\n";
00203 exit(1);
00204 }
00205
00206 std::cerr << "Writing " << outfile << "..." << std::endl;
00207
00208
00209
00210
00211 std::cerr << "Using temporary file " << tmpfile << std::endl;
00212 fstream filebuff(tmpfile.c_str(),ios::out|ios::in|ios::binary);
00213
00214 unsigned int nPts = 0;
00215
00216
00217
00218 unsigned int numNgrams[MAXLEV + 1];
00219 int Order=0,MaxOrder=0;
00220 int n=0;
00221
00222 float logprob,logbow;
00223
00224 DataItem* dataPts;
00225
00226 double* centersP=NULL;
00227 double* centersB=NULL;
00228
00229
00230 unsigned short* mapP=NULL;
00231 unsigned short* mapB=NULL;
00232
00233 int centers[MAXLEV + 1];
00234 streampos iposition;
00235
00236 for (int i=1; i<=MAXLEV; i++) numNgrams[i]=0;
00237 for (int i=1; i<=MAXLEV; i++) centers[i]=k;
00238
00239
00240
00241 char line[MAX_LINE];
00242
00243 while (inp.getline(line,MAX_LINE)) {
00244
00245 bool backslash = (line[0] == '\\');
00246
00247 if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
00248 numNgrams[Order] = n;
00249 MaxOrder=Order;
00250 continue;
00251 }
00252
00253 if (!strncmp(line, "\\data\\", 6) || strlen(line)==0)
00254 continue;
00255
00256 if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
00257
00258
00259 if (Order == 1) {
00260 *out << "qARPA " << MaxOrder;
00261 for (int i=1; i<=MaxOrder; i++)
00262 *out << " " << centers[i];
00263 *out << "\n\n\\data\\\n";
00264
00265 for (int i=1; i<=MaxOrder; i++)
00266 *out << "ngram " << i << "= " << numNgrams[i] << "\n";
00267 }
00268
00269 *out << "\n";
00270 *out << line << "\n";
00271 cerr << "-- Start processing of " << Order << "-grams\n";
00272 assert(Order <= MAXLEV);
00273
00274 unsigned int N=numNgrams[Order];
00275
00276 const char* words[MAXLEV+3];
00277 dataPts=new DataItem[N];
00278
00279
00280 filebuff.seekg((streampos)0);
00281
00282 for (nPts=0; nPts<N; nPts++) {
00283 inp.getline(line,MAX_LINE);
00284 filebuff << line << std::endl;
00285 if (!filebuff.good()) {
00286 std::cerr << "Cannot write in temporary file " << tmpfile << std::endl
00287 << " Probably there is not enough space in this filesystem " << std::endl
00288 << " Eventually rerun quantize-lm by specifyng the pathname" << std::endl
00289 << " of the temporary file to be used. " << std::endl;
00290 removefile(tmpfile.c_str());
00291 exit(1);
00292 }
00293 int howmany = parseWords(line, words, Order + 3);
00294 assert(howmany == Order+2 || howmany == Order+1);
00295 sscanf(words[0],"%f",&logprob);
00296 dataPts[nPts].pt=logprob;
00297 dataPts[nPts].idx=nPts;
00298 }
00299
00300 cerr << "quantizing " << N << " probabilities\n";
00301
00302 centersP=new double[centers[Order]];
00303 mapP=new unsigned short[N];
00304
00305 ComputeCluster(centers[Order],centersP,N,dataPts);
00306
00307
00308 for (unsigned int p=0; p<N; p++) {
00309 mapP[dataPts[p].idx]=dataPts[p].code;
00310 }
00311
00312 if (Order<MaxOrder) {
00313
00314
00315 filebuff.seekg((streampos)0);
00316
00317 for (nPts=0; nPts<N; nPts++) {
00318
00319 filebuff.getline(line,MAX_LINE);
00320 int howmany = parseWords(line, words, Order + 3);
00321 if (howmany==Order+2)
00322 sscanf(words[Order+1],"%f",&logbow);
00323 else
00324 logbow=0;
00325
00326 dataPts[nPts].pt=logbow;
00327 dataPts[nPts].idx=nPts;
00328 }
00329
00330 centersB=new double[centers[Order]];
00331 mapB=new unsigned short[N];
00332
00333 cerr << "quantizing " << N << " backoff weights\n";
00334 ComputeCluster(centers[Order],centersB,N,dataPts);
00335
00336 for (unsigned int p=0; p<N; p++) {
00337 mapB[dataPts[p].idx]=dataPts[p].code;
00338 }
00339
00340 }
00341
00342
00343 *out << centers[Order] << "\n";
00344 for (int c=0; c<centers[Order]; c++) {
00345 *out << centersP[c];
00346 if (Order<MaxOrder) *out << " " << centersB[c];
00347 *out << "\n";
00348 }
00349
00350 filebuff.seekg(0);
00351
00352 for (nPts=0; nPts<numNgrams[Order]; nPts++) {
00353
00354 filebuff.getline(line,MAX_LINE);
00355
00356 parseWords(line, words, Order + 3);
00357
00358 *out << mapP[nPts];
00359
00360 for (int i=1; i<=Order; i++) *out << "\t" << words[i];
00361
00362 if (Order < MaxOrder) *out << "\t" << mapB[nPts];
00363
00364 *out << "\n";
00365
00366 }
00367
00368 if (mapP) {
00369 delete [] mapP;
00370 mapP=NULL;
00371 }
00372 if (mapB) {
00373 delete [] mapB;
00374 mapB=NULL;
00375 }
00376
00377 if (centersP) {
00378 delete [] centersP;
00379 centersP=NULL;
00380 }
00381 if (centersB) {
00382 delete [] centersB;
00383 centersB=NULL;
00384 }
00385
00386 delete [] dataPts;
00387
00388 continue;
00389
00390
00391 }
00392
00393 }
00394
00395 *out << "\\end\\\n";
00396 cerr << "---- done\n";
00397
00398 out->flush();
00399
00400 out->close();
00401 inp.close();
00402
00403 removefile(tmpfile.c_str());
00404 }
00405
00406
00407
00408 int ComputeCluster(int centers,double* ctrs,unsigned int N,DataItem* bintable)
00409 {
00410
00411
00412
00413 double log10=log(10.0);
00414
00415 for (unsigned int i=0; i<N; i++) bintable[i].code=0;
00416
00417
00418 qsort(bintable,N,sizeof(DataItem),cmpFloatEntry);
00419
00420 unsigned int different=1;
00421
00422 for (unsigned int i=1; i<N; i++)
00423 if (bintable[i].pt!=bintable[i-1].pt)
00424 different++;
00425
00426 unsigned int interval=different/centers;
00427 if (interval==0) interval++;
00428
00429 unsigned int* population=new unsigned int[centers];
00430 unsigned int* species=new unsigned int[centers];
00431
00432
00433
00434
00435 for (int i=0; i<centers; i++) {
00436 population[i]=species[i]=0;
00437 ctrs[i]=0;
00438 }
00439
00440
00441 bintable[0].code=0;
00442 population[0]=1;
00443 species[0]=1;
00444
00445 int currcode=0;
00446 different=1;
00447
00448 for (unsigned int i=1; i<N; i++) {
00449
00450 if ((bintable[i].pt!=bintable[i-1].pt)) {
00451 different++;
00452 if ((different % interval) == 0)
00453 if ((currcode+1) < centers
00454 &&
00455 population[currcode]>0) {
00456 currcode++;
00457 }
00458 }
00459
00460 if (bintable[i].pt == bintable[i-1].pt)
00461 bintable[i].code=bintable[i-1].code;
00462 else {
00463 bintable[i].code=currcode;
00464 species[currcode]++;
00465 }
00466
00467 population[bintable[i].code]++;
00468
00469 assert(bintable[i].code < centers);
00470
00471 ctrs[bintable[i].code]=ctrs[bintable[i].code]+exp(bintable[i].pt * log10);
00472
00473 }
00474
00475 for (int i=0; i<centers; i++) {
00476 if (population[i]>0)
00477 ctrs[i]=log(ctrs[i]/population[i])/log10;
00478 else
00479 ctrs[i]=-99;
00480
00481 if (ctrs[i]<-99) {
00482 cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n";
00483 ctrs[i]=-99;
00484 }
00485
00486 cerr << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n";
00487 }
00488
00489 cout.flush();
00490
00491 delete [] population;
00492 delete [] species;
00493
00494
00495 return 1;
00496
00497 }
00498
00499
00500
00501
00502
00503
00504
00505