00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 using namespace std;
00024
00025 #include <iostream>
00026 #include <fstream>
00027 #include <vector>
00028 #include <string>
00029 #include <stdlib.h>
00030 #include <assert.h>
00031 #include "math.h"
00032 #include "util.h"
00033
00034 #define MAX_LINE 1024
00035
00036
00037
00038
00039
00040
00041
00042 typedef struct{
00043 float pt;
00044 unsigned int idx;
00045 unsigned short code;
00046 }DataItem;
00047
00048
00049 int cmpFloatEntry(const void* a,const void* b){
00050 if (*(float *)a > *(float*)b)
00051 return 1;
00052 else if (*(float *)a < *(float *)b)
00053 return -1;
00054 else
00055 return 0;
00056 }
00057
00058
00059
00060
00061
00062 int parseWords(char *sentence, const char **words, int max);
00063
00064 int ComputeCluster(int nc, double* cl,unsigned int N,DataItem* Pts);
00065
00066
00067
00068
00069
00070 int k = 256;
00071 const int MAXLEV = 11;
00072
00073
00074
00075
00076
00077 void usage(const char *msg = 0) {
00078 if (msg) { std::cerr << msg << std::endl; }
00079 std::cerr << "Usage: quantize-lm input-file.lm [output-file.qlm [tmpfile]] " << std::endl;
00080 if (!msg) std::cerr << std::endl
00081 << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
00082 << " a version of it with quantized probabilities and back-off weights"<< std::endl
00083 << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl
00084 << " You can specify the output file to be created and also the pathname " << std::endl
00085 << " of a temporary file used by the program. As default, the temporary " << std::endl
00086 << " file is created in the /tmp directory. Output file can be " << std::endl
00087 << " written to standard output by using the special name -. " << std::endl;
00088 }
00089
00090
00091 int main(int argc, const char **argv)
00092 {
00093
00094
00095
00096 if (argc < 2) { usage(); exit(1); }
00097 std::vector<std::string> files;
00098 for (int i=1; i < argc; i++) {
00099 std::string opt = argv[i];
00100 files.push_back(opt);
00101 }
00102 if (files.size() > 3) { usage("Too many arguments"); exit(1); }
00103 if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); }
00104
00105
00106 std::string infile = files[0];
00107 std::string outfile="";
00108 std::string tmpfile="";
00109
00110 if (files.size() == 1){
00111
00112 outfile=infile;
00113
00114
00115 std::string::size_type p = outfile.rfind('/');
00116 if (p != std::string::npos && ((p+1) < outfile.size()))
00117 outfile.erase(0,p+1);
00118
00119
00120 if (outfile.compare(outfile.size()-3,3,".gz")==0)
00121 outfile.erase(outfile.size()-3,3);
00122
00123 outfile+=".qlm";
00124 }
00125 else
00126 outfile = files[1];
00127
00128
00129 if (files.size()==3){
00130
00131 tmpfile = files[2];
00132 ofstream dummy(tmpfile.c_str(),ios::out);
00133 dummy.close();
00134 }
00135 else{
00136
00137 ofstream dummy;
00138 createtempfile(dummy,tmpfile,ios::out);
00139 dummy.close();
00140 }
00141
00142 std::cerr << "Reading " << infile << "..." << std::endl;
00143
00144 inputfilestream inp(infile.c_str());
00145 if (!inp.good()) {
00146 std::cerr << "Failed to open " << infile << "!\n";
00147 exit(1);
00148 }
00149
00150
00151 std::ofstream* out;
00152 if (outfile == "-")
00153 out = (ofstream *)&std::cout;
00154 else{
00155 out=new std::ofstream;
00156 out->open(outfile.c_str());
00157 }
00158
00159 std::cerr << "Writing " << outfile << "..." << std::endl;
00160
00161
00162
00163
00164 std::cerr << "Using temporary file " << tmpfile << std::endl;
00165 fstream filebuff(tmpfile.c_str(),ios::out|ios::in|ios::binary);
00166
00167 unsigned int nPts = 0;
00168
00169
00170
00171 unsigned int numNgrams[MAXLEV + 1];
00172 int Order=0,MaxOrder=0;
00173 int n=0;
00174
00175 float logprob,logbow;
00176
00177 DataItem* dataPts;
00178
00179 double* centersP=NULL;
00180 double* centersB=NULL;
00181
00182
00183 unsigned short* mapP=NULL; unsigned short* mapB=NULL;
00184
00185 int centers[MAXLEV + 1];
00186 streampos iposition;
00187
00188 for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0;
00189 for (int i=1;i<=MAXLEV;i++) centers[i]=k;
00190
00191
00192
00193 char line[MAX_LINE];
00194
00195 while (inp.getline(line,MAX_LINE)){
00196
00197 bool backslash = (line[0] == '\\');
00198
00199 if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
00200 numNgrams[Order] = n;
00201 MaxOrder=Order;
00202 continue;
00203 }
00204
00205 if (!strncmp(line, "\\data\\", 6) || strlen(line)==0)
00206 continue;
00207
00208 if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
00209
00210
00211 if (Order == 1) {
00212 *out << "qARPA " << MaxOrder;
00213 for (int i=1;i<=MaxOrder;i++)
00214 *out << " " << centers[i];
00215 *out << "\n\n\\data\\\n";
00216
00217 for (int i=1;i<=MaxOrder;i++)
00218 *out << "ngram " << i << "= " << numNgrams[i] << "\n";
00219 }
00220
00221 *out << "\n";
00222 *out << line << "\n";
00223 cerr << "-- Start processing of " << Order << "-grams\n";
00224 assert(Order <= MAXLEV);
00225
00226 unsigned int N=numNgrams[Order];
00227
00228 const char* words[MAXLEV+3];
00229 dataPts=new DataItem[N];
00230
00231
00232 filebuff.seekg((streampos)0);
00233
00234 for (nPts=0;nPts<N;nPts++){
00235 inp.getline(line,MAX_LINE);
00236 filebuff << line << std::endl;
00237 if (!filebuff.good()){
00238 std::cerr << "Cannot write in temporary file " << tmpfile << std::endl
00239 << " Probably there is not enough space in this filesystem " << std::endl
00240 << " Eventually rerun quantize-lm by specifyng the pathname" << std::endl
00241 << " of the temporary file to be used. " << std::endl;
00242 removefile(tmpfile.c_str());
00243 exit(1);
00244 }
00245 int howmany = parseWords(line, words, Order + 3);
00246 assert(howmany == Order+2 || howmany == Order+1);
00247 sscanf(words[0],"%f",&logprob);
00248 dataPts[nPts].pt=logprob;
00249 dataPts[nPts].idx=nPts;
00250 }
00251
00252 cerr << "quantizing " << N << " probabilities\n";
00253
00254 centersP=new double[centers[Order]];
00255 mapP=new unsigned short[N];
00256
00257 ComputeCluster(centers[Order],centersP,N,dataPts);
00258
00259
00260 for (unsigned int p=0;p<N;p++){
00261 mapP[dataPts[p].idx]=dataPts[p].code;
00262 }
00263
00264 if (Order<MaxOrder){
00265
00266
00267 filebuff.seekg((streampos)0);
00268
00269 for (nPts=0;nPts<N;nPts++){
00270
00271 filebuff.getline(line,MAX_LINE);
00272 int howmany = parseWords(line, words, Order + 3);
00273 if (howmany==Order+2)
00274 sscanf(words[Order+1],"%f",&logbow);
00275 else
00276 logbow=0;
00277
00278 dataPts[nPts].pt=logbow;
00279 dataPts[nPts].idx=nPts;
00280 }
00281
00282 centersB=new double[centers[Order]];
00283 mapB=new unsigned short[N];
00284
00285 cerr << "quantizing " << N << " backoff weights\n";
00286 ComputeCluster(centers[Order],centersB,N,dataPts);
00287
00288 for (unsigned int p=0;p<N;p++){
00289 mapB[dataPts[p].idx]=dataPts[p].code;
00290 }
00291
00292 }
00293
00294
00295 *out << centers[Order] << "\n";
00296 for (int c=0;c<centers[Order];c++){
00297 *out << centersP[c];
00298 if (Order<MaxOrder) *out << " " << centersB[c];
00299 *out << "\n";
00300 }
00301
00302 filebuff.seekg(0);
00303
00304 for (nPts=0;nPts<numNgrams[Order];nPts++){
00305
00306 filebuff.getline(line,MAX_LINE);
00307
00308 parseWords(line, words, Order + 3);
00309
00310 *out << mapP[nPts];
00311
00312 for (int i=1;i<=Order;i++) *out << "\t" << words[i];
00313
00314 if (Order < MaxOrder) *out << "\t" << mapB[nPts];
00315
00316 *out << "\n";
00317
00318 }
00319
00320 if (mapP){delete [] mapP;mapP=NULL;}
00321 if (mapB){delete [] mapB;mapB=NULL;}
00322
00323 if (centersP){delete [] centersP; centersP=NULL;}
00324 if (centersB){delete [] centersB; centersB=NULL;}
00325
00326 delete [] dataPts;
00327
00328 continue;
00329
00330
00331 }
00332
00333 }
00334
00335 *out << "\\end\\\n";
00336 cerr << "---- done\n";
00337
00338 out->flush();
00339
00340 out->close();
00341 inp.close();
00342
00343 removefile(tmpfile.c_str());
00344 }
00345
00346
00347
00348 int ComputeCluster(int centers,double* ctrs,unsigned int N,DataItem* bintable){
00349
00350
00351
00352 double log10=log(10.0);
00353
00354 for (unsigned int i=0;i<N;i++) bintable[i].code=0;
00355
00356
00357 qsort(bintable,N,sizeof(DataItem),cmpFloatEntry);
00358
00359 unsigned int different=1;
00360
00361 for (unsigned int i=1;i<N;i++)
00362 if (bintable[i].pt!=bintable[i-1].pt)
00363 different++;
00364
00365 unsigned int interval=different/centers;
00366 if (interval==0) interval++;
00367
00368 unsigned int* population=new unsigned int[centers];
00369 unsigned int* species=new unsigned int[centers];
00370
00371
00372
00373
00374 for (int i=0;i<centers;i++){
00375 population[i]=species[i]=0;
00376 ctrs[i]=0;
00377 }
00378
00379
00380 bintable[0].code=0;
00381 population[0]=1;
00382 species[0]=1;
00383
00384 int currcode=0;
00385 different=1;
00386
00387 for (unsigned int i=1;i<N;i++){
00388
00389 if ((bintable[i].pt!=bintable[i-1].pt)){
00390 different++;
00391 if ((different % interval) == 0)
00392 if ((currcode+1) < centers
00393 &&
00394 population[currcode]>0){
00395 currcode++;
00396 }
00397 }
00398
00399 if (bintable[i].pt == bintable[i-1].pt)
00400 bintable[i].code=bintable[i-1].code;
00401 else{
00402 bintable[i].code=currcode;
00403 species[currcode]++;
00404 }
00405
00406 population[bintable[i].code]++;
00407
00408 assert(bintable[i].code < centers);
00409
00410 ctrs[bintable[i].code]=ctrs[bintable[i].code]+exp(bintable[i].pt * log10);
00411
00412 }
00413
00414 for (int i=0;i<centers;i++){
00415 if (population[i]>0)
00416 ctrs[i]=log(ctrs[i]/population[i])/log10;
00417 else
00418 ctrs[i]=-99;
00419
00420 if (ctrs[i]<-99){
00421 cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n";
00422 ctrs[i]=-99;
00423 }
00424
00425 cerr << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n";
00426 }
00427
00428 cout.flush();
00429
00430 delete [] population;
00431 delete [] species;
00432
00433
00434 return 1;
00435
00436 }
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446 int parseWords(char *sentence, const char **words, int max)
00447 {
00448 const char *word;
00449 int i = 0;
00450
00451 const char *const wordSeparators = " \t\r\n";
00452
00453 for (word = strtok(sentence, wordSeparators);
00454 i < max && word != 0;
00455 i++, word = strtok(0, wordSeparators))
00456 {
00457 words[i] = word;
00458 }
00459 if (i < max) {
00460 words[i] = 0;
00461 }
00462
00463 return i;
00464 }
00465
00466