00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <cstdlib>
00024 #include <stdlib.h>
00025 #include <fcntl.h>
00026 #include <iostream>
00027 #include <fstream>
00028 #include <stdexcept>
00029 #include <string>
00030 #include <set>
00031 #include <cassert>
00032 #include <limits>
00033 #include "math.h"
00034 #include "mempool.h"
00035 #include "htable.h"
00036 #include "ngramcache.h"
00037 #include "dictionary.h"
00038 #include "n_gram.h"
00039 #include "lmtable.h"
00040
00041 #include "util.h"
00042
00043 #define DEBUG 0
00044
00045
00046 #define NOPROB ((float)-1.329227995784915872903807060280344576e36)
00047
00048 using namespace std;
00049
00050 inline void error(const char* message){
00051 std::cerr << message << "\n";
00052 throw std::runtime_error(message);
00053 }
00054
00055
00056
00057 lmtable::lmtable(){
00058
00059 configure(1,false);
00060
00061 dict=new dictionary((char *)NULL,1000000);
00062
00063 memset(cursize, 0, sizeof(cursize));
00064 memset(tbltype, 0, sizeof(tbltype));
00065 memset(maxsize, 0, sizeof(maxsize));
00066 memset(info, 0, sizeof(info));
00067 memset(NumCenters, 0, sizeof(NumCenters));
00068
00069 max_cache_lev=0;
00070 for (int i=0;i<=LMTMAXLEV+1;i++) lmtcache[i]=NULL;
00071
00072 probcache=NULL;
00073 statecache=NULL;
00074 statesizecache=NULL;
00075
00076 memmap=0;
00077
00078 isPruned=false;
00079 isInverted=false;
00080
00081
00082 for (int i=0;i<=LMTMAXLEV+1;i++) totget[i]=totbsearch[i]=0;
00083
00084 logOOVpenalty=0.0;
00085
00086
00087 setOrderQuery(false);
00088 };
00089
00090 void lmtable::init_probcache(){
00091 assert(probcache==NULL);
00092 probcache=new ngramcache(maxlev,sizeof(double),400000);
00093 #ifdef TRACE_CACHE
00094 cacheout=new std::fstream(get_temp_folder()++"tracecache",std::ios::out);
00095 sentence_id=0;
00096 #endif
00097 }
00098
00099 void lmtable::init_statecache(){
00100 assert(statecache==NULL);
00101 if(maxlev > 1){
00102 statecache=new ngramcache(maxlev-1,sizeof(char *),200000);
00103 statesizecache=new ngramcache(maxlev-1,sizeof(int),200000);
00104 }
00105 }
00106
00107 void lmtable::init_lmtcaches(int uptolev){
00108 max_cache_lev=uptolev;
00109 for (int i=2;i<=max_cache_lev;i++){
00110 assert(lmtcache[i]==NULL);
00111 lmtcache[i]=new ngramcache(i,sizeof(char *),200000);
00112 }
00113 }
00114
00115 void lmtable::check_cache_levels(){
00116 if (probcache && probcache->isfull()) probcache->reset(probcache->cursize());
00117 if (statecache && statecache->isfull()) {
00118 statecache->reset(statecache->cursize());
00119 statesizecache->reset(statesizecache->cursize());
00120 }
00121 for (int i=2;i<=max_cache_lev;i++)
00122 if (lmtcache[i]->isfull()) lmtcache[i]->reset(lmtcache[i]->cursize());
00123 }
00124
00125 void lmtable::reset_caches(){
00126 if (probcache) probcache->reset(MAX(probcache->cursize(),probcache->maxsize()));
00127 if (statecache){
00128 statecache->reset(MAX(statecache->cursize(),statecache->maxsize()));
00129 statesizecache->reset(MAX(statesizecache->cursize(),statesizecache->maxsize()));
00130 }
00131 for (int i=2;i<=max_cache_lev;i++)
00132 lmtcache[i]->reset(MAX(lmtcache[i]->cursize(),lmtcache[i]->maxsize()));
00133 }
00134
00135 void lmtable::configure(int n,bool quantized){
00136 maxlev=n;
00137 if (n==1)
00138 tbltype[1]=(quantized?QLEAF:LEAF);
00139 else{
00140 for (int i=1;i<n;i++) tbltype[i]=(quantized?QINTERNAL:INTERNAL);
00141 tbltype[n]=(quantized?QLEAF:LEAF);
00142 }
00143 }
00144
00145
00146
00147
00148
00149 void lmtable::load(istream& inp,const char* filename,const char* outfilename,int keep_on_disk,OUTFILE_TYPE outtype){
00150
00151 #ifdef WIN32
00152 if (keep_on_disk>0){
00153 std::cerr << "lmtable::load memory mapping not yet available under WIN32\n";
00154 keep_on_disk = 0;
00155 }
00156 #endif
00157
00158
00159 char header[MAX_LINE];
00160 inp >> header; cerr << header << "\n";
00161
00162 if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0){
00163 loadbin(inp,header,filename,keep_on_disk);
00164 }
00165 else{
00166
00167 if (keep_on_disk && outfilename==NULL) {
00168 cerr << "Load Error: inconsistent setting. Passed input file: textual. Memory map: yes. Outfilename: not specified.\n";
00169 exit(0);
00170 }
00171
00172 loadtxt(inp,header,outfilename,keep_on_disk);
00173 }
00174
00175 cerr << "OOV code is " << lmtable::getDict()->oovcode() << "\n";
00176 }
00177
00178
00179
00180
00181 int lmtable::reload(std::set<string> words){
00182
00183
00184
00185 dictionary dict(NULL,(int)words.size()); dict.incflag(1);
00186
00187 std::set<string>::iterator w;
00188 for (w = words.begin(); w != words.end(); ++w);
00189 dict.encode((*w).c_str());
00190
00191
00192
00193 return 1;
00194 }
00195
00196 int parseWords(char *sentence, const char **words, int max)
00197 {
00198 char *word;
00199 int i = 0;
00200
00201 const char *const wordSeparators = " \t\r\n";
00202
00203 for (word = strtok(sentence, wordSeparators);
00204 i < max && word != 0;
00205 i++, word = strtok(0, wordSeparators))
00206 {
00207 words[i] = word;
00208 }
00209
00210 if (i < max){words[i] = 0;}
00211
00212 return i;
00213 }
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225 int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow){
00226
00227 const char* words[1+ LMTMAXLEV + 1 + 1];
00228 int howmany;
00229 char line[MAX_LINE];
00230
00231 inp.getline(line,MAX_LINE);
00232 if (strlen(line)==MAX_LINE-1){
00233 cerr << "parseline: input line exceed MAXLINE ("
00234 << MAX_LINE << ") chars " << line << "\n";
00235 exit(1);
00236 }
00237
00238 howmany = parseWords(line, words, Order + 3);
00239
00240 if (!(howmany == (Order+ 1) || howmany == (Order + 2)))
00241 assert(howmany == (Order+ 1) || howmany == (Order + 2));
00242
00243
00244 ng.size=0;
00245 for (int i=1;i<=Order;i++)
00246 ng.pushw(strcmp(words[i],"<unk>")?words[i]:ng.dict->OOV());
00247
00248
00249 assert(sscanf(words[0],"%f",&prob));
00250 if (howmany==(Order+2))
00251 assert(sscanf(words[Order+1],"%f",&bow));
00252 else
00253 bow=0.0;
00254
00263 return 1;
00264 }
00265
00266
00267 void lmtable::loadcenters(istream& inp,int Order){
00268 char line[MAX_LINE];
00269
00270
00271 cerr << Order << " read code book ";
00272 inp >> NumCenters[Order];
00273 Pcenters[Order]=new float[NumCenters[Order]];
00274 Bcenters[Order]=(Order<maxlev?new float[NumCenters[Order]]:NULL);
00275
00276 for (int c=0;c<NumCenters[Order];c++){
00277 inp >> Pcenters[Order][c];
00278 if (Order<maxlev) inp >> Bcenters[Order][c];
00279 };
00280
00281 inp.getline((char*)line,MAX_LINE);
00282 }
00283
00284 void lmtable::loadtxt(istream& inp,const char* header,const char* outfilename,int mmap){
00285 if (mmap>0)
00286 loadtxtmmap(inp,header,outfilename);
00287 else {
00288 loadtxt(inp,header);
00289 lmtable::getDict()->genoovcode();
00290 }
00291 }
00292
00293 void lmtable::loadtxtmmap(istream& inp,const char* header,const char* outfilename){
00294
00295 char nameNgrams[BUFSIZ];
00296 char nameHeader[BUFSIZ];
00297
00298 FILE *fd = NULL;
00299 table_pos_t filesize=0;
00300
00301 int Order,n;
00302
00303 int maxlevel_h;
00304
00305
00306
00307 char line[MAX_LINE];
00308
00309
00310
00311 lmtable::getDict()->incflag(1);
00312
00313
00314 ngram ng(lmtable::getDict());
00315 ngram ing(lmtable::getDict());
00316
00317 float pb,bow;;
00318
00319
00320 isQtable=(strncmp(header,"qARPA",5)==0?true:false);
00321
00322
00323 isItable=(strncmp(header,"iARPA",5)==0?true:false);
00324
00325 if (isQtable){
00326
00327 inp >> line;
00328 if (!(maxlevel_h=atoi(line))){
00329 cerr << "loadtxt with mmap requires new qARPA header. Please regenerate the file.\n";
00330 exit(1);
00331 }
00332
00333 for (n=1;n<=maxlevel_h;n++){
00334 inp >> line;
00335 if (!(NumCenters[n]=atoi(line))){
00336 cerr << "loadtxt with mmap requires new qARPA header. Please regenerate the file.\n";
00337 exit(0);
00338 }
00339 }
00340 }
00341
00342
00343 bool yetconfigured=false;
00344
00345 cerr << "loadtxtmmap()\n";
00346
00347
00348
00349 while (inp.getline(line,MAX_LINE)){
00350
00351 if (strlen(line)==MAX_LINE-1){
00352 cerr << "lmtable::loadtxtmmap: input line exceed MAXLINE ("
00353 << MAX_LINE << ") chars " << line << "\n";
00354 exit(1);
00355 }
00356
00357 bool backslash = (line[0] == '\\');
00358
00359 if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
00360 maxsize[Order] = n; maxlev=Order;
00361 cerr << "size[" << Order << "]=" << maxsize[Order] << "\n";
00362 }
00363
00364 if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
00365
00366
00367 if (!yetconfigured){
00368 configure(maxlev,isQtable);
00369 yetconfigured=true;
00370
00371
00372 strcpy(nameNgrams,outfilename);
00373 strcat(nameNgrams, "-ngrams");
00374
00375 cerr << "saving ngrams probs in " << nameNgrams << "\n";
00376
00377 fd = fopen(nameNgrams, "w+");
00378
00379
00380 for (int l=1;l<=maxlev;l++){
00381 if (l<maxlev)
00382 filesize += (table_pos_t) maxsize[l] * nodesize(tbltype[l]) + 2 * NumCenters[l] * sizeof(float);
00383 else
00384 filesize += (table_pos_t) maxsize[l] * nodesize(tbltype[l]) + NumCenters[l] * sizeof(float);
00385 }
00386
00387 cerr << "global filesize = " << filesize << "\n";
00388
00389 ftruncate(fileno(fd),filesize);
00390 table[0]=(char *)(MMap(fileno(fd),PROT_READ|PROT_WRITE,0,filesize,&tableGaps[0]));
00391
00392
00393
00394 if (maxlev>1)
00395 table[1]=table[0] + (table_pos_t) (2 * NumCenters[1] * sizeof(float));
00396 else
00397 table[1]=table[0] + (table_pos_t) (NumCenters[1] * sizeof(float));
00398
00399 for (int l=2;l<=maxlev;l++)
00400 if (l<maxlev)
00401 table[l]=(char *)(table[l-1] + (table_pos_t) maxsize[l-1]*nodesize(tbltype[l-1]) +
00402 2 * NumCenters[l] * sizeof(float));
00403 else
00404 table[l]=(char *)(table[l-1] + (table_pos_t) maxsize[l-1]*nodesize(tbltype[l-1]) +
00405 NumCenters[l] * sizeof(float));
00406
00407 for (int l=2;l<=maxlev;l++){
00408 cerr << "table[" << l << "]-table[" << l-1 << "]="
00409 << (table_pos_t) table[l]-(table_pos_t) table[l-1] << " (nodesize=" << nodesize(tbltype[l-1]) << ")\n";
00410 }
00411 }
00412
00413 cerr << Order << "-grams: reading ";
00414 if (isQtable) {
00415 loadcenters(inp,Order);
00416
00417 if (Order<maxlev){
00418 memcpy(table[Order] - 2 * NumCenters[Order] * sizeof(float),
00419 Pcenters[Order],
00420 NumCenters[Order] * sizeof(float));
00421 memcpy(table[Order] - NumCenters[Order] * sizeof(float),
00422 Bcenters[Order],
00423 NumCenters[Order] * sizeof(float));
00424 } else
00425 memcpy(table[Order] - NumCenters[Order] * sizeof(float),
00426 Pcenters[Order],
00427 NumCenters[Order] * sizeof(float));
00428 }
00429
00430
00431 if (maxlev>1 && Order<maxlev) {
00432 startpos[Order]=new table_entry_pos_t[maxsize[Order]];
00433 for (table_entry_pos_t c=0;c<maxsize[Order];c++) startpos[Order][c]=BOUND_EMPTY1;
00434 }
00435
00436 cerr << maxsize[Order] << " entries\n";
00437
00438
00439 for (table_entry_pos_t c=0;c<maxsize[Order];c++){
00440
00441 if (parseline(inp,Order,ng,pb,bow)){
00442
00443
00444
00445
00446 if (isInverted & Order>1){
00447 ing.invert(ng);
00448 ng=ing;
00449 }
00450
00451 if (isItable && Order>1){
00452
00453 get(ng,ng.size,ng.size-1);
00454 float rbow=0.0;
00455 if (ng.lev==ng.size-1){
00456 rbow=ng.bow;
00457
00458 }
00459
00460 int tmp=maxlev;
00461 maxlev=Order-1;
00462 pb= log(exp((double)pb * M_LN10) + exp(((double)rbow + lprob(ng)) * M_LN10))/M_LN10;
00463 maxlev=tmp;
00464 }
00465
00466 if (isQtable) add(ng, (qfloat_t)pb, (qfloat_t)bow);
00467 else add(ng, pb, bow);
00468
00469 }
00470 }
00471
00472 msync(table[0],filesize,MS_SYNC);
00473
00474
00475
00476 if (maxlev>1 && Order>1){
00477 checkbounds(Order-1);
00478 delete startpos[Order-1];
00479 }
00480 }
00481 }
00482
00483 cerr << "closing output file: " << nameNgrams << "\n";
00484 for (int i=1;i<=maxlev;i++)
00485 if (maxsize[i] != cursize[i]) {
00486 for (int l=1;l<=maxlev;l++)
00487 cerr << "Level " << l << ": starting ngrams=" << maxsize[l] << " - actual stored ngrams=" << cursize[l] << "\n";
00488 break;
00489 }
00490
00491 Munmap(table[0],filesize,MS_SYNC);
00492 for (int l=1;l<=maxlev;l++)
00493 table[l]=0;
00494 cerr << "running fclose...\n";
00495 fclose(fd);
00496 cerr << "done\n";
00497
00498 lmtable::getDict()->incflag(0);
00499 lmtable::getDict()->genoovcode();
00500
00501
00502
00503 strcpy(nameHeader,outfilename);
00504 strcat(nameHeader, "-header");
00505 cerr << "saving header+dictionary in " << nameHeader << "\n";
00506 fstream out(nameHeader,ios::out);
00507
00508
00509 if (isQtable){
00510 out << "Qblmt" << (isInverted?"I ":" ") << maxlev;
00511 for (int i=1;i<=maxlev;i++) out << " " << maxsize[i];
00512 out << "\nNumCenters";
00513 for (int i=1;i<=maxlev;i++) out << " " << NumCenters[i];
00514 out << "\n";
00515
00516 }else{
00517 out << "blmt" << (isInverted?"I ":" ") << maxlev;
00518 for (int i=1;i<=maxlev;i++) out << " " << maxsize[i];
00519 out << "\n";
00520 }
00521
00522 lmtable::getDict()->save(out);
00523
00524 out.close();
00525 cerr << "done\n";
00526
00527
00528
00529 char cmd[MAX_LINE];
00530 sprintf(cmd,"cat %s >> %s", nameNgrams, nameHeader);
00531 cerr << "run cmd <" << cmd << ">\n";
00532 system(cmd);
00533
00534 sprintf(cmd,"mv %s %s", nameHeader, outfilename);
00535 cerr << "run cmd <" << cmd << ">\n";
00536 system(cmd);
00537
00538 sprintf(cmd,"rm %s", nameNgrams);
00539 cerr << "run cmd <" << cmd << ">\n";
00540 system(cmd);
00541
00542
00543 exit(0);
00544 return;
00545 }
00546
00547 void lmtable::loadtxt(istream& inp,const char* header){
00548
00549
00550
00551 char line[MAX_LINE];
00552
00553
00554
00555 dictionary(NULL,1000000);
00556 lmtable::getDict()->incflag(1);
00557
00558
00559 ngram ng(lmtable::getDict());
00560 ngram ing(lmtable::getDict());
00561
00562 float prob,bow;
00563
00564
00565 isQtable=(strncmp(header,"qARPA",5)==0?true:false);
00566
00567
00568 isItable=(strncmp(header,"iARPA",5)==0?true:false);
00569
00570
00571 bool yetconfigured=false;
00572
00573 cerr << "loadtxt()\n";
00574
00575
00576 int Order,n;
00577
00578 while (inp.getline(line,MAX_LINE)){
00579
00580 if (strlen(line)==MAX_LINE-1){
00581 cerr << "lmtable::loadtxt: input line exceed MAXLINE ("
00582 << MAX_LINE << ") chars " << line << "\n";
00583 exit(1);
00584 }
00585
00586 bool backslash = (line[0] == '\\');
00587
00588 if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
00589 maxsize[Order] = n; maxlev=Order;
00590
00591 }
00592
00593 if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
00594
00595
00596 if (!yetconfigured){
00597 configure(maxlev,isQtable);yetconfigured=true;
00598
00599 for (int i=1;i<=maxlev;i++)
00600 table[i] = new char[(table_pos_t) maxsize[i] * nodesize(tbltype[i])];
00601 }
00602
00603 cerr << Order << "-grams: reading ";
00604
00605 if (isQtable) loadcenters(inp,Order);
00606
00607
00608 if (maxlev>1 && Order<maxlev) {
00609 startpos[Order]=new table_entry_pos_t[maxsize[Order]];
00610 for (table_entry_pos_t c=0;c<maxsize[Order];c++){
00611 startpos[Order][c]=BOUND_EMPTY1;
00612 }
00613 }
00614
00615
00616 cerr << maxsize[Order] << " entries\n";
00617
00618
00619
00620 for (table_entry_pos_t c=0;c<maxsize[Order];c++){
00621
00622 if (parseline(inp,Order,ng,prob,bow)){
00623
00624
00625 if (isInverted & Order>1){
00626 ing.invert(ng);
00627 ng=ing;
00628 }
00629
00630
00631
00632
00633
00634
00635 if (isItable && Order>1) {
00636
00637 get(ng,ng.size,ng.size-1);
00638 float rbow=0.0;
00639 if (ng.lev==ng.size-1){
00640 rbow=ng.bow;
00641
00642 }
00643
00644 int tmp=maxlev;
00645 maxlev=Order-1;
00646
00647 prob= log(exp((double)prob * M_LN10) + exp(((double)rbow + lprob(ng)) * M_LN10))/M_LN10;
00648
00649
00650 maxlev=tmp;
00651 }
00652
00653 if (isQtable) add(ng, (qfloat_t)prob, (qfloat_t)bow);
00654 else add(ng, prob, bow);
00655
00656
00657
00658
00659
00660
00661 }
00662 }
00663
00664 if (maxlev>1 && Order>1) checkbounds(Order-1);
00665 }
00666 }
00667
00668 lmtable::getDict()->incflag(0);
00669 cerr << "done\n";
00670
00671 }
00672
00673
00674 void lmtable::printTable(int level) {
00675 char* tbl=table[level];
00676 LMT_TYPE ndt=tbltype[level];
00677 int ndsz=nodesize(ndt);
00678 table_entry_pos_t printEntryN=1000;
00679 if (cursize[level]>0)
00680 printEntryN=(printEntryN<cursize[level])?printEntryN:cursize[level];
00681
00682 cout << "level = " << level << "\n";
00683
00684
00685 float p;
00686 for (table_entry_pos_t c=0;c<printEntryN;c++){
00687 p=prob(tbl,ndt);
00688 cout << p << " " << word(tbl) << "\n";
00689
00690 tbl+=ndsz;
00691 }
00692 return;
00693 }
00694
00695
00696
00697 void lmtable::checkbounds(int level){
00698
00699 char* tbl=table[level];
00700 char* succtbl=table[level+1];
00701
00702 LMT_TYPE ndt=tbltype[level], succndt=tbltype[level+1];
00703 int ndsz=nodesize(ndt), succndsz=nodesize(succndt);
00704
00705
00706
00707 ofstream out;string filePath;
00708 createtempfile(out,filePath,ios::out|ios::binary);
00709
00710 table_entry_pos_t start,end,newstart;
00711
00712
00713 newstart=0;
00714
00715 for (table_entry_pos_t c=0;c<cursize[level];c++){
00716 start=startpos[level][c]; end=bound(tbl+ (table_pos_t) c*ndsz,ndt);
00717
00718
00719
00720 if (start==BOUND_EMPTY1) end=BOUND_EMPTY2;
00721 if (end==BOUND_EMPTY2) end=start;
00722
00723 assert(start<=end);
00724 assert(newstart+(end-start)<=cursize[level+1]);
00725 assert(end == BOUND_EMPTY1 || end<=cursize[level+1]);
00726
00727
00728 if (start<end){
00729 out.write((char*)(succtbl + (table_pos_t) start * succndsz),(table_pos_t) (end-start) * succndsz);
00730 if (!out.good()){
00731 std::cerr << " Something went wrong while writing temporary file " << filePath
00732 << " Maybe there is not enough space on this filesystem\n";
00733
00734 out.close();
00735 removefile(filePath);
00736 }
00737 }
00738
00739 bound(tbl+(table_pos_t) c*ndsz,ndt,newstart+(end-start));
00740 newstart+=(end-start);
00741 }
00742
00743 out.close();
00744
00745 fstream inp(filePath.c_str(),ios::in|ios::binary);
00746
00747 inp.read(succtbl,(table_pos_t) cursize[level+1]*succndsz);
00748
00749 inp.close();
00750
00751
00752 }
00753
00754
00755
00756
00757
00758
00759 template<typename TA, typename TB>
00760 int lmtable::add(ngram& ng, TA iprob,TB ibow){
00761
00762 char *found;
00763 LMT_TYPE ndt=tbltype[1];
00764 int ndsz=nodesize(ndt);
00765 static int no_more_msg = 0;
00766
00767 if (ng.size>1){
00768
00769
00770 table_entry_pos_t start=0, end=cursize[1];
00771
00772
00773 for (int l=1;l<ng.size;l++){
00774
00775 ndt=tbltype[l]; ndsz=nodesize(ndt);
00776
00777 if (search(l,start,(end-start),ndsz,
00778 ng.wordp(ng.size-l+1),LMT_FIND, &found)){
00779
00780
00781 if (l< (ng.size-1)){
00782
00783 if (found==table[l]) start=0;
00784 else start=bound(found - ndsz,ndt);
00785
00786
00787 end=bound(found,ndt);
00788 }
00789 }
00790 else {
00791 if (!no_more_msg)
00792 cerr << "warning: missing back-off (at level " << l << ") for ngram " << ng << " (and possibly for others)\n";
00793
00794 no_more_msg++;
00795 if (!(no_more_msg % 5000000))
00796 cerr << "!";
00797
00798 return 0;
00799 }
00800 }
00801
00802
00803
00804 table_entry_pos_t position=(table_entry_pos_t) (((table_pos_t) found-(table_pos_t) table[ng.size-1])/ndsz);
00805
00806
00807 if (startpos[ng.size-1][position]==BOUND_EMPTY1)
00808 startpos[ng.size-1][position]=cursize[ng.size];
00809
00810
00811 bound(found,ndt,cursize[ng.size]+1);
00812
00813 }
00814
00815
00816
00817 assert(cursize[ng.size]< maxsize[ng.size]);
00818 ndt=tbltype[ng.size];ndsz=nodesize(ndt);
00819
00820 found=table[ng.size] + ((table_pos_t) cursize[ng.size] * ndsz);
00821
00822 word(found,*ng.wordp(1));
00823 prob(found,ndt,iprob);
00824 if (ng.size<maxlev){bow(found,ndt,ibow);bound(found,ndt,BOUND_EMPTY2);}
00825
00826 cursize[ng.size]++;
00827
00828 if (!(cursize[ng.size]%5000000))
00829 cerr << ".";
00830
00831 return 1;
00832
00833 }
00834
00835
00836 void *lmtable::search(int lev,
00837 table_entry_pos_t offs,
00838 table_entry_pos_t n,
00839 int sz,
00840 int *ngp,
00841 LMT_ACTION action,
00842 char **found){
00843
00844
00845
00846
00847
00848
00849
00850
00851 if (lev==1) return *found=(*ngp < (float) n ? table[1] + (table_pos_t)*ngp * sz:NULL);
00852
00853
00854
00855 char* tb;
00856 tb=table[lev] + (table_pos_t) offs * sz;
00857
00858 char w[LMTCODESIZE];putmem(w,ngp[0],0,LMTCODESIZE);
00859
00860 table_entry_pos_t idx=0;
00861 *found=NULL;
00862
00863 totbsearch[lev]++;
00864 switch(action){
00865 case LMT_FIND:
00866
00867 if (!tb || !mybsearch(tb,n,sz,w,&idx)) return NULL;
00868 else
00869
00870 return *found=tb + ((table_pos_t)idx * sz);
00871 default:
00872 error((char*)"lmtable::search: this option is available");
00873 };
00874
00875 return NULL;
00876 }
00877
00878
00879
00880 int lmtable::mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx)
00881 {
00882 register table_entry_pos_t low, high;
00883 register char *p;
00884 register int result=0;
00885
00886
00887
00888
00889
00890
00891
00892
00893
00894
00895 low = 0;high = n; *idx=0;
00896 while (low < high)
00897 {
00898
00899 *idx = (low + high) / 2;
00900
00901 p = (char *) (ar + ((table_pos_t)*idx * size));
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912 result=word(key)-word(p);
00913
00914 if (result < 0)
00915 high = *idx;
00916 else if (result > 0)
00917 low = *idx + 1;
00918 else
00919 return 1;
00920 }
00921
00922 *idx=low;
00923
00924 return 0;
00925
00926 }
00927
00928
00929
00930
00931 lmtable* lmtable::cpsublm(dictionary* subdict,bool keepunigr){
00932
00933
00934
00935
00936 lmtable* slmt=new lmtable();
00937 slmt->configure(maxlev,isQtable);
00938 slmt->dict=new dictionary((keepunigr?dict:subdict),false);
00939 std::cerr << "subdict size: " << slmt->dict->size() << "\n";
00940
00941 if (isQtable){
00942 for (int i=1;i<=maxlev;i++) {
00943 slmt->NumCenters[i]=NumCenters[i];
00944 slmt->Pcenters[i]=new float [NumCenters[i]];
00945 memcpy(slmt->Pcenters[i],Pcenters[i],NumCenters[i] * sizeof(float));
00946 slmt->Bcenters[i]=new float [NumCenters[i]];
00947 memcpy(slmt->Bcenters[i],Bcenters[i],NumCenters[i] * sizeof(float));
00948 }
00949 }
00950
00951
00952
00953
00954 dict->genoovcode(); slmt->dict->genoovcode(); subdict->genoovcode();
00955 std::cerr << "subdict size: " << slmt->dict->size() << "\n";
00956 int* lookup;lookup=new int [dict->size()];
00957 for (int c=0;c<dict->size();c++){
00958 lookup[c]=subdict->encode(dict->decode(c));
00959 if (c != dict->oovcode() && lookup[c] == subdict->oovcode())
00960 lookup[c]=-1;
00961 }
00962
00963
00964 LMT_TYPE ndt,pndt; int ndsz,pndsz;
00965 char *entry, *newentry;
00966 table_entry_pos_t start, end, origin;
00967
00968 for (int l=1;l<=maxlev;l++){
00969
00970 slmt->cursize[l]=0;
00971 slmt->table[l]=NULL;
00972
00973 if (l==1){
00974
00975 ndt=tbltype[l]; ndsz=nodesize(ndt);
00976
00977 for (table_entry_pos_t p=0;p<cursize[l];p++){
00978
00979 entry=table[l] + (table_pos_t) p * ndsz;
00980 if (lookup[word(entry)]!=-1 || keepunigr){
00981
00982 if ((slmt->cursize[l] % slmt->dict->size()) ==0)
00983 slmt->table[l]=(char *)realloc(slmt->table[l],((table_pos_t) slmt->cursize[l] + (table_pos_t) slmt->dict->size()) * ndsz);
00984
00985 newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz;
00986 memcpy(newentry,entry,ndsz);
00987 if (!keepunigr)
00988 slmt->word(newentry,lookup[word(entry)]);
00989
00990 if (l<maxlev)
00991 slmt->bound(newentry,ndt,p);
00992 slmt->cursize[l]++;
00993 }
00994 }
00995 }
00996
00997 else{
00998
00999 pndt=tbltype[l-1]; pndsz=nodesize(pndt);
01000 ndt=tbltype[l]; ndsz=nodesize(ndt);
01001
01002 for (table_entry_pos_t p=0; p<slmt->cursize[l-1]; p++){
01003
01004
01005 origin=slmt->bound(slmt->table[l-1] + (table_pos_t)p * pndsz,pndt);
01006 if (origin == 0) start=0;
01007 else start=bound(table[l-1] + (table_pos_t)(origin-1) * pndsz,pndt);
01008 end=bound(table[l-1] + (table_pos_t)origin * pndsz,pndt);
01009
01010 if (!keepunigr || lookup[word(table[l-1] + (table_pos_t)origin * pndsz)]!=-1){
01011 while (start < end){
01012
01013 entry=table[l] + (table_pos_t) start * ndsz;
01014
01015 if (lookup[word(entry)]!=-1){
01016
01017 if ((slmt->cursize[l] % slmt->dict->size()) ==0)
01018 slmt->table[l]=(char *)realloc(slmt->table[l],(table_pos_t) (slmt->cursize[l]+slmt->dict->size()) * ndsz);
01019
01020 newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz;
01021 memcpy(newentry,entry,ndsz);
01022 if (!keepunigr)
01023 slmt->word(newentry,lookup[word(entry)]);
01024 if (l<maxlev)
01025 slmt->bound(newentry,ndt,start);
01026 slmt->cursize[l]++;
01027 }
01028
01029 start++;
01030
01031 }
01032 }
01033
01034
01035 slmt->bound(slmt->table[l-1] + (table_pos_t) p * pndsz, pndt,slmt->cursize[l]);
01036
01037 }
01038
01039 }
01040
01041 }
01042
01043
01044 return slmt;
01045 }
01046
01047
01048
01049
01050
01051 void lmtable::savetxt(const char *filename){
01052
01053 fstream out(filename,ios::out);
01054 table_entry_pos_t cnt[1+MAX_NGRAM];
01055 int l;
01056
01057 out.precision(7);
01058
01059 if (isQtable){
01060 out << "qARPA " << maxlev;
01061 for (l=1;l<=maxlev;l++)
01062 out << " " << NumCenters[l];
01063 out << endl;
01064 }
01065
01066 ngram ng(lmtable::getDict(),0);
01067
01068 cerr << "savetxt: " << filename << "\n";
01069
01070 if (isPruned) ngcnt(cnt);
01071
01072 out << "\n\\data\\\n";
01073 for (l=1;l<=maxlev;l++){
01074 out << "ngram " << l << "= " << (isPruned?cnt[l]:cursize[l]) << "\n";
01075 }
01076
01077 for (l=1;l<=maxlev;l++){
01078
01079 out << "\n\\" << l << "-grams:\n";
01080 cerr << "save: " << (isPruned?cnt[l]:cursize[l]) << " " << l << "-grams\n";
01081 if (isQtable){
01082 out << NumCenters[l] << "\n";
01083 for (int c=0;c<NumCenters[l];c++){
01084 out << Pcenters[l][c];
01085 if (l<maxlev) out << " " << Bcenters[l][c];
01086 out << "\n";
01087 }
01088 }
01089
01090 ng.size=0;
01091 dumplm(out,ng,1,l,0,cursize[1]);
01092
01093 }
01094
01095 out << "\\end\\\n";
01096 cerr << "done\n";
01097 }
01098
01099
01100 void lmtable::savebin(const char *filename){
01101
01102 if (isPruned){
01103 cerr << "savebin: pruned LM cannot be saved in binary form\n";
01104 exit(0);
01105 }
01106
01107 fstream out(filename,ios::out);
01108 cerr << "savebin: " << filename << "\n";
01109
01110
01111 if (isQtable){
01112 out << "Qblmt" << (isInverted?"I":"") << " " << maxlev;
01113 for (int i=1;i<=maxlev;i++) out << " " << cursize[i];
01114 out << "\nNumCenters";
01115 for (int i=1;i<=maxlev;i++) out << " " << NumCenters[i];
01116 out << "\n";
01117
01118 }else{
01119 out << "blmt" << (isInverted?"I":"") << " " << maxlev;
01120 for (int i=1;i<=maxlev;i++) out << " " << cursize[i] ;
01121 out << "\n";
01122 }
01123
01124 lmtable::getDict()->save(out);
01125
01126 for (int i=1;i<=maxlev;i++){
01127 cerr << "saving " << cursize[i] << " " << i << "-grams\n";
01128 if (isQtable){
01129 out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float));
01130 if (i<maxlev)
01131 out.write((char *)Bcenters[i],NumCenters[i] * sizeof(float));
01132 }
01133 out.write(table[i],(table_pos_t) cursize[i]*nodesize(tbltype[i]));
01134 }
01135
01136 cerr << "done\n";
01137 }
01138
01139
01140
01141
01142
01143 void lmtable::loadbinheader(istream& inp,const char* header){
01144
01145
01146 inp >> maxlev;
01147
01148 if (strncmp(header,"Qblmt",5)==0){
01149 isQtable=1;
01150 if (strncmp(header,"QblmtI",6)==0)
01151 isInverted=1;
01152 }
01153 else if(strncmp(header,"blmt",4)==0){
01154 isQtable=0;
01155 if (strncmp(header,"blmtI",5)==0)
01156 isInverted=1;
01157 }
01158 else error((char*)"loadbin: LM file is not in binary format");
01159
01160 configure(maxlev,isQtable);
01161
01162 for (int l=1;l<=maxlev;l++){
01163 inp >> cursize[l]; maxsize[l]=cursize[l];
01164 }
01165
01166 if (isQtable){
01167 char header2[100];
01168 inp >> header2;
01169 for (int i=1;i<=maxlev;i++){
01170 inp >> NumCenters[i];
01171 cerr << "reading " << NumCenters[i] << " centers\n";
01172 }
01173 }
01174 }
01175
01176
01177
01178 void lmtable::loadbincodebook(istream& inp,int l){
01179
01180 Pcenters[l]=new float [NumCenters[l]];
01181 inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float));
01182 if (l<maxlev){
01183 Bcenters[l]=new float [NumCenters[l]];
01184 inp.read((char *)Bcenters[l],NumCenters[l]*sizeof(float));
01185 }
01186
01187 }
01188
01189
01190
01191
01192 void lmtable::loadbin(istream& inp, const char* header,const char* filename,int mmap){
01193
01194 cerr << "loadbin()\n";
01195 loadbinheader(inp,header);
01196 lmtable::getDict()->load(inp);
01197
01198
01199 if (filename && mmap>0){
01200
01201 #ifdef WIN32
01202 error("lmtable::loadbin mmap facility not yet supported under WIN32\n");
01203 #else
01204
01205 if (mmap <= maxlev) memmap=mmap;
01206 else error((char*)"keep_on_disk value is out of range\n");
01207
01208 if ((diskid=open(filename, O_RDONLY))<0){
01209 std::cerr << "cannot open " << filename << "\n";
01210 error((char*)"dying");
01211 }
01212
01213
01214 char miniheader[4];
01215 read(diskid,miniheader,4);
01216 if (strncmp(miniheader,"Qblm",4) && strncmp(miniheader,"blmt",4))
01217 error((char*)"mmap functionality does not work with compressed binary LMs\n");
01218 #endif
01219 }
01220
01221 for (int l=1;l<=maxlev;l++){
01222 if (isQtable) loadbincodebook(inp,l);
01223 if ((memmap == 0) || (l < memmap)){
01224 cerr << "loading " << cursize[l] << " " << l << "-grams\n";
01225 table[l]=new char[(table_pos_t) cursize[l] * nodesize(tbltype[l])];
01226 inp.read(table[l],(table_pos_t) cursize[l] * nodesize(tbltype[l]));
01227 }
01228 else{
01229
01230 #ifdef WIN32
01231 error((char*)"mmap not available under WIN32\n");
01232 #else
01233 cerr << "mapping " << cursize[l] << " " << l << "-grams\n";
01234 tableOffs[l]=inp.tellg();
01235 table[l]=(char *)MMap(diskid,PROT_READ,
01236 tableOffs[l], (table_pos_t) cursize[l]*nodesize(tbltype[l]),
01237 &tableGaps[l]);
01238 table[l]+=(table_pos_t) tableGaps[l];
01239 inp.seekg((table_pos_t) cursize[l]*nodesize(tbltype[l]),ios_base::cur);
01240 #endif
01241
01242 }
01243 };
01244
01245 cerr << "done\n";
01246
01247 }
01248
01249
01250
01251 int lmtable::get(ngram& ng,int n,int lev){
01252
01253
01254
01255
01256 totget[lev]++;
01257
01258 if (lev > maxlev) error((char*)"get: lev exceeds maxlevel");
01259 if (n < lev) error((char*)"get: ngram is too small");
01260
01261
01262 table_entry_pos_t offset=0,limit=cursize[1];
01263
01264
01265 table_entry_pos_t hit;
01266 char* found; LMT_TYPE ndt;
01267 ng.link=NULL;
01268 ng.lev=0;
01269
01270 for (int l=1;l<=lev;l++){
01271
01272
01273 hit = 0 ; found = NULL; ndt=tbltype[l];
01274
01275 if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),(char *)&found))
01276 hit=1;
01277 else
01278 search(l,
01279 offset,
01280 (limit-offset),
01281 nodesize(ndt),
01282 ng.wordp(n-l+1),
01283 LMT_FIND,
01284 &found);
01285
01286
01287 if (lmtcache[l] && hit==0)
01288 lmtcache[l]->add(ng.wordp(n),(char *)&found);
01289
01290 if (!found) return 0;
01291 if (prob(found,ndt)==NOPROB) return 0;
01292 ng.path[l]=found;
01293 ng.bow=(l<maxlev?bow(found,ndt):0);
01294 ng.prob=prob(found,ndt);
01295 ng.link=found;
01296 ng.info=ndt;
01297 ng.lev=l;
01298
01299 if (l<maxlev){
01300
01301
01302 if (offset+1==cursize[l]) limit=cursize[l+1];
01303 else limit=bound(found,ndt);
01304
01305
01306 if (found==table[l]) offset=0;
01307 else offset=bound((found - nodesize(ndt)),ndt);
01308
01309 assert(offset!=BOUND_EMPTY1); assert(limit!=BOUND_EMPTY1);
01310 }
01311 }
01312
01313
01314 ng.size=n; ng.freq=0;
01315 ng.succ=(lev<maxlev?limit-offset:0);
01316
01317 #ifdef TRACE_CACHE
01318 if (ng.size==maxlev && sentence_id>0){
01319 *cacheout << sentence_id << " miss " << ng << " " << (unsigned int) ng.link << "\n";
01320 }
01321 #endif
01322
01323 return 1;
01324 }
01325
01326
01327
01328
01329 void lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos){
01330
01331 LMT_TYPE ndt=tbltype[ilev];
01332 ngram ing(ng.dict);
01333 int ndsz=nodesize(ndt);
01334
01335 assert(ng.size==ilev-1);
01336 assert(ipos>=0 && epos<=cursize[ilev] && ipos<epos);
01337 ng.pushc(0);
01338
01339 for (table_entry_pos_t i=ipos;i<epos;i++){
01340 *ng.wordp(1)=word(table[ilev]+(table_pos_t) i*ndsz);
01341 float ipr=prob(table[ilev]+(table_pos_t) i*ndsz,ndt);
01342
01343
01344
01345 if(isPruned && ipr==NOPROB) continue;
01346
01347 if (ilev<elev){
01348
01349 table_entry_pos_t isucc=(i>0?bound(table[ilev]+ (table_pos_t) (i-1) * ndsz,ndt):0);
01350 table_entry_pos_t esucc=bound(table[ilev]+ (table_pos_t) i * ndsz,ndt);
01351 if (isucc < esucc)
01352 dumplm(out,ng,ilev+1,elev,isucc,esucc);
01353
01354
01355 }
01356 else{
01357
01358 out << ipr <<"\t";
01359
01360
01361
01362 if (isInverted & ng.size>1){
01363 ing.invert(ng);
01364 ng=ing;
01365 }
01366
01367 for (int k=ng.size;k>=1;k--){
01368 if (k<ng.size) out << " ";
01369 out << lmtable::getDict()->decode(*ng.wordp(k));
01370 }
01371
01372 if (ilev<maxlev){
01373 float ibo=bow(table[ilev]+ (table_pos_t)i * ndsz,ndt);
01374 if (isQtable) out << "\t" << ibo;
01375 else if (ibo!=0.0) out << "\t" << ibo;
01376
01377
01378
01379
01380
01381
01382
01383 }
01384 out << "\n";
01385 }
01386 }
01387 }
01388
01389
01390
01391
01392
01393 int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
01394 assert(lev==h.lev+1 && h.size==lev && lev<=maxlev);
01395
01396 LMT_TYPE ndt=tbltype[h.lev];
01397 int ndsz=nodesize(ndt);
01398
01399 table_entry_pos_t offset;
01400 switch (action){
01401
01402 case LMT_INIT:
01403
01404
01405 ng.size=lev;
01406 ng.trans(h);
01407
01408 ng.midx[lev]=0;
01409 offset=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0);
01410 h.succ=bound(h.link,ndt)-offset;
01411 h.succlink=table[lev]+(table_pos_t) offset * nodesize(tbltype[lev]);
01412 return 1;
01413
01414 case LMT_CONT:
01415
01416 if (ng.midx[lev] < h.succ)
01417 {
01418
01419 *ng.wordp(1)=word(h.succlink+(table_pos_t) ng.midx[lev]*nodesize(tbltype[lev]));
01420 ng.midx[lev]++;
01421 return 1;
01422 }
01423 else
01424 return 0;
01425
01426 default:
01427 cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n";
01428 exit(0);
01429 }
01430
01431 }
01432
01433
01434
01435
01436
01437
01438 const char *lmtable::maxsuffptr(ngram ong, unsigned int* size){
01439
01440
01441
01442
01443 if (ong.size==0){
01444 if (size!=NULL) *size=0;
01445 return (char*) NULL;
01446 }
01447
01448 if (ong.size>=maxlev) ong.size=maxlev-1;
01449
01450 if (size!=NULL) *size=ong.size;
01451
01452 ngram ng=ong;
01453
01454
01455
01456
01457 if (get(ng,ng.size,ng.size)){
01458 if (ng.succ==0) (*size)--;
01459 return ng.link;
01460 }
01461 else{
01462 ong.size--;
01463 return maxsuffptr(ong,size);
01464 }
01465 }
01466
01467
01468 const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size){
01469
01470
01471
01472
01473 if (size!=NULL) *size=ong.size;
01474 if (ong.size==0) return (char*) NULL;
01475 if (ong.size>=maxlev) ong.size=maxlev-1;
01476
01477 char* found;
01478 unsigned int isize;
01479
01480 if (statecache && (ong.size==maxlev-1) && statecache->get(ong.wordp(maxlev-1),(char *)&found)){
01481 if (size!=NULL) statesizecache->get(ong.wordp(maxlev-1),(char *)size);
01482 return found;
01483 }
01484
01485 found=(char *)maxsuffptr(ong,&isize);
01486
01487 if (statecache && ong.size==maxlev-1){
01488
01489 statecache->add(ong.wordp(maxlev-1),(char *)&found);
01490 statesizecache->add(ong.wordp(maxlev-1),(char *)&isize);
01491 };
01492
01493 if (size!=NULL) *size=isize;
01494
01495 return found;
01496 }
01497
01498
01499
01500
01501
01502
01503
01504
01505
01506
01507
01508
01509
01510
01511
01512
01513
01514
01515
01516
01517
01518
01519
01520
01521
01522
01523
01524
01525
01526
01527
01528
01529
01530
01531
01532
01533
01534
01535
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558 double lmtable::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize){
01559
01560 if (ong.size==0) return 0.0;
01561 if (ong.size>maxlev) ong.size=maxlev;
01562
01563 if (bow) *bow=0;
01564 if (bol) *bol=0;
01565
01566
01567 double rbow=0,lpr=0;
01568 float ibow,iprob;
01569
01570
01571 if (isInverted){
01572 ngram ing=ong;
01573
01574 ing.invert(ong);
01575
01576 get(ing,ing.size,ing.size);
01577 if (ing.lev >0){
01578 iprob=ing.prob;
01579 lpr = (double)(isQtable?Pcenters[ing.size][(qfloat_t)iprob]:iprob);
01580 if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty;
01581 if (statesize) *statesize=MIN(ing.lev,(ing.size-1));
01582 if (maxsuffptr) *maxsuffptr=ing.path[MIN(ing.lev,(ing.size-1))];
01583 }else{
01584 lpr=-log(UNIGRAM_RESOLUTION)/M_LN10;
01585 if (statesize) *statesize=0;
01586 if (maxsuffptr) *maxsuffptr=NULL;
01587 }
01588
01589 if (ing.lev < ing.size){
01590 int depth=(ing.lev>0?ing.lev:1);
01591 if (bol) *bol=ing.size-depth;
01592 ing.size--;
01593 get(ing,ing.size,ing.size);
01594 if (ing.lev>0){
01595
01596 for (int l=depth;l<=ing.lev;l++){
01597
01598 assert(ing.path[l]!=NULL);
01599 ibow=this->bow(ing.path[l],tbltype[l]);
01600 rbow+= (double) (isQtable?Bcenters[l][(qfloat_t)ibow]:ibow);
01601
01602 if (isQtable && (*ing.wordp(1)==dict->oovcode()))
01603 rbow-=(double)Bcenters[l][(qfloat_t)ibow];
01604 }
01605 }
01606 }
01607
01608 if (bow) (*bow)=rbow;
01609 return rbow + lpr;
01610 }
01611 else{
01612
01613 for (ngram ng=ong;ng.size>0;ng.size--){
01614
01615 if (get(ng,ng.size,ng.size)){
01616 iprob=ng.prob;
01617 lpr = (double)(isQtable?Pcenters[ng.size][(qfloat_t)iprob]:iprob);
01618 if (*ng.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty;
01619 if (maxsuffptr || statesize){
01620 if (ong.size==ng.size){
01621 ng.size--;
01622 get(ng,ng.size,ng.size);
01623 }
01624 if (statesize) *statesize=ng.size;
01625 if (maxsuffptr) *maxsuffptr=ng.link;
01626 }
01627 return rbow+lpr;
01628 }else{
01629 if (ng.size==1){
01630 if (maxsuffptr) *maxsuffptr=NULL;
01631 if (statesize) *statesize=0;
01632 return rbow -log(UNIGRAM_RESOLUTION)/M_LN10;
01633 }
01634 else{
01635 if (bol) (*bol)++;
01636 if (ng.lev==(ng.size-1)){
01637 ibow=ng.bow;
01638 rbow+= (double) (isQtable?Bcenters[ng.lev][(qfloat_t)ibow]:ibow);
01639
01640 if (isQtable && (*ng.wordp(2)==dict->oovcode()))
01641 rbow-=(double)Bcenters[ng.lev][(qfloat_t)ibow];
01642 }
01643 if (bow) (*bow)=rbow;
01644 }
01645
01646 }
01647
01648 }
01649 }
01650 assert(0);
01651 return 1.0;
01652 }
01653
01654
01655
01656 double lmtable::clprob(ngram ong){
01657
01658 if (ong.size==0) return 0.0;
01659
01660 if (ong.size>maxlev) ong.size=maxlev;
01661
01662 double logpr;
01663
01664 #ifdef TRACE_CACHE
01665 if (probcache && ong.size==maxlev && sentence_id>0){
01666 *cacheout << sentence_id << " " << ong << "\n";
01667 }
01668 #endif
01669
01670
01671 if (probcache && ong.size==maxlev && probcache->get(ong.wordp(maxlev),(char *)&logpr)){
01672 return logpr;
01673 }
01674
01675
01676
01677 logpr=lmtable::lprob(ong);
01678
01679 if (probcache && ong.size==maxlev){
01680 probcache->add(ong.wordp(maxlev),(char *)&logpr);
01681 };
01682
01683 return logpr;
01684
01685 };
01686
01687
01688
01689 void lmtable::stat(int level){
01690 table_pos_t totmem=0,memory;
01691 float mega=1024 * 1024;
01692
01693 cout.precision(2);
01694
01695 cout << "lmtable class statistics\n";
01696
01697 cout << "levels " << maxlev << "\n";
01698 for (int l=1;l<=maxlev;l++){
01699 memory=(table_pos_t) cursize[l] * nodesize(tbltype[l]);
01700 cout << "lev " << l
01701 << " entries "<< cursize[l]
01702 << " used mem " << memory/mega << "Mb\n";
01703 totmem+=memory;
01704 }
01705
01706 cout << "total allocated mem " << totmem/mega << "Mb\n";
01707
01708 cout << "total number of get and binary search calls\n";
01709 for (int l=1;l<=maxlev;l++){
01710 cout << "level " << l << " get: " << totget[l] << " bsearch: " << totbsearch[l] << "\n";
01711 }
01712
01713 if (level >1 ) lmtable::getDict()->stat();
01714
01715 }
01716
01717 void lmtable::reset_mmap(){
01718 #ifndef WIN32
01719 if (memmap>0 and memmap<=maxlev)
01720 for (int l=memmap;l<=maxlev;l++){
01721
01722 Munmap(table[l]-tableGaps[l],(table_pos_t) cursize[l]*nodesize(tbltype[l])+tableGaps[l],0);
01723 table[l]=(char *)MMap(diskid,PROT_READ,
01724 tableOffs[l], (table_pos_t)cursize[l]*nodesize(tbltype[l]),
01725 &tableGaps[l]);
01726 table[l]+=(table_pos_t)tableGaps[l];
01727 }
01728 #endif
01729 }
01730
01731
01732
01733
01734
01735
01736
01737 double lmtable::lprobx(ngram ong,
01738 double *lkp,
01739 double *bop,
01740 int *bol)
01741 {
01742 double bo, lbo, pr;
01743 float ipr;
01744
01745 ngram ng(dict), ctx(dict);
01746
01747 if(bol) *bol=0;
01748 if(ong.size==0) {
01749 if(lkp) *lkp=0;
01750 return 0;
01751 }
01752 if(ong.size>maxlev) ong.size=maxlev;
01753 ctx = ng = ong;
01754 bo=0;
01755 ctx.shift();
01756 while(!get(ng)) {
01757
01758
01759 if(ng.size==1) {
01760 pr = -log(UNIGRAM_RESOLUTION)/M_LN10;
01761 if(lkp) *lkp=pr;
01762 pr += bo;
01763 return pr;
01764 }
01765
01766 lbo = 0.0;
01767 if(get(ctx)){
01768 ipr = ctx.bow;
01769 lbo = isQtable?Bcenters[ng.size][(qfloat_t)ipr]:ipr;
01770
01771 }
01772 if(bop) *bop++=lbo;
01773 if(bol) ++*bol;
01774 bo += lbo;
01775 ng.size--;
01776 ctx.size--;
01777 }
01778 ipr = ng.prob;
01779 pr = isQtable?Pcenters[ng.size][(qfloat_t)ipr]:ipr;
01780
01781 if(lkp) *lkp=pr;
01782 pr += bo;
01783 return pr;
01784 }
01785
01786
01787
01788 table_entry_pos_t lmtable::wdprune(float *thr,
01789 int aflag)
01790 {
01791 int l;
01792 ngram ng(lmtable::getDict(),0);
01793
01794 isPruned=true;
01795
01796 ng.size=0;
01797 for(l=2; l<=maxlev; l++) wdprune(thr, aflag, ng, 1, l, 0, cursize[1]);
01798 return 0;
01799 }
01800
01801
01802
01803 table_entry_pos_t lmtable::wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double tlk,
01804 double bo, double *ts, double *tbs)
01805 {
01806 LMT_TYPE ndt=tbltype[ilev];
01807 int ndsz=nodesize(ndt);
01808 char *ndp;
01809 float lk;
01810 float ipr, ibo;
01811
01812 table_entry_pos_t i, k, nk;
01813
01814 assert(ng.size==ilev-1);
01815 assert(ipos>=0 && epos<=cursize[ilev] && ipos<epos);
01816
01817 ng.pushc(0);
01818
01819 for(i=ipos, nk=0; i<epos; i++) {
01820
01821
01822 ndp = table[ilev]+(table_pos_t)i*ndsz;
01823 *ng.wordp(1) = word(ndp);
01824
01825
01826 ipr = prob(ndp, ndt);
01827 if(ipr==NOPROB) continue;
01828 lk = ipr;
01829
01830
01831 if(ilev<elev) {
01832
01833
01834 ibo = bow(ndp, ndt);
01835 bo = ibo;
01836
01837
01838
01839 table_entry_pos_t isucc = i>0 ? bound(ndp-ndsz, ndt) : 0;
01840 table_entry_pos_t esucc = bound(ndp, ndt);
01841 if(isucc>=esucc) continue;
01842
01843
01844
01845 prune: double ts=0, tbs=0;
01846 k = wdprune(thr, aflag, ng, ilev+1, elev, isucc, esucc,
01847 tlk+lk, bo, &ts, &tbs);
01848
01849 if(ilev!=elev-1) continue;
01850 if(ts>=1 || tbs>=1) {
01851 cerr << "ng: " << ng
01852 <<" ts=" << ts
01853 <<" tbs=" << tbs
01854 <<" k=" << k
01855 <<" ns=" << esucc-isucc
01856 << "\n";
01857 if(ts>=1) {
01858 pscale(ilev+1, isucc, esucc,
01859 0.999999/ts);
01860 goto prune;
01861 }
01862 }
01863
01864
01865 bo = log((1-ts)/(1-tbs))/M_LN10;
01867 ibo=(float)bo;
01868
01869 bow(ndp, ndt, ibo);
01870 } else {
01871
01872
01873 ngram bng = ng; --bng.size;
01874 double blk = lprob(bng);
01875
01876 double wd = pow(10., tlk+lk) * (lk-bo-blk);
01877 if(aflag&&wd<0) wd=-wd;
01878 if(wd > thr[elev-1]) {
01879 *ts += pow(10., lk);
01880 *tbs += pow(10., blk);
01881 } else {
01882 ++nk;
01883 prob(ndp, ndt, NOPROB);
01884 }
01885 }
01886 }
01887 return nk;
01888 }
01889
01890 int lmtable::pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s)
01891 {
01892 LMT_TYPE ndt=tbltype[lev];
01893 int ndsz=nodesize(ndt);
01894 char *ndp;
01895 float ipr;
01896
01897
01898 s=log(s)/M_LN10;
01899 ndp = table[lev]+ (table_pos_t) ipos*ndsz;
01900 for(table_entry_pos_t i=ipos; i<epos; ndp+=ndsz,i++) {
01901 ipr = prob(ndp, ndt);
01902 if(ipr==NOPROB) continue;
01904 ipr+=(float) s;
01905
01906 prob(ndp, ndt, ipr);
01907 }
01908 return 0;
01909 }
01910
01911
01912 table_entry_pos_t lmtable::ngcnt(table_entry_pos_t *cnt)
01913 {
01914 ngram ng(lmtable::getDict(),0);
01915 memset(cnt, 0, (maxlev+1)*sizeof(*cnt));
01916 ngcnt(cnt, ng, 1, 0, cursize[1]);
01917 return 0;
01918 }
01919
01920
01921 table_entry_pos_t lmtable::ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos){
01922
01923 table_entry_pos_t i, isucc, esucc;
01924 float ipr;
01925
01926 char *ndp;
01927 LMT_TYPE ndt=tbltype[l];
01928 int ndsz=nodesize(ndt);
01929
01930 ng.pushc(0);
01931 for(i=ipos; i<epos; i++) {
01932 ndp = table[l]+(table_pos_t) i*ndsz;
01933 *ng.wordp(1)=word(ndp);
01934 ipr=prob(ndp, ndt);
01935 if(ipr==NOPROB) continue;
01936 ++cnt[l];
01937 if(l==maxlev) continue;
01938 isucc = (i>0)?bound(ndp-ndsz, ndt):0;
01939 esucc = bound(ndp, ndt);
01940 if(isucc < esucc) ngcnt(cnt, ng, l+1, isucc, esucc);
01941 }
01942 return 0;
01943 }
01944
01945
01946
01947