00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <cmath>
00022 #include <string>
00023 #include <assert.h>
00024 #include "util.h"
00025 #include "mfstream.h"
00026 #include "mempool.h"
00027 #include "htable.h"
00028 #include "dictionary.h"
00029 #include "n_gram.h"
00030 #include "mempool.h"
00031 #include "ngramcache.h"
00032 #include "ngramtable.h"
00033 #include "normcache.h"
00034 #include "interplm.h"
00035 #include "mdiadapt.h"
00036 #include "shiftlm.h"
00037 #include "lmtable.h"
00038
00039 using namespace std;
00040
00041
00042
00043
00044 mdiadaptlm::mdiadaptlm(char* ngtfile,int depth,TABLETYPE tbtype):
00045 interplm(ngtfile,depth,tbtype)
00046 {
00047 adaptlev=0;
00048 forelm=NULL;
00049 cache=NULL;
00050 m_save_per_level=true;
00051 };
00052
00053 mdiadaptlm::~mdiadaptlm()
00054 {
00055 if (cache) delete cache;
00056 delete_caches();
00057 };
00058
00059 void mdiadaptlm::delete_caches(int level)
00060 {
00061 if (probcache[level]) delete probcache[level];
00062 if (backoffcache[level]) delete backoffcache[level];
00063 };
00064
00065 void mdiadaptlm::delete_caches()
00066 {
00067 #ifdef MDIADAPTLM_CACHE_ENABLE
00068 for (int i=0; i<=max_caching_level; i++) delete_caches(i);
00069
00070 delete [] probcache;
00071 delete [] backoffcache;
00072 #endif
00073 };
00074
00075 void mdiadaptlm::caches_stat()
00076 {
00077 #ifdef MDIADAPTLM_CACHE_ENABLE
00078 for (int i=1; i<=max_caching_level; i++) {
00079 if (probcache[i]) {
00080 cerr << "Statistics of probcache at level " << i << " (of " << lmsize() << ") ";
00081 probcache[i]->stat();
00082 }
00083 if (backoffcache[i]) {
00084 cerr << "Statistics of backoffcache at level " << i << " (of " << lmsize() << ") ";
00085 backoffcache[i]->stat();
00086 }
00087 }
00088 #endif
00089 };
00090
00091
00092 void mdiadaptlm::create_caches(int mcl)
00093 {
00094 max_caching_level=(mcl>=0 && mcl<lmsize())?mcl:lmsize()-1;
00095
00096 probcache = new NGRAMCACHE_t*[max_caching_level+1];
00097 backoffcache = new NGRAMCACHE_t*[max_caching_level+1];
00098 for (int i=0; i<=max_caching_level; i++) {
00099 probcache[i]=NULL;
00100 backoffcache[i]=NULL;
00101 }
00102
00103 init_caches();
00104 }
00105
00106
00107 void mdiadaptlm::init_caches(int level)
00108 {
00109 assert(probcache[level]==NULL);
00110 assert(backoffcache[level]==NULL);
00111 probcache[level]=new NGRAMCACHE_t(level,sizeof(double),400000);
00112 backoffcache[level]=new NGRAMCACHE_t(level,sizeof(double),400000);
00113 };
00114
00115 void mdiadaptlm::init_caches()
00116 {
00117 #ifdef MDIADAPTLM_CACHE_ENABLE
00118 for (int i=1; i<=max_caching_level; i++) init_caches(i);
00119 #endif
00120 };
00121
00122 void mdiadaptlm::check_cache_levels(int level)
00123 {
00124 if (probcache[level] && probcache[level]->isfull()) probcache[level]->reset(probcache[level]->cursize());
00125 if (backoffcache[level] && backoffcache[level]->isfull()) backoffcache[level]->reset(backoffcache[level]->cursize());
00126 };
00127
00128 void mdiadaptlm::check_cache_levels()
00129 {
00130 #ifdef MDIADAPTLM_CACHE_ENABLE
00131 for (int i=1; i<=max_caching_level; i++) check_cache_levels(i);
00132 #endif
00133 };
00134
00135 void mdiadaptlm::reset_caches(int level)
00136 {
00137 if (probcache[level]) probcache[level]->reset(MAX(probcache[level]->cursize(),probcache[level]->maxsize()));
00138 if (backoffcache[level]) backoffcache[level]->reset(MAX(backoffcache[level]->cursize(),backoffcache[level]->maxsize()));
00139 };
00140
00141 void mdiadaptlm::reset_caches()
00142 {
00143 #ifdef MDIADAPTLM_CACHE_ENABLE
00144 for (int i=1; i<=max_caching_level; i++) reset_caches(i);
00145 #endif
00146 };
00147
00148
00149 inline NGRAMCACHE_t* mdiadaptlm::get_probcache(int level)
00150 {
00151 return probcache[level];
00152 }
00153
00154 inline NGRAMCACHE_t* mdiadaptlm::get_backoffcache(int level)
00155 {
00156 return backoffcache[level];
00157 }
00158
00159 int mdiadaptlm::scalefact(char *ngtfile)
00160 {
00161 if (forelm!=NULL) delete forelm;
00162 if (cache!=NULL) delete cache;
00163 cache=new normcache(dict);
00164
00165 forelm=new shiftbeta(ngtfile,1);
00166 forelm->train();
00167
00168
00169 ngram fng(forelm->dict,1);
00170 ngram ng(dict,1);
00171 int* w=fng.wordp(1);
00172
00173 oovscaling=1.0;
00174 for ((*w)=0; (*w)<forelm->dict->size(); (*w)++)
00175 if ((*w) != forelm->dict->oovcode()) {
00176 ng.trans(fng);
00177 if (*ng.wordp(1)==dict->oovcode()) {
00178 cerr << "adaptation file contains new words: use -ao=yes option\n";
00179 exit(1);
00180 }
00181
00182 oovscaling-=backunig(ng);
00183 }
00184 *w=forelm->dict->oovcode();
00185 oovscaling=foreunig(fng)/oovscaling;
00186
00187 return 1;
00188 };
00189
00190 int mdiadaptlm::savescalefactor(char* filename)
00191 {
00192
00193 ngram ng(dict,1);
00194 int* w=ng.wordp(1);
00195
00196 mfstream out(filename,ios::out);
00197
00198 out << "\n\\data\\" << "\nngram 1=" << dict->size() << "\n\n1grams:\n";
00199
00200 for ((*w)=0; (*w)<dict->size(); (*w)++) {
00201 double ratio=scalefact(ng);
00202 out << (float) (ratio?log10(ratio):-99);
00203 if (*w==dict->oovcode())
00204 out << "\t" << "<unk>\n";
00205 else
00206 out << "\t" << (char *)dict->decode(*w) << "\n";
00207
00208 }
00209 out << "\\end\\\n";
00210
00211 return 1;
00212 }
00213
00214 double mdiadaptlm::scalefact(ngram ng)
00215 {
00216 ngram fng(forelm->dict,1);
00217 fng.trans(ng);
00218 if (*fng.wordp(1)==forelm->dict->oovcode())
00219 return pow(oovscaling,gis_step);
00220 else {
00221 double prback=backunig(ng);
00222 double prfore=foreunig(ng);
00223 return pow(prfore/prback,gis_step);
00224 }
00225 }
00226
00227
00228 double mdiadaptlm::foreunig(ngram ng)
00229 {
00230
00231 double fstar,lambda;
00232
00233 forelm->discount(ng,1,fstar,lambda);
00234
00235 return fstar;
00236 }
00237
00238 double mdiadaptlm::backunig(ngram ng)
00239 {
00240
00241 double fstar,lambda;
00242
00243 discount(ng,1,fstar,lambda,0);
00244
00245 return fstar;
00246 };
00247
00248
00249
00250 int mdiadaptlm::adapt(char* ngtfile,int alev,double step)
00251 {
00252
00253 if (alev > lmsize() || alev<=0) {
00254 cerr << "setting adaptation level to " << lmsize() << "\n";
00255 alev=lmsize();
00256 }
00257 adaptlev=alev;
00258
00259
00260 cerr << "adapt ....";
00261 gis_step=step;
00262
00263 if (ngtfile==NULL) {
00264 cerr << "adaptation file is missing\n";
00265 exit(1);
00266 }
00267
00268
00269
00270 scalefact(ngtfile);
00271
00272
00273 ngram ng(dict,2);
00274 int* w=ng.wordp(1);
00275
00276 cerr << "precomputing 1-gram normalization ...\n";
00277 zeta0=0;
00278 for ((*w)=0; (*w)<dict->size(); (*w)++)
00279 zeta0+=scalefact(ng) * backunig(ng);
00280
00281 if (alev==1) return 1 ;
00282
00283 cerr << "precomputing 2-gram normalization:\n";
00284
00285
00286 w=ng.wordp(2);
00287 *ng.wordp(1)=0;
00288
00289 for ((*w)=0; (*w)<dict->size(); (*w)++) {
00290 zeta(ng,2);
00291 if ((*w % 1000)==0) cerr << ".";
00292 }
00293
00294 cerr << "done\n";
00295
00296 return 1;
00297 };
00298
00299
00300 double mdiadaptlm::zeta(ngram ng,int size)
00301 {
00302
00303 assert(size>=1);
00304
00305 double z=0;
00306
00307 ng.size=size;
00308
00309 if (size==1) return zeta0;
00310 else {
00311
00312
00313 if (size <=3 && cache->get(ng,size,z)) return z;
00314
00315 double fstar,lambda;
00316 ngram histo=ng;
00317 int succ=0;
00318
00319 discount(ng,size,fstar,lambda,(int)0);
00320
00321 if ((lambda<1) && get(histo,size,size-1)) {
00322 ;
00323
00324
00325 succ=0;
00326
00327 succscan(histo,ng,INIT,size);
00328 while(succscan(histo,ng,CONT,size)) {
00329
00330 discount(ng,size,fstar,lambda,0);
00331 if (fstar>0) {
00332 z+=(scalefact(ng) * fstar);
00333 succ++;
00334
00335 }
00336 }
00337 }
00338
00339 z+=lambda*zeta(ng,size-1);
00340
00341 if (size<=3 && succ>1) cache->put(ng,size,z);
00342
00343 return z;
00344 }
00345
00346 }
00347
00348
00349 int mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int )
00350 {
00351
00352 ngram ng(dict);
00353 ng.trans(ng_);
00354
00355 double __fstar, __lambda;
00356 bool lambda_cached=0;
00357 int size_lambda=size-1;
00358
00359 ngram histo=ng;
00360 histo.shift();
00361
00362 if (size_lambda>0 && histo.size>=size_lambda) {
00363 #ifdef MDIADAPTLM_CACHE_ENABLE
00364 if (size_lambda<=max_caching_level) {
00365
00366 if (backoffcache[size_lambda] && backoffcache[size_lambda]->get(histo.wordp(size_lambda),__lambda))
00367 lambda_cached=1;
00368 }
00369 #endif
00370 }
00371
00372 discount(ng,size,__fstar,__lambda,0);
00373
00374 if ((size>0) && (size<=adaptlev) && (__lambda<1)) {
00375
00376 if (size>1) {
00377 double numlambda, numfstar, den;
00378 numfstar=scalefact(ng);
00379 den=zeta(ng,size);
00380 __fstar=__fstar * numfstar/den;
00381 if (!lambda_cached) {
00382 numlambda=zeta(ng,size-1);
00383 __lambda=__lambda * numlambda/den;
00384 }
00385 } else if (size==1) {
00386 double ratio;
00387 ratio=scalefact(ng)/zeta0;
00388 __fstar=__fstar * ratio;
00389 if (!lambda_cached) {
00390 __lambda=__lambda * ratio;
00391 }
00392 } else {
00393
00394 }
00395 }
00396
00397 #ifdef MDIADAPTLM_CACHE_ENABLE
00398
00399 if (!lambda_cached && size_lambda>0 && size_lambda<=max_caching_level && histo.size>=size_lambda && backoffcache[size_lambda])
00400 backoffcache[size_lambda]->add(histo.wordp(size_lambda),__lambda);
00401 #endif
00402
00403 lambda=__lambda;
00404 fstar=__fstar;
00405 return 1;
00406 }
00407
00408
00409 int mdiadaptlm::compute_backoff_per_level()
00410 {
00411
00412 double fstar,lambda;
00413
00414 this->backoff=1;
00415
00416 for (int size=1; size<lmsize(); size++) {
00417
00418 ngram hg(dict,size);
00419
00420 scan(hg,INIT,size);
00421
00422 while(scan(hg,CONT,size)) {
00423
00424 ngram ng=hg;
00425 ng.pushc(0);
00426
00427 double pr=1.0;
00428
00429 succscan(hg,ng,INIT,size+1);
00430 while(succscan(hg,ng,CONT,size+1)) {
00431
00432 mdiadaptlm::discount(ng,ng.size,fstar,lambda);
00433
00434 if (fstar>0){
00435 ng.size=ng.size-1;
00436 pr -= mdiadaptlm::prob(ng,size);
00437 }
00438 }
00439
00440 assert(pr>0 && pr<=1);
00441
00442 boff(hg.link,pr);
00443 }
00444
00445 }
00446
00447 cerr << "done\n";
00448
00449 return 1;
00450 }
00451
00452
00453 int mdiadaptlm::compute_backoff_per_word()
00454 {
00455 cerr << "Current implementation does not support the usage of backoff (-bo=yes) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=no)." << endl;
00456 cerr << "Please, either choose a per-level saving (-saveperllevel=yes) or do not use backoff (-bo=no) " << endl;
00457
00458 exit(1);
00459 }
00460
00461
00462 double mdiadaptlm::prob2(ngram ng,int size,double& fstar)
00463 {
00464
00465 double lambda;
00466
00467 mdiadaptlm::discount(ng,size,fstar,lambda);
00468
00469 if (size>1)
00470 return fstar + lambda * prob(ng,size-1);
00471 else
00472 return fstar;
00473 }
00474
00475
00476
00477 double mdiadaptlm::prob(ngram ng,int size)
00478 {
00479 double fstar,lambda,bo;
00480 return prob(ng,size,fstar,lambda,bo);
00481 }
00482
00483 double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo)
00484 {
00485 double pr;
00486
00487 #ifdef MDIADAPTLM_CACHE_ENABLE
00488
00489 if (size<=max_caching_level && probcache[size] && ng.size>=size && probcache[size]->get(ng.wordp(size),pr))
00490 return pr;
00491 #endif
00492
00493
00494 mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo);
00495
00496 if (fstar>UPPER_SINGLE_PRECISION_OF_1 || lambda>UPPER_SINGLE_PRECISION_OF_1) {
00497 cerr << "wrong probability: " << ng
00498 << " , size " << size
00499 << " , fstar " << fstar
00500 << " , lambda " << lambda << "\n";
00501 exit(1);
00502 }
00503 if (backoff) {
00504
00505 if (size>1) {
00506 if (fstar>0){
00507 pr=fstar;
00508 }else {
00509 if (lambda<1){
00510 pr = lambda/bo * prob(ng,size-1);
00511 }else {
00512 assert(lambda<UPPER_SINGLE_PRECISION_OF_1);
00513 pr = prob(ng,size-1);
00514 }
00515 }
00516 } else
00517 pr = fstar;
00518 }
00519
00520 else {
00521
00522 if (size>1)
00523 pr = fstar + lambda * prob(ng,size-1);
00524 else
00525 pr = fstar;
00526 }
00527
00528 #ifdef MDIADAPTLM_CACHE_ENABLE
00529
00530 if (size<=max_caching_level && probcache[size] && ng.size>=size)
00531 probcache[size]->add(ng.wordp(size),pr);
00532 #endif
00533
00534 return pr;
00535 }
00536
00537
00538 int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo)
00539 {
00540 ngram ng(dict);
00541 ng.trans(ng_);
00542
00543 mdiadaptlm::discount(ng,size,fstar,lambda);
00544
00545 bo=1.0;
00546
00547 if (backoff) {
00548
00549 if (size>1 && lambda<1) {
00550
00551 ngram hg=ng;
00552
00553
00554 if (! get(hg,size,size-1)){
00555 cerr << "ERROR: int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) -> get(hg,size,size-1) returns NULL\n";
00556 }
00557 assert(get(hg,size,size-1));
00558
00559 bo=boff(hg.link);
00560
00561
00562
00563
00564
00565 }
00566 }
00567
00568 return 1;
00569 }
00570
00571
00572 double mdiadaptlm::txclprob(ngram ng,int size)
00573 {
00574
00575 double fstar,lambda;
00576
00577 if (size>1) {
00578 mdiadaptlm::discount(ng,size,fstar,lambda);
00579 return fstar + lambda * txclprob(ng,size-1);
00580 } else {
00581 double freq=1;
00582 if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1))
00583 freq+=ng.freq;
00584
00585 double N=totfreq()+dict->dub()-dict->size();
00586 return freq/N;
00587 }
00588 }
00589
00590
00591 int mdiadaptlm::netsize()
00592 {
00593 double fstar,lambda;
00594 int size,totsize;
00595 ngram ng(dict);
00596
00597 cerr << "Computing LM size:\n";
00598
00599 totsize=dict->size() * 2;
00600
00601 cout << "1-gram " << totsize << "\n";
00602
00603 for (int i=2; i<=maxlevel(); i++) {
00604
00605 size=0;
00606
00607 scan(ng,INIT,i);
00608
00609 while (scan(ng,CONT,i)) {
00610
00611 mdiadaptlm::discount(ng,i,fstar,lambda);
00612
00613 if (fstar>0) size++;
00614
00615 }
00616
00617 size+=size * (i<maxlevel());
00618
00619 totsize+=size;
00620
00621 cout << i << "-gram " << totsize << "\n";
00622
00623 }
00624
00625 return totsize;
00626 }
00627
00628
00629
00630
00631
00632
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676 int swapbytes(char *p, int sz, int n)
00677 {
00678 char c,*l,*h;
00679 if((n<1) ||(sz<2)) return 0;
00680 for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) {
00681 c=*h;
00682 *h=*l;
00683 *l=c;
00684 }
00685 return 0;
00686 };
00687
00688 void fwritex(char *p,int sz,int n,FILE* f)
00689 {
00690
00691 if(*(short *)"AB"==0x4241) {
00692 swapbytes((char*)p, sz,n);
00693 }
00694
00695 fwrite((char *)p,sz,n,f);
00696
00697 if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n);
00698
00699 }
00700
00701 void ifwrite(long loc,void *ptr,int size,int ,FILE* f)
00702 {
00703 fflush(f);
00704
00705 long pos=ftell(f);
00706
00707 fseek(f,loc,SEEK_SET);
00708
00709 fwritex((char *)ptr,size,1,f);
00710
00711 fseek(f,pos,SEEK_SET);
00712
00713 fflush(f);
00714 }
00715
00716 void writeNull(unsigned short nullCode,float nullProb,FILE* f)
00717 {
00718 fwritex((char *)&nullCode,sizeof(short),1,f);
00719 fwritex((char *)&nullProb,sizeof(float),1,f);
00720 }
00721
00722
00723 int mdiadaptlm::saveASR(char *filename,int ,char* subdictfile)
00724 {
00725 int totbg,tottr;
00726
00727 dictionary* subdict;
00728
00729 if (subdictfile)
00730 subdict=new dictionary(subdictfile);
00731 else
00732 subdict=dict;
00733
00734 typedef unsigned short code;
00735
00736 system("date");
00737
00738 if (lmsize()>3 || lmsize()<1) {
00739 cerr << "wrong lmsize\n";
00740 exit(1);
00741 }
00742
00743 if (dict->size()>=0xffff && subdict->size()>=0xffff) {
00744 cerr << "save bin requires unsigned short codes\n";
00745 exit(1);
00746 }
00747
00748 FILE* f=fopen(filename,"w");
00749
00750 double fstar,lambda,boff;
00751 float pr;
00752 long succ1pos,succ2pos;
00753 code succ1,succ2,w,h1,h2;
00754 code stop=0xffff;
00755
00756
00757
00758
00759 code oovcode=subdict->oovcode();
00760
00761
00762 code subdictsz=subdict->size()+1;
00763
00764 fwritex((char *)&subdictsz,sizeof(code),1,f);
00765
00766 subdictsz--;
00767 for (w=0; w<subdictsz; w++)
00768 fprintf(f,"%s\n",(char *)subdict->decode(w));
00769
00770 fprintf(f,"____\n");
00771
00772
00773
00774
00775 h1=subdictsz;
00776 fwritex((char *)&h1,sizeof(code),1,f);
00777
00778 succ1=0;
00779 succ1pos=ftell(f);
00780 fwritex((char *)&succ1,sizeof(code),1,f);
00781
00782 ngram ng(dict);
00783 ngram sng(subdict);
00784
00785 ng.size=sng.size=1;
00786
00787 scan(ng,INIT,1);
00788 while(scan(ng,CONT,1)) {
00789 sng.trans(ng);
00790 if (sng.containsWord(subdict->OOV(),1))
00791 continue;
00792
00793 pr=(float)mdiadaptlm::prob(ng,1);
00794 if (pr>1e-50) {
00795 succ1++;
00796 w=*sng.wordp(1);
00797 fwritex((char *)&w,sizeof(code),1,f);
00798 fwritex((char *)&pr,sizeof(float),1,f);
00799 } else {
00800 cerr << "small prob word " << ng << "\n";
00801 }
00802 }
00803
00804
00805 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
00806
00807 cerr << "finito unigrammi " << succ1 << "\n";
00808 fflush(f);
00809
00810 if (lmsize()==1) {
00811 fclose(f);
00812 return 1;
00813 }
00814
00815
00816
00817
00818 succ1=0;
00819 h1=subdictsz;
00820 totbg=subdictsz;
00821
00822 ngram hg1(dict,1);
00823
00824 ng.size=sng.size=2;
00825
00826 scan(hg1,INIT,1);
00827 while(scan(hg1,CONT,1)) {
00828
00829 if (hg1.containsWord(dict->OOV(),1)) continue;
00830
00831 assert((*hg1.wordp(1))<dict->size());
00832
00833 *ng.wordp(2)=*hg1.wordp(1);
00834 *ng.wordp(1)=0;
00835
00836 sng.trans(ng);
00837 if (sng.containsWord(dict->OOV(),1)) continue;
00838
00839 mdiadaptlm::bodiscount(ng,2,fstar,lambda,boff);
00840
00841 if (lambda < 1.0) {
00842
00843 h1=*sng.wordp(2);
00844
00845 fwritex((char *)&h1,sizeof(code),1,f);
00846
00847 succ1=0;
00848 succ1pos=ftell(f);
00849 fwritex((char *)&succ1,sizeof(code),1,f);
00850
00851 ngram shg=hg1;
00852 get(shg,1,1);
00853
00854 succscan(shg,ng,INIT,2);
00855 while(succscan(shg,ng,CONT,2)) {
00856
00857 if (*ng.wordp(1)==oovcode) continue;
00858
00859 sng.trans(ng);
00860 if (sng.containsWord(dict->OOV(),2)) continue;
00861
00862 mdiadaptlm::discount(ng,2,fstar,lambda);
00863
00864 if (fstar>1e-50) {
00865 w=*sng.wordp(1);
00866 fwritex((char *)&w,sizeof(code),1,f);
00867 pr=(float)mdiadaptlm::prob(ng,2);
00868
00869
00870 fwritex((char *)&pr,sizeof(float),1,f);
00871 succ1++;
00872 }
00873 }
00874
00875 if (succ1) {
00876 lambda/=boff;
00877 writeNull(subdictsz,(float)lambda,f);
00878 succ1++;
00879 totbg+=succ1;
00880 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
00881 } else {
00882
00883 fseek(f,succ1pos-(streampos)sizeof(code),SEEK_SET);
00884 }
00885 }
00886 }
00887
00888 fwritex((char *)&stop,sizeof(code),1,f);
00889
00890 cerr << " finito bigrammi! " << subdictsz << "\n";
00891 fflush(f);
00892
00893 system("date");
00894
00895 if (lmsize()<3) {
00896 fclose(f);
00897 return 1;
00898 }
00899
00900
00901
00902 h1=subdictsz;
00903 h2=subdictsz;
00904 tottr=0;
00905 succ1=0;
00906 succ2=0;
00907
00908 ngram hg2(dict,2);
00909
00910 ng.size=sng.size=3;
00911
00912 scan(hg1,INIT,1);
00913 while(scan(hg1,CONT,1)) {
00914
00915 if ((*hg1.wordp(1)==oovcode)) continue;
00916
00917 *ng.wordp(3)=*hg1.wordp(1);
00918
00919 sng.trans(ng);
00920 if (sng.containsWord(dict->OOV(),1)) continue;
00921
00922 assert((*sng.wordp(3))<subdictsz);
00923
00924 h1=*sng.wordp(3);
00925 fwritex((char *)&h1,sizeof(code),1,f);
00926
00927 succ1=0;
00928 succ1pos=ftell(f);
00929 fwritex((char *)&succ1,sizeof(code),1,f);
00930
00931 ngram shg1=ng;
00932 get(shg1,3,1);
00933
00934 succscan(shg1,hg2,INIT,2);
00935 while(succscan(shg1,hg2,CONT,2)) {
00936
00937 if (*hg2.wordp(1)==oovcode) continue;
00938
00939 *ng.wordp(2)=*hg2.wordp(1);
00940 *ng.wordp(1)=0;
00941
00942 sng.trans(ng);
00943 if (sng.containsWord(dict->OOV(),2)) continue;
00944
00945 mdiadaptlm::bodiscount(ng,3,fstar,lambda,boff);
00946
00947 if (lambda < 1.0) {
00948
00949 h2=*sng.wordp(2);
00950 fwritex((char *)&h2,sizeof(code),1,f);
00951
00952 succ2=0;
00953 succ2pos=ftell(f);
00954 fwritex((char *)&succ2,sizeof(code),1,f);
00955
00956 ngram shg2=ng;
00957 get(shg2,3,2);
00958
00959 succscan(shg2,ng,INIT,3);
00960 while(succscan(shg2,ng,CONT,3)) {
00961
00962 if (*ng.wordp(1)==oovcode) continue;
00963
00964 sng.trans(ng);
00965 if (sng.containsWord(dict->OOV(),3)) continue;
00966
00967 mdiadaptlm::discount(ng,3,fstar,lambda);
00968
00969
00970 if (fstar>1e-50) {
00971
00972 w=*sng.wordp(1);
00973 fwritex((char *)&w,sizeof(code),1,f);
00974
00975 pr=(float)mdiadaptlm::prob(ng,3);
00976
00977
00978 fwritex((char *)&pr,sizeof(float),1,f);
00979 succ2++;
00980 }
00981 }
00982
00983 if (succ2) {
00984 lambda/=boff;
00985 writeNull(subdictsz,(float)lambda,f);
00986 succ2++;
00987 tottr+=succ2;
00988 ifwrite(succ2pos,&succ2,sizeof(code),1,f);
00989 succ1++;
00990 } else {
00991
00992 fseek(f,succ2pos-(long)sizeof(code),SEEK_SET);
00993 }
00994 }
00995 }
00996
00997 if (succ1)
00998 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
00999 else
01000 fseek(f,succ1pos-(long)sizeof(code),SEEK_SET);
01001 }
01002
01003 fwritex((char *)&stop,sizeof(code),1,f);
01004
01005 fclose(f);
01006
01007 cerr << "Tot bg: " << totbg << " tg: " << tottr<< "\n";
01008
01009 system("date");
01010
01011 return 1;
01012 };
01013
01014
01016
01017 int mdiadaptlm::saveMT(char *filename,int backoff,
01018 char* subdictfile,int resolution,double decay)
01019 {
01020
01021 double logalpha=log(decay);
01022 dictionary* subdict;
01023
01024 if (subdictfile)
01025 subdict=new dictionary(subdictfile);
01026 else
01027 subdict=dict;
01028
01029 ngram ng(dict,lmsize());
01030 ngram sng(subdict,lmsize());
01031
01032 cerr << "Adding unigram of OOV word if missing\n";
01033
01034 for (int i=1; i<=maxlevel(); i++)
01035 *ng.wordp(i)=dict->oovcode();
01036
01037 if (!get(ng,maxlevel(),1)) {
01038 cerr << "oov is missing in the ngram-table\n";
01039
01040 ng.freq=dict->freq(dict->oovcode());
01041 cerr << "adding oov unigram " << ng << "\n";
01042 put(ng);
01043 }
01044
01045 cerr << "Eventually adding OOV symbol to subdictionary\n";
01046 subdict->encode(OOV_);
01047
01048 system("date");
01049
01050 mfstream out(filename,ios::out);
01051
01052
01053
01054 subdict->incflag(1);
01055 int bo_code=subdict->encode(BACKOFF_);
01056 int du_code=subdict->encode(DUMMY_);
01057 subdict->incflag(0);
01058
01059 out << "nGrAm " << lmsize() << " " << 0
01060 << " " << "LM_ "
01061 << resolution << " "
01062 << decay << "\n";
01063
01064 subdict->save(out);
01065
01066
01067
01068 cerr << "write unigram of oov probability\n";
01069 ng.size=1;
01070 *ng.wordp(1)=dict->oovcode();
01071 double pr=(float)mdiadaptlm::prob(ng,1);
01072 sng.trans(ng);
01073 sng.size=lmsize();
01074 for (int s=2; s<=lmsize(); s++) *sng.wordp(s)=du_code;
01075 sng.freq=(int)ceil(pr * (double)10000000)-1;
01076 out << sng << "\n";
01077
01078 for (int i=1; i<=lmsize(); i++) {
01079 cerr << "LEVEL " << i << "\n";
01080
01081 double fstar,lambda,bo,dummy;
01082
01083 scan(ng,INIT,i);
01084 while(scan(ng,CONT,i)) {
01085
01086 sng.trans(ng);
01087
01088 sng.size=lmsize();
01089 for (int s=i+1; s<=lmsize(); s++)
01090 *sng.wordp(s)=du_code;
01091
01092 if (i>=1 && sng.containsWord(subdict->OOV(),sng.size)) {
01093 cerr << "skipping : " << sng << "\n";
01094 continue;
01095 }
01096
01097
01098
01099
01100
01101 mdiadaptlm::discount(ng,i,fstar,dummy);
01102
01103
01104
01105
01106
01107
01108 if (fstar>0) {
01109
01110 double pr=(float)mdiadaptlm::prob(ng,i);
01111
01112 if (i>1 && resolution<10000000) {
01113 sng.freq=resolution-(int)(log(pr)/logalpha)-1;
01114 sng.freq=(sng.freq>=0?sng.freq:0);
01115 } else
01116 sng.freq=(int)ceil(pr * (double)10000000)-1;
01117
01118 out << sng << "\n";
01119
01120 }
01121
01122 if (i<lmsize()) {
01123
01124 ngram ng2=ng;
01125 ng2.pushc(0);
01126 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
01127 assert(!backoff || (lambda ==1 || bo<1 ));
01128
01129 sng.pushc(bo_code);
01130 sng.size=lmsize();
01131
01132 if (lambda<1) {
01133 if (resolution<10000000) {
01134 sng.freq=resolution-(int)((log(lambda) - log(bo))/logalpha)-1;
01135 sng.freq=(sng.freq>=0?sng.freq:0);
01136 } else
01137 sng.freq=(int)ceil(lambda/bo * (double)10000000)-1;
01138
01139 out << sng << "\n";
01140 }
01141 }
01142 }
01143 cerr << "LEVEL " << i << "DONE \n";
01144 }
01145 return 1;
01146 };
01147
01149
01150 int mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile,int mmap)
01151 {
01152 VERBOSE(2,"mdiadaptlm::saveBIN_per_word START\n");
01153 system("date");
01154
01155
01156 dictionary* subdict;
01157
01158
01159
01160
01161
01162
01163 if (subdictfile) subdict=new dictionary(subdictfile);
01164 else subdict=dict;
01165
01166 if (mmap) {
01167 VERBOSE(2,"savebin with memory map: " << filename << "\n");
01168 } else {
01169 VERBOSE(2,"savebin: " << filename << "\n");
01170 }
01171
01172
01173 streampos pos[lmsize()+1];
01174 int maxlev=lmsize();
01175 char buff[100];
01176 int isQuant=0;
01177
01178
01179 char tmpfilename[BUFSIZ];
01180
01181
01182 assert(strlen(filename)<1000);
01183 char tfilename[MAX_NGRAM][1000];
01184 mfstream *tout[MAX_NGRAM];
01185
01186 for (int i=1; i<=lmsize(); i++) {
01187 sprintf(tfilename[i],"%s-%dgrams",filename,i);
01188 tout[i]=new mfstream(tfilename[i],ios::out);
01189 }
01190
01191
01192 mfstream out(filename,ios::out);
01193 out << "blmt " << maxlev;
01194
01195 for (int i=1; i<=maxlev; i++) {
01196 pos[i]=out.tellp();
01197 sprintf(buff," %10d",0);
01198 out << buff;
01199 }
01200 out << "\n";
01201 subdict->save(out);
01202 out.flush();
01203
01204 ngram ng(dict,lmsize());
01205 ngram oldng(dict,lmsize());
01206 ngram locng(dict,lmsize());
01207
01208 ngram sng(subdict,lmsize());
01209
01210 double fstar,lambda,bo,dummy,dummy2,pr,ibow;
01211
01212
01213 table_entry_pos_t num[lmsize()+1];
01214 for (int i=1; i<=lmsize(); i++) num[i]=0;
01215
01216 lmtable* lmt = new lmtable();
01217
01218 lmt->configure(maxlev,isQuant);
01219 lmt->setDict(subdict);
01220 lmt->expand_level(1,dict->size(),filename,mmap);
01221
01222
01223 for (int w=0; w<dict->size(); w++) {
01224 sprintf(tmpfilename,"%s_tmp_%d",filename,w);
01225
01226 if (!w % 10000) cerr << ".";
01227
01228
01229 ngram ung(dict,1);
01230 *ung.wordp(1)=w;
01231 sng.trans(ung);
01232
01233
01234 if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue;
01235
01236
01237 pr=mdiadaptlm::prob(ung,1);
01238 pr=(pr?log10(pr):-99);
01239
01240 if (lmsize()>1) {
01241 ung.pushc(0);
01242 mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo);
01243 ung.shift();
01244
01245 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
01246
01247 if (backoff){
01248 ibow=log10(lambda) - log10(bo);
01249 }else{
01250 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
01251 ibow = log10(lambda);
01252 }else {
01253 ibow = 0.0;
01254 }
01255 }
01256 }
01257 else {
01258 ibow=0.0;
01259 }
01260
01261 lmt->addwithoffset(ung,(float)pr,(float)ibow);
01262 num[1]++;
01263
01264
01265 if (get(ung,1,1)) {
01266
01267
01268 *ng.wordp(lmsize())=w;
01269
01270
01271 for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1;
01272
01273
01274 for (int i=2; i<=lmsize(); i++)
01275 lmt->expand_level(i,entries(i),tmpfilename,mmap);
01276
01277 scan(ung.link,ung.info,1,ng,INIT,lmsize());
01278 while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) {
01279 sng.trans(ng);
01280 locng=ng;
01281
01282
01283 int f=lmsize()-1;
01284 while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; }
01285
01286 for (int l=lmsize()-(f-1); l<=lmsize(); l++){
01287
01288 locng=ng;
01289 if (l<lmsize()) locng.shift(lmsize()-l);
01290
01291 if (sng.containsWord(subdict->OOV(),l)) continue;
01292
01293
01294 if (sng.containsWord(dict->EoS(),l-1)) continue;
01295
01296 pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2);
01297
01298
01299
01300 if (!(pr<=1.0 && pr > 1e-10)) {
01301 cerr << ng << " " << pr << "\n";
01302 assert(pr<=1.0);
01303 cerr << "prob modified to 1e-10\n";
01304 pr=1e-10;
01305 }
01306
01307 if (l<lmsize()) {
01308
01309 locng.pushc(0);
01310
01311 mdiadaptlm::bodiscount(locng,l+1,dummy,lambda,bo);
01312
01313 locng.shift();
01314
01315 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
01316 ibow=log10(lambda) - log10(bo);
01317 if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){
01318 num[l]++;
01319 }else{
01320 continue;
01321 }
01322 }
01323 else{
01324 continue;
01325 }
01326 } else {
01327 if (fstar>=UPPER_SINGLE_PRECISION_OF_0) {
01328 ibow=0.0;
01329 if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){
01330 num[l]++;
01331 }else{
01332 continue;
01333 }
01334 }
01335 else{
01336 continue;
01337 }
01338 }
01339 }
01340 oldng=ng;
01341 }
01342 }
01343 else{
01344
01345 for (int i=2; i<=lmsize(); i++)
01346 lmt->expand_level(i,0,tmpfilename,mmap);
01347 }
01348
01349
01350
01351
01352
01353 for (int i=2; i<=lmsize(); i++){
01354
01355 if (i>2) {
01356 lmt->checkbounds(i-1);
01357 lmt->appendbin_level(i-1, *tout[i-1], mmap);
01358 }
01359
01360
01361 lmt->resize_level(i, tmpfilename, mmap);
01362 }
01363
01364
01365 if (lmsize()>1){
01366 lmt->appendbin_level(maxlev, *tout[maxlev], mmap);
01367 }
01368
01369
01370 for (int i=2; i<=lmsize(); i++) lmt->delete_level(i, tmpfilename, mmap);
01371
01372
01373 for (int i=2; i<=lmsize(); i++) lmt->update_offset(i,num[i]);
01374 }
01375
01376 for (int i=2; i<=lmsize(); i++) tout[i]->close();
01377
01378
01379
01380 lmt->savebin_level(1, filename, mmap);
01381
01382
01383 for (int i=1; i<=lmsize(); i++) {
01384 sprintf(buff," %10d",num[i]);
01385 out.seekp(pos[i]);
01386 out << buff;
01387 }
01388
01389 out.close();
01390
01391
01392
01393 lmt->compact_all_levels(filename);
01394
01395 cerr << "\n";
01396 system("date");
01397
01398 VERBOSE(2,"mdiadaptlm::saveBIN_per_word END\n");
01399 return 1;
01400 };
01401
01403 int mdiadaptlm::saveBIN_per_level(char *filename,int backoff,char* subdictfile,int mmap)
01404 {
01405 VERBOSE(2,"mdiadaptlm::saveBIN_per_level START\n");
01406 system("date");
01407
01408
01409 dictionary* subdict;
01410
01411
01412 double oovprob=0;
01413
01414 if (subdictfile) subdict=new dictionary(subdictfile);
01415 else subdict=dict;
01416
01417 if (mmap) {
01418 VERBOSE(2,"savebin with memory map: " << filename << "\n");
01419 } else {
01420 VERBOSE(2,"savebin: " << filename << "\n");
01421 }
01422
01423 streampos pos[lmsize()+1];
01424 int maxlev=lmsize();
01425 char buff[100];
01426 int isQuant=0;
01427
01428
01429 fstream out(filename,ios::out);
01430 out << "blmt " << maxlev;
01431
01432 for (int i=1; i<=maxlev; i++) {
01433 pos[i]=out.tellp();
01434 sprintf(buff," %10d",0);
01435 out << buff;
01436 }
01437 out << "\n";
01438 lmtable* lmt = new lmtable();
01439
01440 lmt->configure(maxlev,isQuant);
01441
01442 lmt->setDict(subdict);
01443 subdict->save(out);
01444 out.flush();
01445
01446
01447
01448
01449 for (int i=1; i<=lmsize(); i++) {
01450 cerr << "saving level " << i << "...\n";
01451 table_entry_pos_t numberofentries;
01452 if (i==1) {
01453 numberofentries = (table_entry_pos_t) subdict->size();
01454 } else {
01455 numberofentries = (table_entry_pos_t) entries(i);
01456 }
01457 system("date");
01458 lmt->expand_level(i,numberofentries,filename,mmap);
01459
01460 double totp=0;
01461 double fstar,lambda,bo,dummy,dummy2,pr,ibow;
01462
01463 ngram ng(dict,1);
01464 ngram ng2(dict);
01465 ngram sng(subdict,1);
01466
01467 if (i==1) {
01468
01469
01470 for (int w=0; w<dict->size(); w++) {
01471 *ng.wordp(1)=w;
01472
01473 sng.trans(ng);
01474 pr=mdiadaptlm::prob(ng,1);
01475 totp+=pr;
01476
01477 if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) {
01478 oovprob+=pr;
01479 continue;
01480 }
01481
01482
01483 if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
01484
01485
01486 pr=(pr?log10(pr):-99);
01487
01488 if (w==dict->oovcode()){
01489
01490 *ng.wordp(1)=lmt->getDict()->oovcode();
01491 ibow=0.0;
01492 }
01493 else {
01494
01495
01496 if (lmsize()>1) {
01497 ngram ng2=ng;
01498 ng2.pushc(0);
01499
01500
01501
01502 mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
01503 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1));
01504
01505 if (backoff){
01506 ibow = log10(lambda) - log10(bo);
01507 }else{
01508 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
01509 ibow = log10(lambda);
01510 }else {
01511 ibow = 0.0;
01512 }
01513 }
01514 }else {
01515 ibow=0.0;
01516 }
01517 }
01518 lmt->add(ng,(float)pr,(float)ibow);
01519 }
01520
01521 }
01522 else {
01523 *ng.wordp(1)=0;
01524 get(ng,1,1);
01525 scan(ng,INIT,i);
01526 while(scan(ng,CONT,i)) {
01527 sng.trans(ng);
01528
01529 if (sng.containsWord(subdict->OOV(),i)) continue;
01530
01531
01532 if (sng.containsWord(dict->EoS(),i-1)) continue;
01533
01534
01535
01536 pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
01537
01538 if (!(pr<=1.0 && pr > 1e-10)) {
01539 cerr << ng << " " << pr << "\n";
01540 assert(pr<=1.0);
01541 cerr << "prob modified to 1e-10\n";
01542 pr=1e-10;
01543 }
01544
01545 if (i<lmsize()) {
01546 ng2=ng;
01547 ng2.pushc(0);
01548
01549 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
01550
01551 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
01552 ibow=log10(lambda) - log10(bo);
01553 lmt->add(ng,(float)log10(pr),(float)ibow);
01554 }
01555 } else {
01556 if (fstar >= UPPER_SINGLE_PRECISION_OF_0) {
01557 ibow=0.0;
01558 lmt->add(ng,(float)log10(pr),(float)ibow);
01559 }
01560 }
01561 }
01562 }
01563
01564
01565
01566
01567 if (maxlev>1 && i>1) {
01568 lmt->checkbounds(i-1);
01569 lmt->savebin_level(i-1, filename, mmap);
01570 }
01571
01572
01573 lmt->resize_level(i, filename, mmap);
01574
01575 }
01576
01577 lmt->savebin_level(maxlev, filename, mmap);
01578
01579
01580 for (int i=1; i<=lmsize(); i++) {
01581 sprintf(buff," %10d",lmt->getCurrentSize(i));
01582 out.seekp(pos[i]);
01583 out << buff;
01584 }
01585 out.close();
01586
01587
01588
01589 lmt->compact_all_levels(filename);
01590
01591 VERBOSE(2,"mdiadaptlm::saveBIN_per_level END\n");
01592 return 1;
01593 }
01594
01595
01597 int mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile )
01598 {
01599 VERBOSE(2,"mdiadaptlm::saveARPA_per_word START\n");
01600 system("date");
01601
01602
01603 dictionary* subdict;
01604
01605
01606
01607
01608
01609
01610 if (subdictfile) subdict=new dictionary(subdictfile);
01611 else subdict=dict;
01612
01613
01614 mfstream out(filename,ios::out);
01615
01616
01617 assert(strlen(filename)<1000);
01618 char tfilename[MAX_NGRAM][1000];
01619 mfstream *tout[MAX_NGRAM];
01620
01621 for (int i=1; i<=lmsize(); i++) {
01622 sprintf(tfilename[i],"%s.%d",filename,i);
01623 tout[i]=new mfstream(tfilename[i],ios::out);
01624 *tout[i] << "\n\\" << i << "-grams:\n";
01625 }
01626
01627
01628 ngram ng(dict,lmsize());
01629 ngram oldng(dict,lmsize());
01630 ngram locng(dict,lmsize());
01631
01632 ngram sng(subdict,lmsize());
01633
01634 double fstar,lambda,bo,dummy,dummy2, pr;
01635
01636
01637 table_entry_pos_t num[lmsize()+1];
01638 for (int i=1; i<=lmsize(); i++) num[i]=0;
01639
01640
01641
01642 for (int w=0; w<dict->size(); w++) {
01643
01644 if (!w % 10000) cerr << ".";
01645
01646
01647 ngram ung(dict,1);
01648 *ung.wordp(1)=w;
01649 sng.trans(ung);
01650
01651
01652 if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue;
01653
01654 pr=mdiadaptlm::prob(ung,1);
01655 pr=(pr?log10(pr):-99);
01656
01657 if (w==dict->oovcode())
01658 *tout[1] << (float) pr << "\t" << "<unk>";
01659 else
01660 *tout[1] << (float) pr << "\t" << (char *)dict->decode(w);
01661
01662 num[1]++;
01663
01664 if (lmsize()>1) {
01665 ung.pushc(0);
01666 mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo);
01667 ung.shift();
01668
01669 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
01670
01671 if (backoff){
01672 *tout[1] << "\t" << (float) (log10(lambda) - log10(bo));
01673 }else{
01674 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
01675 *tout[1] << "\t" << (float) log10(lambda);
01676 }
01677 }
01678 }
01679 *tout[1] << "\n";
01680
01681
01682 if (get(ung,1,1)) {
01683
01684
01685 *ng.wordp(lmsize())=w;
01686
01687
01688 for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1;
01689
01690 scan(ung.link,ung.info,1,ng,INIT,lmsize());
01691 while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) {
01692
01693 sng.trans(ng);
01694 locng=ng;
01695
01696
01697 int f=lmsize()-1;
01698 while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; }
01699
01700 for (int l=lmsize(); l>lmsize()-f;l--){
01701
01702 if (l<lmsize()) locng.shift();
01703
01704 if (sng.containsWord(subdict->OOV(),l)) continue;
01705
01706
01707 if (sng.containsWord(dict->EoS(),l-1)) continue;
01708
01709 pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2);
01710
01711
01712
01713 if (!(pr<=1.0 && pr > 1e-10)) {
01714 cerr << ng << " " << pr << "\n";
01715 assert(pr<=1.0);
01716 cerr << "prob modified to 1e-10\n";
01717 pr=1e-10;
01718 }
01719
01720 if (l<lmsize()) {
01721
01722 locng.pushc(0);
01723
01724 mdiadaptlm::bodiscount(locng,l+1,dummy,lambda,bo);
01725
01726 locng.shift();
01727
01728 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
01729 *tout[l] << (float) log10(pr);
01730 *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l));
01731 for (int j=l-1; j>0; j--)
01732 *tout[l] << " " << (char *)dict->decode(*locng.wordp(j));
01733
01734 if (lambda < LOWER_SINGLE_PRECISION_OF_1)
01735 *tout[l] << "\t" << (float) (log10(lambda) -log10(bo));
01736 *tout[l] << "\n";
01737
01738 num[l]++;
01739 } else continue;
01740 } else {
01741 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 ) {
01742 *tout[l] << (float) log10(pr);
01743 *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l));
01744 for (int j=l-1; j>0; j--)
01745 *tout[l] << " " << (char *)dict->decode(*locng.wordp(j));
01746 *tout[l] << "\n";
01747 num[l]++;
01748 } else continue;
01749 }
01750
01751 }
01752 oldng=ng;
01753 }
01754 }
01755
01756 }
01757
01758
01759
01760 out << "\n\\data\\" << "\n";
01761 char buff[100];
01762 for (int i=1; i<=lmsize(); i++) {
01763 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
01764 out << buff;
01765 }
01766 out << "\n";
01767
01768
01769 for (int i=1; i<=lmsize(); i++) {
01770 delete tout[i];
01771 tout[i]=new mfstream(tfilename[i],ios::in);
01772 out << tout[i]->rdbuf();
01773 delete tout[i];
01774 removefile(tfilename[i]);
01775 }
01776
01777 out << "\\end\\" << "\n";
01778
01779 cerr << "\n";
01780 system("date");
01781
01782 VERBOSE(2,"mdiadaptlm::saveARPA_per_word END\n");
01783 return 1;
01784 };
01785
01787 int mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile )
01788 {
01789 VERBOSE(2,"mdiadaptlm::saveARPA_per_level START\n");
01790 system("date");
01791
01792
01793 dictionary* subdict;
01794
01795
01796 double oovprob=0;
01797
01798 if (subdictfile) {
01799 subdict=new dictionary(subdictfile);
01800 } else
01801 subdict=dict;
01802
01803 fstream out(filename,ios::out);
01804
01805
01806 streampos pos[lmsize()+1];
01807 table_entry_pos_t num[lmsize()+1];
01808 char buff[100];
01809
01810
01811 out << "\n\\data\\" << "\n";
01812
01813 for (int i=1; i<=lmsize(); i++) {
01814 num[i]=0;
01815 pos[i]=out.tellp();
01816 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
01817 out << buff;
01818 }
01819
01820 out << "\n";
01821
01822
01823
01824 for (int i=1; i<=lmsize(); i++) {
01825 cerr << "saving level " << i << "...\n";
01826
01827
01828 out << "\n\\" << i << "-grams:\n";
01829
01830 double totp=0;
01831 double fstar,lambda,bo,dummy,dummy2,pr;
01832
01833
01834 ngram ng(dict,1);
01835 ngram ng2(dict);
01836 ngram sng(subdict,1);
01837
01838 if (i==1) {
01839
01840
01841
01842 for (int w=0; w<dict->size(); w++) {
01843 *ng.wordp(1)=w;
01844
01845 sng.trans(ng);
01846 pr=mdiadaptlm::prob(ng,1);
01847 totp+=pr;
01848
01849 if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) {
01850 oovprob+=pr;
01851 continue;
01852 }
01853
01854
01855 if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
01856
01857
01858 out << (float) (pr?log10(pr):-99);
01859
01860 num[i]++;
01861
01862 if (w==dict->oovcode())
01863 out << "\t" << "<unk>\n";
01864 else {
01865 out << "\t" << (char *)dict->decode(w);
01866
01867 if (lmsize()>1) {
01868 ngram ng2=ng;
01869 ng2.pushc(0);
01870
01871 mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
01872
01873 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
01874
01875 if (backoff){
01876 out << "\t" << (float) (log10(lambda) - log10(bo));
01877 }else{
01878 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
01879 out << "\t" << (float) log10(lambda);
01880 }
01881 }
01882 }
01883 out << "\n";
01884 }
01885 }
01886
01887 }
01888 else {
01889 *ng.wordp(1)=0;
01890 get(ng,1,1);
01891 scan(ng,INIT,i);
01892 while(scan(ng,CONT,i)) {
01893
01894 sng.trans(ng);
01895 if (sng.containsWord(subdict->OOV(),i)) continue;
01896
01897
01898 if (sng.containsWord(dict->EoS(),i-1)) continue;
01899
01900 pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
01901
01902
01903
01904 if (!(pr<=1.0 && pr > 1e-10)) {
01905 cerr << ng << " " << pr << "\n";
01906 assert(pr<=1.0);
01907 cerr << "prob modified to 1e-10\n";
01908 pr=1e-10;
01909 }
01910
01911 if (i<lmsize()) {
01912 ng2=ng;
01913 ng2.pushc(0);
01914
01915 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
01916
01917 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
01918 out << (float) log10(pr);
01919 out << "\t" << (char *)dict->decode(*ng.wordp(i));
01920 for (int j=i-1; j>0; j--)
01921 out << " " << (char *)dict->decode(*ng.wordp(j));
01922 if (backoff){
01923 out << "\t" << (float) (log10(lambda) - log10(bo));
01924 }else{
01925 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
01926 out << "\t" << (float) log10(lambda);
01927 }
01928 }
01929 out << "\n";
01930 num[i]++;
01931 }
01932 } else {
01933 if (fstar>=UPPER_SINGLE_PRECISION_OF_0) {
01934 out << (float) log10(pr);
01935 out << "\t" << (char *)dict->decode(*ng.wordp(i));
01936 for (int j=i-1; j>0; j--)
01937 out << " " << (char *)dict->decode(*ng.wordp(j));
01938 out << "\n";
01939
01940 num[i]++;
01941 }
01942 }
01943 }
01944 }
01945
01946 cerr << i << "grams tot:" << num[i] << "\n";
01947 }
01948
01949 streampos last=out.tellp();
01950
01951
01952 for (int i=1; i<=lmsize(); i++) {
01953 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
01954 out.seekp(pos[i]);
01955 out << buff;
01956 }
01957
01958 out.seekp(last);
01959 out << "\\end\\" << "\n";
01960 system("date");
01961
01962 VERBOSE(2,"mdiadaptlm::saveARPA_per_level END\n");
01963 return 1;
01964 };
01965
01966
01967
01968
01969
01970
01971
01972
01973
01974
01975
01976
01977
01978
01979
01980
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991