00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <cmath>
00022 #include <string>
00023 #include <assert.h>
00024 #include "mfstream.h"
00025 #include "mempool.h"
00026 #include "htable.h"
00027 #include "dictionary.h"
00028 #include "n_gram.h"
00029 #include "mempool.h"
00030 #include "ngramtable.h"
00031 #include "normcache.h"
00032 #include "interplm.h"
00033 #include "mdiadapt.h"
00034 #include "shiftlm.h"
00035
00036 using namespace std;
00037
00038
00039
00040
00041
00042 mdiadaptlm::mdiadaptlm(char* ngtfile,int depth,TABLETYPE tbtype):
00043 interplm(ngtfile,depth,tbtype){
00044 adaptlev=0;
00045 forelm=NULL;
00046 cache=NULL;
00047 };
00048
00049
00050 int mdiadaptlm::scalefact(char *ngtfile){
00051 if (forelm!=NULL) delete forelm;
00052 if (cache!=NULL) delete cache;
00053 cache=new normcache(dict);
00054
00055 forelm=new shiftbeta(ngtfile,1);
00056 forelm->train();
00057
00058
00059 ngram fng(forelm->dict,1);
00060 ngram ng(dict,1);
00061 int* w=fng.wordp(1);
00062
00063 oovscaling=1.0;
00064 for ((*w)=0;(*w)<forelm->dict->size();(*w)++)
00065 if ((*w) != forelm->dict->oovcode()){
00066 ng.trans(fng);
00067 if (*ng.wordp(1)==dict->oovcode())
00068 {
00069 cerr << "fg lm contains new words: use -ao=yes option\n";
00070 exit(1);
00071 }
00072
00073 oovscaling-=backunig(ng);
00074 }
00075 *w=forelm->dict->oovcode();
00076 oovscaling=foreunig(fng)/oovscaling;
00077
00078 return 1;
00079 };
00080
00081
00082 double mdiadaptlm::scalefact(ngram ng){
00083 ngram fng(forelm->dict,1);
00084 fng.trans(ng);
00085 if (*fng.wordp(1)==forelm->dict->oovcode())
00086 return pow(oovscaling,gis_step);
00087 else{
00088 double prback=backunig(ng);
00089 double prfore=foreunig(ng);
00090 return pow(prfore/prback,gis_step);
00091 }
00092 }
00093
00094
00095 double mdiadaptlm::foreunig(ngram ng){
00096
00097 double fstar,lambda;
00098
00099 forelm->discount(ng,1,fstar,lambda);
00100
00101 return fstar;
00102 }
00103
00104 double mdiadaptlm::backunig(ngram ng){
00105
00106 double fstar,lambda;
00107
00108 discount(ng,1,fstar,lambda,0);
00109
00110 return fstar;
00111 };
00112
00113
00114
00115 int mdiadaptlm::adapt(char* ngtfile,int alev,double step){
00116
00117 if (alev > lmsize() || alev<=0){
00118 cerr << "setting adaptation level to " << lmsize() << "\n";
00119 alev=lmsize();
00120 }
00121 adaptlev=alev;
00122
00123
00124 cerr << "adapt ....";
00125 gis_step=step;
00126
00127 if (ngtfile==NULL){
00128 cerr << "adaptation file is missing\n";
00129 exit(1);
00130 }
00131
00132
00133
00134 scalefact(ngtfile);
00135
00136
00137 ngram ng(dict,2);
00138 int* w=ng.wordp(1);
00139
00140 cerr << "precomputing 1-gram normalization ...\n";
00141 zeta0=0;
00142 for ((*w)=0;(*w)<dict->size();(*w)++)
00143 zeta0+=scalefact(ng) * backunig(ng);
00144
00145 if (alev==1) return 1 ;
00146
00147 cerr << "precomputing 2-gram normalization:\n";
00148
00149
00150 w=ng.wordp(2);
00151 *ng.wordp(1)=0;
00152
00153 for ((*w)=0;(*w)<dict->size();(*w)++){
00154 zeta(ng,2);
00155 if ((*w % 1000)==0) cerr << ".";
00156 }
00157
00158 cerr << "done\n";
00159
00160 return 1;
00161 };
00162
00163
00164 double mdiadaptlm::zeta(ngram ng,int size){
00165
00166 assert(size>=1);
00167
00168 double z;
00169
00170 ng.size=size;
00171
00172 if (size==1) return zeta0;
00173 else{
00174
00175
00176 if (cache->get(ng,size,z)) return z;
00177
00178 double fstar,lambda;
00179 ngram histo=ng;
00180 int succ=0;
00181
00182 discount(ng,size,fstar,lambda,(int)0);
00183
00184 if ((lambda<1) && get(histo,size,size-1)){;
00185
00186
00187 succ=0;
00188
00189 succscan(histo,ng,INIT,size);
00190 while(succscan(histo,ng,CONT,size)){
00191
00192 discount(ng,size,fstar,lambda,0);
00193 if (fstar>0){
00194 z+=(scalefact(ng) * fstar);
00195 succ++;
00196
00197 }
00198 }
00199 }
00200
00201 z+=lambda*zeta(ng,size-1);
00202
00203 if ((size==2) || (succ>1))
00204 cache->put(ng,size,z);
00205
00206 return z;
00207 }
00208
00209 }
00210
00211
00212 int mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int cv){
00213
00214 ngram ng(dict);ng.trans(ng_);
00215
00216 discount(ng,size,fstar,lambda,0);
00217
00218 if ((size>0) && (size<=adaptlev) && (lambda<1)){
00219
00220 if (size>1){
00221
00222
00223 double z=zeta(ng,size);
00224 double z1=zeta(ng,size-1);
00225
00226 lambda=lambda * z1/z;
00227 fstar=scalefact(ng) * fstar/z;
00228 }
00229 else if (size==1){
00230 double alpha=scalefact(ng);
00231
00232 fstar=alpha*fstar/zeta0;
00233 lambda=lambda*alpha/zeta0;
00234 }
00235 else{
00236
00237 }
00238 }
00239
00240 return 1;
00241 }
00242
00243
00244 int mdiadaptlm::compute_backoff(){
00245
00246 double fstar,lambda;
00247
00248 cerr << "compute backoff probabilities ...";
00249
00250 this->backoff=1;
00251
00252 for (int size=1;size<lmsize();size++){
00253
00254 ngram hg(dict,size);
00255
00256
00257
00258 scan(hg,INIT,size);
00259
00260 while(scan(hg,CONT,size)){
00261
00262
00263
00264 ngram ng=hg;ng.pushc(0);
00265
00266 double pr=1.0;
00267
00268 succscan(hg,ng,INIT,size+1);
00269
00270 while(succscan(hg,ng,CONT,size+1)){
00271
00272 mdiadaptlm::discount(ng,size+1,fstar,lambda);
00273
00274 if (fstar>0)
00275
00276 pr-=mdiadaptlm::prob(ng,size);
00277
00278 }
00279
00280 assert(pr>0 && pr<=1);
00281
00282 boff(hg.link,pr);
00283
00284 }
00285
00286 }
00287
00288
00289
00290 cerr << "done\n";
00291
00292 return 1;
00293 }
00294
00295
00296
00297 double mdiadaptlm::prob2(ngram ng,int size,double& fstar){
00298
00299 double lambda;
00300
00301 mdiadaptlm::discount(ng,size,fstar,lambda);
00302
00303 if (size>1)
00304 return fstar + lambda * prob(ng,size-1);
00305 else
00306 return fstar;
00307 }
00308
00309
00310
00311 double mdiadaptlm::prob(ngram ng,int size){
00312 double fstar,lambda,bo;
00313 return prob(ng,size,fstar,lambda,bo);
00314 }
00315
00316 double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo){
00317 mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo);
00318
00319 if (fstar >1.0000001 || lambda >1.0000001){
00320 cerr << "wrong probability: " << ng
00321 << " , size " << size
00322 << " , fstar " << fstar
00323 << " , lambda " << lambda << "\n";
00324 exit(1);
00325 }
00326
00327 if (backoff){
00328
00329 if (size>1){
00330 if (fstar>0) return fstar;
00331 else{
00332 if (lambda<1)
00333 return lambda/bo * prob(ng,size-1);
00334 else{
00335 assert(lambda < 1.00000001);
00336 return prob(ng,size-1);
00337 }
00338 }
00339 }
00340 else
00341 return fstar;
00342 }
00343
00344 else{
00345
00346 if (size>1)
00347 return fstar + lambda * prob(ng,size-1);
00348 else
00349 return fstar;
00350 }
00351 }
00352
00353
00354
00355
00356 int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo){
00357 ngram ng(dict);ng.trans(ng_);
00358
00359 mdiadaptlm::discount(ng,size,fstar,lambda);
00360
00361 bo=1.0;
00362
00363 if (backoff){
00364
00365 if (size>1 && lambda<1){
00366
00367 ngram hg=ng;
00368
00369 assert(get(hg,size,size-1));
00370
00371 bo=boff(hg.link);
00372
00373 }
00374 }
00375
00376 return 1;
00377 }
00378
00379
00380 double mdiadaptlm::txclprob(ngram ng,int size){
00381
00382 double fstar,lambda;
00383
00384 if (size>1){
00385 mdiadaptlm::discount(ng,size,fstar,lambda);
00386 return fstar + lambda * txclprob(ng,size-1);
00387 }
00388 else{
00389 double freq=1;
00390 if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1))
00391 freq+=ng.freq;
00392
00393 double N=totfreq()+dict->dub()-dict->size();
00394 return freq/N;
00395 }
00396 }
00397
00398
00399 int mdiadaptlm::netsize(){
00400 double fstar,lambda;
00401 int size,totsize;
00402 ngram ng(dict);
00403
00404 cerr << "Computing LM size:\n";
00405
00406 totsize=dict->size() * 2;
00407
00408 cout << "1-gram " << totsize << "\n";
00409
00410 for (int i=2;i<=maxlevel();i++){
00411
00412 size=0;
00413
00414 scan(ng,INIT,i);
00415
00416 while (scan(ng,CONT,i)){
00417
00418 mdiadaptlm::discount(ng,i,fstar,lambda);
00419
00420 if (fstar>0) size++;
00421
00422 }
00423
00424 size+=size * (i<maxlevel());
00425
00426 totsize+=size;
00427
00428 cout << i << "-gram " << totsize << "\n";
00429
00430 }
00431
00432 return totsize;
00433 }
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483 int swapbytes(char *p, int sz, int n)
00484 {
00485 char c,*l,*h;
00486 if((n<1) ||(sz<2)) return 0;
00487 for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; }
00488 return 0;
00489 };
00490
00491 void fwritex(char *p,int sz,int n,FILE* f){
00492
00493 if(*(short *)"AB"==0x4241){
00494 swapbytes((char*)p, sz,n);
00495 }
00496
00497 fwrite((char *)p,sz,n,f);
00498
00499 if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n);
00500
00501 }
00502
00503 void ifwrite(long loc,void *ptr,int size,int n,FILE* f)
00504 {
00505 fflush(f);
00506
00507 long pos=ftell(f);
00508
00509 fseek(f,loc,SEEK_SET);
00510
00511 fwritex((char *)ptr,size,1,f);
00512
00513 fseek(f,pos,SEEK_SET);
00514
00515 fflush(f);
00516 }
00517
00518 void writeNull(unsigned short nullCode,float nullProb,FILE* f){
00519 fwritex((char *)&nullCode,sizeof(short),1,f);
00520 fwritex((char *)&nullProb,sizeof(float),1,f);
00521 }
00522
00523
00524 int mdiadaptlm::saveASR(char *filename,int backoff,char* subdictfile){
00525 int totbg,tottr;
00526
00527 dictionary* subdict;
00528
00529 if (subdictfile)
00530 subdict=new dictionary(subdictfile);
00531 else
00532 subdict=dict;
00533
00534 typedef unsigned short code;
00535
00536 system("date");
00537
00538 if (lmsize()>3 || lmsize()<1)
00539 {
00540 cerr << "wrong lmsize\n";
00541 exit(1);
00542 }
00543
00544 if (dict->size()>=0xffff && subdict->size()>=0xffff)
00545 {
00546 cerr << "save bin requires unsigned short codes\n";
00547 exit(1);
00548 }
00549
00550 FILE* f=fopen(filename,"w");
00551
00552 double fstar,lambda,boff;
00553 float pr;
00554 long succ1pos,succ2pos;
00555 code succ1,succ2,w,h1,h2;
00556 code stop=0xffff;
00557
00558
00559
00560
00561 code oovcode=subdict->oovcode();
00562
00563
00564 code subdictsz=subdict->size()+1;
00565
00566 fwritex((char *)&subdictsz,sizeof(code),1,f);
00567
00568 subdictsz--;
00569 for (w=0;w<subdictsz;w++)
00570 fprintf(f,"%s\n",(char *)subdict->decode(w));
00571
00572 fprintf(f,"____\n");
00573
00574
00575
00576
00577 h1=subdictsz;
00578 fwritex((char *)&h1,sizeof(code),1,f);
00579
00580 succ1=0;succ1pos=ftell(f);
00581 fwritex((char *)&succ1,sizeof(code),1,f);
00582
00583 ngram ng(dict);
00584 ngram sng(subdict);
00585
00586 ng.size=sng.size=1;
00587
00588 scan(ng,INIT,1);
00589 while(scan(ng,CONT,1))
00590 {
00591 sng.trans(ng);
00592 if (sng.containsWord(subdict->OOV(),1))
00593 continue;
00594
00595 pr=(float)mdiadaptlm::prob(ng,1);
00596 if (pr>1e-50){
00597 succ1++;
00598 w=*sng.wordp(1);
00599 fwritex((char *)&w,sizeof(code),1,f);
00600 fwritex((char *)&pr,sizeof(float),1,f);
00601 }
00602 else{
00603 cerr << "small prob word " << ng << "\n";
00604 }
00605 }
00606
00607
00608 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
00609
00610 cerr << "finito unigrammi " << succ1 << "\n";
00611 fflush(f);
00612
00613 if (lmsize()==1){
00614 fclose(f);
00615 return 1;
00616 }
00617
00618
00619
00620
00621 succ1=0; h1=subdictsz; totbg=subdictsz;
00622
00623 ngram hg1(dict,1);
00624
00625 ng.size=sng.size=2;
00626
00627 scan(hg1,INIT,1);
00628 while(scan(hg1,CONT,1)){
00629
00630 if (hg1.containsWord(dict->OOV(),1)) continue;
00631
00632 assert((*hg1.wordp(1))<dict->size());
00633
00634 *ng.wordp(2)=*hg1.wordp(1);
00635 *ng.wordp(1)=0;
00636
00637 sng.trans(ng);
00638 if (sng.containsWord(dict->OOV(),1)) continue;
00639
00640 mdiadaptlm::bodiscount(ng,2,fstar,lambda,boff);
00641
00642 if (lambda < 1.0){
00643
00644 h1=*sng.wordp(2);
00645
00646 fwritex((char *)&h1,sizeof(code),1,f);
00647
00648 succ1=0;succ1pos=ftell(f);
00649 fwritex((char *)&succ1,sizeof(code),1,f);
00650
00651 ngram shg=hg1;
00652 get(shg,1,1);
00653
00654 succscan(shg,ng,INIT,2);
00655 while(succscan(shg,ng,CONT,2)){
00656
00657 if (*ng.wordp(1)==oovcode) continue;
00658
00659 sng.trans(ng);
00660 if (sng.containsWord(dict->OOV(),2)) continue;
00661
00662 mdiadaptlm::discount(ng,2,fstar,lambda);
00663
00664 if (fstar>1e-50){
00665 w=*sng.wordp(1);
00666 fwritex((char *)&w,sizeof(code),1,f);
00667 pr=(float)mdiadaptlm::prob(ng,2);
00668
00669
00670 fwritex((char *)&pr,sizeof(float),1,f);
00671 succ1++;
00672 }
00673 }
00674
00675 if (succ1){
00676 lambda/=boff;
00677 writeNull(subdictsz,(float)lambda,f);
00678 succ1++;
00679 totbg+=succ1;
00680 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
00681 }
00682 else{
00683
00684 fseek(f,succ1pos-(streampos)sizeof(code),SEEK_SET);
00685 }
00686 }
00687 }
00688
00689 fwritex((char *)&stop,sizeof(code),1,f);
00690
00691 cerr << " finito bigrammi! " << subdictsz << "\n";
00692 fflush(f);
00693
00694 system("date");
00695
00696 if (lmsize()<3){
00697 fclose(f);
00698 return 1;
00699 }
00700
00701
00702
00703 h1=subdictsz; h2=subdictsz; tottr=0;
00704 succ1=0; succ2=0;
00705
00706 ngram hg2(dict,2);
00707
00708 ng.size=sng.size=3;
00709
00710 scan(hg1,INIT,1);
00711 while(scan(hg1,CONT,1)){
00712
00713 if ((*hg1.wordp(1)==oovcode)) continue;
00714
00715 *ng.wordp(3)=*hg1.wordp(1);
00716
00717 sng.trans(ng);
00718 if (sng.containsWord(dict->OOV(),1)) continue;
00719
00720 assert((*sng.wordp(3))<subdictsz);
00721
00722 h1=*sng.wordp(3);
00723 fwritex((char *)&h1,sizeof(code),1,f);
00724
00725 succ1=0;succ1pos=ftell(f);
00726 fwritex((char *)&succ1,sizeof(code),1,f);
00727
00728 ngram shg1=ng; get(shg1,3,1);
00729
00730 succscan(shg1,hg2,INIT,2);
00731 while(succscan(shg1,hg2,CONT,2)){
00732
00733 if (*hg2.wordp(1)==oovcode) continue;
00734
00735 *ng.wordp(2)=*hg2.wordp(1);
00736 *ng.wordp(1)=0;
00737
00738 sng.trans(ng);
00739 if (sng.containsWord(dict->OOV(),2)) continue;
00740
00741 mdiadaptlm::bodiscount(ng,3,fstar,lambda,boff);
00742
00743 if (lambda < 1.0){
00744
00745 h2=*sng.wordp(2);
00746 fwritex((char *)&h2,sizeof(code),1,f);
00747
00748 succ2=0;succ2pos=ftell(f);
00749 fwritex((char *)&succ2,sizeof(code),1,f);
00750
00751 ngram shg2=ng; get(shg2,3,2);
00752
00753 succscan(shg2,ng,INIT,3);
00754 while(succscan(shg2,ng,CONT,3)){
00755
00756 if (*ng.wordp(1)==oovcode) continue;
00757
00758 sng.trans(ng);
00759 if (sng.containsWord(dict->OOV(),3)) continue;
00760
00761 mdiadaptlm::discount(ng,3,fstar,lambda);
00762
00763
00764 if (fstar>1e-50){
00765
00766 w=*sng.wordp(1);
00767 fwritex((char *)&w,sizeof(code),1,f);
00768
00769 pr=(float)mdiadaptlm::prob(ng,3);
00770
00771
00772 fwritex((char *)&pr,sizeof(float),1,f);
00773 succ2++;
00774 }
00775 }
00776
00777 if (succ2){
00778 lambda/=boff;
00779 writeNull(subdictsz,(float)lambda,f);
00780 succ2++;
00781 tottr+=succ2;
00782 ifwrite(succ2pos,&succ2,sizeof(code),1,f);
00783 succ1++;
00784 }
00785 else{
00786
00787 fseek(f,succ2pos-(long)sizeof(code),SEEK_SET);
00788 }
00789 }
00790 }
00791
00792 if (succ1)
00793 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
00794 else
00795 fseek(f,succ1pos-(long)sizeof(code),SEEK_SET);
00796 }
00797
00798 fwritex((char *)&stop,sizeof(code),1,f);
00799
00800 fclose(f);
00801
00802 cerr << "Tot bg: " << totbg << " tg: " << tottr<< "\n";
00803
00804 system("date");
00805
00806 return 1;
00807 };
00808
00809
00811
00812 int mdiadaptlm::saveMT(char *filename,int backoff,
00813 char* subdictfile,int resolution,double decay){
00814
00815 double logalpha=log(decay);
00816 dictionary* subdict;
00817
00818 if (subdictfile)
00819 subdict=new dictionary(subdictfile);
00820 else
00821 subdict=dict;
00822
00823 ngram ng(dict,lmsize());
00824 ngram sng(subdict,lmsize());
00825
00826 cerr << "Adding unigram of OOV word if missing\n";
00827
00828 for (int i=1;i<=maxlevel();i++)
00829 *ng.wordp(i)=dict->oovcode();
00830
00831 if (!get(ng,maxlevel(),1)){
00832 cerr << "oov is missing in the ngram-table\n";
00833
00834 ng.freq=dict->freq(dict->oovcode());
00835 cerr << "adding oov unigram " << ng << "\n";
00836 put(ng);
00837 }
00838
00839 cerr << "Eventually adding OOV symbol to subdictionary\n";
00840 subdict->encode(OOV_);
00841
00842 system("date");
00843
00844 mfstream out(filename,ios::out);
00845
00846
00847
00848 subdict->incflag(1);
00849 int bo_code=subdict->encode(BACKOFF_);
00850 int du_code=subdict->encode(DUMMY_);
00851 subdict->incflag(0);
00852
00853 out << "nGrAm " << lmsize() << " " << 0
00854 << " " << "LM_ "
00855 << resolution << " "
00856 << decay << "\n";
00857
00858 subdict->save(out);
00859
00860
00861
00862 cerr << "write unigram of oov probability\n";
00863 ng.size=1; *ng.wordp(1)=dict->oovcode();
00864 double pr=(float)mdiadaptlm::prob(ng,1);
00865 sng.trans(ng);sng.size=lmsize();
00866 for (int s=2;s<=lmsize();s++) *sng.wordp(s)=du_code;
00867 sng.freq=(int)ceil(pr * (double)10000000)-1;
00868 out << sng << "\n";
00869
00870 for (int i=1;i<=lmsize();i++){
00871 cerr << "LEVEL " << i << "\n";
00872
00873 double fstar,lambda,bo,dummy;
00874
00875 scan(ng,INIT,i);
00876 while(scan(ng,CONT,i)){
00877
00878 sng.trans(ng);
00879
00880 sng.size=lmsize();
00881 for (int s=i+1;s<=lmsize();s++)
00882 *sng.wordp(s)=du_code;
00883
00884 if (i>=1 && sng.containsWord(subdict->OOV(),sng.size)){
00885 cerr << "skipping : " << sng << "\n";
00886 continue;
00887 }
00888
00889
00890
00891
00892
00893 mdiadaptlm::discount(ng,i,fstar,dummy);
00894
00895
00896
00897
00898
00899
00900 if (fstar>0){
00901
00902 double pr=(float)mdiadaptlm::prob(ng,i);
00903
00904 if (i>1 && resolution<10000000){
00905 sng.freq=resolution-(int)(log(pr)/logalpha)-1;
00906 sng.freq=(sng.freq>=0?sng.freq:0);
00907 }
00908 else
00909 sng.freq=(int)ceil(pr * (double)10000000)-1;
00910
00911 out << sng << "\n";
00912
00913 }
00914
00915 if (i<lmsize()){
00916
00917 ngram ng2=ng; ng2.pushc(0);
00918 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
00919 assert(!backoff || (lambda ==1 || bo<1 ));
00920
00921 sng.pushc(bo_code);
00922 sng.size=lmsize();
00923
00924 if (lambda<1){
00925 if (resolution<10000000){
00926 sng.freq=resolution-(int)(log(lambda/bo)/logalpha)-1;
00927 sng.freq=(sng.freq>=0?sng.freq:0);
00928 }
00929 else
00930 sng.freq=(int)ceil(lambda/bo * (double)10000000)-1;
00931
00932 out << sng << "\n";
00933 }
00934 }
00935 }
00936 cerr << "LEVEL " << i << "DONE \n";
00937 }
00938 return 1;
00939 };
00940
00941
00942
00944
00945 int mdiadaptlm::saveARPA(char *filename,int backoff,char* subdictfile )
00946 {
00947
00948 system("date");
00949
00950
00951 dictionary* subdict; double oovprob=0;
00952 if (subdictfile){
00953 subdict=new dictionary(subdictfile);
00954
00955 assert(dict->oovcode()==(dict->size()-1));
00956 }
00957 else
00958 subdict=dict;
00959
00960 fstream out(filename,ios::out);
00961
00962
00963 streampos pos[lmsize()+1];
00964 int num[lmsize()+1];
00965 char buff[100];
00966
00967
00968 out << "\n\\data\\" << "\n";
00969
00970 for (int i=1;i<=lmsize();i++){
00971 num[i]=0;
00972 pos[i]=out.tellp();
00973 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
00974 out << buff;
00975 }
00976
00977 out << "\n";
00978
00979
00980
00981 for (int i=1;i<=lmsize();i++){
00982
00983 out << "\n\\" << i << "-grams:\n";
00984
00985 double totp=0;
00986 double fstar,lambda,bo,dummy,dummy2,pr;
00987
00988
00989 ngram ng(dict,1);
00990 ngram ng2(dict);
00991 ngram sng(subdict,1);
00992
00993 if (i==1){
00994
00995
00996
00997 for (int w=0;w<dict->size();w++)
00998 {
00999 *ng.wordp(1)=w;
01000
01001 sng.trans(ng);
01002 pr=mdiadaptlm::prob(ng,1);
01003 totp+=pr;
01004
01005 if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)){
01006 oovprob+=pr;
01007 continue;
01008 }
01009
01010
01011 if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
01012
01013
01014
01015 out << (float) (pr?log10(pr):-99);
01016
01017 num[i]++;
01018
01019 if (w==dict->oovcode())
01020 out << "\t" << "<unk>\n";
01021 else{
01022 out << "\t" << (char *)dict->decode(w);
01023
01024 if (lmsize()>1){
01025 ngram ng2=ng; ng2.pushc(0);
01026
01027
01028
01029 mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
01030 assert(!backoff || ((lambda<1.00000001 && lambda>0.99999999) || bo< 1.0000001 ));
01031
01032 if (lmsize()> 1 && lambda < 1.0000001)
01033
01034 out << "\t" << (float) (log10(lambda) - log10(bo));
01035 }
01036 out << "\n";
01037 }
01038 }
01039
01040 }
01041 else{
01042 scan(ng,INIT,i);
01043 while(scan(ng,CONT,i)){
01044
01045 sng.trans(ng);
01046 if (sng.containsWord(subdict->OOV(),i)) continue;
01047
01048
01049 if (sng.containsWord(dict->EoS(),i-1)) continue;
01050
01051
01052
01053 pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
01054
01055
01056
01057
01058
01059
01060
01061
01062 if (!(pr<=1.0 && pr > 1e-10)){
01063 cerr << ng << " " << pr << "\n";
01064 assert(pr<=1.0);
01065 cerr << "prob modified to 1e-10\n";
01066 pr=1e-10;
01067 };
01068
01069 if (i<lmsize()){
01070 ng2=ng; ng2.pushc(0);
01071
01072 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
01073
01074 if (fstar>=0.0000000001 || lambda <= 0.9999999999){
01075
01076 out << (float) log10(pr);
01077 out << "\t" << (char *)dict->decode(*ng.wordp(i));
01078 for (int j=i-1;j>0;j--)
01079 out << " " << (char *)dict->decode(*ng.wordp(j));
01080
01081 if (lambda < 0.9999999999) out << "\t" << (float) (log10(lambda) - log10(bo));
01082 out << "\n";
01083 num[i]++;
01084 }
01085 }
01086 else
01087 if (fstar >0.0000000001){
01088
01089 out << (float) log10(pr);
01090 out << "\t" << (char *)dict->decode(*ng.wordp(i));
01091 for (int j=i-1;j>0;j--)
01092 out << " " << (char *)dict->decode(*ng.wordp(j));
01093 out << "\n";
01094
01095 num[i]++;
01096 }
01097 }
01098 }
01099
01100 cerr << i << "grams tot:" << num[i] << "\n";
01101 }
01102
01103 streampos last=out.tellp();
01104
01105
01106 for (int i=1;i<=lmsize();i++){
01107 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
01108 out.seekp(pos[i]);
01109 out << buff;
01110 }
01111
01112 out.seekp(last);
01113 out << "\\end\\" << "\n";
01114 system("date");
01115
01116 return 1;
01117 };
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137
01138
01139
01140
01141
01142
01143
01144
01145