00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef MF_NGRAMTABLE_H
00024 #define MF_NGRAMTABLE_H
00025
00026
00027 #ifndef BACKOFF_
00028 #define BACKOFF_ "_backoff_"
00029 #endif
00030
00031
00032 #ifndef DUMMY_
00033 #define DUMMY_ "_dummy_"
00034 #endif
00035
00036
00037
00038 #ifdef MYCODESIZE
00039 #define DEFCODESIZE MYCODESIZE
00040 #else
00041 #define DEFCODESIZE (int)2
00042 #endif
00043
00044 #define SHORTSIZE (int)2
00045 #define PTRSIZE (int)sizeof(char *)
00046 #define INTSIZE (int)4
00047 #define CHARSIZE (int)1
00048
00049
00050
00051 #define FREQ1 (unsigned char) 1
00052 #define FREQ2 (unsigned char) 2
00053 #define FREQ4 (unsigned char) 4
00054 #define INODE (unsigned char) 8
00055 #define LNODE (unsigned char) 16
00056 #define SNODE (unsigned char) 32
00057 #define FREQ6 (unsigned char) 64
00058 #define FREQ3 (unsigned char) 128
00059
00060 typedef char* node;
00061 typedef char* table;
00062
00063 typedef unsigned char NODETYPE;
00064
00065
00066 typedef enum {FIND,
00067 ENTER,
00068 DELETE,
00069 INIT,
00070 CONT
00071 } ACTION;
00072
00073
00074 typedef enum {COUNT,
00075 LEAFPROB,
00076 FLEAFPROB,
00077 LEAFPROB2,
00078 LEAFPROB3,
00079 LEAFPROB4,
00080 LEAFCODE,
00081 SIMPLE_I,
00082 SIMPLE_B,
00083 SHIFTBETA_I,
00084 SHIFTBETA_B,
00085 MSHIFTBETA_I,
00086 MSHIFTBETA_B,
00087 FULL,
00088
00089 } TABLETYPE;
00090
00091
00092
00093 class tabletype{
00094
00095 TABLETYPE ttype;
00096
00097 public:
00098
00099 int CODESIZE;
00100 long long code_range[7];
00101
00102
00103 int WORD_OFFS;
00104 int MSUCC_OFFS;
00105 int MTAB_OFFS;
00106 int FLAGS_OFFS;
00107 int SUCC1_OFFS;
00108 int SUCC2_OFFS;
00109 int BOFF_OFFS;
00110 int I_FREQ_OFFS;
00111 int I_FREQ_NUM;
00112 int L_FREQ_NUM;
00113 int L_FREQ_SIZE;
00114
00115
00116 int L_FREQ_OFFS;
00117
00118 TABLETYPE tbtype(){return ttype;}
00119
00120 tabletype(TABLETYPE tt,int codesize=DEFCODESIZE){
00121
00122 if (codesize<=4 && codesize>0)
00123 CODESIZE=codesize;
00124 else{
00125 cerr << "ngramtable wrong codesize\n";
00126 exit(1);
00127 }
00128
00129 code_range[1]=255;
00130 code_range[2]=65535;
00131 code_range[3]=16777214;
00132 code_range[4]=2147483640;
00133 code_range[6]=140737488360000LL;
00134
00135
00136
00137
00138 L_FREQ_SIZE=FREQ1;
00139
00140 WORD_OFFS =0;
00141 MSUCC_OFFS =CODESIZE;
00142 MTAB_OFFS =MSUCC_OFFS+CODESIZE;
00143 FLAGS_OFFS =MTAB_OFFS+PTRSIZE;
00144
00145 switch (tt){
00146
00147 case COUNT:
00148 SUCC1_OFFS =0;
00149 SUCC2_OFFS =0;
00150 BOFF_OFFS =0;
00151 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00152 I_FREQ_NUM=1;
00153 L_FREQ_NUM=1;
00154
00155 ttype=tt;
00156 break;
00157
00158 case FULL:case MSHIFTBETA_B:
00159 SUCC1_OFFS =FLAGS_OFFS+CHARSIZE;
00160 SUCC2_OFFS =SUCC1_OFFS+CODESIZE;
00161 BOFF_OFFS =SUCC2_OFFS+CODESIZE;
00162 I_FREQ_OFFS=BOFF_OFFS+INTSIZE;
00163 L_FREQ_OFFS=CODESIZE;
00164 I_FREQ_NUM=2;
00165 L_FREQ_NUM=1;
00166
00167 ttype=tt;
00168 break;
00169
00170 case MSHIFTBETA_I:
00171 SUCC1_OFFS =FLAGS_OFFS+CHARSIZE;
00172 SUCC2_OFFS =SUCC1_OFFS+CODESIZE;
00173 BOFF_OFFS =0;
00174 I_FREQ_OFFS=SUCC2_OFFS+CODESIZE;
00175 L_FREQ_OFFS=CODESIZE;
00176 I_FREQ_NUM=2;
00177 L_FREQ_NUM=1;
00178
00179 ttype=tt;
00180 break;
00181
00182 case SIMPLE_I:
00183 SUCC1_OFFS = 0;
00184 SUCC2_OFFS = 0;
00185 BOFF_OFFS = 0;
00186 I_FREQ_OFFS= FLAGS_OFFS+CHARSIZE;
00187 L_FREQ_OFFS=CODESIZE;
00188 I_FREQ_NUM=1;
00189 L_FREQ_NUM=1;
00190
00191 ttype=tt;
00192 break;
00193
00194 case SIMPLE_B:
00195
00196 SUCC1_OFFS = 0;
00197 SUCC2_OFFS = 0;
00198 BOFF_OFFS = FLAGS_OFFS+CHARSIZE;
00199 I_FREQ_OFFS = BOFF_OFFS+INTSIZE;
00200 L_FREQ_OFFS = CODESIZE;
00201 I_FREQ_NUM = 1;
00202 L_FREQ_NUM = 1;
00203
00204 ttype=tt;
00205 break;
00206
00207 case SHIFTBETA_I:
00208 SUCC1_OFFS = FLAGS_OFFS+CHARSIZE;
00209 SUCC2_OFFS = 0;
00210 BOFF_OFFS = 0;
00211 I_FREQ_OFFS= SUCC1_OFFS+CODESIZE;
00212 L_FREQ_OFFS=CODESIZE;
00213 I_FREQ_NUM=1;
00214 L_FREQ_NUM=1;
00215
00216 ttype=tt;
00217 break;
00218
00219 case SHIFTBETA_B:
00220
00221 SUCC1_OFFS = FLAGS_OFFS+CHARSIZE;
00222 SUCC2_OFFS = 0;
00223 BOFF_OFFS = SUCC1_OFFS+CODESIZE;
00224 I_FREQ_OFFS = BOFF_OFFS+INTSIZE;
00225 L_FREQ_OFFS = CODESIZE;
00226 I_FREQ_NUM = 1;
00227 L_FREQ_NUM = 1;
00228
00229 ttype=tt;
00230 break;
00231
00232 case LEAFPROB: case FLEAFPROB:
00233 SUCC1_OFFS = 0;
00234 SUCC2_OFFS = 0;
00235 BOFF_OFFS = 0;
00236 I_FREQ_OFFS = FLAGS_OFFS+CHARSIZE;
00237 I_FREQ_NUM = 0;
00238 L_FREQ_NUM = 1;
00239
00240 ttype=tt;
00241 break;
00242
00243 case LEAFPROB2:
00244 SUCC1_OFFS =0;
00245 SUCC2_OFFS =0;
00246 BOFF_OFFS =0;
00247 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00248 I_FREQ_NUM=0;
00249 L_FREQ_NUM=2;
00250
00251 ttype=LEAFPROB;
00252 break;
00253
00254 case LEAFPROB3:
00255 SUCC1_OFFS =0;
00256 SUCC2_OFFS =0;
00257 BOFF_OFFS =0;
00258 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00259 I_FREQ_NUM=0;
00260 L_FREQ_NUM=3;
00261
00262 ttype=LEAFPROB;
00263 break;
00264
00265 case LEAFPROB4:
00266 SUCC1_OFFS =0;
00267 SUCC2_OFFS =0;
00268 BOFF_OFFS =0;
00269 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00270 I_FREQ_NUM=0;
00271 L_FREQ_NUM=4;
00272
00273 ttype=LEAFPROB;
00274 break;
00275
00276 default:
00277 assert(tt==COUNT);
00278 }
00279
00280 L_FREQ_OFFS=CODESIZE;
00281 }
00282
00283 int inodesize(int s){
00284
00285 return I_FREQ_OFFS + I_FREQ_NUM * s;
00286
00287 }
00288
00289 int lnodesize(int s){
00290 return L_FREQ_OFFS + L_FREQ_NUM * s;
00291 }
00292
00293 };
00294
00295
00296 class ngramtable:tabletype{
00297
00298 node tree;
00299 int maxlev;
00300 NODETYPE treeflags;
00301 char info[100];
00302 int resolution;
00303 double decay;
00304
00305 storage* mem;
00306
00307 int* memory;
00308 int* occupancy;
00309 long long* mentr;
00310 long long card;
00311
00312 int idx[MAX_NGRAM+1];
00313
00314 int oov_code,oov_size,du_code, bo_code;
00315
00316 int backoff_state;
00317
00318 public:
00319
00320 int corrcounts;
00321
00322 dictionary *dict;
00323
00324
00325
00326
00327 dictionary *filterdict;
00328
00329
00330
00331
00332
00333 ngramtable(char* filename,int maxl,char* is,char *oovlex,
00334 char* filterdictfile,
00335 int googletable=0,
00336 int dstco=0,char* hmask=NULL,int inplen=0,
00337 TABLETYPE tt=FULL,int codesize=DEFCODESIZE);
00338
00339 inline char* ngtype(char *str=NULL){if (str!=NULL) strcpy(info,str);return info;}
00340
00341 ~ngramtable();
00342 void freetree(node nd);
00343 void stat(int level=4);
00344
00345 inline long long totfreq(long long v=-1){
00346 return (v==-1?freq(tree,INODE):freq(tree,INODE,v));
00347 }
00348
00349 inline long long btotfreq(long long v=-1){
00350 return (v==-1?getfreq(tree,treeflags,1):setfreq(tree,treeflags,v,1));
00351 }
00352
00353 inline long long entries(int lev){
00354 return mentr[lev];
00355 }
00356
00357 int maxlevel(){return maxlev;}
00358
00359
00360 void savetxt(char *filename,int sz=0,int googleformat=0);
00361 void loadtxt(char *filename,int googletable=0);
00362
00363
00364 void savebin(char *filename,int sz=0);
00365 void savebin(mfstream& out);
00366 void savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev);
00367
00368 void loadbin(const char *filename);
00369 void loadbin(mfstream& inp);
00370 void loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev);
00371
00372 void loadbinold(char *filename);
00373 void loadbinold(mfstream& inp,node nd,NODETYPE ndt,int lev);
00374
00375 void generate(char *filename);
00376 void generate_dstco(char *filename,int dstco);
00377 void generate_hmask(char *filename,char* hmask,int inplen=0);
00378
00379 void augment(ngramtable* ngt);
00380
00381 int scan(ngram& ng,ACTION action=CONT,int maxlev=-1){
00382 return scan(tree,INODE,0,ng,action,maxlev);}
00383
00384 int succscan(ngram& h,ngram& ng,ACTION action,int lev){
00385
00386 return scan(h.link,h.info,lev-1,ng,action,lev);
00387 }
00388
00389 double prob(ngram ng);
00390
00391 int scan(node nd,NODETYPE ndt,int lev,ngram& ng,ACTION action=CONT,int maxl=-1);
00392
00393 void show();
00394
00395 void *search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *w,
00396 ACTION action,char **found=(char **)NULL);
00397
00398 int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx);
00399
00400 int put(ngram& ng);
00401 int put(ngram& ng,node nd,NODETYPE ndt,int lev);
00402
00403 int get(ngram& ng){return get(ng,maxlev,maxlev);}
00404 int get(ngram& ng,int n,int lev);
00405
00406 int comptbsize(int n);
00407 table *grow(table *tb,NODETYPE ndt,int lev,int n,int sz,NODETYPE oldndt=0);
00408
00409 inline int putmem(char* ptr,int value,int offs,int size){
00410 assert(ptr!=NULL);
00411 for (int i=0;i<size;i++)
00412 ptr[offs+i]=(value >> (8 * i)) & 0xff;
00413 return value;
00414 }
00415
00416 inline int getmem(char* ptr,int* value,int offs,int size){
00417 assert(ptr!=NULL);
00418 *value=ptr[offs] & 0xff;
00419 for (int i=1;i<size;i++)
00420 *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
00421 return *value;
00422 }
00423
00424 inline long putmem(char* ptr,long long value,int offs,int size){
00425 assert(ptr!=NULL);
00426 for (int i=0;i<size;i++)
00427 ptr[offs+i]=(value >> (8 * i)) & 0xffLL;
00428 return value;
00429 }
00430
00431 inline long getmem(char* ptr,long long* value,int offs,int size){
00432 assert(ptr!=NULL);
00433 *value=ptr[offs] & 0xff;
00434 for (int i=1;i<size;i++)
00435 *value= *value | ( ( ptr[offs+i] & 0xffLL ) << (8 *i));
00436 return *value;
00437 }
00438
00439 inline void tb2ngcpy(int* wordp,char* tablep,int n=1){
00440 for (int i=0;i<n;i++)
00441 getmem(tablep,&wordp[i],i*CODESIZE,CODESIZE);
00442 }
00443
00444 inline void ng2tbcpy(char* tablep,int* wordp,int n=1){
00445 for (int i=0;i<n;i++)
00446 putmem(tablep,wordp[i],i*CODESIZE,CODESIZE);
00447 }
00448
00449 inline int ngtbcmp(int* wordp,char* tablep,int n=1){
00450 int word;
00451 for (int i=0;i<n;i++){
00452 getmem(tablep,&word,i*CODESIZE,CODESIZE);
00453 if (wordp[i]!=word) return 1;
00454 }
00455 return 0;
00456 }
00457
00458 inline int word(node nd,int value)
00459 {
00460 putmem(nd,value,WORD_OFFS,CODESIZE);
00461 return value;
00462 }
00463
00464 inline int word(node nd)
00465 {
00466 int v;
00467 getmem(nd,&v,WORD_OFFS,CODESIZE);
00468 return v;
00469 }
00470
00471 unsigned char mtflags(node nd,unsigned char value){
00472 return *(unsigned char *)(nd+FLAGS_OFFS)=value;
00473 }
00474
00475 unsigned char mtflags(node nd){
00476 return *(unsigned char *)(nd+FLAGS_OFFS);
00477 }
00478
00479
00480 int update(ngram ng){
00481
00482 if (!get(ng,ng.size,ng.size))
00483 {
00484 cerr << "cannot find " << ng << "\n";
00485 exit (1);
00486 }
00487
00488 freq(ng.link,ng.pinfo,ng.freq);
00489
00490 return 1;
00491 }
00492
00493 long long freq(node nd,NODETYPE ndt,long long value)
00494 {
00495 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00496
00497 if (ndt & FREQ1)
00498 putmem(nd,value,offs,1);
00499 else if (ndt & FREQ2)
00500 putmem(nd,value,offs,2);
00501 else if (ndt & FREQ3)
00502 putmem(nd,value,offs,3);
00503 else if (ndt & FREQ4)
00504 putmem(nd,value,offs,4);
00505 else
00506 putmem(nd,value,offs,6);
00507 return value;
00508 }
00509
00510 long long freq(node nd,NODETYPE ndt)
00511 {
00512 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00513 long long value;
00514
00515 if (ndt & FREQ1)
00516 getmem(nd,&value,offs,1);
00517 else if (ndt & FREQ2)
00518 getmem(nd,&value,offs,2);
00519 else if (ndt & FREQ3)
00520 getmem(nd,&value,offs,3);
00521 else if (ndt & FREQ4)
00522 getmem(nd,&value,offs,4);
00523 else
00524 getmem(nd,&value,offs,6);
00525
00526 return value;
00527 }
00528
00529
00530 long long setfreq(node nd,NODETYPE ndt,long long value,int index=0)
00531 {
00532 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00533
00534 if (ndt & FREQ1)
00535 putmem(nd,value,offs+index * 1,1);
00536 else if (ndt & FREQ2)
00537 putmem(nd,value,offs+index * 2,2);
00538 else if (ndt & FREQ3)
00539 putmem(nd,value,offs+index * 3,3);
00540 else if (ndt & FREQ4)
00541 putmem(nd,value,offs+index * 4,4);
00542 else
00543 putmem(nd,value,offs+index * 6,6);
00544
00545 return value;
00546 }
00547
00548 long long getfreq(node nd,NODETYPE ndt,int index=0)
00549 {
00550 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00551
00552 long long value;
00553
00554 if (ndt & FREQ1)
00555 getmem(nd,&value,offs+ index * 1,1);
00556 else if (ndt & FREQ2)
00557 getmem(nd,&value,offs+ index * 2,2);
00558 else if (ndt & FREQ3)
00559 getmem(nd,&value,offs+ index * 3,3);
00560 else if (ndt & FREQ4)
00561 getmem(nd,&value,offs+ index * 4,4);
00562 else
00563 getmem(nd,&value,offs+ index * 6,6);
00564
00565 return value;
00566 }
00567
00568
00569 double boff(node nd)
00570 {
00571 int value=0;
00572 getmem(nd,&value,BOFF_OFFS,INTSIZE);
00573
00574 return double (value/(double)1000000000.0);
00575 }
00576
00577
00578 double myround(double x){
00579 long int i=(long int)(x);
00580 return (x-i)>0.500?i+1.0:(double)i;
00581 }
00582
00583 int boff(node nd,double value)
00584 {
00585 int v=(int)myround(value * 1000000000.0);
00586 putmem(nd,v,BOFF_OFFS,INTSIZE);
00587
00588 return 1;
00589 }
00590
00591 int succ2(node nd,int value)
00592 {
00593 putmem(nd,value,SUCC2_OFFS,CODESIZE);
00594 return value;
00595 }
00596
00597 int succ2(node nd)
00598 {
00599 int value=0;
00600 getmem(nd,&value,SUCC2_OFFS,CODESIZE);
00601 return value;
00602 }
00603
00604 int succ1(node nd,int value)
00605 {
00606 putmem(nd,value,SUCC1_OFFS,CODESIZE);
00607 return value;
00608 }
00609
00610 int succ1(node nd)
00611 {
00612 int value=0;
00613 getmem(nd,&value,SUCC1_OFFS,CODESIZE);
00614 return value;
00615 }
00616
00617 int msucc(node nd,int value)
00618 {
00619 putmem(nd,value,MSUCC_OFFS,CODESIZE);
00620 return value;
00621 }
00622
00623 int msucc(node nd)
00624 {
00625 int value;
00626 getmem(nd,&value,MSUCC_OFFS,CODESIZE);
00627 return value;
00628 }
00629
00630 table mtable(node nd)
00631 {
00632 char v[PTRSIZE];;
00633 for (int i=0;i<PTRSIZE;i++)
00634 v[i]=nd[MTAB_OFFS+i];
00635
00636 return *(table *)v;
00637 }
00638
00639 table mtable(node nd,table value)
00640 {
00641 char *v=(char *)&value;
00642 for (int i=0;i<PTRSIZE;i++)
00643 nd[MTAB_OFFS+i]=v[i];
00644 return value;
00645 }
00646
00647 int mtablesz(node nd)
00648 {
00649 if (mtflags(nd) & LNODE){
00650 if (mtflags(nd) & FREQ1)
00651 return lnodesize(1);
00652 else if (mtflags(nd) & FREQ2)
00653 return lnodesize(2);
00654 else if (mtflags(nd) & FREQ3)
00655 return lnodesize(3);
00656 else if (mtflags(nd) & FREQ4)
00657 return lnodesize(4);
00658 else
00659 return lnodesize(6);
00660 }
00661 else if (mtflags(nd) & INODE){
00662 if (mtflags(nd) & FREQ1)
00663 return inodesize(1);
00664 else if (mtflags(nd) & FREQ2)
00665 return inodesize(2);
00666 else if (mtflags(nd) & FREQ3)
00667 return inodesize(3);
00668 else if (mtflags(nd) & FREQ4)
00669 return inodesize(4);
00670 else
00671 return inodesize(6);
00672 }
00673 else{
00674 cerr << "node has wrong flags\n";
00675 exit(1);
00676 }
00677 }
00678
00679
00680 int bo_state(int value=-1){
00681 return (value==-1?backoff_state:backoff_state=value);
00682 }
00683
00684 };
00685
00686 #endif
00687
00688
00689
00690