00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef MF_LMTABLE_H
00025 #define MF_LMTABLE_H
00026
00027 #ifndef WIN32
00028 #include <sys/types.h>
00029 #include <sys/mman.h>
00030 #endif
00031
00032 #include <math.h>
00033 #include <cstdlib>
00034 #include <string>
00035 #include <set>
00036 #include "util.h"
00037 #include "ngramcache.h"
00038 #include "dictionary.h"
00039 #include "n_gram.h"
00040
00041 #define MAX(a,b) (((a)>(b))?(a):(b))
00042 #define MIN(a,b) (((a)<(b))?(a):(b))
00043
00044
00045
00046 #define LMTMAXLEV 20
00047 #define MAX_LINE 1024
00048
00049 #ifndef LMTCODESIZE
00050 #define LMTCODESIZE (int)3
00051 #endif
00052
00053 #define SHORTSIZE (int)2
00054 #define PTRSIZE (int)sizeof(char *)
00055 #define INTSIZE (int)4
00056 #define CHARSIZE (int)1
00057
00058 #define PROBSIZE (int)4 //use float
00059 #define QPROBSIZE (int)1 //use qfloat_t
00060
00061 #define BOUNDSIZE (int)sizeof(table_entry_pos_t) //use table_pos_t
00062
00063 #define UNIGRAM_RESOLUTION 10000000.0
00064
00065 typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE;
00066 typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE;
00067 typedef char* node;
00068
00069 typedef enum {LMT_FIND,
00070 LMT_ENTER,
00071 LMT_INIT,
00072 LMT_CONT
00073 } LMT_ACTION;
00074
00075 typedef unsigned int table_entry_pos_t;
00076 typedef unsigned long table_pos_t;
00077 typedef unsigned char qfloat_t;
00078
00079
00080
00081 #define BOUND_EMPTY1 (numeric_limits<table_entry_pos_t>::max() - 2)
00082 #define BOUND_EMPTY2 (numeric_limits<table_entry_pos_t>::max() - 1)
00083
00084
00085
00086
00087
00088
00089 class lmtable{
00090
00091 protected:
00092 char* table[LMTMAXLEV+1];
00093 LMT_TYPE tbltype[LMTMAXLEV+1];
00094 table_entry_pos_t cursize[LMTMAXLEV+1];
00095 table_entry_pos_t maxsize[LMTMAXLEV+1];
00096 table_entry_pos_t* startpos[LMTMAXLEV+1];
00097
00098 int maxlev;
00099 char info[100];
00100
00101
00102 int totget[LMTMAXLEV+1];
00103 int totbsearch[LMTMAXLEV+1];
00104
00105
00106 bool isQtable;
00107
00108
00109 bool isItable;
00110
00111
00112 bool isInverted;
00113
00114
00115 bool isPruned;
00116
00117 int NumCenters[LMTMAXLEV+1];
00118 float* Pcenters[LMTMAXLEV+1];
00119 float* Bcenters[LMTMAXLEV+1];
00120
00121 double logOOVpenalty;
00122 int dictionary_upperbound;
00123 int backoff_state;
00124
00125
00126 ngramcache* lmtcache[LMTMAXLEV+1];
00127 ngramcache* probcache;
00128 ngramcache* statecache;
00129 ngramcache* statesizecache;
00130 int max_cache_lev;
00131
00132
00133 int memmap;
00134 int diskid;
00135 off_t tableOffs[LMTMAXLEV+1];
00136 off_t tableGaps[LMTMAXLEV+1];
00137
00138
00139
00140 bool orderQuery;
00141
00142 public:
00143
00144 #ifdef TRACE_CACHE
00145 std::fstream* cacheout;
00146 int sentence_id;
00147 #endif
00148
00149 dictionary *dict;
00150
00151 lmtable();
00152
00153 virtual ~lmtable(){
00154 for (int i=2;i<=LMTMAXLEV;i++)
00155 if (lmtcache[i]){
00156
00157 delete lmtcache[i];
00158 }
00159
00160 if (probcache){
00161
00162 delete probcache;
00163 #if TRACE_CACHE
00164 cacheout->close();
00165 delete cacheout;
00166 #endif
00167
00168 }
00169 if (statecache){
00170
00171 delete statecache;
00172 delete statesizecache;
00173 }
00174
00175
00176 for (int l=1;l<=maxlev;l++){
00177 if (table[l]){
00178 if (memmap > 0 && l >= memmap)
00179 Munmap(table[l]-tableGaps[l],cursize[l]*nodesize(tbltype[l])+tableGaps[l],0);
00180 else
00181 delete [] table[l];
00182 }
00183 if (isQtable){
00184 if (Pcenters[l]) delete [] Pcenters[l];
00185 if (l<maxlev)
00186 if (Bcenters[l]) delete [] Bcenters[l];
00187 }
00188 }
00189
00190 delete dict;
00191 }
00192
00193
00194
00195 table_entry_pos_t wdprune(float *thr, int aflag=0);
00196
00197
00198 table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos,
00199 double lk=0, double bo=0, double *ts=0, double *tbs=0);
00200 double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0);
00201
00202
00203 table_entry_pos_t ngcnt(table_entry_pos_t *cnt);
00204
00205 table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos);
00206
00207 int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s);
00208
00209 void init_probcache();
00210 void init_statecache();
00211 void init_lmtcaches(int uptolev);
00212
00213 void check_cache_levels();
00214 void reset_caches();
00215
00216 void reset_mmap();
00217
00218 bool is_probcache_active(){return probcache!=NULL;}
00219 bool is_statecache_active(){return statecache!=NULL;}
00220 bool are_lmtcaches_active(){return lmtcache[2]!=NULL;}
00221
00222 bool is_inverted(const bool flag){return isInverted=flag;}
00223 bool is_inverted(){return isInverted;}
00224
00225 void configure(int n,bool quantized);
00226
00227
00228 double getlogOOVpenalty() const { return logOOVpenalty; }
00229
00230 double setlogOOVpenalty(int dub){
00231 assert(dub > dict->size());
00232 return logOOVpenalty=log((double)(dub - dict->size()))/log(10.0);
00233 }
00234
00235 double setlogOOVpenalty2(double oovp){
00236 return logOOVpenalty=oovp;
00237 }
00238
00239 int maxlevel() const {return maxlev;}
00240 bool isQuantized() const {return isQtable;}
00241
00242
00243 void savetxt(const char *filename);
00244 void savebin(const char *filename);
00245
00246 void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos);
00247
00248 void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0,OUTFILE_TYPE outtype=NONE);
00249 void loadtxt(std::istream& inp,const char* header,const char* outfilename,int mmap);
00250 void loadtxt(std::istream& inp,const char* header);
00251 void loadtxtmmap(std::istream& inp,const char* header,const char* outfilename);
00252 void loadbin(std::istream& inp,const char* header,const char* filename=NULL,int mmap=0);
00253
00254 void loadbinheader(std::istream& inp, const char* header);
00255 void loadbincodebook(std::istream& inp,int l);
00256 void loadcenters(std::istream& inp,int Order);
00257
00258
00259 lmtable* cpsublm(dictionary* subdict,bool keepunigr=true);
00260
00261 int reload(std::set<string> words);
00262
00263 void filter(const char* lmfile){}
00264
00265
00266 virtual double lprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL);
00267 virtual double clprob(ngram ng);
00268
00269
00270
00271 void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL);
00272
00273
00274
00275 int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx);
00276
00277
00278
00279 template<typename TA, typename TB> int add(ngram& ng, TA prob,TB bow);
00280
00281 void checkbounds(int level);
00282
00283 int get(ngram& ng){return get(ng,ng.size,ng.size);}
00284 int get(ngram& ng,int n,int lev);
00285
00286 int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
00287
00288 virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
00289 virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
00290
00291 inline void putmem(char* ptr,int value,int offs,int size){
00292 assert(ptr!=NULL);
00293 for (int i=0;i<size;i++)
00294 ptr[offs+i]=(value >> (8 * i)) & 0xff;
00295 }
00296
00297 inline void getmem(char* ptr,int* value,int offs,int size){
00298 assert(ptr!=NULL);
00299 *value=ptr[offs] & 0xff;
00300 for (int i=1;i<size;i++)
00301 *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
00302 }
00303
00304 template<typename T>
00305 inline void putmem(char* ptr,T value,int offs){
00306 assert(ptr!=NULL);
00307 memcpy(ptr+offs, &value, sizeof(T));
00308 }
00309
00310 template<typename T>
00311 inline void getmem(char* ptr,T* value,int offs){
00312 assert(ptr!=NULL);
00313 memcpy((void*)value, ptr+offs, sizeof(T));
00314 }
00315
00316
00317 int nodesize(LMT_TYPE ndt){
00318 switch (ndt){
00319 case INTERNAL:
00320 return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;
00321 case QINTERNAL:
00322 return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;
00323 case LEAF:
00324 return LMTCODESIZE + PROBSIZE;
00325 case QLEAF:
00326 return LMTCODESIZE + QPROBSIZE;
00327 default:
00328 assert(0);
00329 return 0;
00330 }
00331 }
00332
00333 inline int word(node nd,int value=-1)
00334 {
00335 int offset=0;
00336
00337 if (value==-1)
00338 getmem(nd,&value,offset,LMTCODESIZE);
00339 else
00340 putmem(nd,value,offset,LMTCODESIZE);
00341
00342 return value;
00343 }
00344
00345 inline float prob(node nd,LMT_TYPE ndt)
00346 {
00347 int offs=LMTCODESIZE;
00348
00349 float fv;
00350 unsigned char cv;
00351 switch (ndt){
00352 case INTERNAL:
00353 getmem(nd,&fv,offs);
00354 return fv;
00355 case QINTERNAL:
00356 getmem(nd,&cv,offs);
00357 return (float) cv;
00358 case LEAF:
00359 getmem(nd,&fv,offs);
00360 return fv;
00361 case QLEAF:
00362 getmem(nd,&cv,offs);
00363 return (float) cv;
00364 default:
00365 assert(0);
00366 return 0;
00367 }
00368 }
00369
00370 template<typename T>
00371 inline float prob(node nd,LMT_TYPE ndt, T value)
00372 {
00373 int offs=LMTCODESIZE;
00374
00375 putmem(nd,value,offs);
00376
00377 return (float) value;
00378 }
00379
00380 inline float bow(node nd,LMT_TYPE ndt)
00381 {
00382 int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00383
00384 float fv;
00385 unsigned char cv;
00386 switch (ndt){
00387 case INTERNAL:
00388 getmem(nd,&fv,offs);
00389 return fv;
00390 case QINTERNAL:
00391 getmem(nd,&cv,offs);
00392 return (float) cv;
00393 case LEAF:
00394 getmem(nd,&fv,offs);
00395 return fv;
00396 case QLEAF:
00397 getmem(nd,&cv,offs);
00398 return (float) cv;
00399 default:
00400 assert(0);
00401 return 0;
00402 }
00403 }
00404
00405 template<typename T>
00406 inline T bow(node nd,LMT_TYPE ndt, T value)
00407 {
00408 int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00409
00410 putmem(nd,value,offs);
00411
00412 return value;
00413 }
00414
00415
00416 inline table_entry_pos_t bound(node nd,LMT_TYPE ndt)
00417 {
00418 int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00419
00420 table_entry_pos_t v;
00421 getmem(nd,&v,offs);
00422 return v;
00423 }
00424
00425
00426 template<typename T>
00427 inline T bound(node nd,LMT_TYPE ndt, T value)
00428 {
00429 int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00430
00431 putmem(nd,value,offs);
00432
00433 return value;
00434 }
00435
00436 void stat(int lev=0);
00437
00438
00439 void printTable(int level);
00440
00441 virtual inline dictionary* getDict() const {
00442 return dict;
00443 }
00444
00445 inline void setOrderQuery(bool v)
00446 {
00447 orderQuery = v;
00448 }
00449 inline bool isOrderQuery() const
00450 {
00451 return orderQuery;
00452 }
00453 };
00454
00455
00456 #endif
00457