00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef MF_DICTIONARY_H
00024 #define MF_DICTIONARY_H
00025
00026 #include "mfstream.h"
00027 #include <cstring>
00028 #include <iostream>
00029
00030
00031 #define MAX_WORD 1000
00032 #define LOAD_FACTOR 5
00033
00034 #ifndef GROWTH_STEP
00035 #define GROWTH_STEP 100000
00036 #endif
00037
00038 #ifndef DICT_INITSIZE
00039 #define DICT_INITSIZE 100000
00040 #endif
00041
00042
00043 #ifndef BOS_
00044 #define BOS_ "<s>"
00045 #endif
00046
00047
00048
00049 #ifndef EOS_
00050 #define EOS_ "</s>"
00051 #endif
00052
00053
00054 #ifndef OOV_
00055 #define OOV_ "<unk>"
00056 #endif
00057
00058 typedef struct{
00059 const char *word;
00060 int code;
00061 long long freq;
00062 }dict_entry;
00063
00064 class strstack;
00065 class htable;
00066
00067 class dictionary{
00068 strstack *st;
00069 dict_entry *tb;
00070 htable *htb;
00071 int n;
00072 long long N;
00073 int lim;
00074 int oov_code;
00075 char ifl;
00076 int dubv;
00077 char* oov_str;
00078
00079 public:
00080
00081 friend class dictionary_iter;
00082
00083 dictionary* oovlex;
00084
00085 inline int dub(){return dubv;}
00086 inline int dub(int value){return (dubv=value);}
00087
00088 inline const char *OOV(){return ((char*)OOV_);}
00089 inline const char *BoS(){return ((char*)BOS_);}
00090 inline const char *EoS(){return ((char*)EOS_);}
00091
00092 inline int oovcode(int v=-1){return oov_code=(v>=0?v:oov_code);}
00093
00094 inline int incflag(){return ifl;}
00095 inline int incflag(int v){return ifl=v;}
00096
00097 int getword(fstream& inp , char* buffer);
00098 int isprintable(char* w){
00099 char buffer[MAX_WORD];
00100 sprintf(buffer,"%s",w);
00101 return strcmp(w,buffer)==0;
00102 }
00103
00104 inline void genoovcode(){
00105 int c=encode(OOV());
00106 std::cerr << "OOV code is "<< c << std::endl;
00107 oovcode(c);
00108 }
00109
00110
00111 inline int setoovrate(double oovrate){
00112 encode(OOV());
00113 int oovfreq=(int)(oovrate * totfreq());
00114 std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl;
00115 return freq(oovcode(),oovfreq);
00116 }
00117
00118
00119 inline long long incfreq(int code,long long value){N+=value;return tb[code].freq+=value;}
00120
00121 inline long long multfreq(int code,double value){
00122 N+=(long long)(value * tb[code].freq)-tb[code].freq;
00123 return tb[code].freq=(long long)(value * tb[code].freq);
00124 }
00125
00126 inline long freq(int code,long long value=-1){
00127 if (value>=0){
00128 N+=value-tb[code].freq;
00129 tb[code].freq=value;
00130 }
00131 return tb[code].freq;
00132 }
00133
00134 inline long long totfreq(){return N;}
00135
00136 void grow();
00137 void sort();
00138
00139 dictionary(char *filename,int size=DICT_INITSIZE);
00140 dictionary(dictionary* d, bool sortflag=true);
00141
00142 ~dictionary();
00143 void generate(char *filename);
00144 void load(char *filename);
00145 void save(char *filename, int freqflag=0);
00146 void load(std::istream& fd);
00147 void save(std::ostream& fd);
00148
00149 int size(){return n;}
00150 int getcode(const char *w);
00151 int encode(const char *w);
00152 const char *decode(int c);
00153 void stat();
00154
00155 void print_curve(int curvesize, float* testOOV=NULL);
00156 float* test(int curvesize, const char *filename, int listflag=0);
00157
00158 void cleanfreq(){
00159 for (int i=0;i<n;tb[i++].freq=0);
00160 N=0;
00161 }
00162
00163 };
00164
00165 class dictionary_iter {
00166 public:
00167 dictionary_iter(dictionary *dict);
00168 dict_entry* next();
00169 private:
00170 dictionary* m_dict;
00171 };
00172
00173 #endif
00174