00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "mfstream.h"
00024 #include <stdio.h>
00025 #include <stdlib.h>
00026 #include <iomanip>
00027 #include <iostream>
00028 #include <fstream>
00029 #include "mempool.h"
00030 #include "htable.h"
00031 #include "dictionary.h"
00032 #include "index.h"
00033
00034 using namespace std;
00035
00036 dictionary::dictionary(char *filename,int size){
00037
00038
00039 htb = new htable(size/LOAD_FACTOR);
00040 tb = new dict_entry[size];
00041 st = new strstack(size * 10);
00042
00043 for (int i=0;i<size;i++) tb[i].freq=0;
00044
00045 oov_code = -1;
00046
00047 n = 0;
00048 N = 0;
00049 dubv = 0;
00050 lim = size;
00051 ifl=0;
00052
00053 if (filename==NULL) return;
00054
00055 mfstream inp(filename,ios::in);
00056
00057 if (!inp){
00058 cerr << "cannot open " << filename << "\n";
00059 exit(1);
00060 }
00061
00062 char buffer[100];
00063
00064 inp >> setw(100) >> buffer;
00065
00066 inp.close();
00067
00068 if ((strncmp(buffer,"dict",4)==0) ||
00069 (strncmp(buffer,"DICT",4)==0))
00070 load(filename);
00071 else
00072 generate(filename);
00073
00074 cerr << "loaded \n";
00075
00076 }
00077
00078
00079
00080 int dictionary::getword(fstream& inp , char* buffer){
00081
00082
00083 while(inp >> setw(MAX_WORD) >> buffer){
00084
00085
00086
00087 if (strlen(buffer)==(MAX_WORD-1)){
00088 cerr << "getword: a very long word was read ("
00089 << buffer << ")\n";
00090 }
00091
00092
00093 if (strlen(buffer)==0){
00094 cerr << "zero lenght word!\n";
00095 continue;
00096 }
00097
00098
00099 return 1;
00100
00101 }
00102
00103 return 0;
00104 }
00105
00106
00107 void dictionary::generate(char *filename){
00108
00109 char buffer[MAX_WORD];
00110 int counter=0;
00111
00112 mfstream inp(filename,ios::in);
00113
00114 if (!inp){
00115 cerr << "cannot open " << filename << "\n";
00116 exit(1);
00117 }
00118
00119 cerr << "dict:";
00120
00121 ifl=1;
00122
00123 while (getword(inp,buffer)){
00124
00125 incfreq(encode(buffer),1);
00126
00127 if (!(++counter % 1000000)) cerr << ".";
00128 }
00129
00130 ifl=0;
00131
00132 cerr << "\n";
00133
00134 inp.close();
00135
00136 }
00137
00138
00139
00140
00141 void dictionary::print_curve(int curvesize, float* testOOV) {
00142
00143 int* curve = new int[curvesize];
00144 for (int i=0;i<curvesize;i++) curve[i]=0;
00145
00146
00147 for (int i=0;i<n;i++){
00148 if(tb[i].freq > curvesize-1)
00149 curve[curvesize-1]++;
00150 else
00151 curve[tb[i].freq-1]++;
00152 }
00153
00154
00155 for (int i=curvesize-2; i>=0; i--) {
00156 curve[i] = curve[i] + curve[i+1];
00157 }
00158
00159 cout.setf(ios::fixed);
00160 cout << "Dict size: " << n << "\n";
00161 cout << "**************** DICTIONARY GROWTH CURVE ****************\n";
00162 cout << "Freq\tEntries\tPercent";
00163 if(testOOV!=NULL)
00164 cout << "\t\tFreq\tOOV onTest";
00165 cout << "\n";
00166
00167 for (int i=0;i<curvesize;i++) {
00168
00169 cout << ">" << i << "\t" << curve[i] << "\t" << setprecision(2) << (float)curve[i]/n * 100.0 << "%";
00170
00171
00172 if(testOOV!=NULL)
00173 cout << "\t\t<" << i+1<< "\t" << testOOV[i] << "%";
00174 cout << "\n";
00175 }
00176 cout << "*********************************************************\n";
00177 }
00178
00179
00180
00181
00182
00183
00184 float* dictionary::test(int curvesize, const char *filename, int listflag) {
00185
00186 int NwTest=0;
00187 int* OOVchart = new int[curvesize];
00188 for (int j=0; j<curvesize; j++) OOVchart[j]=0;
00189 char buffer[MAX_WORD];
00190
00191 const char* bos = BoS();
00192
00193 int k;
00194 mfstream inp(filename,ios::in);
00195
00196 if (!inp){
00197 cerr << "cannot open test: " << filename << "\n";
00198
00199 return NULL;
00200 }
00201 cerr << "test:";
00202
00203 k=0;
00204 while (getword(inp,buffer)){
00205
00206
00207 if (strcmp(buffer,bos)==0)
00208 continue;
00209
00210 int freq = 0; int wCode = getcode(buffer);
00211 if(wCode!=-1) freq = tb[wCode].freq;
00212
00213 if(freq==0) {
00214 OOVchart[0]++;
00215 if(listflag) { cerr << "<OOV>" << buffer << "</OOV>\n"; }
00216 }
00217 else{
00218 if(freq < curvesize) OOVchart[freq]++;
00219 }
00220 NwTest++;
00221 if (!(++k % 1000000)) cerr << ".";
00222 }
00223 cerr << "\n";
00224 inp.close();
00225 cout << "nb words of test: " << NwTest << "\n";
00226
00227
00228 for (int i=1; i<curvesize; i++)
00229 OOVchart[i] = OOVchart[i] + OOVchart[i-1];
00230
00231
00232 float* OOVrates = new float[curvesize];
00233 for (int i=0; i<curvesize; i++)
00234 OOVrates[i] = (float)OOVchart[i]/NwTest * 100.0;
00235
00236 return OOVrates;
00237 }
00238
00239 void dictionary::load(char* filename){
00240 char header[100];
00241 char buffer[MAX_WORD];
00242 char *addr;
00243 int freqflag=0;
00244
00245 mfstream inp(filename,ios::in);
00246
00247 if (!inp){
00248 cerr << "\ncannot open " << filename << "\n";
00249 exit(1);
00250 }
00251
00252 cerr << "dict:";
00253
00254 inp.getline(header,100);
00255 if (strncmp(header,"DICT",4)==0)
00256 freqflag=1;
00257 else
00258 if (strncmp(header,"dict",4)!=0){
00259 cerr << "\ndictionary file " << filename << " has a wrong header\n";
00260 exit(1);
00261 }
00262
00263
00264 while (getword(inp,buffer)){
00265
00266 tb[n].word=st->push(buffer);
00267 tb[n].code=n;
00268
00269 if (freqflag)
00270 inp >> tb[n].freq;
00271 else
00272 tb[n].freq=0;
00273
00274 if ((addr=htb->search((char *)&tb[n].word,HT_ENTER)))
00275 if (addr!=(char *)&tb[n].word){
00276 cerr << "dictionary::loadtxt wrong entry was found ("
00277 << buffer << ") in position " << n << "\n";
00278
00279 continue;
00280 }
00281
00282 N+=tb[n].freq;
00283
00284 if (strcmp(buffer,OOV())==0) oov_code=n;
00285
00286 if (++n==lim) grow();
00287
00288 }
00289
00290 inp.close();
00291 }
00292
00293
00294 void dictionary::load(std::istream& inp){
00295
00296 char buffer[MAX_WORD];
00297 char *addr;
00298 int size;
00299
00300 inp >> size;
00301
00302 for (int i=0;i<size;i++){
00303
00304 inp >> setw(MAX_WORD) >> buffer;
00305
00306 tb[n].word=st->push(buffer);
00307 tb[n].code=n;
00308 inp >> tb[n].freq;
00309 N+=tb[n].freq;
00310
00311 if ((addr=htb->search((char *)&tb[n].word,HT_ENTER)))
00312 if (addr!=(char *)&tb[n].word){
00313 cerr << "dictionary::loadtxt wrong entry was found ("
00314 << buffer << ") in position " << n << "\n";
00315 exit(1);
00316 }
00317
00318 if (strcmp(tb[n].word,OOV())==0)
00319 oov_code=n;
00320
00321 if (++n==lim) grow();
00322 }
00323 inp.getline(buffer,MAX_WORD-1);
00324 }
00325
00326
00327 void dictionary::save(std::ostream& out){
00328 out << n << "\n";
00329 for (int i=0;i<n;i++)
00330 out << tb[i].word << " " << tb[i].freq << "\n";
00331 }
00332
00333 int cmpdictentry(const void *a,const void *b){
00334 dict_entry *ae=(dict_entry *)a;
00335 dict_entry *be=(dict_entry *)b;
00336 if (be->freq-ae->freq)
00337 return be->freq-ae->freq;
00338 else
00339 return strcmp(ae->word,be->word);
00340 }
00341
00342
00343 dictionary::dictionary(dictionary* d, bool sortflag){
00344
00345
00346
00347 n=d->n;
00348 N=d->N;
00349 lim=d->lim;
00350 oov_code=-1;
00351 ifl=0;
00352 dubv=d->dubv;
00353
00354
00355
00356 tb = new dict_entry[lim];
00357 htb = new htable(lim/LOAD_FACTOR);
00358 st = new strstack(lim * 10);
00359
00360 for (int i=0;i<n;i++){
00361 tb[i].code=d->tb[i].code;
00362 tb[i].freq=d->tb[i].freq;
00363 tb[i].word=st->push(d->tb[i].word);
00364 }
00365
00366 if (sortflag){
00367
00368 cerr << "sorting dictionary ...";
00369 qsort(tb,n,sizeof(dict_entry),cmpdictentry);
00370 cerr << "done\n";
00371 }
00372
00373 for (int i=0;i<n;i++){
00374
00375
00376 if (d->oov_code==tb[i].code) oov_code=i;
00377
00378 tb[i].code=i;
00379 htb->search((char *)&tb[i].word,HT_ENTER);
00380 };
00381
00382 };
00383
00384 void dictionary::sort(){
00385
00386
00387 if (htb != NULL ) delete htb;
00388 htb = new htable(lim/LOAD_FACTOR);
00389
00390
00391 cerr << "sorting dictionary ...";
00392 qsort(tb,n,sizeof(dict_entry),cmpdictentry);
00393 cerr << "done\n";
00394
00395 for (int i=0;i<n;i++){
00396
00397 if (oov_code==tb[i].code) oov_code=i;
00398 tb[i].code=i;
00399 htb->search((char *)&tb[i].word,HT_ENTER);
00400 };
00401
00402 }
00403
00404 dictionary::~dictionary(){
00405 delete htb;
00406 delete st;
00407 delete [] tb;
00408 }
00409
00410 void dictionary::stat(){
00411 cout << "dictionary class statistics\n";
00412 cout << "size " << n
00413 << " used memory "
00414 << (lim * sizeof(int) +
00415 htb->used() +
00416 st->used())/1024 << " Kb\n";
00417 }
00418
00419 void dictionary::grow(){
00420
00421 delete htb;
00422
00423 cerr << "+\b";
00424
00425 dict_entry *tb2=new dict_entry[lim+GROWTH_STEP];
00426
00427 memcpy(tb2,tb,sizeof(dict_entry) * lim );
00428
00429 delete [] tb; tb=tb2;
00430
00431 htb=new htable((lim+GROWTH_STEP)/LOAD_FACTOR);
00432
00433 for (int i=0;i<lim;i++)
00434
00435 htb->search((char *)&tb[i].word,HT_ENTER);
00436
00437 for (int i=lim;i<lim+GROWTH_STEP;i++) tb[i].freq=0;
00438
00439 lim+=GROWTH_STEP;
00440
00441
00442 }
00443
00444 void dictionary::save(char *filename,int freqflag){
00445
00446 std::ofstream out(filename,ios::out);
00447
00448 if (!out){
00449 cerr << "cannot open " << filename << "\n";
00450 }
00451
00452
00453 if (freqflag)
00454 out << "DICTIONARY 0 " << n << "\n";
00455 else
00456 out << "dictionary 0 " << n << "\n";
00457
00458 for (int i=0;i<n;i++)
00459 if (tb[i].freq){
00460 out << tb[i].word;
00461 if (freqflag)
00462 out << " " << tb[i].freq;
00463 out << "\n";
00464 }
00465
00466 out.close();
00467 }
00468
00469
00470 int dictionary::getcode(const char *w){
00471 dict_entry* ptr=(dict_entry *)htb->search((char *)&w,HT_FIND);
00472 if (ptr==NULL) return -1;
00473 return ptr->code;
00474 }
00475
00476 int dictionary::encode(const char *w){
00477
00478
00479 if (strlen(w)==0){cerr << "0";w=OOV();}
00480
00481 dict_entry* ptr;
00482
00483 if ((ptr=(dict_entry *)htb->search((char *)&w,HT_FIND))!=NULL)
00484 return ptr->code;
00485 else{
00486 if (!ifl){
00487 if (oov_code==-1){
00488 cerr << "starting to use OOV words [" << w << "]\n";
00489 tb[n].word=st->push(OOV());
00490 htb->search((char *)&tb[n].word,HT_ENTER);
00491 tb[n].code=n;
00492 tb[n].freq=0;
00493 oov_code=n;
00494 if (++n==lim) grow();
00495 }
00496
00497 return encode(OOV());
00498 }
00499 else{
00500 tb[n].word=st->push((char *)w);
00501 htb->search((char *)&tb[n].word,HT_ENTER);
00502 tb[n].code=n;
00503 tb[n].freq=0;
00504 if (++n==lim) grow();
00505 return n-1;
00506 }
00507 }
00508 }
00509
00510
00511 const char *dictionary::decode(int c){
00512 if (c>=0 && c < n)
00513 return tb[c].word;
00514 else{
00515 cerr << "decode: code out of boundary\n";
00516 return OOV();
00517 }
00518 }
00519
00520
00521 dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) {
00522 m_dict->htb->scan(HT_INIT);
00523 }
00524
00525 dict_entry* dictionary_iter::next() {
00526 return (dict_entry*)m_dict->htb->scan(HT_CONT);
00527 }
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542