00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <fcntl.h>
00025 #include <iostream>
00026 #include <fstream>
00027 #include <stdexcept>
00028 #include <cassert>
00029 #include "math.h"
00030 #include "mempool.h"
00031 #include "htable.h"
00032 #include "ngramcache.h"
00033 #include "dictionary.h"
00034 #include "n_gram.h"
00035 #include "lmtable.h"
00036 #include "lmmacro.h"
00037 #include "util.h"
00038
00039 using namespace std;
00040
00041
00042
00043 int parseWords(char *sentence, const char **words, int max);
00044
00045 inline void error(const char* message){
00046 cerr << message << "\n";
00047 throw runtime_error(message);
00048 }
00049
00050 void lmmacro::cutLex(ngram *in, ngram *out)
00051 {
00052 *out=*in;
00053
00054 const char *curr_macro = out->dict->decode(*(out->wordp(1)));
00055 out->shift();
00056 const char *p = strrchr(curr_macro, '_');
00057 int lexLen;
00058 if (p)
00059 lexLen=strlen(p);
00060 else
00061 lexLen=0;
00062 char curr_NoLexMacro[BUFSIZ];
00063 memset(&curr_NoLexMacro,0,BUFSIZ);
00064 strncpy(curr_NoLexMacro,curr_macro,strlen(curr_macro)-lexLen);
00065 out->pushw(curr_NoLexMacro);
00066 return;
00067 }
00068
00069
00070
00071
00072
00073 lmmacro::lmmacro(string lmfilename, istream& inp, istream& inpMap){
00074 dict = new dictionary((char *)NULL,1000000);
00075 microMacroMap = NULL;
00076 microMacroMapN = 0;
00077 lexicaltoken2classMap = NULL;
00078 lexicaltoken2classMapN = 0;
00079
00080 if (!loadmap(lmfilename, inp, inpMap))
00081 error((char*)"Error in loadmap\n");
00082
00083 };
00084
00085
00086 bool lmmacro::loadmap(string lmfilename, istream& inp, istream& inpMap) {
00087
00088 char line[MAX_LINE];
00089 const char* words[MAX_TOKEN_N_MAP];
00090 const char *macroW; const char *microW;
00091 int tokenN;
00092
00093 microMacroMap = (int *)calloc(BUFSIZ, sizeof(int));
00094
00095
00096 #ifdef WIN32
00097 lmtable::load(inp);
00098 #else
00099 if (lmfilename.compare(lmfilename.size()-3,3,".mm")==0)
00100 lmtable::load(inp,lmfilename.c_str(),NULL,1);
00101 else
00102 lmtable::load(inp,lmfilename.c_str(),NULL,0);
00103 #endif
00104
00105
00106 inpMap.getline(line,MAX_LINE,'\n');
00107 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00108 if (tokenN < 2 || strcmp(words[0],"FIELD")!=0)
00109 error((char*)"ERROR: wrong header format of map file\n[correct: FIELD <int> (file of lexical classes, only if <int> > 9)]\n");
00110 selectedField = atoi(words[1]);
00111 if ( (selectedField==-1 || selectedField==-2) && tokenN==2)
00112 cerr << "no selected field: the whole string is used\n";
00113 else if ((selectedField>=0 && selectedField<10) && tokenN==2)
00114 cerr << "selected field n. " << selectedField << "\n";
00115 else if (selectedField>9 && selectedField<100 && tokenN==2)
00116 cerr << "selected field is " << selectedField/10 << " lexicalized with field " << selectedField%10 << " (no lexical classes)\n";
00117 else if (selectedField>9 && selectedField<100 && tokenN==3)
00118 cerr << "selected field is " << selectedField/10 << " lexicalized with classes from field " << selectedField%10 << "\n";
00119 else
00120 error((char*)"ERROR: wrong header format of map file\n[correct: FIELD <int> (file of lexical classes, only if <int> > 9)]\n");
00121
00122
00123 if (tokenN==3)
00124 loadLexicalClasses(words[2]);
00125
00126
00127 getDict()->incflag(1);
00128 while (inpMap.getline(line,MAX_LINE,'\n')){
00129 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00130 if (tokenN != 2)
00131 error((char*)"ERROR: wrong format of map file\n");
00132 microW = words[0];
00133 macroW = words[1];
00134 getDict()->encode(microW);
00135
00136 #ifdef DEBUG
00137 cout << "\nmicroW = " << microW << "\n";
00138 cout << "macroW = " << macroW << "\n";
00139 cout << "microMacroMapN = " << microMacroMapN << "\n";
00140 cout << "code of micro = " << getDict()->getcode(microW) << "\n";
00141 cout << "code of macro = " << lmtable::getDict()->getcode(macroW) << "\n";
00142 #endif
00143
00144 if (microMacroMapN && !(microMacroMapN%BUFSIZ))
00145 microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
00146 microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(macroW);
00147 }
00148
00149 getDict()->genoovcode();
00150
00151 #ifdef DEBUG
00152 cout << "oovcode(micro)=" << getDict()->oovcode() << "\n";
00153 cout << "oovcode(macro)=" << lmtable::getDict()->oovcode() << "\n";
00154
00155 cout << "microMacroMapN = " << microMacroMapN << "\n";
00156 cout << "macrodictsize = " << lmtable::getDict()->size() << "\n";
00157 cout << "microdictsize = " << getDict()->size() << "\n";
00158
00159 for (int i=0; i<microMacroMapN; i++) {
00160 cout << "micro[" << getDict()->decode(i) << "] -> " << lmtable::getDict()->decode(microMacroMap[i]) << "\n";
00161 }
00162 #endif
00163 return true;
00164 };
00165
00166
00167 void lmmacro::loadLexicalClasses(const char *fn)
00168 {
00169 char line[MAX_LINE];
00170 const char* words[MAX_TOKEN_N_MAP];
00171 int tokenN;
00172
00173 lexicaltoken2classMap = (int *)calloc(BUFSIZ, sizeof(int));
00174 lexicaltoken2classMapN = BUFSIZ;
00175
00176 lmtable::getDict()->incflag(1);
00177
00178 inputfilestream inp(fn);
00179 while (inp.getline(line,MAX_LINE,'\n')){
00180 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00181 if (tokenN != 2)
00182 error((char*)"ERROR: wrong format of lexical classes file\n");
00183 else {
00184 int classIdx = atoi(words[1]);
00185 int wordCode = lmtable::getDict()->encode(words[0]);
00186
00187 if (wordCode>=lexicaltoken2classMapN) {
00188 int r = (wordCode-lexicaltoken2classMapN)/BUFSIZ;
00189 lexicaltoken2classMapN += (r+1)*BUFSIZ;
00190 lexicaltoken2classMap = (int *)realloc(lexicaltoken2classMap, sizeof(int)*lexicaltoken2classMapN);
00191 }
00192 lexicaltoken2classMap[wordCode] = classIdx;
00193 }
00194 }
00195
00196 lmtable::getDict()->incflag(0);
00197
00198 #ifdef DEBUG
00199 for (int x=0; x<lmtable::getDict()->size(); x++)
00200 cout << "class of <" << lmtable::getDict()->decode(x) << "> (code=" << x << ") = " << lexicaltoken2classMap[x] << endl;
00201 #endif
00202
00203 return;
00204 }
00205
00206
00207 double lmmacro::lprob(ngram micro_ng) {
00208
00209 #ifdef DEBUG
00210 cout << " lmmacro::lprob, parameter = <" << micro_ng << ">\n";
00211 #endif
00212
00213 ngram macro_ng(lmtable::getDict());
00214
00215 if (micro_ng.dict == macro_ng.dict)
00216 macro_ng.trans(micro_ng);
00217 else
00218 map(µ_ng, ¯o_ng);
00219
00220 #ifdef DEBUG
00221 cout << "lmmacro::lprob: micro_ng = " << micro_ng << "\n";
00222 cout << "lmmacro::lprob: macro_ng = " << macro_ng << "\n";
00223 #endif
00224
00225
00226 double prob;
00227 prob = lmtable::lprob(macro_ng);
00228 #ifdef DEBUG
00229 cout << "prob = " << prob << "\n";
00230 #endif
00231
00232 return prob;
00233 };
00234
00235
00236 double lmmacro::clprob(ngram micro_ng) {
00237 #ifdef DEBUG
00238 cout << " lmmacro::clprob, parameter = <" << micro_ng << ">\n";
00239 #endif
00240
00241 double logpr;
00242 ngram macro_ng(lmtable::getDict());
00243 ngram macroNoLex_ng(lmtable::getDict());
00244
00245
00246 map(µ_ng, ¯o_ng);
00247
00248 ngram prevMicro_ng(micro_ng);
00249 ngram prevMacro_ng(lmtable::getDict());
00250 prevMicro_ng.shift();
00251
00252
00253 map(&prevMicro_ng, &prevMacro_ng);
00254
00255
00256 #ifdef DEBUG
00257 cout << "lmmacro::clprob: micro_ng = " << micro_ng << "\n";
00258 cout << "lmmacro::clprob: macro_ng = " << macro_ng << "\n";
00259 cout << "lmmacro::clprob: prevMicro_ng = " << prevMicro_ng << "\n";
00260 cout << "lmmacro::clprob: prevMacro_ng = " << prevMacro_ng << "\n";
00261 #endif
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274 if (selectedField<10) {
00275
00276 if (prevMacro_ng == macro_ng)
00277 return 0.0;
00278
00279 #ifdef DEBUG
00280 cout << " QUERY MACRO LM on " << macro_ng << "\n";
00281 #endif
00282
00283 if (macro_ng.size==0) return 0.0;
00284
00285 if (macro_ng.size>maxlev) macro_ng.size=maxlev;
00286
00287
00288 if (probcache && macro_ng.size==maxlev && probcache->get(macro_ng.wordp(maxlev),(char *)&logpr))
00289 return logpr;
00290
00291
00292 logpr=lmmacro::lprob(macro_ng);
00293
00294 if (probcache && macro_ng.size==maxlev)
00295 probcache->add(macro_ng.wordp(maxlev),(char *)&logpr);
00296
00297 } else {
00298
00299 cutLex(¯o_ng, ¯oNoLex_ng);
00300 #ifdef DEBUG
00301 cout << " macroNoLex_ng = " << macroNoLex_ng << endl;
00302 #endif
00303 if (prevMacro_ng == macroNoLex_ng)
00304 {
00305 #ifdef DEBUG
00306 cout << " DO NOT QUERY MACRO LM " << endl;
00307 #endif
00308 return 0.0;
00309 }
00310
00311 #ifdef DEBUG
00312 cout << " QUERY MACRO LM on " << prevMacro_ng << "\n";
00313 #endif
00314
00315 if (prevMacro_ng.size==0) return 0.0;
00316
00317 if (prevMacro_ng.size>maxlev) prevMacro_ng.size=maxlev;
00318
00319
00320 if (probcache && prevMacro_ng.size==maxlev && probcache->get(prevMacro_ng.wordp(maxlev),(char *)&logpr))
00321 return logpr;
00322
00323
00324 logpr=lmmacro::lprob(prevMacro_ng);
00325
00326 if (probcache && prevMacro_ng.size==maxlev)
00327 probcache->add(prevMacro_ng.wordp(maxlev),(char *)&logpr);
00328 }
00329
00330 return logpr;
00331 };
00332
00333
00334
00335
00336
00337
00338 const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size){
00339
00340
00341
00342
00343
00344
00345
00346
00347 ngram macro_ng(lmtable::getDict());
00348
00349 if (micro_ng.dict == macro_ng.dict)
00350 macro_ng.trans(micro_ng);
00351 else
00352 map(µ_ng, ¯o_ng);
00353
00354 #ifdef DEBUG
00355 cout << "lmmacro::lprob: micro_ng = " << micro_ng << "\n";
00356 cout << "lmmacro::lprob: macro_ng = " << macro_ng << "\n";
00357 #endif
00358
00359 return lmtable::maxsuffptr(macro_ng,size);
00360
00361 }
00362
00363 const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size){
00364
00365
00366
00367
00368
00369
00370
00371
00372 ngram macro_ng(lmtable::getDict());
00373
00374 if (micro_ng.dict == macro_ng.dict)
00375 macro_ng.trans(micro_ng);
00376 else
00377 map(µ_ng, ¯o_ng);
00378
00379 #ifdef DEBUG
00380 cout << "lmmacro::lprob: micro_ng = " << micro_ng << "\n";
00381 cout << "lmmacro::lprob: macro_ng = " << macro_ng << "\n";
00382 #endif
00383
00384 return lmtable::cmaxsuffptr(macro_ng,size);
00385
00386 }
00387
00388
00389 void lmmacro::map(ngram *in, ngram *out)
00390 {
00391
00392 #ifdef DEBUG
00393 cout << "In lmmacro::map, in = " << *in << endl;
00394 cout << " (selectedField = " << selectedField << " )\n";
00395 #endif
00396 if (selectedField==-2)
00397 One2OneMapping(in, out);
00398
00399 else if (selectedField==-1)
00400 Micro2MacroMapping(in, out);
00401
00402 else if (selectedField<10) {
00403 ngram field_ng(getDict());
00404
00405 int microsize = in->size;
00406 for (int i=microsize; i>0; i--) {
00407 char curr_token[BUFSIZ];
00408 strcpy(curr_token, getDict()->decode(*(in->wordp(i))));
00409 char *field;
00410 if (strcmp(curr_token,"<s>") &&
00411 strcmp(curr_token,"</s>") &&
00412 strcmp(curr_token,"_unk_")) {
00413 field = strtok(curr_token, "#");
00414 for (int j=0; j<selectedField; j++)
00415 field = strtok(0, "#");
00416 } else
00417 field = curr_token;
00418
00419 if (field)
00420 field_ng.pushw(field);
00421 else {
00422 field_ng.pushw((char*)"_unk_");
00423
00424
00425 }
00426 }
00427 if (microMacroMapN>0)
00428 Micro2MacroMapping(&field_ng, out);
00429 else
00430 out->trans(field_ng);
00431 } else {
00432
00433
00434
00435
00436 int tagIdx = selectedField/10;
00437 int lemmaIdx = selectedField%10;
00438
00439
00440 ngram tag_ng(getDict());
00441 char *lemmas[BUFSIZ];
00442
00443 int microsize = in->size;
00444 for (int i=microsize; i>0; i--) {
00445 char curr_token[BUFSIZ];
00446 strcpy(curr_token, getDict()->decode(*(in->wordp(i))));
00447 char *tag = NULL, *lemma = NULL;
00448
00449 if (strcmp(curr_token,"<s>") &&
00450 strcmp(curr_token,"</s>") &&
00451 strcmp(curr_token,"_unk_")) {
00452
00453 if (tagIdx<lemmaIdx) {
00454 tag = strtok(curr_token, "#");
00455 for (int j=0; j<tagIdx; j++)
00456 tag = strtok(0, "#");
00457 for (int j=tagIdx; j<lemmaIdx; j++)
00458 lemma = strtok(0, "#");
00459 } else {
00460 lemma = strtok(curr_token, "#");
00461 for (int j=0; j<lemmaIdx; j++)
00462 lemma = strtok(0, "#");
00463 for (int j=lemmaIdx; j<tagIdx; j++)
00464 tag = strtok(0, "#");
00465 }
00466
00467 #ifdef DEBUG
00468 printf("(tag,lemma) = %s %s\n", tag, lemma);
00469 #endif
00470 } else {
00471 tag = curr_token;
00472 lemma = curr_token;
00473 #ifdef DEBUG
00474 printf("(tag=lemma) = %s %s\n", tag, lemma);
00475 #endif
00476 }
00477 if (tag) {
00478 tag_ng.pushw(tag);
00479 lemmas[i] = strdup(lemma);
00480 } else {
00481 tag_ng.pushw((char*)"_unk_");
00482 lemmas[i] = strdup("_unk_");
00483
00484
00485 }
00486 }
00487 if (microMacroMapN>0)
00488 Micro2MacroMapping(&tag_ng, out, lemmas);
00489 else
00490 out->trans(tag_ng);
00491
00492 #ifdef DEBUG
00493 cout << "In lmmacro::map, FINAL out = " << *out << endl;
00494 #endif
00495
00496 }
00497 }
00498
00499 void lmmacro::One2OneMapping(ngram *in, ngram *out)
00500 {
00501
00502 int insize = in->size;
00503
00504
00505
00506 for (int i=insize; i>0; i--) {
00507
00508 const char *outtoken =
00509 lmtable::getDict()->decode((*(in->wordp(i))<microMacroMapN)?microMacroMap[*(in->wordp(i))]:lmtable::getDict()->oovcode());
00510 out->pushw(outtoken);
00511 }
00512 return;
00513 }
00514
00515
00516 void lmmacro::Micro2MacroMapping(ngram *in, ngram *out)
00517 {
00518
00519 int microsize = in->size;
00520
00521
00522
00523 for (int i=microsize; i>0; i--) {
00524
00525 int curr_code = *(in->wordp(i));
00526 const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
00527
00528 if (i==microsize) {
00529 out->pushw(curr_macrotag);
00530
00531 } else {
00532 int prev_code = *(in->wordp(i+1));
00533
00534 const char *prev_microtag = getDict()->decode(prev_code);
00535 const char *curr_microtag = getDict()->decode(curr_code);
00536 const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
00537
00538
00539 int prev_len = strlen(prev_microtag)-1;
00540 int curr_len = strlen(curr_microtag)-1;
00541
00542 if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
00543 !(
00544 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(')) ||
00545 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && curr_microtag[curr_len]=='+' ) ||
00546 (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
00547 (prev_microtag[prev_len]== '+' && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))))
00548 out->pushw(curr_macrotag);
00549 }
00550 }
00551 return;
00552 }
00553
00554 void lmmacro::Micro2MacroMapping(ngram *in, ngram *out, char **lemmas)
00555 {
00556
00557 #ifdef DEBUG
00558 cout << "In Micro2MacroMapping, in = " << *in << "\n";
00559 #endif
00560
00561 int microsize = in->size;
00562
00563 #ifdef DEBUG
00564 cout << "In Micro2MacroMapping, lemmas:\n";
00565 if (lexicaltoken2classMap)
00566 for (int i=microsize; i>0; i--)
00567 cout << "lemmas[" << i << "]=" << lemmas[i] << " -> class -> " << lexicaltoken2classMap[lmtable::getDict()->encode(lemmas[i])] << endl;
00568 else
00569 for (int i=microsize; i>0; i--)
00570 cout << "lemmas[" << i << "]=" << lemmas[i] << endl;
00571 #endif
00572
00573
00574
00575 char tag_lemma[BUFSIZ];
00576
00577 for (int i=microsize; i>0; i--) {
00578
00579 int curr_code = *(in->wordp(i));
00580
00581 const char *curr_microtag = getDict()->decode(curr_code);
00582 const char *curr_lemma = lemmas[i];
00583 const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
00584 int curr_len = strlen(curr_microtag)-1;
00585
00586 if (i==microsize) {
00587 if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
00588 sprintf(tag_lemma, "%s", curr_macrotag);
00589 else
00590 if (lexicaltoken2classMap)
00591 sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
00592 else
00593 sprintf(tag_lemma, "%s_%s", curr_macrotag, lemmas[microsize]);
00594
00595 #ifdef DEBUG
00596 cout << "In Micro2MacroMapping, starting tag_lemma = >" << tag_lemma << "<\n";
00597 #endif
00598
00599 out->pushw(tag_lemma);
00600 free(lemmas[microsize]);
00601
00602
00603 } else {
00604
00605 int prev_code = *(in->wordp(i+1));
00606 const char *prev_microtag = getDict()->decode(prev_code);
00607 const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
00608
00609
00610 int prev_len = strlen(prev_microtag)-1;
00611
00612 if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
00613 sprintf(tag_lemma, "%s", curr_macrotag);
00614 else
00615 if (lexicaltoken2classMap)
00616 sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
00617 else
00618 sprintf(tag_lemma, "%s_%s", curr_macrotag, curr_lemma);
00619
00620 #ifdef DEBUG
00621 cout << "In Micro2MacroMapping, tag_lemma = >" << tag_lemma << "<\n";
00622 #endif
00623
00624 if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
00625 !(
00626 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!=')' )) && curr_microtag[curr_len]==')' && curr_microtag[0]!='(') ||
00627 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')')) && curr_microtag[curr_len]=='+' ) ||
00628 (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
00629 (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))) {
00630
00631
00632 #ifdef DEBUG
00633 cout << "In Micro2MacroMapping, before pushw, out = " << *out << endl;
00634 #endif
00635 out->pushw(tag_lemma);
00636 #ifdef DEBUG
00637 cout << "In Micro2MacroMapping, after pushw, out = " << *out << endl;
00638 #endif
00639 } else {
00640 #ifdef DEBUG
00641 cout << "In Micro2MacroMapping, before shift, out = " << *out << endl;
00642 #endif
00643 out->shift();
00644 #ifdef DEBUG
00645 cout << "In Micro2MacroMapping, after shift, out = " << *out << endl;
00646 #endif
00647 out->pushw(tag_lemma);
00648 #ifdef DEBUG
00649 cout << "In Micro2MacroMapping, after push, out = " << *out << endl;
00650 #endif
00651 }
00652 free(lemmas[i]);
00653 }
00654 }
00655 return;
00656 }