00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <fcntl.h>
00025 #include <iostream>
00026 #include <fstream>
00027 #include <stdexcept>
00028 #include <cassert>
00029 #include "math.h"
00030 #include "mempool.h"
00031 #include "htable.h"
00032 #include "ngramcache.h"
00033 #include "dictionary.h"
00034 #include "n_gram.h"
00035 #include "lmtable.h"
00036 #include "lmclass.h"
00037 #include "util.h"
00038
00039 using namespace std;
00040
00041
00042
00043 int parseWords(char *sentence, const char **words, int max);
00044
00045 inline void error(const char* message)
00046 {
00047 cerr << message << "\n";
00048 throw runtime_error(message);
00049 }
00050
00051
00052
00053
00054
00055 lmclass::lmclass(float nlf, float dlfi):lmtable(nlf,dlfi)
00056 {
00057 MaxMapSize=1000000;
00058 MapScore= (double *)malloc(MaxMapSize*sizeof(double));
00059 memset(MapScore,0,MaxMapSize*sizeof(double));
00060 MapScoreN=0;
00061 dict = new dictionary((char *)NULL,MaxMapSize);
00062 };
00063
00064 lmclass::~lmclass()
00065 {
00066 free (MapScore);
00067 delete dict;
00068 }
00069
00070 void lmclass::load(const std::string filename,int memmap)
00071 {
00072 VERBOSE(2,"lmclass::load(const std::string filename,int memmap)" << std::endl);
00073
00074
00075 fstream inp(filename.c_str(),ios::in|ios::binary);
00076
00077 char line[MAX_LINE];
00078 const char* words[MAX_TOKEN];
00079 int tokenN;
00080 inp.getline(line,MAX_LINE,'\n');
00081 tokenN = parseWords(line,words,MAX_TOKEN);
00082
00083 if (tokenN != 2 || ((strcmp(words[0],"LMCLASS") != 0) && (strcmp(words[0],"lmclass")!=0)))
00084 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
00085
00086 maxlev = atoi(words[1]);
00087 std::string lmfilename;
00088 if (inp.getline(line,MAX_LINE,'\n')) {
00089 tokenN = parseWords(line,words,MAX_TOKEN);
00090 lmfilename = words[0];
00091 } else {
00092 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
00093 }
00094
00095 std::string W2Cdict = "";
00096 if (inp.getline(line,MAX_LINE,'\n')) {
00097 tokenN = parseWords(line,words,MAX_TOKEN);
00098 W2Cdict = words[0];
00099 } else {
00100 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
00101 }
00102 inp.close();
00103
00104 std::cerr << "lmfilename:" << lmfilename << std::endl;
00105 if (W2Cdict != "") {
00106 std::cerr << "mapfilename:" << W2Cdict << std::endl;
00107 } else {
00108 error((char*)"ERROR: you must specify a map!");
00109 }
00110
00111
00112
00113 inputfilestream inpLM(lmfilename.c_str());
00114 if (!inpLM.good()) {
00115 std::cerr << "Failed to open " << lmfilename << "!" << std::endl;
00116 exit(1);
00117 }
00118 lmtable::load(inpLM,lmfilename.c_str(),NULL,memmap);
00119
00120 inputfilestream inW2C(W2Cdict);
00121 if (!inW2C.good()) {
00122 std::cerr << "Failed to open " << W2Cdict << "!" << std::endl;
00123 exit(1);
00124 }
00125 loadMap(inW2C);
00126 getDict()->genoovcode();
00127
00128 VERBOSE(2,"OOV code of lmclass is " << getDict()->oovcode() << " mapped into " << getMap(getDict()->oovcode())<< "\n");
00129 getDict()->incflag(1);
00130 }
00131
00132 void lmclass::loadMap(istream& inW2C)
00133 {
00134
00135 double lprob=0.0;
00136 int howmany=0;
00137
00138 const char* words[1 + LMTMAXLEV + 1 + 1];
00139
00140
00141 char line[MAX_LINE];
00142
00143 dict->incflag(1);
00144
00145 cerr<<"loadW2Cdict()...\n";
00146
00147
00148 loadMapElement(dict->BoS(),lmtable::dict->BoS(),0.0);
00149 loadMapElement(dict->EoS(),lmtable::dict->EoS(),0.0);
00150
00151
00152 loadMapElement(dict->OOV(),lmtable::dict->OOV(),0.0);
00153
00154 while (inW2C.getline(line,MAX_LINE)) {
00155 if (strlen(line)==MAX_LINE-1) {
00156 cerr << "lmtable::loadW2Cdict: input line exceed MAXLINE ("
00157 << MAX_LINE << ") chars " << line << "\n";
00158 exit(1);
00159 }
00160
00161 howmany = parseWords(line, words, 4);
00162
00163 if(howmany == 3) {
00164 assert(sscanf(words[2], "%lf", &lprob));
00165 lprob=(double)log10(lprob);
00166 } else if(howmany==2) {
00167
00168 VERBOSE(3,"No score for the pair (" << words[0] << "," << words[1] << "); set to default 1.0\n");
00169
00170 lprob=0.0;
00171 } else {
00172 cerr << "parseline: not enough entries" << line << "\n";
00173 exit(1);
00174 }
00175 loadMapElement(words[0],words[1],lprob);
00176
00177
00178 checkMap();
00179 }
00180
00181 VERBOSE(2,"There are " << MapScoreN << " entries in the map\n");
00182
00183 dict->incflag(0);
00184 }
00185
00186 void lmclass::checkMap()
00187 {
00188 if (MapScoreN > MaxMapSize) {
00189 MaxMapSize=2*MapScoreN;
00190 MapScore = (double*) realloc(MapScore, sizeof(double)*(MaxMapSize));
00191 VERBOSE(2,"In lmclass::checkMap(...) MaxMapSize=" << MaxMapSize << " MapScoreN=" << MapScoreN << "\n");
00192 }
00193 }
00194
00195 void lmclass::loadMapElement(const char* in, const char* out, double sc)
00196 {
00197
00198
00199 int wcode=dict->encode(in);
00200 dict->freq(wcode,lmtable::dict->encode(out));
00201 MapScore[wcode]=sc;
00202 VERBOSE(3,"In lmclass::loadMapElement(...) in=" << in << " wcode=" << wcode << " out=" << out << " ccode=" << lmtable::dict->encode(out) << " MapScoreN=" << MapScoreN << "\n");
00203
00204 if (wcode >= MapScoreN) MapScoreN++;
00205 }
00206
00207 double lmclass::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize,bool* extendible)
00208 {
00209 double lpr=getMapScore(*ong.wordp(1));
00210
00211 VERBOSE(3,"In lmclass::lprob(...) Mapscore = " << lpr << "\n");
00212
00213
00214 ngram mapped_ng(lmtable::getDict());
00215
00216 mapping(ong,mapped_ng);
00217
00218 lpr+=lmtable::clprob(mapped_ng,bow,bol,maxsuffptr,statesize, extendible);
00219
00220 VERBOSE(3,"In lmclass::lprob(...) global prob = " << lpr << "\n");
00221 return lpr;
00222 }
00223
00224 void lmclass::mapping(ngram &in, ngram &out)
00225 {
00226 int insize = in.size;
00227 VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) in = " << in << "\n");
00228
00229
00230 for (int i=insize; i>0; i--) {
00231 out.pushc(getMap(*in.wordp(i)));
00232 }
00233
00234 VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) out = " << out << "\n");
00235 return;
00236 }