00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <cstdlib>
00021 #include <vector>
00022 #include <string>
00023
00024 #include "util/exception.hh"
00025 #include "moses/Util.h"
00026 #include "InputFileStream.h"
00027 #include "OutputFileStream.h"
00028 #include "PropertiesConsolidator.h"
00029
00030
00031 bool countsProperty = false;
00032 bool goodTuringFlag = false;
00033 bool hierarchicalFlag = false;
00034 bool kneserNeyFlag = false;
00035 bool logProbFlag = false;
00036 bool lowCountFlag = false;
00037 bool onlyDirectFlag = false;
00038 bool partsOfSpeechFlag = false;
00039 bool phraseCountFlag = false;
00040 bool sourceLabelsFlag = false;
00041 bool targetSyntacticPreferencesFlag = false;
00042 bool sparseCountBinFeatureFlag = false;
00043
00044 std::vector< int > countBin;
00045 float minScore0 = 0;
00046 float minScore2 = 0;
00047
00048 std::vector< float > countOfCounts;
00049 std::vector< float > goodTuringDiscount;
00050 float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
00051
00052
00053 void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
00054 void loadCountOfCounts( const std::string& );
00055 void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse );
00056 bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item );
00057
00058
00059 inline float maybeLogProb( float a )
00060 {
00061 return logProbFlag ? std::log(a) : a;
00062 }
00063
00064
00065 inline bool isNonTerminal( const std::string &word )
00066 {
00067 return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']');
00068 }
00069
00070
00071 int main(int argc, char* argv[])
00072 {
00073 std::cerr << "Consolidate v2.0 written by Philipp Koehn" << std::endl
00074 << "consolidating direct and indirect rule tables" << std::endl;
00075
00076 if (argc < 4) {
00077 std::cerr <<
00078 "syntax: "
00079 "consolidate phrase-table.direct "
00080 "phrase-table.indirect "
00081 "phrase-table.consolidated "
00082 "[--Hierarchical] [--OnlyDirect] [--PhraseCount] "
00083 "[--GoodTuring counts-of-counts-file] "
00084 "[--KneserNey counts-of-counts-file] [--LowCountFeature] "
00085 "[--SourceLabels source-labels-file] "
00086 "[--PartsOfSpeech parts-of-speech-file] "
00087 "[--MinScore id:threshold[,id:threshold]*]"
00088 << std::endl;
00089 exit(1);
00090 }
00091 const std::string fileNameDirect = argv[1];
00092 const std::string fileNameIndirect = argv[2];
00093 const std::string fileNameConsolidated = argv[3];
00094 std::string fileNameCountOfCounts;
00095 std::string fileNameSourceLabelSet;
00096 std::string fileNamePartsOfSpeechVocabulary;
00097 std::string fileNameTargetSyntacticPreferencesLabelSet;
00098
00099 for(int i=4; i<argc; i++) {
00100 if (strcmp(argv[i],"--Hierarchical") == 0) {
00101 hierarchicalFlag = true;
00102 std::cerr << "processing hierarchical rules" << std::endl;
00103 } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
00104 onlyDirectFlag = true;
00105 std::cerr << "only including direct translation scores p(e|f)" << std::endl;
00106 } else if (strcmp(argv[i],"--PhraseCount") == 0) {
00107 phraseCountFlag = true;
00108 std::cerr << "including the phrase count feature" << std::endl;
00109 } else if (strcmp(argv[i],"--GoodTuring") == 0) {
00110 goodTuringFlag = true;
00111 UTIL_THROW_IF2(i+1==argc, "specify count of count files for Good Turing discounting!");
00112 fileNameCountOfCounts = argv[++i];
00113 std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
00114 } else if (strcmp(argv[i],"--KneserNey") == 0) {
00115 kneserNeyFlag = true;
00116 UTIL_THROW_IF2(i+1==argc, "specify count of count files for Kneser Ney discounting!");
00117 fileNameCountOfCounts = argv[++i];
00118 std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
00119 } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
00120 lowCountFlag = true;
00121 std::cerr << "including the low count feature" << std::endl;
00122 } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
00123 strcmp(argv[i],"--SparseCountBinFeature") == 0) {
00124 if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
00125 sparseCountBinFeatureFlag = true;
00126 std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
00127 int prev = 0;
00128 while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
00129 int binCount = std::atoi( argv[++i] );
00130 countBin.push_back( binCount );
00131 if (prev+1 == binCount) {
00132 std::cerr << " " << binCount;
00133 } else {
00134 std::cerr << " " << (prev+1) << "-" << binCount;
00135 }
00136 prev = binCount;
00137 }
00138 std::cerr << " " << (prev+1) << "+" << std::endl;
00139 } else if (strcmp(argv[i],"--LogProb") == 0) {
00140 logProbFlag = true;
00141 std::cerr << "using log-probabilities" << std::endl;
00142 } else if (strcmp(argv[i],"--Counts") == 0) {
00143 countsProperty = true;
00144 std::cerr << "output counts as a property" << std::endl;;
00145 } else if (strcmp(argv[i],"--SourceLabels") == 0) {
00146 sourceLabelsFlag = true;
00147 UTIL_THROW_IF2(i+1==argc, "specify source label set file!");
00148 fileNameSourceLabelSet = argv[++i];
00149 std::cerr << "processing source labels property" << std::endl;
00150 } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
00151 partsOfSpeechFlag = true;
00152 UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
00153 fileNamePartsOfSpeechVocabulary = argv[++i];
00154 std::cerr << "processing parts-of-speech property" << std::endl;
00155 } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
00156 targetSyntacticPreferencesFlag = true;
00157 UTIL_THROW_IF2(i+1==argc, "specify target syntactic preferences label set file!");
00158 fileNameTargetSyntacticPreferencesLabelSet = argv[++i];
00159 std::cerr << "processing target syntactic preferences property" << std::endl;
00160 } else if (strcmp(argv[i],"--MinScore") == 0) {
00161 std::string setting = argv[++i];
00162 bool done = false;
00163 while (!done) {
00164 std::string single_setting;
00165 size_t pos;
00166 if ((pos = setting.find(",")) != std::string::npos) {
00167 single_setting = setting.substr(0, pos);
00168 setting.erase(0, pos + 1);
00169 } else {
00170 single_setting = setting;
00171 done = true;
00172 }
00173 pos = single_setting.find(":");
00174 UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'");
00175 unsigned int field = atoll( single_setting.substr(0,pos).c_str() );
00176 float threshold = std::atof( single_setting.substr(pos+1).c_str() );
00177 if (field == 0) {
00178 minScore0 = threshold;
00179 std::cerr << "setting minScore0 to " << threshold << std::endl;
00180 } else if (field == 2) {
00181 minScore2 = threshold;
00182 std::cerr << "setting minScore2 to " << threshold << std::endl;
00183 } else {
00184 UTIL_THROW2("MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities");
00185 }
00186 }
00187 } else {
00188 UTIL_THROW2("unknown option " << argv[i]);
00189 }
00190 }
00191
00192 processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary, fileNameTargetSyntacticPreferencesLabelSet );
00193 }
00194
00195
00196 void loadCountOfCounts( const std::string& fileNameCountOfCounts )
00197 {
00198 Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
00199 UTIL_THROW_IF2(fileCountOfCounts.fail(), "could not open count of counts file " << fileNameCountOfCounts);
00200
00201 countOfCounts.push_back(0.0);
00202
00203 std::string line;
00204 while (getline(fileCountOfCounts, line)) {
00205 if (totalCount < 0)
00206 totalCount = std::atof( line.c_str() );
00207 else
00208 countOfCounts.push_back( std::atof( line.c_str() ) );
00209 }
00210 fileCountOfCounts.Close();
00211
00212
00213 if (goodTuringFlag) {
00214 goodTuringDiscount.push_back(0.01);
00215 for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
00216 goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
00217 if (goodTuringDiscount[i]>1)
00218 goodTuringDiscount[i] = 1;
00219 if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
00220 goodTuringDiscount[i] = goodTuringDiscount[i-1];
00221 }
00222 }
00223
00224
00225 float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
00226 kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
00227 kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
00228 kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
00229
00230 if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
00231 if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
00232 if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
00233 }
00234
00235
00236 void processFiles( const std::string& fileNameDirect,
00237 const std::string& fileNameIndirect,
00238 const std::string& fileNameConsolidated,
00239 const std::string& fileNameCountOfCounts,
00240 const std::string& fileNameSourceLabelSet,
00241 const std::string& fileNamePartsOfSpeechVocabulary,
00242 const std::string& fileNameTargetSyntacticPreferencesLabelSet )
00243 {
00244 if (goodTuringFlag || kneserNeyFlag)
00245 loadCountOfCounts( fileNameCountOfCounts );
00246
00247
00248 Moses::InputFileStream fileDirect(fileNameDirect);
00249 UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect);
00250 Moses::InputFileStream fileIndirect(fileNameIndirect);
00251 UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect);
00252
00253
00254 Moses::OutputFileStream fileConsolidated;
00255 bool success = fileConsolidated.Open(fileNameConsolidated);
00256 UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated);
00257
00258
00259
00260 MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
00261 if (sourceLabelsFlag) {
00262 propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
00263 }
00264 if (partsOfSpeechFlag) {
00265 propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
00266 }
00267 if (targetSyntacticPreferencesFlag) {
00268 propertiesConsolidator.ActivateTargetSyntacticPreferencesProcessing(fileNameTargetSyntacticPreferencesLabelSet);
00269 }
00270
00271
00272 int i=0;
00273 while(true) {
00274
00275 i++;
00276 if (i%100000 == 0) std::cerr << "." << std::flush;
00277
00278 std::vector< std::string > itemDirect, itemIndirect;
00279 if (! getLine(fileIndirect, itemIndirect) ||
00280 ! getLine(fileDirect, itemDirect))
00281 break;
00282
00283
00284
00285
00286
00287 UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0,
00288 "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'");
00289 UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0,
00290 "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'");
00291
00292
00293 std::string directScores, directSparseScores, indirectScores, indirectSparseScores;
00294 breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
00295 breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );
00296
00297 std::vector<std::string> directCounts;
00298 Moses::Tokenize( directCounts, itemDirect[4] );
00299 std::vector<std::string> indirectCounts;
00300 Moses::Tokenize( indirectCounts, itemIndirect[4] );
00301 float countF = std::atof( directCounts[0].c_str() );
00302 float countE = std::atof( indirectCounts[0].c_str() );
00303 float countEF = std::atof( indirectCounts[1].c_str() );
00304 float n1_F, n1_E;
00305 if (kneserNeyFlag) {
00306 n1_F = std::atof( directCounts[2].c_str() );
00307 n1_E = std::atof( indirectCounts[2].c_str() );
00308 }
00309
00310
00311 float adjustedCountEF = countEF;
00312 if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
00313 adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
00314 float adjustedCountEF_indirect = adjustedCountEF;
00315
00316
00317 if (kneserNeyFlag) {
00318 float D = kneserNey_D3;
00319 if (countEF < 2) D = kneserNey_D1;
00320 else if (countEF < 3) D = kneserNey_D2;
00321 if (D > countEF) D = countEF - 0.01;
00322
00323 float p_b_E = n1_E / totalCount;
00324 float alpha_F = D * n1_F / countF;
00325 adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
00326
00327
00328 float p_b_F = n1_F / totalCount;
00329 float alpha_E = D * n1_E / countE;
00330 adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
00331 }
00332
00333
00334 if ((minScore0 > 0 && adjustedCountEF_indirect/countE < minScore0) ||
00335 (minScore2 > 0 && adjustedCountEF /countF < minScore2)) {
00336 continue;
00337 }
00338
00339
00340 fileConsolidated << itemDirect[0] << " ||| ";
00341
00342 if (partsOfSpeechFlag) {
00343
00344 std::vector<std::string> targetTokens;
00345 Moses::Tokenize( targetTokens, itemDirect[1] );
00346 std::vector<std::string> propertyValuePOS;
00347 propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS);
00348 size_t targetTerminalIndex = 0;
00349 for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin();
00350 targetTokensIt!=targetTokens.end(); ++targetTokensIt) {
00351 fileConsolidated << *targetTokensIt;
00352 if (!isNonTerminal(*targetTokensIt)) {
00353 assert(propertyValuePOS.size() > targetTerminalIndex);
00354 fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex];
00355 ++targetTerminalIndex;
00356 }
00357 fileConsolidated << " ";
00358 }
00359 fileConsolidated << "|||";
00360
00361 } else {
00362
00363 fileConsolidated << itemDirect[1] << " |||";
00364 }
00365
00366
00367
00368 if (!onlyDirectFlag) {
00369 fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
00370 fileConsolidated << " " << indirectScores;
00371 }
00372
00373
00374 fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
00375 fileConsolidated << " " << directScores;
00376
00377
00378 if (phraseCountFlag) {
00379 fileConsolidated << " " << maybeLogProb(2.718);
00380 }
00381
00382
00383 if (lowCountFlag) {
00384 fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF));
00385 }
00386
00387
00388 if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
00389 bool foundBin = false;
00390 for(size_t i=0; i < countBin.size(); i++) {
00391 if (!foundBin && countEF <= countBin[i]) {
00392 fileConsolidated << " " << maybeLogProb(2.718);
00393 foundBin = true;
00394 } else {
00395 fileConsolidated << " " << maybeLogProb(1);
00396 }
00397 }
00398 fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
00399 }
00400
00401
00402 fileConsolidated << " |||";
00403 if (!itemDirect[2].empty()) {
00404 fileConsolidated << " " << itemDirect[2];;
00405 }
00406
00407
00408 fileConsolidated << " ||| " << countE << " " << countF << " " << countEF;
00409
00410
00411 fileConsolidated << " |||";
00412 if (directSparseScores.compare("") != 0)
00413 fileConsolidated << " " << directSparseScores;
00414 if (indirectSparseScores.compare("") != 0)
00415 fileConsolidated << " " << indirectSparseScores;
00416
00417
00418 if (sparseCountBinFeatureFlag) {
00419 bool foundBin = false;
00420 for(size_t i=0; i < countBin.size(); i++) {
00421 if (!foundBin && countEF <= countBin[i]) {
00422 fileConsolidated << " cb_";
00423 if (i == 0 && countBin[i] > 1)
00424 fileConsolidated << "1_";
00425 else if (i > 0 && countBin[i-1]+1 < countBin[i])
00426 fileConsolidated << (countBin[i-1]+1) << "_";
00427 fileConsolidated << countBin[i] << " 1";
00428 foundBin = true;
00429 }
00430 }
00431 if (!foundBin) {
00432 fileConsolidated << " cb_max 1";
00433 }
00434 }
00435
00436
00437 fileConsolidated << " |||";
00438 if (itemDirect.size() >= 6) {
00439 propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated);
00440 }
00441
00442 if (countsProperty) {
00443 fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}";
00444 }
00445
00446 fileConsolidated << std::endl;
00447 }
00448
00449 fileDirect.Close();
00450 fileIndirect.Close();
00451 fileConsolidated.Close();
00452
00453
00454 std::cerr << std::endl;
00455 }
00456
00457
00458 void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse )
00459 {
00460 core = "";
00461 sparse = "";
00462 std::vector<std::string> score;
00463 Moses::Tokenize( score, combined );
00464 for(size_t i=0; i<score.size(); i++) {
00465 if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
00466 core += " " + score[i];
00467 else {
00468 sparse += " " + score[i];
00469 sparse += " " + score[++i];
00470 }
00471 }
00472 if (core.size() > 0 ) core = core.substr(1);
00473 if (sparse.size() > 0 ) sparse = sparse.substr(1);
00474 }
00475
00476
00477 bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item )
00478 {
00479 if (file.eof())
00480 return false;
00481
00482 std::string line;
00483 if (!getline(file, line))
00484 return false;
00485
00486 Moses::TokenizeMultiCharSeparator(item, line, " ||| ");
00487
00488 return true;
00489 }
00490