00001 #include <cassert>
00002 #include "PermutationScorer.h"
00003
00004 using namespace std;
00005
00006 namespace MosesTuning
00007 {
00008
00009
00010 const int PermutationScorer::SCORE_PRECISION = 5;
00011 const int PermutationScorer::SCORE_MULTFACT = 100000;
00012
00013 PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config)
00014 :StatisticsBasedScorer(distanceMetric,config)
00015 {
00016
00017
00018 static string KEY_REFCHOICE = "refchoice";
00019 static string REFCHOICE_AVERAGE = "average";
00020 static string REFCHOICE_CLOSEST = "closest";
00021
00022 string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
00023 if (refchoice == REFCHOICE_AVERAGE) {
00024 m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
00025 } else if (refchoice == REFCHOICE_CLOSEST) {
00026 m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
00027 } else {
00028 throw runtime_error("Unknown reference choice strategy: " + refchoice);
00029 }
00030 cerr << "Using reference choice strategy: " << refchoice << endl;
00031
00032 if (distanceMetric.compare("HAMMING") == 0) {
00033 m_distanceMetric = HAMMING_DISTANCE;
00034 } else if (distanceMetric.compare("KENDALL") == 0) {
00035 m_distanceMetric = KENDALL_DISTANCE;
00036 }
00037 cerr << "Using permutation distance metric: " << distanceMetric << endl;
00038
00039
00040 static string KEY_ALIGNMENT_FILES = "refalign";
00041 string refalign = getConfig(KEY_ALIGNMENT_FILES,"");
00042
00043 if (refalign.length() > 0) {
00044 string substring;
00045 while (!refalign.empty()) {
00046 getNextPound(refalign, substring, "+");
00047 m_referenceAlignments.push_back(substring);
00048 }
00049 }
00050
00051
00052
00053 static string KEY_SOURCE_FILE = "source";
00054 string sourceFile = getConfig(KEY_SOURCE_FILE,"");
00055 if (sourceFile.length() > 0) {
00056 cerr << "Loading source sentence lengths from " << sourceFile << endl;
00057 ifstream sourcein(sourceFile.c_str());
00058 if (!sourcein) {
00059 throw runtime_error("Unable to open: " + sourceFile);
00060 }
00061 string line;
00062 while (getline(sourcein,line)) {
00063 size_t wordNumber = 0;
00064 string word;
00065 while(!line.empty()) {
00066 getNextPound(line, word, " ");
00067 wordNumber++;
00068 }
00069 m_sourceLengths.push_back(wordNumber);
00070 }
00071 sourcein.close();
00072 }
00073 }
00074
00075 void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles)
00076 {
00077 cout << "*******setReferenceFiles" << endl;
00078
00079 m_referencePerms.clear();
00080
00081 vector< vector< int> > targetLengths;
00082
00083 for (size_t i = 0; i < referenceFiles.size(); ++i) {
00084 vector <int> lengths;
00085 cout << "Loading reference from " << referenceFiles[i] << endl;
00086 ifstream refin(referenceFiles[i].c_str());
00087 if (!refin) {
00088 cerr << "Unable to open: " << referenceFiles[i] << endl;
00089 throw runtime_error("Unable to open alignment file");
00090 }
00091 string line;
00092 while (getline(refin,line)) {
00093 int count = getNumberWords(line);
00094 lengths.push_back(count);
00095 }
00096 targetLengths.push_back(lengths);
00097 }
00098
00099
00100
00101 for (size_t i = 0; i < m_referenceAlignments.size(); ++i) {
00102 vector<Permutation> referencePerms;
00103 cout << "Loading reference from " << m_referenceAlignments[i] << endl;
00104 ifstream refin(m_referenceAlignments[i].c_str());
00105 if (!refin) {
00106 cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
00107 throw runtime_error("Unable to open alignment file");
00108 }
00109 string line;
00110 size_t sid = 0;
00111 while (getline(refin,line)) {
00112
00113
00114
00115 Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
00116
00117 referencePerms.push_back(perm);
00118
00119 if (perm.getLength() != m_sourceLengths[sid]) {
00120 cerr << "Permutation Length: " << perm.getLength() << endl;
00121 cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
00122 throw runtime_error("Source sentence lengths not the same: ");
00123 }
00124
00125 sid++;
00126 }
00127 m_referencePerms.push_back(referencePerms);
00128 }
00129 }
00130
00131 int PermutationScorer::getNumberWords (const string& text) const
00132 {
00133 int count = 0;
00134 string line = trimStr(text);
00135 if (line.length()>0) {
00136 int pos = line.find(" ");
00137 while (pos!=int(string::npos)) {
00138 count++;
00139 pos = line.find(" ",pos+1);
00140 }
00141 count++;
00142 }
00143 return count;
00144 }
00145
00146
00147 void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
00148 {
00149
00150 bool debug=false;
00151 if (debug) {
00152 cout << "*******prepareStats" ;
00153 cout << text << endl;
00154 cout << sid << endl;
00155 cout << "Reference0align:" << endl;
00156 m_referencePerms[0][sid].dump();
00157 }
00158
00159 string sentence = "";
00160 string align = text;
00161 size_t alignmentData = text.find("|||");
00162
00163 if(alignmentData != string::npos) {
00164 getNextPound(align,sentence, "|||");
00165 } else {
00166 align = text;
00167 }
00168 int translationLength = getNumberWords(sentence);
00169
00170
00171
00172 vector< vector<Permutation> > nBestPerms;
00173 float distanceValue;
00174
00175
00176
00177 bool isWordAlignment=true;
00178 string alignCopy = align;
00179 string align1;
00180 getNextPound(alignCopy,align1," ");
00181 if (align1.length() > 0) {
00182 size_t phraseDelimeter = align1.find("=");
00183 if(phraseDelimeter!= string::npos)
00184 isWordAlignment=false;
00185 }
00186 string standardFormat = align;
00187 if(!isWordAlignment)
00188 standardFormat= Permutation::convertMosesToStandard(align);
00189
00190 if (debug) {
00191 cerr << "Nbest alignment: " << align << endl;
00192 cerr << "-->std alignment: " << standardFormat << endl;
00193 }
00194
00195 Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
00196
00197
00198 if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
00199 float total = 0;
00200 for (size_t i = 0; i < m_referencePerms.size(); ++i) {
00201 float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
00202 total += dist;
00203
00204 }
00205 float mean = (float)total/m_referencePerms.size();
00206
00207 distanceValue = mean;
00208 } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) {
00209 float max_val = 0;
00210
00211 for (size_t i = 0; i < m_referencePerms.size(); ++i) {
00212
00213 float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
00214
00215 if (value > max_val) {
00216 max_val = value;
00217 }
00218 }
00219 distanceValue = max_val;
00220
00221 } else {
00222 throw runtime_error("Unsupported reflength strategy");
00223 }
00224
00225
00226 distanceValue*=SCORE_MULTFACT;
00227 ostringstream tempStream;
00228 tempStream.precision(0);
00229 tempStream << std::fixed << distanceValue << " 1";
00230 string str = tempStream.str();
00231 entry.set(str);
00232
00233
00234 }
00235
00236
00237 statscore_t PermutationScorer::calculateScore(const vector<ScoreStatsType>& comps) const
00238 {
00239
00240
00241 return (((statscore_t) comps[0]) / comps[1]) / SCORE_MULTFACT;
00242 }
00243
00244 }
00245