00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "PropertiesConsolidator.h"
00021
00022 #include <sstream>
00023 #include <limits>
00024 #include <vector>
00025
00026 #include "moses/Util.h"
00027 #include "phrase-extract/InputFileStream.h"
00028 #include "phrase-extract/OutputFileStream.h"
00029
00030
00031 namespace MosesTraining
00032 {
00033
00034 void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
00035 {
00036 Moses::InputFileStream inFile(sourceLabelSetFile);
00037
00038
00039 m_sourceLabels.clear();
00040 std::string line;
00041 while (getline(inFile, line)) {
00042 std::istringstream tokenizer(line);
00043 std::string label;
00044 size_t index;
00045 try {
00046 tokenizer >> label >> index;
00047 } catch (const std::exception &e) {
00048 UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
00049 }
00050 std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
00051 UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
00052 }
00053
00054 inFile.Close();
00055
00056 m_sourceLabelsFlag = true;
00057 }
00058
00059
00060 void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
00061 {
00062 Moses::InputFileStream inFile(partsOfSpeechFile);
00063
00064
00065 m_partsOfSpeechVocabulary.clear();
00066 std::string line;
00067 while (getline(inFile, line)) {
00068 std::istringstream tokenizer(line);
00069 std::string label;
00070 size_t index;
00071 try {
00072 tokenizer >> label >> index;
00073 } catch (const std::exception &e) {
00074 UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
00075 }
00076 std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
00077 UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
00078 }
00079
00080 inFile.Close();
00081
00082 m_partsOfSpeechFlag = true;
00083 }
00084
00085
00086 void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
00087 {
00088 Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
00089
00090
00091 m_targetSyntacticPreferencesLabels.clear();
00092 std::string line;
00093 while (getline(inFile, line)) {
00094 std::istringstream tokenizer(line);
00095 std::string label;
00096 size_t index;
00097 try {
00098 tokenizer >> label >> index;
00099 } catch (const std::exception &e) {
00100 UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
00101 }
00102 std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
00103 UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
00104 }
00105
00106 inFile.Close();
00107
00108 m_targetSyntacticPreferencesFlag = true;
00109 }
00110
00111
00112 void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
00113 {
00114 if ( propertiesString.empty() ) {
00115 return;
00116 }
00117
00118 std::vector<std::string> toks;
00119 Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
00120 for (size_t i = 1; i < toks.size(); ++i) {
00121 std::string &tok = toks[i];
00122 if (tok.empty()) {
00123 continue;
00124 }
00125 size_t endPos = tok.rfind("}");
00126 tok = tok.substr(0, endPos - 1);
00127 std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
00128 assert(keyValue.size() == 2);
00129
00130 if ( !keyValue[0].compare("SourceLabels") ) {
00131
00132 if ( m_sourceLabelsFlag ) {
00133
00134
00135 out << " {{" << keyValue[0];
00136 ProcessSourceLabelsPropertyValue(keyValue[1], out);
00137 out << "}}";
00138
00139 } else {
00140 out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
00141 }
00142
00143 } else if ( !keyValue[0].compare("POS") ) {
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158 } else if ( !keyValue[0].compare("TargetPreferences") ) {
00159
00160 if ( m_targetSyntacticPreferencesFlag ) {
00161
00162
00163 out << " {{" << keyValue[0];
00164 ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
00165 out << "}}";
00166
00167 } else {
00168 out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
00169 }
00170
00171 } else {
00172
00173
00174 out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
00175 }
00176 }
00177 }
00178
00179
00180 void PropertiesConsolidator::ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
00181 {
00182
00183 std::istringstream tokenizer(value);
00184
00185 size_t nNTs;
00186 double totalCount;
00187
00188 if (! (tokenizer >> nNTs)) {
00189 UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
00190 << "Flawed SourceLabels property?");
00191 }
00192 assert( nNTs > 0 );
00193 out << " " << nNTs;
00194
00195 if (! (tokenizer >> totalCount)) {
00196 UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
00197 << "Flawed SourceLabels property?");
00198 }
00199 assert( totalCount > 0.0 );
00200 out << " " << totalCount;
00201
00202 while (tokenizer.peek() != EOF) {
00203 try {
00204
00205 size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
00206
00207 std::string token;
00208
00209 if (nNTs > 1) {
00210 for (size_t i=0; i<nNTs-1; ++i) {
00211 tokenizer >> token;
00212 std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
00213 UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
00214 out << " " << found->second;
00215 }
00216
00217 tokenizer >> token;
00218 out << " " << token;
00219
00220 tokenizer >> numberOfLHSsGivenRHS;
00221 out << " " << numberOfLHSsGivenRHS;
00222 }
00223
00224 for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) {
00225 tokenizer >> token;
00226 std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
00227 UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
00228 out << " " << found->second;
00229
00230 tokenizer >> token;
00231 out << " " << token;
00232 }
00233
00234 } catch (const std::exception &e) {
00235 UTIL_THROW2("Flawed item in SourceLabels property?");
00236 }
00237 }
00238 }
00239
00240
00241 void PropertiesConsolidator::ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
00242 {
00243 std::istringstream tokenizer(value);
00244 while (tokenizer.peek() != EOF) {
00245 std::string token;
00246 tokenizer >> token;
00247 std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
00248 UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
00249 out << " " << found->second;
00250 }
00251 }
00252
00253
00254 bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const
00255 {
00256 out.clear();
00257 if ( propertiesString.empty() ) {
00258 return false;
00259 }
00260
00261 std::vector<std::string> toks;
00262 Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
00263 for (size_t i = 1; i < toks.size(); ++i) {
00264 std::string &tok = toks[i];
00265 if (tok.empty()) {
00266 continue;
00267 }
00268 size_t endPos = tok.rfind("}");
00269 tok = tok.substr(0, endPos - 1);
00270 std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
00271 assert(keyValue.size() == 2);
00272
00273 if ( !keyValue[0].compare("POS") ) {
00274 std::istringstream tokenizer(keyValue[1]);
00275 while (tokenizer.peek() != EOF) {
00276 std::string token;
00277 tokenizer >> token;
00278 out.push_back(token);
00279 }
00280 return true;
00281 }
00282 }
00283
00284 return false;
00285 }
00286
00287
00288 void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
00289 {
00290
00291 std::istringstream tokenizer(value);
00292
00293 size_t nNTs;
00294 double totalCount;
00295
00296 if (! (tokenizer >> nNTs)) {
00297 UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
00298 << "Flawed TargetPreferences property?");
00299 }
00300 assert( nNTs > 0 );
00301 out << " " << nNTs;
00302
00303 if (! (tokenizer >> totalCount)) {
00304 UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
00305 << "Flawed TargetPreferences property?");
00306 }
00307 assert( totalCount > 0.0 );
00308 out << " " << totalCount;
00309
00310 while (tokenizer.peek() != EOF) {
00311 try {
00312
00313 size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
00314
00315 std::string token;
00316
00317 if (nNTs > 1) {
00318 for (size_t i=0; i<nNTs-1; ++i) {
00319 tokenizer >> token;
00320 std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
00321 UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
00322 out << " " << found->second;
00323 }
00324
00325 tokenizer >> token;
00326 out << " " << token;
00327
00328 tokenizer >> numberOfLHSsGivenRHS;
00329 out << " " << numberOfLHSsGivenRHS;
00330 }
00331
00332 for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) {
00333 tokenizer >> token;
00334 std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
00335 UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
00336 out << " " << found->second;
00337
00338 tokenizer >> token;
00339 out << " " << token;
00340 }
00341
00342 } catch (const std::exception &e) {
00343 UTIL_THROW2("Flawed item in TargetPreferences property?");
00344 }
00345 }
00346 }
00347
00348
00349 }
00350