00001 #include <vector>
00002 #include <limits>
00003 #include <cassert>
00004 #include "SoftSourceSyntacticConstraintsFeature.h"
00005 #include "moses/StaticData.h"
00006 #include "moses/InputFileStream.h"
00007 #include "moses/ScoreComponentCollection.h"
00008 #include "moses/Hypothesis.h"
00009 #include "moses/ChartHypothesis.h"
00010 #include "moses/ChartManager.h"
00011 #include "moses/FactorCollection.h"
00012 #include "moses/TreeInput.h"
00013 #include "moses/PP/SourceLabelsPhraseProperty.h"
00014
00015
00016 using namespace std;
00017
00018 namespace Moses
00019 {
00020
00021
00022 SoftSourceSyntacticConstraintsFeature::SoftSourceSyntacticConstraintsFeature(const std::string &line)
00023 : StatelessFeatureFunction(6, line)
00024 , m_useCoreSourceLabels(false)
00025 , m_useLogprobs(true)
00026 , m_useSparse(false)
00027 , m_useSparseLabelPairs(false)
00028 , m_noMismatches(false)
00029 , m_floor(1e-7)
00030 {
00031 VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
00032 ReadParameters();
00033 VERBOSE(1, " Done.");
00034 VERBOSE(1, " Config:");
00035 VERBOSE(1, " Log probabilities");
00036 if ( m_useLogprobs ) {
00037 VERBOSE(1, " active.");
00038 } else {
00039 VERBOSE(1, " inactive.");
00040 }
00041 VERBOSE(1, " Sparse scores");
00042 if ( m_useSparse ) {
00043 VERBOSE(1, " active.");
00044 } else {
00045 VERBOSE(1, " inactive.");
00046 }
00047 VERBOSE(1, " Sparse label pair scores");
00048 if ( m_useSparseLabelPairs ) {
00049 VERBOSE(1, " active.");
00050 } else {
00051 VERBOSE(1, " inactive.");
00052 }
00053 VERBOSE(1, " Core labels");
00054 if ( m_useCoreSourceLabels ) {
00055 VERBOSE(1, " active.");
00056 } else {
00057 VERBOSE(1, " inactive.");
00058 }
00059 VERBOSE(1, " No mismatches");
00060 if ( m_noMismatches ) {
00061 VERBOSE(1, " active.");
00062 } else {
00063 VERBOSE(1, " inactive.");
00064 }
00065 VERBOSE(1, std::endl);
00066 }
00067
00068
00069 void SoftSourceSyntacticConstraintsFeature::SetParameter(const std::string& key, const std::string& value)
00070 {
00071 if (key == "sourceLabelSetFile") {
00072 m_sourceLabelSetFile = value;
00073 } else if (key == "coreSourceLabelSetFile") {
00074 m_coreSourceLabelSetFile = value;
00075 m_useCoreSourceLabels = true;
00076 } else if (key == "targetSourceLeftHandSideJointCountFile") {
00077 m_targetSourceLHSJointCountFile = value;
00078 } else if (key == "noMismatches") {
00079 m_noMismatches = Scan<bool>(value);
00080 } else if (key == "logProbabilities") {
00081 m_useLogprobs = Scan<bool>(value);
00082 } else if (key == "sparse") {
00083 m_useSparse = Scan<bool>(value);
00084 } else if (key == "sparseLabelPairs") {
00085 m_useSparseLabelPairs = Scan<bool>(value);
00086 } else {
00087 StatelessFeatureFunction::SetParameter(key, value);
00088 }
00089 }
00090
00091 void SoftSourceSyntacticConstraintsFeature::Load(AllOptions::ptr const& opts)
00092 {
00093 m_options = opts;
00094
00095 LoadSourceLabelSet();
00096 if (!m_coreSourceLabelSetFile.empty()) {
00097 LoadCoreSourceLabelSet();
00098 }
00099 if (!m_targetSourceLHSJointCountFile.empty()) {
00100 LoadTargetSourceLeftHandSideJointCountFile();
00101 }
00102 }
00103
00104 void SoftSourceSyntacticConstraintsFeature::LoadSourceLabelSet()
00105 {
00106 FEATUREVERBOSE(2, "Loading source label set from file " << m_sourceLabelSetFile << " ...");
00107 InputFileStream inFile(m_sourceLabelSetFile);
00108
00109 FactorCollection &factorCollection = FactorCollection::Instance();
00110
00111
00112 std::string line;
00113 m_sourceLabels.clear();
00114 m_sourceLabelsByIndex.clear();
00115 m_sourceLabelsByIndex_RHS_1.clear();
00116 m_sourceLabelsByIndex_RHS_0.clear();
00117 m_sourceLabelsByIndex_LHS_1.clear();
00118 m_sourceLabelsByIndex_LHS_0.clear();
00119 m_sourceLabelIndexesByFactor.clear();
00120 while (getline(inFile, line)) {
00121 std::istringstream tokenizer(line);
00122 std::string label;
00123 size_t index;
00124 try {
00125 tokenizer >> label >> index;
00126 } catch (const std::exception &e) {
00127 UTIL_THROW2(GetScoreProducerDescription()
00128 << ": Error reading source label set file " << m_sourceLabelSetFile << " .");
00129 }
00130 std::pair< boost::unordered_map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
00131 UTIL_THROW_IF2(!inserted.second, GetScoreProducerDescription()
00132 << ": Source label set file " << m_sourceLabelSetFile << " should contain each syntactic label only once.");
00133
00134 if (index >= m_sourceLabelsByIndex.size()) {
00135 m_sourceLabelsByIndex.resize(index+1);
00136 m_sourceLabelsByIndex_RHS_1.resize(index+1);
00137 m_sourceLabelsByIndex_RHS_0.resize(index+1);
00138 m_sourceLabelsByIndex_LHS_1.resize(index+1);
00139 m_sourceLabelsByIndex_LHS_0.resize(index+1);
00140 }
00141 m_sourceLabelsByIndex[index] = label;
00142 m_sourceLabelsByIndex_RHS_1[index] = "RHS_1_" + label;
00143 m_sourceLabelsByIndex_RHS_0[index] = "RHS_0_" + label;
00144 m_sourceLabelsByIndex_LHS_1[index] = "LHS_1_" + label;
00145 m_sourceLabelsByIndex_LHS_0[index] = "LHS_0_" + label;
00146 const Factor* sourceLabelFactor = factorCollection.AddFactor(label,true);
00147 m_sourceLabelIndexesByFactor[sourceLabelFactor] = index;
00148 }
00149
00150 inFile.Close();
00151
00152 std::list<std::string> specialLabels;
00153 specialLabels.push_back("GlueTop");
00154 specialLabels.push_back("GlueX");
00155
00156
00157 for (std::list<std::string>::const_iterator iter=specialLabels.begin();
00158 iter!=specialLabels.end(); ++iter) {
00159 boost::unordered_map<std::string,size_t>::iterator found = m_sourceLabels.find(*iter);
00160 UTIL_THROW_IF2(found == m_sourceLabels.end(), GetScoreProducerDescription()
00161 << ": Source label set file " << m_sourceLabelSetFile << " should contain an entry for the special label \"" << *iter << "\".");
00162 if (!(found->first).compare("GlueTop")) {
00163 m_GlueTopLabel = found->second;
00164
00165
00166
00167
00168 }
00169 }
00170 FEATUREVERBOSE2(2, " Done." << std::endl);
00171 }
00172
00173
00174 void SoftSourceSyntacticConstraintsFeature::LoadCoreSourceLabelSet()
00175 {
00176 FEATUREVERBOSE(2, "Loading core source label set from file " << m_coreSourceLabelSetFile << " ...");
00177
00178 LoadLabelSet(m_coreSourceLabelSetFile, m_coreSourceLabels);
00179 FEATUREVERBOSE2(2, " Done." << std::endl);
00180 }
00181
00182 void SoftSourceSyntacticConstraintsFeature::LoadLabelSet(std::string &filename,
00183 boost::unordered_set<size_t> &labelSet)
00184 {
00185 InputFileStream inFile(filename);
00186 std::string line;
00187 labelSet.clear();
00188 while (getline(inFile, line)) {
00189 istringstream tokenizer(line);
00190 std::string label;
00191 tokenizer >> label;
00192 boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( label );
00193 if ( foundSourceLabelIndex != m_sourceLabels.end() ) {
00194 labelSet.insert(foundSourceLabelIndex->second);
00195 } else {
00196 FEATUREVERBOSE(2, "Ignoring undefined source label \"" << label << "\" "
00197 << "from core source label set file " << filename << "."
00198 << std::endl);
00199 }
00200 }
00201 inFile.Close();
00202 }
00203
00204
00205 void SoftSourceSyntacticConstraintsFeature::LoadTargetSourceLeftHandSideJointCountFile()
00206 {
00207
00208 FEATUREVERBOSE(2, "Loading target/source label joint counts from file " << m_targetSourceLHSJointCountFile << " ...");
00209 InputFileStream inFile(m_targetSourceLHSJointCountFile);
00210
00211 for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
00212 iter!=m_labelPairProbabilities.end(); ++iter) {
00213 delete iter->second;
00214 }
00215 m_labelPairProbabilities.clear();
00216
00217
00218 std::string line;
00219 FactorCollection &factorCollection = FactorCollection::Instance();
00220 boost::unordered_map<const Factor*,float> targetLHSCounts;
00221 std::vector<float> sourceLHSCounts(m_sourceLabels.size(),0.0);
00222
00223 while (getline(inFile, line)) {
00224 istringstream tokenizer(line);
00225 std::string targetLabel;
00226 std::string sourceLabel;
00227 float count;
00228 tokenizer >> targetLabel;
00229 tokenizer >> sourceLabel;
00230 tokenizer >> count;
00231
00232 boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( sourceLabel );
00233 UTIL_THROW_IF2(foundSourceLabelIndex == m_sourceLabels.end(), GetScoreProducerDescription()
00234 << ": Target/source label joint count file " << m_targetSourceLHSJointCountFile
00235 << " contains undefined source label \"" << sourceLabel << "\".");
00236
00237 const Factor* targetLabelFactor = factorCollection.AddFactor(targetLabel,true);
00238
00239 sourceLHSCounts[foundSourceLabelIndex->second] += count;
00240 std::pair< boost::unordered_map<const Factor*,float >::iterator, bool > insertedTargetLHSCount =
00241 targetLHSCounts.insert( std::pair<const Factor*,float>(targetLabelFactor,count) );
00242 if (!insertedTargetLHSCount.second) {
00243 (insertedTargetLHSCount.first)->second += count;
00244 boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator jointCountIt =
00245 m_labelPairProbabilities.find( targetLabelFactor );
00246 assert(jointCountIt != m_labelPairProbabilities.end());
00247 (jointCountIt->second)->at(foundSourceLabelIndex->second).first += count;
00248 (jointCountIt->second)->at(foundSourceLabelIndex->second).second += count;
00249 } else {
00250 std::pair<float,float> init(0.0,0.0);
00251 std::vector< std::pair<float,float> >* sourceVector = new std::vector< std::pair<float,float> >(m_sourceLabels.size(),init);
00252 sourceVector->at(foundSourceLabelIndex->second) = std::pair<float,float>(count,count);
00253 std::pair< boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator, bool > insertedJointCount =
00254 m_labelPairProbabilities.insert( std::pair<const Factor*, std::vector< std::pair<float,float> >* >(targetLabelFactor,sourceVector) );
00255 UTIL_THROW_IF2(!insertedJointCount.second, GetScoreProducerDescription()
00256 << ": Loading target/source label joint counts from file " << m_targetSourceLHSJointCountFile << " failed.");
00257 }
00258 }
00259
00260
00261 for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
00262 iter!=m_labelPairProbabilities.end(); ++iter) {
00263 float targetLHSCount = 0;
00264 boost::unordered_map<const Factor*,float >::const_iterator targetLHSCountIt = targetLHSCounts.find( iter->first );
00265 if ( targetLHSCountIt != targetLHSCounts.end() ) {
00266 targetLHSCount = targetLHSCountIt->second;
00267 }
00268 std::vector< std::pair<float,float> > &probabilities = *(iter->second);
00269 for (size_t index=0; index<probabilities.size(); ++index) {
00270
00271 if ( probabilities[index].first != 0 ) {
00272 assert(targetLHSCount != 0);
00273 probabilities[index].first /= targetLHSCount;
00274 }
00275 if ( probabilities[index].second != 0 ) {
00276 assert(sourceLHSCounts[index] != 0);
00277 probabilities[index].second /= sourceLHSCounts[index];
00278 }
00279 }
00280 }
00281
00282 inFile.Close();
00283 FEATUREVERBOSE2(2, " Done." << std::endl);
00284 }
00285
00286
00287 void SoftSourceSyntacticConstraintsFeature::EvaluateWithSourceContext(const InputType &input
00288 , const InputPath &inputPath
00289 , const TargetPhrase &targetPhrase
00290 , const StackVec *stackVec
00291 , ScoreComponentCollection &scoreBreakdown
00292 , ScoreComponentCollection *estimatedScores) const
00293 {
00294 assert(stackVec);
00295
00296 IFFEATUREVERBOSE(3) {
00297 FEATUREVERBOSE(3, targetPhrase << std::endl);
00298 FEATUREVERBOSE(3, inputPath << std::endl);
00299 for (size_t i = 0; i < stackVec->size(); ++i) {
00300 const ChartCellLabel &cell = *stackVec->at(i);
00301 const Range &ntRange = cell.GetCoverage();
00302 FEATUREVERBOSE(3, "stackVec[ " << i << " ] : " << ntRange.GetStartPos() << " - " << ntRange.GetEndPos() << std::endl);
00303 }
00304
00305 for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
00306 it!=targetPhrase.GetAlignNonTerm().end(); ++it) {
00307 FEATUREVERBOSE(3, "alignNonTerm " << it->first << " " << it->second << std::endl);
00308 }
00309 }
00310
00311
00312 std::vector<float> newScores(m_numScoreComponents,0);
00313
00314 const TreeInput& treeInput = static_cast<const TreeInput&>(input);
00315
00316
00317
00318 size_t nNTs = 1;
00319 bool treeInputMismatchLHSBinary = true;
00320 size_t treeInputMismatchRHSCount = 0;
00321 bool hasCompleteTreeInputMatch = false;
00322 float ruleLabelledProbability = 0.0;
00323 float treeInputMatchProbRHS = 0.0;
00324 float treeInputMatchProbLHS = 0.0;
00325
00326
00327 const Factor* targetLHS = targetPhrase.GetTargetLHS()[0];
00328 bool isGlueGrammarRule = false;
00329 bool isUnkRule = false;
00330
00331 if (const PhraseProperty *property = targetPhrase.GetProperty("SourceLabels")) {
00332
00333 const SourceLabelsPhraseProperty *sourceLabelsPhraseProperty = static_cast<const SourceLabelsPhraseProperty*>(property);
00334
00335 nNTs = sourceLabelsPhraseProperty->GetNumberOfNonTerminals();
00336 float totalCount = sourceLabelsPhraseProperty->GetTotalCount();
00337
00338
00339 std::vector< boost::unordered_set<size_t> > treeInputLabelsRHS(nNTs-1);
00340 boost::unordered_set<size_t> treeInputLabelsLHS;
00341
00342
00343 const Range& range = inputPath.GetWordsRange();
00344 size_t startPos = range.GetStartPos();
00345 size_t endPos = range.GetEndPos();
00346 const Phrase *sourcePhrase = targetPhrase.GetRuleSource();
00347
00348 if (nNTs > 1) {
00349 size_t nonTerminalNumber = 0;
00350 size_t sourceSentPos = startPos;
00351
00352 for (size_t sourcePhrasePos=0; sourcePhrasePos<sourcePhrase->GetSize(); ++sourcePhrasePos) {
00353
00354 const Word &word = sourcePhrase->GetWord(sourcePhrasePos);
00355 size_t symbolStartPos = sourceSentPos;
00356 size_t symbolEndPos = sourceSentPos;
00357 if ( word.IsNonTerminal() ) {
00358
00359 const ChartCellLabel &cell = *stackVec->at(nonTerminalNumber);
00360 const Range& prevWordsRange = cell.GetCoverage();
00361 symbolStartPos = prevWordsRange.GetStartPos();
00362 symbolEndPos = prevWordsRange.GetEndPos();
00363 }
00364
00365 const NonTerminalSet& treeInputLabels = treeInput.GetLabelSet(symbolStartPos,symbolEndPos);
00366
00367 for (NonTerminalSet::const_iterator treeInputLabelsIt = treeInputLabels.begin();
00368 treeInputLabelsIt != treeInputLabels.end(); ++treeInputLabelsIt) {
00369 if (*treeInputLabelsIt != m_options->syntax.output_default_non_terminal) {
00370 boost::unordered_map<const Factor*,size_t>::const_iterator foundTreeInputLabel
00371 = m_sourceLabelIndexesByFactor.find((*treeInputLabelsIt)[0]);
00372 if (foundTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
00373 size_t treeInputLabelIndex = foundTreeInputLabel->second;
00374 treeInputLabelsRHS[sourcePhrasePos].insert(treeInputLabelIndex);
00375 }
00376 }
00377 }
00378
00379 if ( word.IsNonTerminal() ) {
00380 ++nonTerminalNumber;
00381 }
00382 sourceSentPos = symbolEndPos + 1;
00383 }
00384 }
00385
00386
00387 const NonTerminalSet& treeInputLabels = treeInput.GetLabelSet(startPos,endPos);
00388
00389 for (NonTerminalSet::const_iterator treeInputLabelsIt = treeInputLabels.begin();
00390 treeInputLabelsIt != treeInputLabels.end(); ++treeInputLabelsIt) {
00391 if (*treeInputLabelsIt != m_options->syntax.output_default_non_terminal) {
00392 boost::unordered_map<const Factor*,size_t>::const_iterator foundTreeInputLabel
00393 = m_sourceLabelIndexesByFactor.find((*treeInputLabelsIt)[0]);
00394 if (foundTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
00395 size_t treeInputLabelIndex = foundTreeInputLabel->second;
00396 treeInputLabelsLHS.insert(treeInputLabelIndex);
00397 }
00398 }
00399 }
00400
00401
00402
00403
00404 std::vector< boost::unordered_set<size_t> > sparseScoredTreeInputLabelsRHS(nNTs-1);
00405 boost::unordered_set<size_t> sparseScoredTreeInputLabelsLHS;
00406
00407 std::vector<bool> sourceLabelSeenAsLHS(m_sourceLabels.size(),false);
00408 std::vector<bool> treeInputMatchRHSCountByNonTerminal(nNTs-1,false);
00409 std::vector<float> treeInputMatchProbRHSByNonTerminal(nNTs-1,0.0);
00410
00411 const std::list<SourceLabelsPhrasePropertyItem> &sourceLabelItems = sourceLabelsPhraseProperty->GetSourceLabelItems();
00412
00413 for (std::list<SourceLabelsPhrasePropertyItem>::const_iterator sourceLabelItem = sourceLabelItems.begin();
00414 sourceLabelItem != sourceLabelItems.end() && !hasCompleteTreeInputMatch; ++sourceLabelItem) {
00415
00416 const std::list<size_t> &sourceLabelsRHS = sourceLabelItem->GetSourceLabelsRHS();
00417 const std::list< std::pair<size_t,float> > &sourceLabelsLHSList = sourceLabelItem->GetSourceLabelsLHSList();
00418 float sourceLabelsRHSCount = sourceLabelItem->GetSourceLabelsRHSCount();
00419
00420 assert(sourceLabelsRHS.size() == nNTs-1);
00421
00422 bool currentSourceLabelItemIsCompleteTreeInputMatch = true;
00423
00424 size_t nonTerminalNumber=0;
00425 for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
00426 sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
00427
00428 if (treeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) != treeInputLabelsRHS[nonTerminalNumber].end()) {
00429
00430 treeInputMatchRHSCountByNonTerminal[nonTerminalNumber] = true;
00431 treeInputMatchProbRHSByNonTerminal[nonTerminalNumber] += sourceLabelsRHSCount;
00432
00433 if ( m_useSparse &&
00434 (!m_useCoreSourceLabels || m_coreSourceLabels.find(*sourceLabelsRHSIt) != m_coreSourceLabels.end()) ) {
00435
00436 if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
00437
00438 float score_RHS_1 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
00439 scoreBreakdown.PlusEquals(this,
00440 m_sourceLabelsByIndex_RHS_1[*sourceLabelsRHSIt],
00441 score_RHS_1);
00442 sparseScoredTreeInputLabelsRHS[nonTerminalNumber].insert(*sourceLabelsRHSIt);
00443 }
00444 }
00445
00446 } else {
00447
00448 currentSourceLabelItemIsCompleteTreeInputMatch = false;
00449
00450 }
00451 }
00452
00453 for (std::list< std::pair<size_t,float> >::const_iterator sourceLabelsLHSIt = sourceLabelsLHSList.begin();
00454 sourceLabelsLHSIt != sourceLabelsLHSList.end(); ++sourceLabelsLHSIt) {
00455
00456 if ( sourceLabelsLHSIt->first == m_GlueTopLabel ) {
00457 isGlueGrammarRule = true;
00458 }
00459
00460 if (treeInputLabelsLHS.find(sourceLabelsLHSIt->first) != treeInputLabelsLHS.end()) {
00461
00462 treeInputMismatchLHSBinary = false;
00463 treeInputMatchProbLHS += sourceLabelsLHSIt->second;
00464
00465 if ( m_useSparse &&
00466 (!m_useCoreSourceLabels || m_coreSourceLabels.find(sourceLabelsLHSIt->first) != m_coreSourceLabels.end()) ) {
00467
00468 if (sparseScoredTreeInputLabelsLHS.find(sourceLabelsLHSIt->first) == sparseScoredTreeInputLabelsLHS.end()) {
00469
00470 float score_LHS_1 = (float)1/treeInputLabelsLHS.size();
00471 scoreBreakdown.PlusEquals(this,
00472 m_sourceLabelsByIndex_LHS_1[sourceLabelsLHSIt->first],
00473 score_LHS_1);
00474 sparseScoredTreeInputLabelsLHS.insert(sourceLabelsLHSIt->first);
00475 }
00476 }
00477
00478 if ( currentSourceLabelItemIsCompleteTreeInputMatch ) {
00479 ruleLabelledProbability += sourceLabelsLHSIt->second;
00480 hasCompleteTreeInputMatch = true;
00481 }
00482
00483 }
00484 }
00485 }
00486
00487
00488 for (std::vector<float>::iterator treeInputMatchProbRHSByNonTerminalIt = treeInputMatchProbRHSByNonTerminal.begin();
00489 treeInputMatchProbRHSByNonTerminalIt != treeInputMatchProbRHSByNonTerminal.end(); ++treeInputMatchProbRHSByNonTerminalIt) {
00490 *treeInputMatchProbRHSByNonTerminalIt /= totalCount;
00491 if ( *treeInputMatchProbRHSByNonTerminalIt != 0 ) {
00492 treeInputMatchProbRHS += ( m_useLogprobs ? TransformScore(*treeInputMatchProbRHSByNonTerminalIt) : *treeInputMatchProbRHSByNonTerminalIt );
00493 }
00494 }
00495 treeInputMatchProbLHS /= totalCount;
00496 ruleLabelledProbability /= totalCount;
00497
00498
00499 if ( !hasCompleteTreeInputMatch ) {
00500 treeInputMismatchRHSCount = nNTs-1;
00501 for (std::vector<bool>::const_iterator treeInputMatchRHSCountByNonTerminalIt = treeInputMatchRHSCountByNonTerminal.begin();
00502 treeInputMatchRHSCountByNonTerminalIt != treeInputMatchRHSCountByNonTerminal.end(); ++treeInputMatchRHSCountByNonTerminalIt) {
00503 if (*treeInputMatchRHSCountByNonTerminalIt) {
00504 --treeInputMismatchRHSCount;
00505 }
00506 }
00507 }
00508
00509
00510 if ( m_useSparse ) {
00511
00512
00513
00514 for (size_t nonTerminalNumber = 0; nonTerminalNumber < nNTs-1; ++nonTerminalNumber) {
00515
00516
00517 float score_RHS_0 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
00518 for (boost::unordered_set<size_t>::const_iterator treeInputLabelsRHSIt = treeInputLabelsRHS[nonTerminalNumber].begin();
00519 treeInputLabelsRHSIt != treeInputLabelsRHS[nonTerminalNumber].end(); ++treeInputLabelsRHSIt) {
00520
00521 if ( !m_useCoreSourceLabels || m_coreSourceLabels.find(*treeInputLabelsRHSIt) != m_coreSourceLabels.end() ) {
00522
00523 if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*treeInputLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
00524
00525 scoreBreakdown.PlusEquals(this,
00526 m_sourceLabelsByIndex_RHS_0[*treeInputLabelsRHSIt],
00527 score_RHS_0);
00528 }
00529 }
00530 }
00531 }
00532
00533
00534
00535 float score_LHS_0 = (float)1/treeInputLabelsLHS.size();
00536 for (boost::unordered_set<size_t>::const_iterator treeInputLabelsLHSIt = treeInputLabelsLHS.begin();
00537 treeInputLabelsLHSIt != treeInputLabelsLHS.end(); ++treeInputLabelsLHSIt) {
00538
00539 if ( !m_useCoreSourceLabels || m_coreSourceLabels.find(*treeInputLabelsLHSIt) != m_coreSourceLabels.end() ) {
00540
00541 if (sparseScoredTreeInputLabelsLHS.find(*treeInputLabelsLHSIt) == sparseScoredTreeInputLabelsLHS.end()) {
00542
00543 scoreBreakdown.PlusEquals(this,
00544 m_sourceLabelsByIndex_LHS_0[*treeInputLabelsLHSIt],
00545 score_LHS_0);
00546 }
00547 }
00548 }
00549
00550 }
00551
00552 if ( m_useSparseLabelPairs && !isGlueGrammarRule ) {
00553
00554
00555 float t2sLabelsScore = 0.0;
00556 float s2tLabelsScore = 0.0;
00557 for (boost::unordered_set<size_t>::const_iterator treeInputLabelsLHSIt = treeInputLabelsLHS.begin();
00558 treeInputLabelsLHSIt != treeInputLabelsLHS.end(); ++treeInputLabelsLHSIt) {
00559
00560 scoreBreakdown.PlusEquals(this,
00561 "LHSPAIR_" + targetLHS->GetString().as_string() + "_" + m_sourceLabelsByIndex[*treeInputLabelsLHSIt],
00562 (float)1/treeInputLabelsLHS.size());
00563
00564 if (!m_targetSourceLHSJointCountFile.empty()) {
00565 std::pair<float,float> probPair = GetLabelPairProbabilities( targetLHS, *treeInputLabelsLHSIt);
00566 t2sLabelsScore += probPair.first;
00567 s2tLabelsScore += probPair.second;
00568 }
00569 }
00570 if ( treeInputLabelsLHS.size() == 0 ) {
00571 scoreBreakdown.PlusEquals(this,
00572 "LHSPAIR_" + targetLHS->GetString().as_string() + "_"
00573 + m_options->syntax.output_default_non_terminal[0]
00574 ->GetString().as_string(),
00575 1);
00576 if (!m_targetSourceLHSJointCountFile.empty()) {
00577 t2sLabelsScore = TransformScore(m_floor);
00578 s2tLabelsScore = TransformScore(m_floor);
00579 }
00580 } else {
00581 if (!m_targetSourceLHSJointCountFile.empty()) {
00582 float norm = TransformScore(treeInputLabelsLHS.size());
00583 t2sLabelsScore = TransformScore(t2sLabelsScore) - norm;
00584 s2tLabelsScore = TransformScore(s2tLabelsScore) - norm;
00585 }
00586 }
00587 if (!m_targetSourceLHSJointCountFile.empty()) {
00588 scoreBreakdown.PlusEquals(this, "LHST2S", t2sLabelsScore);
00589 scoreBreakdown.PlusEquals(this, "LHSS2T", s2tLabelsScore);
00590 }
00591 }
00592
00593 } else {
00594
00595
00596 UTIL_THROW_IF2(!targetPhrase.GetWord(0).IsOOV(), GetScoreProducerDescription()
00597 << ": Missing SourceLabels property. "
00598 << "Please check phrase table and glue rules.");
00599
00600
00601 isUnkRule = true;
00602
00603
00604 }
00605
00606
00607
00608
00609 newScores[0] = !hasCompleteTreeInputMatch;
00610 if ( m_noMismatches ) {
00611 newScores[0] = ( (hasCompleteTreeInputMatch || isGlueGrammarRule || isUnkRule) ? 0 : -std::numeric_limits<float>::infinity() );
00612 }
00613 newScores[1] = treeInputMismatchLHSBinary;
00614 newScores[2] = treeInputMismatchRHSCount;
00615
00616 if ( m_useLogprobs ) {
00617 if ( ruleLabelledProbability != 0 ) {
00618 ruleLabelledProbability = TransformScore(ruleLabelledProbability);
00619 }
00620 if ( treeInputMatchProbLHS != 0 ) {
00621 treeInputMatchProbLHS = TransformScore(treeInputMatchProbLHS);
00622 }
00623 }
00624
00625 newScores[3] = ruleLabelledProbability;
00626 newScores[4] = treeInputMatchProbLHS;
00627 newScores[5] = treeInputMatchProbRHS;
00628
00629 scoreBreakdown.PlusEquals(this, newScores);
00630 }
00631
00632
00633 std::pair<float,float> SoftSourceSyntacticConstraintsFeature::GetLabelPairProbabilities(
00634 const Factor* target,
00635 const size_t source) const
00636 {
00637 boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::const_iterator found =
00638 m_labelPairProbabilities.find(target);
00639 if ( found == m_labelPairProbabilities.end() ) {
00640 return std::pair<float,float>(m_floor,m_floor);
00641 }
00642 std::pair<float,float> ret = found->second->at(source);
00643 if ( ret == std::pair<float,float>(0,0) ) {
00644 return std::pair<float,float>(m_floor,m_floor);
00645 }
00646 return ret;
00647 }
00648
00649
00650 }
00651