00001 #include "moses/PP/SourceLabelsPhraseProperty.h"
00002 #include <iostream>
00003 #include <cstdio>
00004 #include <cstdlib>
00005 #include <sstream>
00006 #include <string>
00007 #include <queue>
00008 #include <cassert>
00009 #include <limits>
00010
00011 namespace Moses
00012 {
00013
00014 void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
00015 {
00016 std::istringstream tokenizer(value);
00017
00018 if (! (tokenizer >> m_nNTs)) {
00019 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
00020 }
00021 assert( m_nNTs > 0 );
00022
00023 if (! (tokenizer >> m_totalCount)) {
00024 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
00025 }
00026 assert( m_totalCount > 0.0 );
00027
00028
00029
00030
00031
00032 std::priority_queue<float> ruleLabelledCountsPQ;
00033
00034 while (tokenizer.peek() != EOF) {
00035
00036
00037 SourceLabelsPhrasePropertyItem item;
00038 size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
00039
00040 if (m_nNTs == 1) {
00041
00042 item.m_sourceLabelsRHSCount = m_totalCount;
00043
00044 } else {
00045
00046 for (size_t i=0; i<m_nNTs-1; ++i) {
00047 size_t sourceLabelRHS;
00048 if (! (tokenizer >> sourceLabelRHS) ) {
00049 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
00050 }
00051 item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
00052 }
00053
00054 if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
00055 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
00056 }
00057
00058 if (! (tokenizer >> numberOfLHSsGivenRHS)) {
00059 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
00060 }
00061 }
00062
00063 for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) {
00064 size_t sourceLabelLHS;
00065 if (! (tokenizer >> sourceLabelLHS)) {
00066 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
00067 }
00068 float ruleSourceLabelledCount;
00069 if (! (tokenizer >> ruleSourceLabelledCount)) {
00070 UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
00071 }
00072 item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
00073 ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
00074 }
00075
00076 m_sourceLabelItems.push_back(item);
00077
00078
00079
00080
00081 }
00082
00083
00084 const size_t N=50;
00085
00086 if (ruleLabelledCountsPQ.size() > N) {
00087
00088 float topNRuleLabelledCount = std::numeric_limits<int>::max();
00089 for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
00090 topNRuleLabelledCount = ruleLabelledCountsPQ.top();
00091 ruleLabelledCountsPQ.pop();
00092 }
00093
00094 size_t nKept=0;
00095 std::list<SourceLabelsPhrasePropertyItem>::iterator itemIter=m_sourceLabelItems.begin();
00096 while (itemIter!=m_sourceLabelItems.end()) {
00097 if (itemIter->m_sourceLabelsRHSCount < topNRuleLabelledCount) {
00098 itemIter = m_sourceLabelItems.erase(itemIter);
00099 } else {
00100 std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_sourceLabelsLHSList).begin();
00101 while (itemLHSIter!=(itemIter->m_sourceLabelsLHSList).end()) {
00102 if (itemLHSIter->second < topNRuleLabelledCount) {
00103 itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter);
00104 } else {
00105 if (nKept >= N) {
00106 itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter,(itemIter->m_sourceLabelsLHSList).end());
00107 } else {
00108 ++nKept;
00109 ++itemLHSIter;
00110 }
00111 }
00112 }
00113 if ((itemIter->m_sourceLabelsLHSList).empty()) {
00114 itemIter = m_sourceLabelItems.erase(itemIter);
00115 } else {
00116 ++itemIter;
00117 }
00118 }
00119 }
00120 }
00121 };
00122
00123 }
00124