00001
00002
00003 #include "domain.h"
00004 #include "tables-core.h"
00005 #include "InputFileStream.h"
00006 #include "SafeGetline.h"
00007
00008 #define TABLE_LINE_MAX_LENGTH 1000
00009
00010 using namespace std;
00011
00012 namespace MosesTraining
00013 {
00014
00015
00016 void Domain::load( const std::string &domainFileName )
00017 {
00018 Moses::InputFileStream fileS( domainFileName );
00019 istream *fileP = &fileS;
00020 while(true) {
00021 char line[TABLE_LINE_MAX_LENGTH];
00022 SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
00023 if (fileP->eof()) break;
00024
00025 vector< string > domainSpecLine = tokenize( line );
00026 int lineNumber;
00027 if (domainSpecLine.size() != 2 ||
00028 ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
00029 cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
00030 exit(1);
00031 }
00032
00033 string &name = domainSpecLine[1];
00034 spec.push_back( make_pair( lineNumber, name ));
00035 if (name2id.find( name ) == name2id.end()) {
00036 name2id[ name ] = list.size();
00037 list.push_back( name );
00038 }
00039 }
00040 }
00041
00042
00043 string Domain::getDomainOfSentence( int sentenceId ) const
00044 {
00045 for(size_t i=0; i<spec.size(); i++) {
00046 if (sentenceId <= spec[i].first) {
00047 return spec[i].second;
00048 }
00049 }
00050 return "undefined";
00051 }
00052
00053 DomainFeature::DomainFeature(const string& domainFile)
00054 {
00055
00056 m_domain.load(domainFile);
00057 }
00058
00059 void DomainFeature::add(const ScoreFeatureContext& context,
00060 std::vector<float>& denseValues,
00061 std::map<std::string,float>& sparseValues) const
00062 {
00063 map< string, float > domainCount;
00064 for(size_t i=0; i<context.phrasePair.size(); i++) {
00065 string d = m_domain.getDomainOfSentence(context.phrasePair[i]->sentenceId );
00066 if (domainCount.find( d ) == domainCount.end()) {
00067 domainCount[d] = context.phrasePair[i]->count;
00068 } else {
00069 domainCount[d] += context.phrasePair[i]->count;
00070 }
00071 }
00072 add(domainCount, context.count, context.maybeLog, denseValues, sparseValues);
00073 }
00074
00075 void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
00076 const MaybeLog& maybeLog,
00077 std::vector<float>& denseValues,
00078 std::map<std::string,float>& sparseValues) const
00079 {
00080 if (m_domain.list.size() > 6) {
00081 UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
00082 "too many domains for core domain subset features");
00083 }
00084 size_t bitmap = 0;
00085 for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
00086 if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
00087 bitmap += 1 << bit;
00088 }
00089 }
00090 for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
00091 denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
00092 }
00093 }
00094
00095 void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
00096 const MaybeLog& maybeLog,
00097 std::vector<float>& denseValues,
00098 std::map<std::string,float>& sparseValues) const
00099 {
00100 typedef vector<string>::const_iterator I;
00101 ostringstream key;
00102 key << "doms";
00103 for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
00104 if (domainCount.find(*i) != domainCount.end()) {
00105 key << "_" << *i;
00106 }
00107 }
00108 sparseValues[key.str()] = 1;
00109 }
00110
00111
00112 void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
00113 const MaybeLog& maybeLog,
00114 std::vector<float>& denseValues,
00115 std::map<std::string,float>& sparseValues) const
00116 {
00117 typedef vector< string >::const_iterator I;
00118 for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
00119 map<string,float>::const_iterator dci = domainCount.find(*i);
00120 if (dci == domainCount.end() ) {
00121 denseValues.push_back(maybeLog( 1 ));
00122 } else {
00123 denseValues.push_back(maybeLog(exp( dci->second / count ) ));
00124 }
00125 }
00126 }
00127
00128
00129 void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
00130 const MaybeLog& maybeLog,
00131 std::vector<float>& denseValues,
00132 std::map<std::string,float>& sparseValues) const
00133 {
00134 typedef map< string, float >::const_iterator I;
00135 for (I i=domainCount.begin(); i != domainCount.end(); i++) {
00136 sparseValues["domr_" + i->first] = (i->second / count);
00137 }
00138 }
00139
00140
00141 void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
00142 const MaybeLog& maybeLog,
00143 std::vector<float>& denseValues,
00144 std::map<std::string,float>& sparseValues) const
00145 {
00146 typedef vector< string >::const_iterator I;
00147 for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
00148 map<string,float>::const_iterator dci = domainCount.find(*i);
00149 if (dci == domainCount.end() ) {
00150 denseValues.push_back(maybeLog( 1 ));
00151 } else {
00152 denseValues.push_back(maybeLog(2.718));
00153 }
00154 }
00155
00156 }
00157
00158 void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
00159 const MaybeLog& maybeLog,
00160 std::vector<float>& denseValues,
00161 std::map<std::string,float>& sparseValues) const
00162 {
00163 typedef map< string, float >::const_iterator I;
00164 for (I i=domainCount.begin(); i != domainCount.end(); i++) {
00165 sparseValues["dom_" + i->first] = 1;
00166 }
00167 }
00168
00169 bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
00170 {
00171 return m_domain.getDomainOfSentence(lhs.sentenceId) ==
00172 m_domain.getDomainOfSentence( rhs.sentenceId);
00173 }
00174
00175
00176 }
00177