00001 #include <fstream>
00002 #include "GlobalLexicalModel.h"
00003 #include "moses/StaticData.h"
00004 #include "moses/InputFileStream.h"
00005 #include "moses/TranslationOption.h"
00006 #include "moses/TranslationTask.h"
00007 #include "moses/FactorCollection.h"
00008 #include "util/exception.hh"
00009
00010 using namespace std;
00011
00012 namespace Moses
00013 {
00014 GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
00015 : StatelessFeatureFunction(1, line)
00016 {
00017 std::cerr << "Creating global lexical model...\n";
00018 ReadParameters();
00019
00020
00021 FactorCollection &factorCollection = FactorCollection::Instance();
00022 m_bias = new Word();
00023 const Factor* factor = factorCollection.AddFactor( Input, m_inputFactorsVec[0], "**BIAS**" );
00024 m_bias->SetFactor( m_inputFactorsVec[0], factor );
00025
00026 }
00027
00028 void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
00029 {
00030 if (key == "path") {
00031 m_filePath = value;
00032 } else if (key == "input-factor") {
00033 m_inputFactorsVec = Tokenize<FactorType>(value,",");
00034 } else if (key == "output-factor") {
00035 m_outputFactorsVec = Tokenize<FactorType>(value,",");
00036 } else {
00037 StatelessFeatureFunction::SetParameter(key, value);
00038 }
00039 }
00040
00041 GlobalLexicalModel::~GlobalLexicalModel()
00042 {
00043
00044 DoubleHash::const_iterator iter;
00045 for(iter = m_hash.begin(); iter != m_hash.end(); iter++ ) {
00046 boost::unordered_map< const Word*, float, UnorderedComparer<Word>, UnorderedComparer<Word> >::const_iterator iter2;
00047 for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ ) {
00048 delete iter2->first;
00049 }
00050 delete iter->first;
00051 }
00052 }
00053
00054 void GlobalLexicalModel::Load(AllOptions::ptr const& opts)
00055 {
00056 m_options = opts;
00057 FactorCollection &factorCollection = FactorCollection::Instance();
00058 const std::string& oFactorDelimiter = opts->output.factor_delimiter;
00059 const std::string& iFactorDelimiter = opts->input.factor_delimiter;
00060
00061
00062 VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl);
00063
00064 m_inputFactors = FactorMask(m_inputFactorsVec);
00065 m_outputFactors = FactorMask(m_outputFactorsVec);
00066 InputFileStream inFile(m_filePath);
00067
00068
00069 size_t lineNum = 0;
00070 string line;
00071 while(getline(inFile, line)) {
00072 ++lineNum;
00073 vector<string> token = Tokenize<string>(line, " ");
00074
00075 if (token.size() != 3) {
00076 UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line);
00077 }
00078
00079
00080 Word *outWord = new Word();
00081 vector<string> factorString = Tokenize( token[0], oFactorDelimiter );
00082 for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) {
00083 const FactorDirection& direction = Output;
00084 const FactorType& factorType = m_outputFactorsVec[i];
00085 const Factor* factor
00086 = factorCollection.AddFactor( direction, factorType, factorString[i] );
00087 outWord->SetFactor( factorType, factor );
00088 }
00089
00090
00091 Word *inWord = new Word();
00092 factorString = Tokenize( token[1], iFactorDelimiter );
00093 for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) {
00094 const FactorDirection& direction = Input;
00095 const FactorType& factorType = m_inputFactorsVec[i];
00096 const Factor* factor
00097 = factorCollection.AddFactor( direction, factorType, factorString[i] );
00098 inWord->SetFactor( factorType, factor );
00099 }
00100
00101
00102 float score = Scan<float>(token[2]);
00103
00104
00105
00106
00107 DoubleHash::iterator keyOutWord = m_hash.find( outWord );
00108 if( keyOutWord == m_hash.end() ) {
00109 m_hash[outWord][inWord] = score;
00110 } else {
00111 (keyOutWord->second)[inWord] = score;
00112 delete outWord;
00113 }
00114 }
00115 }
00116
00117 void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask)
00118 {
00119 UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
00120 "GlobalLexicalModel works only with sentence input.");
00121 Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
00122 m_local.reset(new ThreadLocalStorage);
00123 m_local->input = s;
00124 }
00125
00126 float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
00127 {
00128 const Sentence& input = *(m_local->input);
00129 float score = 0;
00130 for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
00131 float sum = 0;
00132 const Word& targetWord = targetPhrase.GetWord( targetIndex );
00133 VERBOSE(2,"glm " << targetWord << ": ");
00134 const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord );
00135 if( targetWordHash != m_hash.end() ) {
00136 SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias );
00137 if( inputWordHash != targetWordHash->second.end() ) {
00138 VERBOSE(2,"*BIAS* " << inputWordHash->second);
00139 sum += inputWordHash->second;
00140 }
00141
00142 boost::unordered_set< const Word*, UnorderedComparer<Word>, UnorderedComparer<Word> > alreadyScored;
00143 for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
00144 const Word& inputWord = input.GetWord( inputIndex );
00145 if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
00146 SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord );
00147 if( inputWordHash != targetWordHash->second.end() ) {
00148 VERBOSE(2," " << inputWord << " " << inputWordHash->second);
00149 sum += inputWordHash->second;
00150 }
00151 alreadyScored.insert( &inputWord );
00152 }
00153 }
00154 }
00155
00156 VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl);
00157 score += FloorScore( log(1/(1+exp(-sum))) );
00158 }
00159 return score;
00160 }
00161
00162 float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const
00163 {
00164 LexiconCache& m_cache = m_local->cache;
00165 const LexiconCache::const_iterator query = m_cache.find( &targetPhrase );
00166 if ( query != m_cache.end() ) {
00167 return query->second;
00168 }
00169
00170 float score = ScorePhrase( targetPhrase );
00171 m_cache.insert( pair<const TargetPhrase*, float>(&targetPhrase, score) );
00172
00173 return score;
00174 }
00175
00176 void GlobalLexicalModel::EvaluateWithSourceContext(const InputType &input
00177 , const InputPath &inputPath
00178 , const TargetPhrase &targetPhrase
00179 , const StackVec *stackVec
00180 , ScoreComponentCollection &scoreBreakdown
00181 , ScoreComponentCollection *estimatedScores) const
00182 {
00183 scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) );
00184 }
00185
00186 bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const
00187 {
00188 for (size_t i = 0; i < m_outputFactors.size(); ++i) {
00189 if (m_outputFactors[i]) {
00190 if (!mask[i]) {
00191 return false;
00192 }
00193 }
00194 }
00195
00196 return true;
00197 }
00198
00199 }