00001 #include <fstream>
00002 #include "GlobalLexicalModel.h"
00003 #include "moses/StaticData.h"
00004 #include "moses/InputFileStream.h"
00005 #include "moses/TranslationOption.h"
00006 #include "moses/UserMessage.h"
00007
00008 using namespace std;
00009
00010 namespace Moses
00011 {
00012 GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
00013 : StatelessFeatureFunction("GlobalLexicalModel",1, line)
00014 {
00015 std::cerr << "Creating global lexical model...\n";
00016
00017 size_t ind = 0;
00018 while (ind < m_args.size()) {
00019 vector<string> &args = m_args[ind];
00020 bool consumed = SetParameter(args[0], args[1]);
00021 if (consumed) {
00022 m_args.erase(m_args.begin() + ind);
00023 } else {
00024 ++ind;
00025 }
00026 }
00027 CHECK(m_args.size() == 0);
00028
00029
00030 FactorCollection &factorCollection = FactorCollection::Instance();
00031 m_bias = new Word();
00032 const Factor* factor = factorCollection.AddFactor( Input, m_inputFactorsVec[0], "**BIAS**" );
00033 m_bias->SetFactor( m_inputFactorsVec[0], factor );
00034
00035 }
00036
00037 bool GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
00038 {
00039 if (key == "file") {
00040 m_filePath = value;
00041 } else if (key == "inputFactors") {
00042 m_inputFactorsVec = Tokenize<FactorType>(value,",");
00043 } else if (key == "outputFactors") {
00044 m_outputFactorsVec = Tokenize<FactorType>(value,",");
00045 } else {
00046 return StatelessFeatureFunction::SetParameter(key, value);
00047 }
00048 return true;
00049 }
00050
00051 GlobalLexicalModel::~GlobalLexicalModel()
00052 {
00053
00054 DoubleHash::const_iterator iter;
00055 for(iter = m_hash.begin(); iter != m_hash.end(); iter++ ) {
00056 map< const Word*, float, WordComparer >::const_iterator iter2;
00057 for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ ) {
00058 delete iter2->first;
00059 }
00060 delete iter->first;
00061 }
00062 }
00063
00064 void GlobalLexicalModel::Load()
00065 {
00066 FactorCollection &factorCollection = FactorCollection::Instance();
00067 const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
00068
00069 VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl);
00070
00071 m_inputFactors = FactorMask(m_inputFactorsVec);
00072 m_outputFactors = FactorMask(m_outputFactorsVec);
00073 InputFileStream inFile(m_filePath);
00074
00075
00076 size_t lineNum = 0;
00077 string line;
00078 while(getline(inFile, line)) {
00079 ++lineNum;
00080 vector<string> token = Tokenize<string>(line, " ");
00081
00082 if (token.size() != 3) {
00083 stringstream errorMessage;
00084 errorMessage << "Syntax error at " << m_filePath << ":" << lineNum << endl << line << endl;
00085 UserMessage::Add(errorMessage.str());
00086 abort();
00087 }
00088
00089
00090 Word *outWord = new Word();
00091 vector<string> factorString = Tokenize( token[0], factorDelimiter );
00092 for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) {
00093 const FactorDirection& direction = Output;
00094 const FactorType& factorType = m_outputFactorsVec[i];
00095 const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
00096 outWord->SetFactor( factorType, factor );
00097 }
00098
00099
00100 Word *inWord = new Word();
00101 factorString = Tokenize( token[1], factorDelimiter );
00102 for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) {
00103 const FactorDirection& direction = Input;
00104 const FactorType& factorType = m_inputFactorsVec[i];
00105 const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
00106 inWord->SetFactor( factorType, factor );
00107 }
00108
00109
00110 float score = Scan<float>(token[2]);
00111
00112
00113
00114
00115 DoubleHash::iterator keyOutWord = m_hash.find( outWord );
00116 if( keyOutWord == m_hash.end() ) {
00117 m_hash[outWord][inWord] = score;
00118 } else {
00119 (keyOutWord->second)[inWord] = score;
00120 delete outWord;
00121 }
00122 }
00123 }
00124
00125 void GlobalLexicalModel::InitializeForInput( Sentence const& in )
00126 {
00127 m_local.reset(new ThreadLocalStorage);
00128 m_local->input = ∈
00129 }
00130
00131 float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
00132 {
00133 const Sentence& input = *(m_local->input);
00134 float score = 0;
00135 for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
00136 float sum = 0;
00137 const Word& targetWord = targetPhrase.GetWord( targetIndex );
00138 VERBOSE(2,"glm " << targetWord << ": ");
00139 const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord );
00140 if( targetWordHash != m_hash.end() ) {
00141 SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias );
00142 if( inputWordHash != targetWordHash->second.end() ) {
00143 VERBOSE(2,"*BIAS* " << inputWordHash->second);
00144 sum += inputWordHash->second;
00145 }
00146
00147 set< const Word*, WordComparer > alreadyScored;
00148 for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
00149 const Word& inputWord = input.GetWord( inputIndex );
00150 if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
00151 SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord );
00152 if( inputWordHash != targetWordHash->second.end() ) {
00153 VERBOSE(2," " << inputWord << " " << inputWordHash->second);
00154 sum += inputWordHash->second;
00155 }
00156 alreadyScored.insert( &inputWord );
00157 }
00158 }
00159 }
00160
00161 VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl);
00162 score += FloorScore( log(1/(1+exp(-sum))) );
00163 }
00164 return score;
00165 }
00166
00167 float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const
00168 {
00169 LexiconCache& m_cache = m_local->cache;
00170 const LexiconCache::const_iterator query = m_cache.find( &targetPhrase );
00171 if ( query != m_cache.end() ) {
00172 return query->second;
00173 }
00174
00175 float score = ScorePhrase( targetPhrase );
00176 m_cache.insert( pair<const TargetPhrase*, float>(&targetPhrase, score) );
00177
00178 return score;
00179 }
00180
00181 void GlobalLexicalModel::Evaluate
00182 (const PhraseBasedFeatureContext& context,
00183 ScoreComponentCollection* accumulator) const
00184 {
00185 accumulator->PlusEquals( this,
00186 GetFromCacheOrScorePhrase(context.GetTargetPhrase()) );
00187 }
00188
00189 bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const
00190 {
00191 for (size_t i = 0; i < m_outputFactors.size(); ++i) {
00192 if (m_outputFactors[i]) {
00193 if (!mask[i]) {
00194 return false;
00195 }
00196 }
00197 }
00198
00199 return true;
00200 }
00201
00202 }