00001 #include <utility>
00002 #include "moses/StaticData.h"
00003 #include "moses/InputFileStream.h"
00004 #include "DynamicCacheBasedLanguageModel.h"
00005
00006 namespace Moses
00007 {
00008
00009 std::map< const std::string, DynamicCacheBasedLanguageModel * > DynamicCacheBasedLanguageModel::s_instance_map;
00010 DynamicCacheBasedLanguageModel *DynamicCacheBasedLanguageModel::s_instance = NULL;
00011
00012 DynamicCacheBasedLanguageModel::DynamicCacheBasedLanguageModel(const std::string &line)
00013 : StatelessFeatureFunction(1, line)
00014 {
00015 VERBOSE(2,"Initializing DynamicCacheBasedLanguageModel feature..." << std::endl);
00016
00017 m_query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
00018 m_score_type = CBLM_SCORE_TYPE_HYPERBOLA;
00019 m_maxAge = 1000;
00020 m_name = "default";
00021 m_constant = false;
00022
00023 ReadParameters();
00024 UTIL_THROW_IF2(s_instance_map.find(m_name) != s_instance_map.end(), "Only 1 DynamicCacheBasedLanguageModel feature named " + m_name + " is allowed");
00025 s_instance_map[m_name] = this;
00026 s_instance = this;
00027
00028 SetPreComputedScores();
00029 }
00030
00031 DynamicCacheBasedLanguageModel::~DynamicCacheBasedLanguageModel() {};
00032
00033 void DynamicCacheBasedLanguageModel::SetPreComputedScores()
00034 {
00035 #ifdef WITH_THREADS
00036 boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
00037 #endif
00038 precomputedScores.clear();
00039 for (unsigned int i=0; i<m_maxAge; i++) {
00040 precomputedScores.push_back(decaying_score(i));
00041 }
00042
00043 if ( m_score_type == CBLM_SCORE_TYPE_HYPERBOLA
00044 || m_score_type == CBLM_SCORE_TYPE_POWER
00045 || m_score_type == CBLM_SCORE_TYPE_EXPONENTIAL
00046 || m_score_type == CBLM_SCORE_TYPE_COSINE ) {
00047 precomputedScores.push_back(decaying_score(m_maxAge));
00048 } else {
00049 precomputedScores.push_back(0.0);
00050 }
00051 m_lower_score = precomputedScores[m_maxAge];
00052 VERBOSE(3, "SetPreComputedScores(): lower_age:|" << m_maxAge << "| lower_score:|" << m_lower_score << "|" << std::endl);
00053 }
00054
00055 float DynamicCacheBasedLanguageModel::GetPreComputedScores(const unsigned int age)
00056 {
00057 VERBOSE(2, "float DynamicCacheBasedLanguageModel::GetPreComputedScores" << std::endl);
00058 VERBOSE(2, "age:|"<< age << "|" << std::endl);
00059
00060 if (age < m_maxAge) {
00061 return precomputedScores.at(age);
00062 } else {
00063 VERBOSE(2, "is to big reduced to m)_maxAge:|"<< m_maxAge << "|" << std::endl);
00064 return precomputedScores.at(m_maxAge);
00065 }
00066 }
00067
00068 void DynamicCacheBasedLanguageModel::SetParameter(const std::string& key, const std::string& value)
00069 {
00070 VERBOSE(2, "DynamicCacheBasedLanguageModel::SetParameter key:|" << key << "| value:|" << value << "|" << std::endl);
00071 if (key == "cblm-query-type") {
00072 SetQueryType(Scan<size_t>(value));
00073 } else if (key == "cblm-score-type") {
00074 SetScoreType(Scan<size_t>(value));
00075 } else if (key == "cblm-max-age") {
00076 SetMaxAge(Scan<unsigned int>(value));
00077 } else if (key == "cblm-file") {
00078 m_initfiles = Scan<std::string>(value);
00079 } else if (key == "cblm-name") {
00080 m_name = Scan<std::string>(value);
00081 } else if (key == "cblm-constant") {
00082 m_constant = Scan<bool>(value);
00083 } else {
00084 StatelessFeatureFunction::SetParameter(key, value);
00085 }
00086 }
00087
00088 void DynamicCacheBasedLanguageModel::EvaluateInIsolation(const Phrase &sp
00089 , const TargetPhrase &tp
00090 , ScoreComponentCollection &scoreBreakdown
00091 , ScoreComponentCollection &estimatedScores) const
00092 {
00093 float score = m_lower_score;
00094 switch(m_query_type) {
00095 case CBLM_QUERY_TYPE_WHOLESTRING:
00096 score = Evaluate_Whole_String(tp);
00097 break;
00098 case CBLM_QUERY_TYPE_ALLSUBSTRINGS:
00099 score = Evaluate_All_Substrings(tp);
00100 break;
00101 default:
00102 UTIL_THROW_IF2(false, "This score type (" << m_query_type << ") is unknown.");
00103 }
00104
00105 scoreBreakdown.Assign(this, score);
00106 }
00107
00108 float DynamicCacheBasedLanguageModel::Evaluate_Whole_String(const TargetPhrase& tp) const
00109 {
00110
00111
00112
00113
00114 decaying_cache_t::const_iterator it;
00115 float score = m_lower_score;
00116
00117 std::string w = "";
00118 size_t endpos = tp.GetSize();
00119 for (size_t pos = 0 ; pos < endpos ; ++pos) {
00120 w += tp.GetWord(pos).GetFactor(0)->GetString().as_string();
00121 if (pos < endpos - 1) {
00122 w += " ";
00123 }
00124 }
00125 it = m_cache.find(w);
00126
00127 VERBOSE(4,"cblm::Evaluate_Whole_String: searching w:|" << w << "|" << std::endl);
00128 if (it != m_cache.end()) {
00129 score = ((*it).second).second;
00130 VERBOSE(4,"cblm::Evaluate_Whole_String: found w:|" << w << "|" << std::endl);
00131 }
00132
00133 VERBOSE(4,"cblm::Evaluate_Whole_String: returning score:|" << score << "|" << std::endl);
00134 return score;
00135 }
00136
00137 float DynamicCacheBasedLanguageModel::Evaluate_All_Substrings(const TargetPhrase& tp) const
00138 {
00139
00140
00141
00142
00143 decaying_cache_t::const_iterator it;
00144 float score = 0.0;
00145
00146 for (size_t startpos = 0 ; startpos < tp.GetSize() ; ++startpos) {
00147 std::string w = "";
00148 for (size_t endpos = startpos; endpos < tp.GetSize() ; ++endpos) {
00149 w += tp.GetWord(endpos).GetFactor(0)->GetString().as_string();
00150 it = m_cache.find(w);
00151
00152 if (it != m_cache.end()) {
00153 score += ((*it).second).second;
00154 VERBOSE(3,"cblm::Evaluate_All_Substrings: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" << score << "|" << std::endl);
00155 } else {
00156 score += m_lower_score;
00157 }
00158
00159 if (endpos == startpos) {
00160 w += " ";
00161 }
00162
00163 }
00164 }
00165 VERBOSE(3,"cblm::Evaluate_All_Substrings: returning score:|" << score << "|" << std::endl);
00166 return score;
00167 }
00168
00169 void DynamicCacheBasedLanguageModel::Print() const
00170 {
00171 #ifdef WITH_THREADS
00172 boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
00173 #endif
00174 decaying_cache_t::const_iterator it;
00175 std::cout << "Content of the cache of Cache-Based Language Model" << std::endl;
00176 std::cout << "Size of the cache of Cache-Based Language Model:|" << m_cache.size() << "|" << std::endl;
00177 for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
00178 std::cout << "word:|" << (*it).first << "| age:|" << ((*it).second).first << "| score:|" << ((*it).second).second << "|" << std::endl;
00179 }
00180 }
00181
00182 void DynamicCacheBasedLanguageModel::Decay()
00183 {
00184 #ifdef WITH_THREADS
00185 boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
00186 #endif
00187 decaying_cache_t::iterator it;
00188
00189 unsigned int age;
00190 float score;
00191 for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
00192 age=((*it).second).first + 1;
00193 if (age > m_maxAge) {
00194 m_cache.erase(it);
00195 it--;
00196 } else {
00197 score = GetPreComputedScores(age);
00198
00199 decaying_cache_value_t p (age, score);
00200 (*it).second = p;
00201 }
00202 }
00203 }
00204
00205 void DynamicCacheBasedLanguageModel::Update(std::vector<std::string> words, int age)
00206 {
00207 #ifdef WITH_THREADS
00208 boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
00209 #endif
00210 VERBOSE(3,"words.size():|" << words.size() << "|" << std::endl);
00211 for (size_t j=0; j<words.size(); j++) {
00212 words[j] = Trim(words[j]);
00213
00214
00215 VERBOSE(3,"CacheBasedLanguageModel::Update word[" << j << "]:"<< words[j] << " age:" << age << " GetPreComputedScores(age):" << GetPreComputedScores(age) << std::endl);
00216 decaying_cache_value_t p (age,GetPreComputedScores(age));
00217 std::pair<std::string, decaying_cache_value_t> e (words[j],p);
00218 m_cache.erase(words[j]);
00219 m_cache.insert(e);
00220 }
00221 }
00222
00223 void DynamicCacheBasedLanguageModel::ClearEntries(std::string &entries)
00224 {
00225 if (entries != "") {
00226 VERBOSE(3,"entries:|" << entries << "|" << std::endl);
00227 std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
00228 VERBOSE(3,"elements.size() after:|" << elements.size() << "|" << std::endl);
00229 ClearEntries(elements);
00230 }
00231 }
00232
00233 void DynamicCacheBasedLanguageModel::ClearEntries(std::vector<std::string> words)
00234 {
00235 #ifdef WITH_THREADS
00236 boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
00237 #endif
00238 VERBOSE(3,"words.size():|" << words.size() << "|" << std::endl);
00239 for (size_t j=0; j<words.size(); j++) {
00240 words[j] = Trim(words[j]);
00241 VERBOSE(3,"CacheBasedLanguageModel::ClearEntries word[" << j << "]:"<< words[j] << std::endl);
00242 m_cache.erase(words[j]);
00243 }
00244 }
00245
00246 void DynamicCacheBasedLanguageModel::Insert(std::string &entries)
00247 {
00248 if (entries != "") {
00249 VERBOSE(3,"entries:|" << entries << "|" << std::endl);
00250 std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
00251 VERBOSE(3,"elements.size() after:|" << elements.size() << "|" << std::endl);
00252 Insert(elements);
00253 }
00254 }
00255
00256 void DynamicCacheBasedLanguageModel::Insert(std::vector<std::string> ngrams)
00257 {
00258 VERBOSE(3,"DynamicCacheBasedLanguageModel Insert ngrams.size():|" << ngrams.size() << "|" << std::endl);
00259 if (m_constant == false) {
00260 Decay();
00261 }
00262 Update(ngrams,1);
00263 IFVERBOSE(3) Print();
00264 }
00265
00266 void DynamicCacheBasedLanguageModel::ExecuteDlt(std::map<std::string, std::string> dlt_meta)
00267 {
00268 if (dlt_meta.find("cblm") != dlt_meta.end()) {
00269 Insert(dlt_meta["cblm"]);
00270 }
00271 if (dlt_meta.find("cblm-command") != dlt_meta.end()) {
00272 Execute(dlt_meta["cblm-command"]);
00273 }
00274 if (dlt_meta.find("cblm-file") != dlt_meta.end()) {
00275 Load(dlt_meta["cblm-file"]);
00276 }
00277 if (dlt_meta.find("cblm-clear-entries") != dlt_meta.end()) {
00278 ClearEntries(dlt_meta["cblm-clear-entries"]);
00279 }
00280 if (dlt_meta.find("cblm-clear-all") != dlt_meta.end()) {
00281 Clear();
00282 }
00283
00284 }
00285
00286 void DynamicCacheBasedLanguageModel::Execute(std::string command)
00287 {
00288 VERBOSE(2,"DynamicCacheBasedLanguageModel::Execute(std::string command:|" << command << "|" << std::endl);
00289 std::vector<std::string> commands = Tokenize(command, "||");
00290 Execute(commands);
00291 }
00292
00293 void DynamicCacheBasedLanguageModel::Execute(std::vector<std::string> commands)
00294 {
00295 for (size_t j=0; j<commands.size(); j++) {
00296 Execute_Single_Command(commands[j]);
00297 }
00298 IFVERBOSE(2) Print();
00299 }
00300
00301 void DynamicCacheBasedLanguageModel::Execute_Single_Command(std::string command)
00302 {
00303 VERBOSE(2,"CacheBasedLanguageModel::Execute_Single_Command(std::string command:|" << command << "|" << std::endl);
00304 if (command == "clear") {
00305 VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Cache cleared." << std::endl);
00306 Clear();
00307 } else if (command == "settype_wholestring") {
00308 VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_WHOLESTRING << " (CBLM_QUERY_TYPE_WHOLESTRING)." << std::endl);
00309 SetQueryType(CBLM_QUERY_TYPE_WHOLESTRING);
00310 } else if (command == "settype_allsubstrings") {
00311 VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << " (CBLM_QUERY_TYPE_ALLSUBSTRINGS)." << std::endl);
00312 SetQueryType(CBLM_QUERY_TYPE_ALLSUBSTRINGS);
00313 } else {
00314 VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "| is unknown. Skipped." << std::endl);
00315 }
00316 }
00317
00318 void DynamicCacheBasedLanguageModel::Clear()
00319 {
00320 #ifdef WITH_THREADS
00321 boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
00322 #endif
00323 m_cache.clear();
00324 }
00325
00326 void DynamicCacheBasedLanguageModel::Load(AllOptions::ptr const& opts)
00327 {
00328 m_options = opts;
00329
00330 VERBOSE(2,"DynamicCacheBasedLanguageModel::Load()" << std::endl);
00331 Load(m_initfiles);
00332 }
00333
00334 void DynamicCacheBasedLanguageModel::Load(const std::string filestr)
00335 {
00336 VERBOSE(2,"DynamicCacheBasedLanguageModel::Load(const std::string filestr)" << std::endl);
00337
00338 std::vector<std::string> files = Tokenize(filestr, "||");
00339 Load_Multiple_Files(files);
00340 }
00341
00342
00343 void DynamicCacheBasedLanguageModel::Load_Multiple_Files(std::vector<std::string> files)
00344 {
00345 VERBOSE(2,"DynamicCacheBasedLanguageModel::Load_Multiple_Files(std::vector<std::string> files)" << std::endl);
00346 for(size_t j = 0; j < files.size(); ++j) {
00347 Load_Single_File(files[j]);
00348 }
00349 }
00350
00351 void DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)
00352 {
00353 VERBOSE(2,"DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)" << std::endl);
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365 VERBOSE(2,"Loading data from the cache file " << file << std::endl);
00366 InputFileStream cacheFile(file);
00367
00368 std::string line;
00369 int age;
00370 std::vector<std::string> words;
00371
00372 while (getline(cacheFile, line)) {
00373 std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" );
00374 if (vecStr.size() >= 2) {
00375 age = Scan<int>(vecStr[0]);
00376 vecStr.erase(vecStr.begin());
00377 Update(vecStr,age);
00378 } else {
00379 UTIL_THROW_IF2(false, "The format of the loaded file is wrong: " << line);
00380 }
00381 }
00382 IFVERBOSE(2) Print();
00383 }
00384
00385 void DynamicCacheBasedLanguageModel::SetQueryType(size_t type)
00386 {
00387 #ifdef WITH_THREADS
00388 boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
00389 #endif
00390
00391 m_query_type = type;
00392 if ( m_query_type != CBLM_QUERY_TYPE_WHOLESTRING
00393 && m_query_type != CBLM_QUERY_TYPE_ALLSUBSTRINGS ) {
00394 VERBOSE(2, "This query type " << m_query_type << " is unknown. Instead used " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << "." << std::endl);
00395 m_query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
00396 }
00397 VERBOSE(2, "CacheBasedLanguageModel QueryType: " << m_query_type << std::endl);
00398
00399 };
00400
00401 void DynamicCacheBasedLanguageModel::SetScoreType(size_t type)
00402 {
00403 #ifdef WITH_THREADS
00404 boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
00405 #endif
00406 m_score_type = type;
00407 if ( m_score_type != CBLM_SCORE_TYPE_HYPERBOLA
00408 && m_score_type != CBLM_SCORE_TYPE_POWER
00409 && m_score_type != CBLM_SCORE_TYPE_EXPONENTIAL
00410 && m_score_type != CBLM_SCORE_TYPE_COSINE
00411 && m_score_type != CBLM_SCORE_TYPE_HYPERBOLA_REWARD
00412 && m_score_type != CBLM_SCORE_TYPE_POWER_REWARD
00413 && m_score_type != CBLM_SCORE_TYPE_EXPONENTIAL_REWARD ) {
00414 VERBOSE(2, "This score type " << m_score_type << " is unknown. Instead used " << CBLM_SCORE_TYPE_HYPERBOLA << "." << std::endl);
00415 m_score_type = CBLM_SCORE_TYPE_HYPERBOLA;
00416 }
00417 VERBOSE(2, "CacheBasedLanguageModel ScoreType: " << m_score_type << std::endl);
00418 };
00419
00420 void DynamicCacheBasedLanguageModel::SetMaxAge(unsigned int age)
00421 {
00422 #ifdef WITH_THREADS
00423 boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
00424 #endif
00425 m_maxAge = age;
00426 VERBOSE(2, "CacheBasedLanguageModel MaxAge: " << m_maxAge << std::endl);
00427 };
00428
00429 float DynamicCacheBasedLanguageModel::decaying_score(const unsigned int age)
00430 {
00431 float sc;
00432 switch(m_score_type) {
00433 case CBLM_SCORE_TYPE_HYPERBOLA:
00434 sc = (float) 1.0/age - 1.0;
00435 break;
00436 case CBLM_SCORE_TYPE_POWER:
00437 sc = (float) pow(age, -0.25) - 1.0;
00438 break;
00439 case CBLM_SCORE_TYPE_EXPONENTIAL:
00440 sc = (age == 1) ? 0.0 : (float) exp( 1.0/age ) / exp(1.0) - 1.0;
00441 break;
00442 case CBLM_SCORE_TYPE_COSINE:
00443 sc = (float) cos( (age-1) * (PI/2) / m_maxAge ) - 1.0;
00444 break;
00445 case CBLM_SCORE_TYPE_HYPERBOLA_REWARD:
00446 sc = (float) 1.0/age;
00447 break;
00448 case CBLM_SCORE_TYPE_POWER_REWARD:
00449 sc = (float) pow(age, -0.25);
00450 break;
00451 case CBLM_SCORE_TYPE_EXPONENTIAL_REWARD:
00452 sc = (age == 1) ? 1.0 : (float) exp( 1.0/age ) / exp(1.0);
00453 break;
00454 default:
00455 sc = -1.0;
00456 }
00457 return sc;
00458 }
00459 }