00001 #include <limits>
00002 #include <iostream>
00003 #include <fstream>
00004
00005 #include "moses/FactorCollection.h"
00006 #include "moses/Phrase.h"
00007 #include "moses/InputFileStream.h"
00008 #include "moses/StaticData.h"
00009 #include "ORLM.h"
00010
00011 using namespace std;
00012
00013 namespace Moses
00014 {
00015 bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
00016 size_t nGramOrder)
00017 {
00018 cerr << "Loading LanguageModelORLM..." << endl;
00019 m_filePath = filePath;
00020 m_factorType = factorType;
00021 m_nGramOrder = nGramOrder;
00022 FileHandler fLmIn(m_filePath, std::ios::in|std::ios::binary, true);
00023 m_lm = new OnlineRLM<T>(&fLmIn, m_nGramOrder);
00024 fLmIn.close();
00025
00026
00027 m_oov_id = m_lm->vocab_->GetWordID("<unk>");
00028 CreateFactors();
00029 return true;
00030 }
00031 void LanguageModelORLM::CreateFactors()
00032 {
00033 FactorCollection &factorCollection = FactorCollection::Instance();
00034 size_t maxFactorId = 0;
00035 std::map<size_t, wordID_t> m_lmids_map;
00036
00037 for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
00038 vIter != m_lm->vocab_->VocabEnd(); vIter++) {
00039
00040 size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
00041 m_lmids_map[factorId] = vIter->second;
00042 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00043 }
00044
00045 size_t factorId;
00046 m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, "<s>");
00047 factorId = m_sentenceStart->GetId();
00048 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00049 m_sentenceStartWord[m_factorType] = m_sentenceStart;
00050
00051 m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, "</s>");
00052 factorId = m_sentenceEnd->GetId();
00053 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00054 m_sentenceEndWord[m_factorType] = m_sentenceEnd;
00055
00056 lm_ids_vec_.resize(maxFactorId+1);
00057
00058 fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
00059
00060 for (map<size_t, wordID_t>::const_iterator iter = m_lmids_map.begin();
00061 iter != m_lmids_map.end() ; ++iter)
00062 lm_ids_vec_[iter->first] = iter->second;
00063 }
00064 wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
00065 {
00066 return m_lm->vocab_->GetWordID(str);
00067 }
00068 wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
00069 {
00070 size_t factorId = factor->GetId();
00071 return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
00072 }
00073 LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
00074 State* finalState) const
00075 {
00076 FactorType factorType = GetFactorType();
00077
00078
00079
00080 wordID_t ngram[MAX_NGRAM_SIZE];
00081 int count = contextFactor.size();
00082 for (int i = 0; i < count; i++) {
00083 ngram[i] = GetLmID((*contextFactor[i])[factorType]);
00084
00085 }
00086
00087 LMResult ret;
00088 ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, finalState)));
00089 ret.unknown = count && (ngram[count - 1] == m_oov_id);
00090
00091
00092
00093
00094
00095 return ret;
00096 }
00097 bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
00098 {
00099
00100
00101
00102
00103 m_lm->vocab_->MakeOpen();
00104 bool res = m_lm->update(ngram, value);
00105 m_lm->vocab_->MakeClosed();
00106 return res;
00107 }
00108 }