00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <limits>
00021 #include <iostream>
00022 #include <fstream>
00023 #include <string>
00024 #include <vector>
00025
00026 #include "SingleFactor.h"
00027 #include "RandLM.h"
00028 #include "Rand.h"
00029 #include "moses/Factor.h"
00030 #include "moses/Util.h"
00031 #include "moses/FactorCollection.h"
00032 #include "moses/Phrase.h"
00033 #include "moses/InputFileStream.h"
00034 #include "moses/StaticData.h"
00035 #include "util/check.hh"
00036
00037
00038 namespace Moses
00039 {
00040 namespace
00041 {
00042 using namespace std;
00043
00044 class LanguageModelRandLM : public LanguageModelSingleFactor
00045 {
00046 public:
00047 LanguageModelRandLM(const std::string &line)
00048 :LanguageModelSingleFactor("RandLM", line)
00049 , m_lm(0)
00050 {}
00051 bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
00052 virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
00053 ~LanguageModelRandLM() {
00054 delete m_lm;
00055 }
00056 void InitializeForInput(InputType const& source) {
00057 m_lm->initThreadSpecificData();
00058 }
00059 void CleanUpAfterSentenceProcessing(const InputType& source) {
00060 m_lm->clearCaches();
00061 }
00062 protected:
00063 std::vector<randlm::WordID> m_randlm_ids_vec;
00064 randlm::RandLM* m_lm;
00065 randlm::WordID m_oov_id;
00066 void CreateFactors(FactorCollection &factorCollection);
00067 randlm::WordID GetLmID( const std::string &str ) const;
00068 randlm::WordID GetLmID( const Factor *factor ) const {
00069 size_t factorId = factor->GetId();
00070 return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
00071 };
00072
00073 };
00074
00075
00076 bool LanguageModelRandLM::Load(const std::string &filePath, FactorType factorType,
00077 size_t nGramOrder)
00078 {
00079 cerr << "Loading LanguageModelRandLM..." << endl;
00080 FactorCollection &factorCollection = FactorCollection::Instance();
00081 m_filePath = filePath;
00082 m_factorType = factorType;
00083 m_nGramOrder = nGramOrder;
00084 int cache_MB = 50;
00085 m_lm = randlm::RandLM::initRandLM(filePath, nGramOrder, cache_MB);
00086 CHECK(m_lm != NULL);
00087
00088 m_oov_id = m_lm->getWordID(m_lm->getOOV());
00089 CreateFactors(factorCollection);
00090 m_lm->initThreadSpecificData();
00091 return true;
00092 }
00093
00094 void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection)
00095 {
00096
00097
00098 std::map<size_t, randlm::WordID> randlm_ids_map;
00099 size_t maxFactorId = 0;
00100 for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart();
00101 vIter != m_lm->vocabEnd(); vIter++) {
00102
00103 size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId();
00104 randlm_ids_map[factorId] = vIter->second;
00105 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00106 }
00107
00108 size_t factorId;
00109 m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS());
00110 factorId = m_sentenceStart->GetId();
00111 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00112 m_sentenceStartWord[m_factorType] = m_sentenceStart;
00113
00114 m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS());
00115 factorId = m_sentenceEnd->GetId();
00116 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00117 m_sentenceEndWord[m_factorType] = m_sentenceEnd;
00118
00119
00120 m_randlm_ids_vec.resize(maxFactorId+1);
00121
00122 fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id);
00123
00124 for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin();
00125 iter != randlm_ids_map.end() ; ++iter)
00126 m_randlm_ids_vec[iter->first] = iter->second;
00127
00128 }
00129
00130 randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
00131 {
00132 return m_lm->getWordID(str);
00133 }
00134
00135 LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
00136 State* finalState) const
00137 {
00138 FactorType factorType = GetFactorType();
00139
00140 randlm::WordID ngram[MAX_NGRAM_SIZE];
00141 int count = contextFactor.size();
00142 for (int i = 0 ; i < count ; i++) {
00143 ngram[i] = GetLmID((*contextFactor[i])[factorType]);
00144
00145 }
00146 int found = 0;
00147 LMResult ret;
00148 ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
00149 ret.unknown = count && (ngram[count - 1] == m_oov_id);
00150
00151
00152
00153
00154 return ret;
00155 }
00156
00157 }
00158
00159 }
00160