00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <limits>
00027 #include <iostream>
00028 #include <fstream>
00029
00030 #include "Rand.h"
00031 #include "moses/Factor.h"
00032 #include "moses/Util.h"
00033 #include "moses/FactorCollection.h"
00034 #include "moses/Phrase.h"
00035 #include "moses/InputFileStream.h"
00036 #include "moses/StaticData.h"
00037 #include "RandLM.h"
00038
00039 using namespace std;
00040
00041 namespace Moses
00042 {
00043
00044 LanguageModelRandLM::LanguageModelRandLM(const std::string &line)
00045 :LanguageModelSingleFactor(line)
00046 , m_lm(0)
00047 {
00048 }
00049
00050 LanguageModelRandLM::~LanguageModelRandLM()
00051 {
00052 delete m_lm;
00053 }
00054
00055 void LanguageModelRandLM::Load(AllOptions::ptr const& opts)
00056 {
00057 cerr << "Loading LanguageModelRandLM..." << endl;
00058 FactorCollection &factorCollection = FactorCollection::Instance();
00059 int cache_MB = 50;
00060 m_lm = randlm::RandLM::initRandLM(m_filePath, m_nGramOrder, cache_MB);
00061 UTIL_THROW_IF2(m_lm == NULL, "RandLM object not created");
00062
00063 m_oov_id = m_lm->getWordID(m_lm->getOOV());
00064 CreateFactors(factorCollection);
00065 m_lm->initThreadSpecificData();
00066 }
00067
00068 void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection)
00069 {
00070
00071
00072 std::map<size_t, randlm::WordID> randlm_ids_map;
00073 size_t maxFactorId = 0;
00074 for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart();
00075 vIter != m_lm->vocabEnd(); vIter++) {
00076
00077 size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId();
00078 randlm_ids_map[factorId] = vIter->second;
00079 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00080 }
00081
00082 size_t factorId;
00083 m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS());
00084 factorId = m_sentenceStart->GetId();
00085 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00086 m_sentenceStartWord[m_factorType] = m_sentenceStart;
00087
00088 m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS());
00089 factorId = m_sentenceEnd->GetId();
00090 maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
00091 m_sentenceEndWord[m_factorType] = m_sentenceEnd;
00092
00093
00094 m_randlm_ids_vec.resize(maxFactorId+1);
00095
00096 fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id);
00097
00098 for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin();
00099 iter != randlm_ids_map.end() ; ++iter)
00100 m_randlm_ids_vec[iter->first] = iter->second;
00101
00102 }
00103
00104 randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
00105 {
00106 return m_lm->getWordID(str);
00107 }
00108
00109 randlm::WordID LanguageModelRandLM::GetLmID( const Factor *factor ) const
00110 {
00111 size_t factorId = factor->GetId();
00112 return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
00113 }
00114
00115 LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
00116 State* finalState) const
00117 {
00118 FactorType factorType = GetFactorType();
00119
00120 randlm::WordID ngram[MAX_NGRAM_SIZE];
00121 int count = contextFactor.size();
00122 for (int i = 0 ; i < count ; i++) {
00123 ngram[i] = GetLmID((*contextFactor[i])[factorType]);
00124
00125 }
00126 int found = 0;
00127 LMResult ret;
00128 ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
00129 ret.unknown = count && (ngram[count - 1] == m_oov_id);
00130
00131
00132
00133
00134 return ret;
00135 }
00136
00137 void LanguageModelRandLM::InitializeForInput(ttasksptr const& ttask)
00138 {
00139 m_lm->initThreadSpecificData();
00140 }
00141 void LanguageModelRandLM::CleanUpAfterSentenceProcessing(const InputType& source)
00142 {
00143 m_lm->clearCaches();
00144 }
00145
00146 }
00147