00001 #ifndef LM_MODEL__
00002 #define LM_MODEL__
00003
00004 #include "lm/bhiksha.hh"
00005 #include "lm/binary_format.hh"
00006 #include "lm/config.hh"
00007 #include "lm/facade.hh"
00008 #include "lm/quantize.hh"
00009 #include "lm/search_hashed.hh"
00010 #include "lm/search_trie.hh"
00011 #include "lm/state.hh"
00012 #include "lm/value.hh"
00013 #include "lm/vocab.hh"
00014 #include "lm/weights.hh"
00015
00016 #include "util/murmur_hash.hh"
00017
00018 #include <algorithm>
00019 #include <vector>
00020
00021 #include <string.h>
00022
00023 namespace util { class FilePiece; }
00024
00025 namespace lm {
00026 namespace ngram {
00027 namespace detail {
00028
00029
00030
00031 template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
00032 private:
00033 typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
00034 public:
00035
00036 static const ModelType kModelType;
00037
00038 static const unsigned int kVersion = Search::kVersion;
00039
00040
00041
00042
00043
00044 static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
00045
00046
00047
00048
00049
00050
00051
00052 explicit GenericModel(const char *file, const Config &config = Config());
00053
00054
00055
00056
00057
00058 FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
00059
00060
00061
00062
00063
00064
00065
00066
00067 FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077 void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
00078
00079
00080
00081
00082
00083 FullScoreReturn ExtendLeft(
00084
00085 const WordIndex *add_rbegin, const WordIndex *add_rend,
00086
00087 const float *backoff_in,
00088
00089 uint64_t extend_pointer,
00090
00091 unsigned char extend_length,
00092
00093 float *backoff_out,
00094
00095 unsigned char &next_use) const;
00096
00097
00098
00099
00100
00101 float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
00102
00103 return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
00104 }
00105
00106 private:
00107 friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
00108
00109 static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
00110
00111 FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
00112
00113
00114 void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
00115
00116
00117 void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
00118
00119 void InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd);
00120
00121 void InitializeFromARPA(const char *file, const Config &config);
00122
00123 float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
00124
00125 Backing &MutableBacking() { return backing_; }
00126
00127 Backing backing_;
00128
00129 VocabularyT vocab_;
00130
00131 Search search_;
00132 };
00133
00134 }
00135
00136
00137
00138 #define LM_COMMA() ,
00139 #define LM_NAME_MODEL(name, from)\
00140 class name : public from {\
00141 public:\
00142 name(const char *file, const Config &config = Config()) : from(file, config) {}\
00143 };
00144
00145 LM_NAME_MODEL(ProbingModel, detail::GenericModel<detail::HashedSearch<BackoffValue> LM_COMMA() ProbingVocabulary>);
00146 LM_NAME_MODEL(RestProbingModel, detail::GenericModel<detail::HashedSearch<RestValue> LM_COMMA() ProbingVocabulary>);
00147 LM_NAME_MODEL(TrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
00148 LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
00149 LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
00150 LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
00151
00152
00153 typedef ::lm::ngram::ProbingVocabulary Vocabulary;
00154 typedef ProbingModel Model;
00155
00156 }
00157 }
00158
00159 #endif // LM_MODEL__