Moses: /disk4/html/www/moses/doxygen/mosesdecoder/lm/search

00001 /* This is where the trie is built.  It's on-disk.  */
00002 #include "lm/search_trie.hh"
00003 
00004 #include "lm/bhiksha.hh"
00005 #include "lm/binary_format.hh"
00006 #include "lm/blank.hh"
00007 #include "lm/lm_exception.hh"
00008 #include "lm/max_order.hh"
00009 #include "lm/quantize.hh"
00010 #include "lm/trie.hh"
00011 #include "lm/trie_sort.hh"
00012 #include "lm/vocab.hh"
00013 #include "lm/weights.hh"
00014 #include "lm/word_index.hh"
00015 #include "util/ersatz_progress.hh"
00016 #include "util/mmap.hh"
00017 #include "util/proxy_iterator.hh"
00018 #include "util/scoped.hh"
00019 #include "util/sized_iterator.hh"
00020 
00021 #include <algorithm>
00022 #include <cstring>
00023 #include <cstdio>
00024 #include <cstdlib>
00025 #include <queue>
00026 #include <limits>
00027 #include <numeric>
00028 #include <vector>
00029 
00030 #if defined(_WIN32) || defined(_WIN64)
00031 #include <windows.h>
00032 #endif
00033 
00034 namespace lm {
00035 namespace ngram {
00036 namespace trie {
00037 namespace {
00038 
00039 void ReadOrThrow(FILE *from, void *data, size_t size) {
00040   UTIL_THROW_IF(1 != std::fread(data, size, 1, from), util::ErrnoException, "Short read");
00041 }
00042 
00043 int Compare(unsigned char order, const void *first_void, const void *second_void) {
00044   const WordIndex *first = reinterpret_cast<const WordIndex*>(first_void), *second = reinterpret_cast<const WordIndex*>(second_void);
00045   const WordIndex *end = first + order;
00046   for (; first != end; ++first, ++second) {
00047     if (*first < *second) return -1;
00048     if (*first > *second) return 1;
00049   }
00050   return 0;
00051 }
00052 
00053 struct ProbPointer {
00054   unsigned char array;
00055   uint64_t index;
00056 };
00057 
00058 // Array of n-grams and float indices.
00059 class BackoffMessages {
00060   public:
00061     void Init(std::size_t entry_size) {
00062       current_ = NULL;
00063       allocated_ = NULL;
00064       entry_size_ = entry_size;
00065     }
00066 
00067     void Add(const WordIndex *to, ProbPointer index) {
00068       while (current_ + entry_size_ > allocated_) {
00069         std::size_t allocated_size = allocated_ - (uint8_t*)backing_.get();
00070         Resize(std::max<std::size_t>(allocated_size * 2, entry_size_));
00071       }
00072       memcpy(current_, to, entry_size_ - sizeof(ProbPointer));
00073       *reinterpret_cast<ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer)) = index;
00074       current_ += entry_size_;
00075     }
00076 
00077     void Apply(float *const *const base, FILE *unigrams) {
00078       FinishedAdding();
00079       if (current_ == allocated_) return;
00080       rewind(unigrams);
00081       ProbBackoff weights;
00082       WordIndex unigram = 0;
00083       ReadOrThrow(unigrams, &weights, sizeof(weights));
00084       for (; current_ != allocated_; current_ += entry_size_) {
00085         const WordIndex &cur_word = *reinterpret_cast<const WordIndex*>(current_);
00086         for (; unigram < cur_word; ++unigram) {
00087           ReadOrThrow(unigrams, &weights, sizeof(weights));
00088         }
00089         if (!HasExtension(weights.backoff)) {
00090           weights.backoff = kExtensionBackoff;
00091           UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed.");
00092           util::WriteOrThrow(unigrams, &weights, sizeof(weights));
00093         }
00094         const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + sizeof(WordIndex));
00095         base[write_to.array][write_to.index] += weights.backoff;
00096       }
00097       backing_.reset();
00098     }
00099 
00100     void Apply(float *const *const base, RecordReader &reader) {
00101       FinishedAdding();
00102       if (current_ == allocated_) return;
00103       // We'll also use the same buffer to record messages to blanks that they extend.
00104       WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
00105       const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
00106       for (reader.Rewind(); reader && (current_ != allocated_); ) {
00107         switch (Compare(order, reader.Data(), current_)) {
00108           case -1:
00109             ++reader;
00110             break;
00111           case 1:
00112             // Message but nobody to receive it.  Write it down at the beginning of the buffer so we can inform this blank that it extends.
00113             for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
00114             current_ += entry_size_;
00115             break;
00116           case 0:
00117             float &backoff = reinterpret_cast<ProbBackoff*>((uint8_t*)reader.Data() + order * sizeof(WordIndex))->backoff;
00118             if (!HasExtension(backoff)) {
00119               backoff = kExtensionBackoff;
00120               reader.Overwrite(&backoff, sizeof(float));
00121             } else {
00122               const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer));
00123               base[write_to.array][write_to.index] += backoff;
00124             }
00125             current_ += entry_size_;
00126             break;
00127         }
00128       }
00129       // Now this is a list of blanks that extend right.
00130       entry_size_ = sizeof(WordIndex) * order;
00131       Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
00132       current_ = (uint8_t*)backing_.get();
00133     }
00134 
00135     // Call after Apply
00136     bool Extends(unsigned char order, const WordIndex *words) {
00137       if (current_ == allocated_) return false;
00138       assert(order * sizeof(WordIndex) == entry_size_);
00139       while (true) {
00140         switch(Compare(order, words, current_)) {
00141           case 1:
00142             current_ += entry_size_;
00143             if (current_ == allocated_) return false;
00144             break;
00145           case -1:
00146             return false;
00147           case 0:
00148             return true;
00149         }
00150       }
00151     }
00152 
00153   private:
00154     void FinishedAdding() {
00155       Resize(current_ - (uint8_t*)backing_.get());
00156       // Sort requests in same order as files.
00157       std::sort(
00158           util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
00159           util::SizedIterator(util::SizedProxy(current_, entry_size_)),
00160           util::SizedCompare<EntryCompare>(EntryCompare((entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex))));
00161       current_ = (uint8_t*)backing_.get();
00162     }
00163 
00164     void Resize(std::size_t to) {
00165       std::size_t current = current_ - (uint8_t*)backing_.get();
00166       backing_.call_realloc(to);
00167       current_ = (uint8_t*)backing_.get() + current;
00168       allocated_ = (uint8_t*)backing_.get() + to;
00169     }
00170 
00171     util::scoped_malloc backing_;
00172 
00173     uint8_t *current_, *allocated_;
00174 
00175     std::size_t entry_size_;
00176 };
00177 
00178 const float kBadProb = std::numeric_limits<float>::infinity();
00179 
00180 class SRISucks {
00181   public:
00182     SRISucks() {
00183       for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i)
00184         i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));
00185     }
00186 
00187     void Send(unsigned char begin, unsigned char order, const WordIndex *to, float prob_basis) {
00188       assert(prob_basis != kBadProb);
00189       ProbPointer pointer;
00190       pointer.array = order - 1;
00191       pointer.index = values_[order - 1].size();
00192       for (unsigned char i = begin; i < order; ++i) {
00193         messages_[i - 1].Add(to, pointer);
00194       }
00195       values_[order - 1].push_back(prob_basis);
00196     }
00197 
00198     void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
00199       for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) {
00200         it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
00201       }
00202       messages_[0].Apply(it_, unigram_file);
00203       BackoffMessages *messages = messages_ + 1;
00204       const RecordReader *end = reader + total_order - 2 /* exclude unigrams and longest order */;
00205       for (; reader != end; ++messages, ++reader) {
00206         messages->Apply(it_, *reader);
00207       }
00208     }
00209 
00210     ProbBackoff GetBlank(unsigned char total_order, unsigned char order, const WordIndex *indices) {
00211       assert(order > 1);
00212       ProbBackoff ret;
00213       ret.prob = *(it_[order - 1]++);
00214       ret.backoff = ((order != total_order - 1) && messages_[order - 1].Extends(order, indices)) ? kExtensionBackoff : kNoExtensionBackoff;
00215       return ret;
00216     }
00217 
00218     const std::vector<float> &Values(unsigned char order) const {
00219       return values_[order - 1];
00220     }
00221 
00222   private:
00223     // This used to be one array.  Then I needed to separate it by order for quantization to work.
00224     std::vector<float> values_[KENLM_MAX_ORDER - 1];
00225     BackoffMessages messages_[KENLM_MAX_ORDER - 1];
00226 
00227     float *it_[KENLM_MAX_ORDER - 1];
00228 };
00229 
00230 class FindBlanks {
00231   public:
00232     FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
00233       : counts_(order), unigrams_(unigrams), sri_(messages) {}
00234 
00235     float UnigramProb(WordIndex index) const {
00236       return unigrams_[index].prob;
00237     }
00238 
00239     void Unigram(WordIndex /*index*/) {
00240       ++counts_[0];
00241     }
00242 
00243     void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) {
00244       sri_.Send(lower, order, indices + 1, prob_basis);
00245       ++counts_[order - 1];
00246     }
00247 
00248     void Middle(const unsigned char order, const void * /*data*/) {
00249       ++counts_[order - 1];
00250     }
00251 
00252     void Longest(const void * /*data*/) {
00253       ++counts_.back();
00254     }
00255 
00256     const std::vector<uint64_t> &Counts() const {
00257       return counts_;
00258     }
00259 
00260   private:
00261     std::vector<uint64_t> counts_;
00262 
00263     const ProbBackoff *unigrams_;
00264 
00265     SRISucks &sri_;
00266 };
00267 
00268 // Phase to actually write n-grams to the trie.
00269 template <class Quant, class Bhiksha> class WriteEntries {
00270   public:
00271     WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
00272       contexts_(contexts),
00273       quant_(quant),
00274       unigrams_(unigrams),
00275       middle_(middle),
00276       longest_(longest),
00277       bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
00278       order_(order),
00279       sri_(sri) {}
00280 
00281     float UnigramProb(WordIndex index) const { return unigrams_[index].weights.prob; }
00282 
00283     void Unigram(WordIndex word) {
00284       unigrams_[word].next = bigram_pack_.InsertIndex();
00285     }
00286 
00287     void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char /*lower*/, float /*prob_base*/) {
00288       ProbBackoff weights = sri_.GetBlank(order_, order, indices);
00289       typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(indices[order - 1])).Write(weights.prob, weights.backoff);
00290     }
00291 
00292     void Middle(const unsigned char order, const void *data) {
00293       RecordReader &context = contexts_[order - 1];
00294       const WordIndex *words = reinterpret_cast<const WordIndex*>(data);
00295       ProbBackoff weights = *reinterpret_cast<const ProbBackoff*>(words + order);
00296       if (context && !memcmp(data, context.Data(), sizeof(WordIndex) * order)) {
00297         SetExtension(weights.backoff);
00298         ++context;
00299       }
00300       typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(words[order - 1])).Write(weights.prob, weights.backoff);
00301     }
00302 
00303     void Longest(const void *data) {
00304       const WordIndex *words = reinterpret_cast<const WordIndex*>(data);
00305       typename Quant::LongestPointer(quant_, longest_.Insert(words[order_ - 1])).Write(reinterpret_cast<const Prob*>(words + order_)->prob);
00306     }
00307 
00308   private:
00309     RecordReader *contexts_;
00310     const Quant &quant_;
00311     UnigramValue *const unigrams_;
00312     BitPackedMiddle<Bhiksha> *const middle_;
00313     BitPackedLongest &longest_;
00314     BitPacked &bigram_pack_;
00315     const unsigned char order_;
00316     SRISucks &sri_;
00317 };
00318 
00319 struct Gram {
00320   Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {}
00321 
00322   const WordIndex *begin, *end;
00323 
00324   // For queue, this is the direction we want.
00325   bool operator<(const Gram &other) const {
00326     return std::lexicographical_compare(other.begin, other.end, begin, end);
00327   }
00328 };
00329 
00330 template <class Doing> class BlankManager {
00331   public:
00332     BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) {
00333       for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb;
00334     }
00335 
00336     void Visit(const WordIndex *to, unsigned char length, float prob) {
00337       basis_[length - 1] = prob;
00338       unsigned char overlap = std::min<unsigned char>(length - 1, been_length_);
00339       const WordIndex *cur;
00340       WordIndex *pre;
00341       for (cur = to, pre = been_; cur != to + overlap; ++cur, ++pre) {
00342         if (*pre != *cur) break;
00343       }
00344       if (cur == to + length - 1) {
00345         *pre = *cur;
00346         been_length_ = length;
00347         return;
00348       }
00349       // There are blanks to insert starting with order blank.
00350       unsigned char blank = cur - to + 1;
00351       UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
00352       const float *lower_basis;
00353       for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {}
00354       unsigned char based_on = lower_basis - basis_ + 1;
00355       for (; cur != to + length - 1; ++blank, ++cur, ++pre) {
00356         assert(*lower_basis != kBadProb);
00357         doing_.MiddleBlank(blank, to, based_on, *lower_basis);
00358         *pre = *cur;
00359         // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
00360         basis_[blank - 1] = kBadProb;
00361       }
00362       *pre = *cur;
00363       been_length_ = length;
00364     }
00365 
00366   private:
00367     const unsigned char total_order_;
00368 
00369     WordIndex been_[KENLM_MAX_ORDER];
00370     unsigned char been_length_;
00371 
00372     float basis_[KENLM_MAX_ORDER];
00373 
00374     Doing &doing_;
00375 };
00376 
00377 template <class Doing> void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) {
00378   util::ErsatzProgress progress(unigram_count + 1, progress_out, message);
00379   WordIndex unigram = 0;
00380   std::priority_queue<Gram> grams;
00381   if (unigram_count) grams.push(Gram(&unigram, 1));
00382   for (unsigned char i = 2; i <= total_order; ++i) {
00383     if (input[i-2]) grams.push(Gram(reinterpret_cast<const WordIndex*>(input[i-2].Data()), i));
00384   }
00385 
00386   BlankManager<Doing> blank(total_order, doing);
00387 
00388   while (!grams.empty()) {
00389     Gram top = grams.top();
00390     grams.pop();
00391     unsigned char order = top.end - top.begin;
00392     if (order == 1) {
00393       blank.Visit(&unigram, 1, doing.UnigramProb(unigram));
00394       doing.Unigram(unigram);
00395       progress.Set(unigram);
00396       if (++unigram < unigram_count) grams.push(top);
00397     } else {
00398       if (order == total_order) {
00399         blank.Visit(top.begin, order, reinterpret_cast<const Prob*>(top.end)->prob);
00400         doing.Longest(top.begin);
00401       } else {
00402         blank.Visit(top.begin, order, reinterpret_cast<const ProbBackoff*>(top.end)->prob);
00403         doing.Middle(order, top.begin);
00404       }
00405       RecordReader &reader = input[order - 2];
00406       if (++reader) grams.push(top);
00407     }
00408   }
00409 }
00410 
00411 void SanityCheckCounts(const std::vector<uint64_t> &initial, const std::vector<uint64_t> &fixed) {
00412   if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]);
00413   if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back());
00414   for (unsigned char i = 0; i < initial.size(); ++i) {
00415     if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected.  This shouldn't happen");
00416   }
00417 }
00418 
00419 template <class Quant> void TrainQuantizer(uint8_t order, uint64_t count, const std::vector<float> &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
00420   std::vector<float> probs(additional), backoffs;
00421   probs.reserve(count + additional.size());
00422   backoffs.reserve(count);
00423   for (reader.Rewind(); reader; ++reader) {
00424     const ProbBackoff &weights = *reinterpret_cast<const ProbBackoff*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * order);
00425     probs.push_back(weights.prob);
00426     if (weights.backoff != 0.0) backoffs.push_back(weights.backoff);
00427     ++progress;
00428   }
00429   quant.Train(order, probs, backoffs);
00430 }
00431 
00432 template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
00433   std::vector<float> probs, backoffs;
00434   probs.reserve(count);
00435   for (reader.Rewind(); reader; ++reader) {
00436     const Prob &weights = *reinterpret_cast<const Prob*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * order);
00437     probs.push_back(weights.prob);
00438     ++progress;
00439   }
00440   quant.TrainProb(order, probs);
00441 }
00442 
00443 void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
00444   // Fill unigram probabilities.
00445   try {
00446     rewind(file);
00447     for (WordIndex i = 0; i < unigram_count; ++i) {
00448       ReadOrThrow(file, &unigrams[i].weights, sizeof(ProbBackoff));
00449       if (contexts && *reinterpret_cast<const WordIndex*>(contexts.Data()) == i) {
00450         SetExtension(unigrams[i].weights.backoff);
00451         ++contexts;
00452       }
00453     }
00454   } catch (util::Exception &e) {
00455     e << " while re-reading unigram probabilities";
00456     throw;
00457   }
00458 }
00459 
00460 } // namespace
00461 
00462 template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) {
00463   RecordReader inputs[KENLM_MAX_ORDER - 1];
00464   RecordReader contexts[KENLM_MAX_ORDER - 1];
00465 
00466   for (unsigned char i = 2; i <= counts.size(); ++i) {
00467     inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
00468     contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex));
00469   }
00470 
00471   SRISucks sri;
00472   std::vector<uint64_t> fixed_counts;
00473   util::scoped_FILE unigram_file;
00474   util::scoped_fd unigram_fd(files.StealUnigram());
00475   {
00476     util::scoped_memory unigrams;
00477     MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
00478     FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
00479     RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
00480     fixed_counts = finder.Counts();
00481   }
00482   unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
00483   for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
00484     if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
00485   }
00486   SanityCheckCounts(counts, fixed_counts);
00487   counts = fixed_counts;
00488 
00489   sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
00490 
00491   void *vocab_relocate;
00492   void *search_base = backing.GrowForSearch(TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate);
00493   vocab.Relocate(vocab_relocate);
00494   out.SetupMemory(reinterpret_cast<uint8_t*>(search_base), fixed_counts, config);
00495 
00496   for (unsigned char i = 2; i <= counts.size(); ++i) {
00497     inputs[i-2].Rewind();
00498   }
00499   if (Quant::kTrain) {
00500     util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0),
00501                                   config.ProgressMessages(), "Quantizing");
00502     for (unsigned char i = 2; i < counts.size(); ++i) {
00503       TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
00504     }
00505     TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant);
00506     quant.FinishedLoading(config);
00507   }
00508 
00509   UnigramValue *unigrams = out.unigram_.Raw();
00510   PopulateUnigramWeights(unigram_file.get(), counts[0], contexts[0], unigrams);
00511   unigram_file.reset();
00512 
00513   for (unsigned char i = 2; i <= counts.size(); ++i) {
00514     inputs[i-2].Rewind();
00515   }
00516   // Fill entries except unigram probabilities.
00517   {
00518     WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
00519     RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
00520     // Write the last unigram entry, which is the end pointer for the bigrams.
00521     writer.Unigram(counts[0]);
00522   }
00523 
00524   // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
00525   for (unsigned char order = 2; order <= counts.size(); ++order) {
00526     const RecordReader &context = contexts[order - 2];
00527     if (context) {
00528       FormatLoadException e;
00529       e << "A " << static_cast<unsigned int>(order) << "-gram has context";
00530       const WordIndex *ctx = reinterpret_cast<const WordIndex*>(context.Data());
00531       for (const WordIndex *i = ctx; i != ctx + order - 1; ++i) {
00532         e << ' ' << *i;
00533       }
00534       e << " so this context must appear in the model as a " << static_cast<unsigned int>(order - 1) << "-gram but it does not";
00535       throw e;
00536     }
00537   }
00538 
00539   /* Set ending offsets so the last entry will be sized properly */
00540   // Last entry for unigrams was already set.
00541   if (out.middle_begin_ != out.middle_end_) {
00542     for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
00543       i->FinishedLoading((i+1)->InsertIndex(), config);
00544     }
00545     (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
00546   }
00547 }
00548 
00549 template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
00550   quant_.SetupMemory(start, counts.size(), config);
00551   start += Quant::Size(counts.size(), config);
00552   unigram_.Init(start);
00553   start += Unigram::Size(counts[0]);
00554   FreeMiddles();
00555   middle_begin_ = static_cast<Middle*>(malloc(sizeof(Middle) * (counts.size() - 2)));
00556   middle_end_ = middle_begin_ + (counts.size() - 2);
00557   std::vector<uint8_t*> middle_starts(counts.size() - 2);
00558   for (unsigned char i = 2; i < counts.size(); ++i) {
00559     middle_starts[i-2] = start;
00560     start += Middle::Size(Quant::MiddleBits(config), counts[i-1], counts[0], counts[i], config);
00561   }
00562   // Crazy backwards thing so we initialize using pointers to ones that have already been initialized
00563   for (unsigned char i = counts.size() - 1; i >= 2; --i) {
00564     // use "placement new" syntax to initalize Middle in an already-allocated memory location
00565     new (middle_begin_ + i - 2) Middle(
00566         middle_starts[i-2],
00567         quant_.MiddleBits(config),
00568         counts[i-1],
00569         counts[0],
00570         counts[i],
00571         (i == counts.size() - 1) ? static_cast<const BitPacked&>(longest_) : static_cast<const BitPacked &>(middle_begin_[i-1]),
00572         config);
00573   }
00574   longest_.Init(start, quant_.LongestBits(config), counts[0]);
00575   return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]);
00576 }
00577 
00578 template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) {
00579   std::string temporary_prefix;
00580   if (!config.temporary_directory_prefix.empty()) {
00581     temporary_prefix = config.temporary_directory_prefix;
00582   } else if (config.write_mmap) {
00583     temporary_prefix = config.write_mmap;
00584   } else {
00585     temporary_prefix = file;
00586   }
00587   // At least 1MB sorting memory.
00588   SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
00589 
00590   BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
00591 }
00592 
00593 template class TrieSearch<DontQuantize, DontBhiksha>;
00594 template class TrieSearch<DontQuantize, ArrayBhiksha>;
00595 template class TrieSearch<SeparatelyQuantize, DontBhiksha>;
00596 template class TrieSearch<SeparatelyQuantize, ArrayBhiksha>;
00597 
00598 } // namespace trie
00599 } // namespace ngram
00600 } // namespace lm
/disk4/html/www/moses/doxygen/mosesdecoder/lm/search_trie.cc