Moses: /disk4/html/www/moses/doxygen/mosesdecoder/lm/builder/dump_counts

00001 #include "lm/common/print.hh"
00002 #include "lm/word_index.hh"
00003 #include "util/file.hh"
00004 #include "util/read_compressed.hh"
00005 
00006 #include <boost/lexical_cast.hpp>
00007 
00008 #include <iostream>
00009 #include <vector>
00010 
00011 int main(int argc, char *argv[]) {
00012   if (argc != 4) {
00013     std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
00014     "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
00015     "counts.  Each record has order many vocabulary ids.\n"
00016     "The vocabulary file contains the words delimited by NULL in order of id.\n"
00017     "The vocabulary file may not be compressed because it is mmapped but the counts\n"
00018     "file can be compressed.\n";
00019     return 1;
00020   }
00021   util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
00022   util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
00023   lm::VocabReconstitute vocab(vocab_file.get());
00024   unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
00025   std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
00026   while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
00027     UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
00028     const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
00029     for (const lm::WordIndex *i = words; i != words + order; ++i) {
00030       UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ".  Are you sure you have the right order and vocab file for these counts?");
00031       std::cout << vocab.Lookup(*i) << ' ';
00032     }
00033     // TODO don't use std::cout because it is slow.  Add fast uint64_t printing support to FileStream.
00034     std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
00035   }
00036 }
/disk4/html/www/moses/doxygen/mosesdecoder/lm/builder/dump_counts_main.cc