00001 #include "lm/common/print.hh"
00002 #include "lm/word_index.hh"
00003 #include "util/file.hh"
00004 #include "util/read_compressed.hh"
00005
00006 #include <boost/lexical_cast.hpp>
00007
00008 #include <iostream>
00009 #include <vector>
00010
00011 int main(int argc, char *argv[]) {
00012 if (argc != 4) {
00013 std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
00014 "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
00015 "counts. Each record has order many vocabulary ids.\n"
00016 "The vocabulary file contains the words delimited by NULL in order of id.\n"
00017 "The vocabulary file may not be compressed because it is mmapped but the counts\n"
00018 "file can be compressed.\n";
00019 return 1;
00020 }
00021 util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
00022 util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
00023 lm::VocabReconstitute vocab(vocab_file.get());
00024 unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
00025 std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
00026 while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
00027 UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
00028 const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
00029 for (const lm::WordIndex *i = words; i != words + order; ++i) {
00030 UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
00031 std::cout << vocab.Lookup(*i) << ' ';
00032 }
00033
00034 std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
00035 }
00036 }