00001 #include "lm/filter/vocab.hh"
00002
00003 #include <istream>
00004 #include <iostream>
00005
00006 #include <cctype>
00007
00008 namespace lm {
00009 namespace vocab {
00010
00011 void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out) {
00012 in.exceptions(std::istream::badbit);
00013 std::string word;
00014 while (in >> word) {
00015 out.insert(word);
00016 }
00017 }
00018
00019 namespace {
00020 bool IsLineEnd(std::istream &in) {
00021 int got;
00022 do {
00023 got = in.get();
00024 if (!in) return true;
00025 if (got == '\n') return true;
00026 } while (isspace(got));
00027 in.unget();
00028 return false;
00029 }
00030 }
00031
00032
00033
00034 unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
00035 in.exceptions(std::istream::badbit);
00036 unsigned int sentence = 0;
00037 bool used_id = false;
00038 std::string word;
00039 while (in >> word) {
00040 used_id = true;
00041 std::vector<unsigned int> &posting = out[word];
00042 if (posting.empty() || (posting.back() != sentence))
00043 posting.push_back(sentence);
00044 if (IsLineEnd(in)) {
00045 ++sentence;
00046 used_id = false;
00047 }
00048 }
00049 return sentence + used_id;
00050 }
00051
00052 }
00053 }