00001 #ifndef LM_FILTER_VOCAB_H
00002 #define LM_FILTER_VOCAB_H
00003
00004
00005
00006 #include "util/multi_intersection.hh"
00007 #include "util/string_piece.hh"
00008 #include "util/string_piece_hash.hh"
00009 #include "util/tokenize_piece.hh"
00010
00011 #include <boost/noncopyable.hpp>
00012 #include <boost/range/iterator_range.hpp>
00013 #include <boost/unordered/unordered_map.hpp>
00014 #include <boost/unordered/unordered_set.hpp>
00015
00016 #include <string>
00017 #include <vector>
00018
00019 namespace lm {
00020 namespace vocab {
00021
00022 void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out);
00023
00024
00025 unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out);
00026
00027
00028
00029
00030
00031 inline bool IsTag(const StringPiece &value) {
00032
00033 assert(!value.empty());
00034 return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>');
00035 }
00036
00037 class Single {
00038 public:
00039 typedef boost::unordered_set<std::string> Words;
00040
00041 explicit Single(const Words &vocab) : vocab_(vocab) {}
00042
00043 template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) {
00044 for (Iterator i = begin; i != end; ++i) {
00045 if (IsTag(*i)) continue;
00046 if (FindStringPiece(vocab_, *i) == vocab_.end()) return false;
00047 }
00048 return true;
00049 }
00050
00051 private:
00052 const Words &vocab_;
00053 };
00054
00055 class Union {
00056 public:
00057 typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words;
00058
00059 explicit Union(const Words &vocabs) : vocabs_(vocabs) {}
00060
00061 template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) {
00062 sets_.clear();
00063
00064 for (Iterator i(begin); i != end; ++i) {
00065 if (IsTag(*i)) continue;
00066 Words::const_iterator found(FindStringPiece(vocabs_, *i));
00067 if (vocabs_.end() == found) return false;
00068 sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end()));
00069 }
00070 return (sets_.empty() || util::FirstIntersection(sets_));
00071 }
00072
00073 private:
00074 const Words &vocabs_;
00075
00076 std::vector<boost::iterator_range<const unsigned int*> > sets_;
00077 };
00078
00079 class Multiple {
00080 public:
00081 typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words;
00082
00083 Multiple(const Words &vocabs) : vocabs_(vocabs) {}
00084
00085 private:
00086
00087 template <class Output> class Callback {
00088 public:
00089 Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {}
00090
00091 void operator()(unsigned int index) {
00092 out_.SingleAddNGram(index, line_);
00093 }
00094
00095 private:
00096 Output &out_;
00097 const StringPiece &line_;
00098 };
00099
00100 public:
00101 template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
00102 sets_.clear();
00103 for (Iterator i(begin); i != end; ++i) {
00104 if (IsTag(*i)) continue;
00105 Words::const_iterator found(FindStringPiece(vocabs_, *i));
00106 if (vocabs_.end() == found) return;
00107 sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end()));
00108 }
00109 if (sets_.empty()) {
00110 output.AddNGram(line);
00111 return;
00112 }
00113
00114 Callback<Output> cb(output, line);
00115 util::AllIntersection(sets_, cb);
00116 }
00117
00118 template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
00119 AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output);
00120 }
00121
00122 void Flush() const {}
00123
00124 private:
00125 const Words &vocabs_;
00126
00127 std::vector<boost::iterator_range<const unsigned int*> > sets_;
00128 };
00129
00130 }
00131 }
00132
00133 #endif // LM_FILTER_VOCAB_H