00001 #ifndef LM_FILTER_PHRASE_H
00002 #define LM_FILTER_PHRASE_H
00003
00004 #include "util/murmur_hash.hh"
00005 #include "util/string_piece.hh"
00006 #include "util/tokenize_piece.hh"
00007
00008 #include <boost/unordered_map.hpp>
00009
00010 #include <iosfwd>
00011 #include <vector>
00012
00013 #define LM_FILTER_PHRASE_METHOD(caps, lower) \
00014 bool Find##caps(Hash key, const std::vector<unsigned int> *&out) const {\
00015 Table::const_iterator i(table_.find(key));\
00016 if (i==table_.end()) return false; \
00017 out = &i->second.lower; \
00018 return true; \
00019 }
00020
00021 namespace lm {
00022 namespace phrase {
00023
00024 typedef uint64_t Hash;
00025
00026 class Substrings {
00027 private:
00028
00029
00030
00031
00032
00033
00034
00035
00036 struct SentenceRelation {
00037 std::vector<unsigned int> substring, left, right, phrase;
00038 };
00039
00040
00041
00042
00043
00044
00045 typedef boost::unordered_map<Hash, SentenceRelation> Table;
00046
00047 public:
00048 Substrings() {}
00049
00050
00051
00052
00053
00054
00055 LM_FILTER_PHRASE_METHOD(Substring, substring)
00056 LM_FILTER_PHRASE_METHOD(Left, left)
00057 LM_FILTER_PHRASE_METHOD(Right, right)
00058 LM_FILTER_PHRASE_METHOD(Phrase, phrase)
00059
00060 #pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
00061
00062 template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
00063
00064 for (Iterator start = begin; start != end; ++start) {
00065 Hash hash = 0;
00066 SentenceRelation *relation;
00067 for (Iterator finish = start; finish != end; ++finish) {
00068 hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish);
00069
00070 relation = &table_[hash];
00071 AppendSentence(relation->substring, sentence_id);
00072 if (start == begin) AppendSentence(relation->left, sentence_id);
00073 }
00074 AppendSentence(relation->right, sentence_id);
00075 if (start == begin) AppendSentence(relation->phrase, sentence_id);
00076 }
00077 }
00078
00079 private:
00080 void AppendSentence(std::vector<unsigned int> &vec, unsigned int sentence_id) {
00081 if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id);
00082 }
00083
00084 Table table_;
00085 };
00086
00087
00088
00089 unsigned int ReadMultiple(std::istream &in, Substrings &out);
00090
00091 namespace detail {
00092 extern const StringPiece kEndSentence;
00093
00094 template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
00095 hashes.clear();
00096 if (i == end) return;
00097
00098 if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
00099 ++i;
00100 }
00101 for (; i != end && (*i != kEndSentence); ++i) {
00102 hashes.push_back(util::MurmurHashNative(i->data(), i->size()));
00103 }
00104 }
00105
00106 class Vertex;
00107 class Arc;
00108
00109 class ConditionCommon {
00110 protected:
00111 ConditionCommon(const Substrings &substrings);
00112 ConditionCommon(const ConditionCommon &from);
00113
00114 ~ConditionCommon();
00115
00116 detail::Vertex &MakeGraph();
00117
00118
00119 std::vector<Hash> hashes_;
00120
00121 private:
00122 std::vector<detail::Vertex> vertices_;
00123 std::vector<detail::Arc> arcs_;
00124
00125 const Substrings &substrings_;
00126 };
00127
00128 }
00129
00130 class Union : public detail::ConditionCommon {
00131 public:
00132 explicit Union(const Substrings &substrings) : detail::ConditionCommon(substrings) {}
00133
00134 template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) {
00135 detail::MakeHashes(begin, end, hashes_);
00136 return hashes_.empty() || Evaluate();
00137 }
00138
00139 private:
00140 bool Evaluate();
00141 };
00142
00143 class Multiple : public detail::ConditionCommon {
00144 public:
00145 explicit Multiple(const Substrings &substrings) : detail::ConditionCommon(substrings) {}
00146
00147 template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
00148 detail::MakeHashes(begin, end, hashes_);
00149 if (hashes_.empty()) {
00150 output.AddNGram(line);
00151 } else {
00152 Evaluate(line, output);
00153 }
00154 }
00155
00156 template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
00157 AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output);
00158 }
00159
00160 void Flush() const {}
00161
00162 private:
00163 template <class Output> void Evaluate(const StringPiece &line, Output &output);
00164 };
00165
00166 }
00167 }
00168 #endif // LM_FILTER_PHRASE_H