00001 #ifndef LM_FILTER_FORMAT_H
00002 #define LM_FILTER_FORMAT_H
00003
00004 #include "lm/filter/arpa_io.hh"
00005 #include "lm/filter/count_io.hh"
00006
00007 #include <boost/lexical_cast.hpp>
00008 #include <boost/ptr_container/ptr_vector.hpp>
00009
00010 #include <iosfwd>
00011
00012 namespace lm {
00013
00014 template <class Single> class MultipleOutput {
00015 private:
00016 typedef boost::ptr_vector<Single> Singles;
00017 typedef typename Singles::iterator SinglesIterator;
00018
00019 public:
00020 MultipleOutput(const char *prefix, size_t number) {
00021 files_.reserve(number);
00022 std::string tmp;
00023 for (unsigned int i = 0; i < number; ++i) {
00024 tmp = prefix;
00025 tmp += boost::lexical_cast<std::string>(i);
00026 files_.push_back(new Single(tmp.c_str()));
00027 }
00028 }
00029
00030 void AddNGram(const StringPiece &line) {
00031 for (SinglesIterator i = files_.begin(); i != files_.end(); ++i)
00032 i->AddNGram(line);
00033 }
00034
00035 template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
00036 for (SinglesIterator i = files_.begin(); i != files_.end(); ++i)
00037 i->AddNGram(begin, end, line);
00038 }
00039
00040 void SingleAddNGram(size_t offset, const StringPiece &line) {
00041 files_[offset].AddNGram(line);
00042 }
00043
00044 template <class Iterator> void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) {
00045 files_[offset].AddNGram(begin, end, line);
00046 }
00047
00048 protected:
00049 Singles files_;
00050 };
00051
00052 class MultipleARPAOutput : public MultipleOutput<ARPAOutput> {
00053 public:
00054 MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput<ARPAOutput>(prefix, number) {}
00055
00056 void ReserveForCounts(std::streampos reserve) {
00057 for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i)
00058 i->ReserveForCounts(reserve);
00059 }
00060
00061 void BeginLength(unsigned int length) {
00062 for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i)
00063 i->BeginLength(length);
00064 }
00065
00066 void EndLength(unsigned int length) {
00067 for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i)
00068 i->EndLength(length);
00069 }
00070
00071 void Finish() {
00072 for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i)
00073 i->Finish();
00074 }
00075 };
00076
00077 template <class Filter, class Output> class DispatchInput {
00078 public:
00079 DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {}
00080
00081
00082
00083
00084
00085 void AddNGram(const StringPiece &ngram, const StringPiece &line) {
00086 filter_.AddNGram(ngram, line, output_);
00087 }
00088
00089 protected:
00090 Filter &filter_;
00091 Output &output_;
00092 };
00093
00094 template <class Filter, class Output> class DispatchARPAInput : public DispatchInput<Filter, Output> {
00095 private:
00096 typedef DispatchInput<Filter, Output> B;
00097
00098 public:
00099 DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {}
00100
00101 void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); }
00102 void BeginLength(unsigned int length) { B::output_.BeginLength(length); }
00103
00104 void EndLength(unsigned int length) {
00105 B::filter_.Flush();
00106 B::output_.EndLength(length);
00107 }
00108 void Finish() { B::output_.Finish(); }
00109 };
00110
00111 struct ARPAFormat {
00112 typedef ARPAOutput Output;
00113 typedef MultipleARPAOutput Multiple;
00114 static void Copy(util::FilePiece &in, Output &out) {
00115 ReadARPA(in, out);
00116 }
00117 template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) {
00118 DispatchARPAInput<Filter, Out> dispatcher(filter, output);
00119 ReadARPA(in, dispatcher);
00120 }
00121 };
00122
00123 struct CountFormat {
00124 typedef CountOutput Output;
00125 typedef MultipleOutput<Output> Multiple;
00126 static void Copy(util::FilePiece &in, Output &out) {
00127 ReadCount(in, out);
00128 }
00129 template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) {
00130 DispatchInput<Filter, Out> dispatcher(filter, output);
00131 ReadCount(in, dispatcher);
00132 }
00133 };
00134
00135
00136
00137
00138
00139 class InputBuffer {
00140 public:
00141 InputBuffer() : actual_(0) {}
00142
00143 void Reserve(size_t size) { lines_.reserve(size); }
00144
00145 template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
00146 if (lines_.size() == actual_) lines_.resize(lines_.size() + 1);
00147
00148 std::string &copied = lines_[actual_].line;
00149 copied.assign(line.data(), line.size());
00150 lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size());
00151 ++actual_;
00152 }
00153
00154 template <class Filter, class Output> void CallFilter(Filter &filter, Output &output) const {
00155 for (std::vector<Line>::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) {
00156 filter.AddNGram(i->ngram, i->line, output);
00157 }
00158 }
00159
00160 void Clear() { actual_ = 0; }
00161 bool Empty() { return actual_ == 0; }
00162 size_t Size() { return actual_; }
00163
00164 private:
00165 struct Line {
00166 std::string line;
00167 StringPiece ngram;
00168 };
00169
00170 size_t actual_;
00171
00172 std::vector<Line> lines_;
00173 };
00174
00175 class BinaryOutputBuffer {
00176 public:
00177 BinaryOutputBuffer() {}
00178
00179 void Reserve(size_t size) {
00180 lines_.reserve(size);
00181 }
00182
00183 void AddNGram(const StringPiece &line) {
00184 lines_.push_back(line);
00185 }
00186
00187 template <class Output> void Flush(Output &output) {
00188 for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) {
00189 output.AddNGram(*i);
00190 }
00191 lines_.clear();
00192 }
00193
00194 private:
00195 std::vector<StringPiece> lines_;
00196 };
00197
00198 class MultipleOutputBuffer {
00199 public:
00200 MultipleOutputBuffer() : last_(NULL) {}
00201
00202 void Reserve(size_t size) {
00203 annotated_.reserve(size);
00204 }
00205
00206 void AddNGram(const StringPiece &line) {
00207 annotated_.resize(annotated_.size() + 1);
00208 annotated_.back().line = line;
00209 }
00210
00211 void SingleAddNGram(size_t offset, const StringPiece &line) {
00212 if ((line.data() == last_.data()) && (line.length() == last_.length())) {
00213 annotated_.back().systems.push_back(offset);
00214 } else {
00215 annotated_.resize(annotated_.size() + 1);
00216 annotated_.back().systems.push_back(offset);
00217 annotated_.back().line = line;
00218 last_ = line;
00219 }
00220 }
00221
00222 template <class Output> void Flush(Output &output) {
00223 for (std::vector<Annotated>::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) {
00224 if (i->systems.empty()) {
00225 output.AddNGram(i->line);
00226 } else {
00227 for (std::vector<size_t>::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) {
00228 output.SingleAddNGram(*j, i->line);
00229 }
00230 }
00231 }
00232 annotated_.clear();
00233 }
00234
00235 private:
00236 struct Annotated {
00237
00238
00239 std::vector<size_t> systems;
00240 StringPiece line;
00241 };
00242
00243 StringPiece last_;
00244
00245 std::vector<Annotated> annotated_;
00246 };
00247
00248 }
00249
00250 #endif // LM_FILTER_FORMAT_H