00001
00002
00003 #ifndef __ug_tsa_tree_iterator_h
00004 #define __ug_tsa_tree_iterator_h
00005
00006 #include "ug_tsa_array_entry.h"
00007 #include "ug_typedefs.h"
00008 #include "tpt_tokenindex.h"
00009 #include <iostream>
00010 #include "util/exception.hh"
00011 #include "moses/Util.h"
00012 #include "util/random.hh"
00013
00014 namespace sapt
00015 {
00016
00017 #ifndef _DISPLAY_CHAIN
00018 #define _DISPLAY_CHAIN
00019
00020 template<typename T>
00021 void display(T const* x, std::string label)
00022 {
00023 std::cout << label << ":";
00024 for (;x;x=next(x)) std::cout << " " << x->lemma;
00025 std::cout << std::endl;
00026 }
00027 #endif
00028
00029 template<typename T> class TSA;
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 template<typename TKN>
00043 class
00044 TSA_tree_iterator
00045 {
00046 protected:
00047 std::vector<char const*> lower;
00048 std::vector<char const*> upper;
00049
00050
00051 void showBounds(std::ostream& out) const;
00052 public:
00053 typedef TKN Token;
00054
00055 virtual ~TSA_tree_iterator() {};
00056
00057 TSA<Token> const* root;
00058
00059
00060
00061
00062 TSA_tree_iterator(TSA<Token> const* s);
00063 TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
00064 TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
00065
00066 TSA_tree_iterator(TSA<Token> const* s,
00067 Token const* kstart,
00068 size_t const len,
00069 bool full_match_only=true);
00070 TSA_tree_iterator(TSA<Token> const* s,
00071 Token const* kstart,
00072 Token const* kend,
00073 bool full_match_only=true);
00074 TSA_tree_iterator(TSA<Token> const* s,
00075 TokenIndex const& V,
00076 std::string const& key);
00077
00078 char const* lower_bound(int p) const;
00079 char const* upper_bound(int p) const;
00080
00081 size_t size() const;
00082
00083 Token const* getToken(int p) const;
00084 id_type getSid() const;
00085 ushort getOffset(int p) const;
00086 size_t sntCnt(int p=-1) const;
00087 size_t rawCnt(int p=-1) const;
00088 uint64_t getPid(int p=-1) const;
00089
00090 virtual bool extend(Token const& id);
00091 virtual bool extend(id_type id);
00092 virtual bool down();
00093 virtual bool over();
00094 virtual bool up();
00095
00096 std::string str(TokenIndex const* V=NULL, int start=0, int stop=0) const;
00097
00098
00099 bool match(Token const* start, Token const* stop) const;
00100
00101 bool match(id_type sid) const;
00102
00103
00104 count_type
00105 fillBitSet(boost::dynamic_bitset<uint64_t>& bitset) const;
00106
00107 count_type
00108 markEndOfSequence(Token const* start, Token const* stop,
00109 boost::dynamic_bitset<uint64_t>& dest) const;
00110 count_type
00111 markSequence(Token const* start, Token const* stop, bitvector& dest) const;
00112
00113 count_type
00114 markSentences(boost::dynamic_bitset<uint64_t>& bitset) const;
00115
00116 count_type
00117 markOccurrences(boost::dynamic_bitset<uint64_t>& bitset,
00118 bool markOnlyStartPosition=false) const;
00119
00120 count_type
00121 markOccurrences(std::vector<ushort>& dest) const;
00122
00123 ::uint64_t
00124 getSequenceId() const;
00125
00126
00127
00128 bitvector& filterSentences(bitvector& foo) const;
00129
00131 void
00132 tfAndRoot(bitvector const& ref,
00133 bitvector const& snt,
00134 bitvector& dest) const;
00135
00136 size_t arrayByteSpanSize(int p = -1) const
00137 {
00138 if (lower.size()==0) return 0;
00139 if (p < 0) p = lower.size()+p;
00140 assert(p >=0 && p < int(lower.size()));
00141 return lower.size() ? upper[p]-lower[p] : 0;
00142 }
00143
00144 struct SortByApproximateCount
00145 {
00146 bool operator()(TSA_tree_iterator const& a,
00147 TSA_tree_iterator const& b) const
00148 {
00149 if (a.size()==0) return b.size() ? true : false;
00150 if (b.size()==0) return false;
00151 return a.arrayByteSpanSize() < b.arrayByteSpanSize();
00152 }
00153 };
00154
00155 double
00156 ca(int p=-1) const
00157 {
00158 assert(root);
00159 if (p < 0) p += lower.size();
00160 double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
00161
00162
00163
00164 if (ret < 25) ret = rawCnt(p);
00165 UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
00166 << "Word count mismatch.");
00167 assert(ret <= root->corpus->numTokens());
00168 return ret;
00169 }
00170
00171 inline
00172 double
00173 approxOccurrenceCount(int p=-1) const
00174 {
00175 return ca();
00176 }
00177
00178 size_t grow(Token const* t, Token const* stop)
00179 {
00180 while ((t != stop) && extend(*t)) t = t->next();
00181 return this->size();
00182 }
00183
00184 size_t grow(Token const* snt, bitvector const& cov)
00185 {
00186 size_t x = cov.find_first();
00187 while (x < cov.size() && extend(snt[x]))
00188 x = cov.find_next(x);
00189 return this->size();
00190 }
00191
00192 SPTR<std::vector<typename ttrack::Position> >
00193 randomSample(int level, size_t N) const;
00194
00195 };
00196
00197
00198
00199
00200
00201 template<typename TSA_TYPE>
00202 bool
00203 TSA_tree_iterator<TSA_TYPE>::
00204 down()
00205 {
00206 assert(root);
00207 if (lower.size() == 0)
00208 {
00209 char const* lo = root->arrayStart();
00210 assert(lo < root->arrayEnd());
00211 if (lo == root->arrayEnd()) return false;
00212 tsa::ArrayEntry A(root,lo);
00213 assert(root->corpus->getToken(A));
00214 assert(lo < root->getUpperBound(root->corpus->getToken(A)->id()));
00215 lower.push_back(lo);
00216 Token const* foo = this->getToken(0);
00217 upper.push_back(root->upper_bound(foo,lower.size()));
00218 return lower.size();
00219 }
00220 else
00221 {
00222 char const* lo = lower.back();
00223 tsa::ArrayEntry A(root,lo);
00224 Token const* a = root->corpus->getToken(A); assert(a);
00225 Token const* z = next(a);
00226 for (size_t i = 1; i < size(); ++i) z = next(z);
00227 if (z < root->corpus->sntStart(A.sid) || z >= root->corpus->sntEnd(A.sid))
00228 {
00229 char const* up = upper.back();
00230 lo = root->find_longer(lo,up,a,lower.size(),0);
00231 if (!lo) return false;
00232 root->readEntry(lo,A);
00233 a = root->corpus->getToken(A); assert(a);
00234 z = next(a);
00235 assert(z >= root->corpus->sntStart(A.sid) && z < root->corpus->sntEnd(A.sid));
00236 }
00237 lower.push_back(lo);
00238 char const* up = root->getUpperBound(a->id());
00239 char const* u = root->find_end(lo,up,a,lower.size(),0);
00240 assert(u);
00241 upper.push_back(u);
00242 return true;
00243 }
00244 }
00245
00246
00247
00248
00249
00250 template<typename Token>
00251 bool
00252 TSA_tree_iterator<Token>::
00253 over()
00254 {
00255 if (lower.size() == 0)
00256 return false;
00257 if (lower.size() == 1)
00258 {
00259 Token const* t = this->getToken(0);
00260 id_type wid = t->id();
00261 char const* hi = root->getUpperBound(wid);
00262 if (upper[0] < hi)
00263 {
00264 lower[0] = upper[0];
00265 Token const* foo = this->getToken(0);
00266 upper.back() = root->upper_bound(foo,lower.size());
00267 }
00268 else
00269 {
00270 for (++wid; wid < root->indexSize; ++wid)
00271 {
00272 char const* lo = root->getLowerBound(wid);
00273 if (lo == root->endArray) return false;
00274 char const* hi = root->getUpperBound(wid);
00275 if (!hi) return false;
00276 if (lo == hi) continue;
00277 assert(lo);
00278 lower[0] = lo;
00279 Token const* foo = this->getToken(0);
00280 upper.back() = root->upper_bound(foo,lower.size());
00281 break;
00282 }
00283 }
00284 return wid < root->indexSize;
00285 }
00286 else
00287 {
00288 if (upper.back() == root->arrayEnd())
00289 return false;
00290 tsa::ArrayEntry L(root,lower.back());
00291 tsa::ArrayEntry U(root,upper.back());
00292
00293
00294
00295
00296 int x = root->corpus->cmp(U,L,lower.size()-1);
00297
00298 if (x != 1)
00299 return false;
00300 lower.back() = upper.back();
00301
00302
00303
00304 Token const* foo = this->getToken(0);
00305
00306 upper.back() = root->upper_bound(foo,lower.size());
00307 return true;
00308 }
00309 }
00310
00311
00312
00313
00314
00315 template<typename Token>
00316 bool
00317 TSA_tree_iterator<Token>::
00318 up()
00319 {
00320 if (lower.size())
00321 {
00322 lower.pop_back();
00323 upper.pop_back();
00324 return true;
00325 }
00326 else
00327 return false;
00328 }
00329
00330
00331
00332
00333
00334 template<typename Token>
00335 TSA_tree_iterator<Token>::
00336 TSA_tree_iterator(TSA<Token> const* s)
00337 : root(s)
00338 {};
00339
00340 template<typename Token>
00341 TSA_tree_iterator<Token>::
00342 TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
00343 : root(s)
00344 {
00345 Token const* x = other.getToken(0);
00346 for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
00347 x = x->next();
00348 };
00349
00350
00351
00352 template<typename Token>
00353 TSA_tree_iterator<Token>::
00354 TSA_tree_iterator
00355 (TSA<Token> const* r,
00356 id_type const* s,
00357 size_t const len)
00358 : root(r)
00359 {
00360 for (id_type const* e = s + len; s < e && extend(*s); ++s);
00361 };
00362
00363
00364
00365 #if 1
00366 template<typename Token>
00367 TSA_tree_iterator<Token>::
00368 TSA_tree_iterator(TSA<Token> const* s,
00369 TokenIndex const& V,
00370 std::string const& key)
00371 : root(s)
00372 {
00373 std::istringstream buf(key); std::string w;
00374 while (buf >> w)
00375 {
00376 if (this->extend(V[w]))
00377 continue;
00378 else
00379 {
00380 lower.clear();
00381 upper.clear();
00382 break;
00383 }
00384 }
00385 };
00386 #endif
00387
00388 #if 0
00389
00390
00391 template<typename Token>
00392 TSA_tree_iterator<Token>::
00393 TSA_tree_iterator(TSA_tree_iterator<Token> const& other)
00394 : root(other.root)
00395 {
00396 lower = other.lower;
00397 upper = other.upper;
00398 };
00399
00400
00401
00402 template<typename Token>
00403 TSA_tree_iterator<Token>::
00404 TSA_tree_iterator(TSA<Token> const* s, Token const& t)
00405 : root(s)
00406 {
00407 if (!root) return;
00408 char const* up = root->getUpperBound(t.id());
00409 if (!up) return;
00410 lower.push_back(root->getLowerBound(t.id()));
00411 upper.push_back(up);
00412 };
00413
00414
00415
00416 #endif
00417
00418 template<typename Token>
00419 TSA_tree_iterator<Token>::
00420 TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
00421 size_t const len, bool full_match_only)
00422 : root(s)
00423 {
00424 if (!root) return;
00425 size_t i = 0;
00426 for (; i < len && kstart && extend(*kstart); ++i)
00427 kstart = kstart->next();
00428 if (full_match_only && i != len)
00429 {
00430 lower.clear();
00431 upper.clear();
00432 }
00433 };
00434
00435
00436
00437 template<typename Token>
00438 TSA_tree_iterator<Token>::
00439 TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
00440 Token const* kend, bool full_match_only)
00441 : root(s)
00442 {
00443 for (;kstart != kend; kstart = kstart->next())
00444 if (!extend(*kstart))
00445 break;
00446 if (full_match_only && kstart != kend)
00447 {
00448 lower.clear();
00449 upper.clear();
00450 }
00451 };
00452
00453
00454
00455
00456
00457 template<typename Token>
00458 bool
00459 TSA_tree_iterator<Token>::
00460 extend(id_type const id)
00461 {
00462 return extend(Token(id));
00463 }
00464
00465
00466 template<typename Token>
00467 bool
00468 TSA_tree_iterator<Token>::
00469 extend(Token const& t)
00470 {
00471 if (lower.size())
00472 {
00473 char const* lo = lower.back();
00474 char const* hi = upper.back();
00475 lo = root->find_start(lo, hi, &t, 1, lower.size());
00476 if (!lo) return false;
00477 lower.push_back(lo);
00478 hi = root->find_end(lo, hi, getToken(-1), 1, lower.size()-1);
00479 upper.push_back(hi);
00480 }
00481 else
00482 {
00483 char const* lo = root->getLowerBound(t.id());
00484 char const* hi = root->getUpperBound(t.id());
00485
00486 if (lo==hi) return false;
00487 lo = root->find_start(lo, hi, &t, 1, lower.size());
00488 if (!lo) return false;
00489 lower.push_back(lo);
00490 #if 0
00491 tsa::ArrayEntry I;
00492 root->readEntry(lo,I);
00493 cout << I.sid << " " << I.offset << std::endl;
00494 cout << root->corpus->sntLen(I.sid) << std::endl;
00495 #endif
00496 hi = root->find_end(lo, hi, getToken(0), 1, 0);
00497 upper.push_back(hi);
00498 }
00499 return true;
00500 };
00501
00502
00503
00504 template<typename Token>
00505 size_t
00506 TSA_tree_iterator<Token>::
00507 size() const
00508 {
00509 return lower.size();
00510 }
00511
00512
00513
00514 template<typename Token>
00515 id_type
00516 TSA_tree_iterator<Token>::
00517 getSid() const
00518 {
00519 char const* p = (lower.size() ? lower.back() : root->startArray);
00520 char const* q = (upper.size() ? upper.back() : root->endArray);
00521 id_type sid;
00522 root->readSid(p,q,sid);
00523 return sid;
00524 }
00525
00526
00527
00528 template<typename Token>
00529 ::uint64_t
00530 TSA_tree_iterator<Token>::
00531 getPid(int p) const
00532 {
00533 if (this->size() == 0) return 0;
00534 if (p < 0) p += upper.size();
00535 char const* lb = lower_bound(p);
00536 char const* ub = upper_bound(p);
00537 ::uint64_t sid,off;
00538 root->readOffset(root->readSid(lb,ub,sid),ub,off);
00539 ::uint64_t ret = (sid<<32) + (off<<16) + ::uint64_t(p+1);
00540 return ret;
00541 }
00542
00543
00544
00545 template<typename Token>
00546 char const*
00547 TSA_tree_iterator<Token>::
00548 lower_bound(int p) const
00549 {
00550 if (p < 0) p += lower.size();
00551 assert(p >= 0 && p < int(lower.size()));
00552 return lower[p];
00553 }
00554
00555
00556
00557 template<typename Token>
00558 char const*
00559 TSA_tree_iterator<Token>::
00560 upper_bound(int p) const
00561 {
00562 if (p < 0) p += upper.size();
00563 assert(p >= 0 && p < int(upper.size()));
00564 return upper[p];
00565 }
00566
00567
00568
00569
00570
00571
00572 template<typename Token>
00573 Token const*
00574 TSA_tree_iterator<Token>::
00575 getToken(int p) const
00576 {
00577 if (lower.size()==0) return NULL;
00578 tsa::ArrayEntry A(root,lower.back());
00579 Token const* t = root->corpus->getToken(A); assert(t);
00580 #ifndef NDEBUG
00581 Token const* bos = root->corpus->sntStart(A.sid);
00582 Token const* eos = root->corpus->sntEnd(A.sid);
00583 #endif
00584 if (p < 0) p += lower.size();
00585
00586 while (p-- > 0)
00587 {
00588 t = next(t);
00589
00590 assert(t >= bos && t < eos);
00591 }
00592 return t;
00593 }
00594
00595
00596
00597 template<typename Token>
00598 size_t
00599 TSA_tree_iterator<Token>::
00600 sntCnt(int p) const
00601 {
00602 if (p < 0)
00603 p = lower.size()+p;
00604 assert(p>=0);
00605 if (lower.size() == 0) return root->getCorpusSize();
00606 return reinterpret_cast<TSA<Token> const* const>(root)->sntCnt(lower[p],upper[p]);
00607 }
00608
00609
00610
00611 template<typename Token>
00612 size_t
00613 TSA_tree_iterator<Token>::
00614 rawCnt(int p) const
00615 {
00616 if (p < 0) p += lower.size();
00617 assert(p>=0);
00618 if (lower.size() == 0) return root->getCorpusSize();
00619 return root->rawCnt(lower[p],upper[p]);
00620 }
00621
00622
00623
00624 template<typename Token>
00625 count_type
00626 TSA_tree_iterator<Token>::
00627 fillBitSet(boost::dynamic_bitset<uint64_t>& bitset) const
00628 {
00629 return markSentences(bitset);
00630 }
00631
00632
00633
00634 template<typename Token>
00635 count_type
00636 TSA_tree_iterator<Token>::
00637 markSentences(boost::dynamic_bitset<uint64_t>& bitset) const
00638 {
00639 assert(root && root->corpus);
00640 bitset.resize(root->corpus->size());
00641 bitset.reset();
00642 if (lower.size()==0) return 0;
00643 char const* lo = lower.back();
00644 char const* up = upper.back();
00645 char const* p = lo;
00646 id_type sid;
00647 ushort off;
00648 count_type wcount=0;
00649 while (p < up)
00650 {
00651 p = root->readSid(p,up,sid);
00652 p = root->readOffset(p,up,off);
00653 bitset.set(sid);
00654 wcount++;
00655 }
00656 return wcount;
00657 }
00658
00659
00660
00661 template<typename Token>
00662 count_type
00663 TSA_tree_iterator<Token>::
00664 markOccurrences(boost::dynamic_bitset<uint64_t>& bitset, bool markOnlyStartPosition) const
00665 {
00666 assert(root && root->corpus);
00667 if (bitset.size() != root->corpus->numTokens())
00668 bitset.resize(root->corpus->numTokens());
00669 bitset.reset();
00670 if (lower.size()==0) return 0;
00671 char const* lo = lower.back();
00672 char const* up = upper.back();
00673 return root->markOccurrences(lo,up,lower.size(),bitset,markOnlyStartPosition);
00674 }
00675
00676
00677 template<typename Token>
00678 count_type
00679 TSA_tree_iterator<Token>::
00680 markOccurrences(std::vector<ushort>& dest) const
00681 {
00682 assert(root && root->corpus);
00683 assert(dest.size() == root->corpus->numTokens());
00684 if (lower.size()==0) return 0;
00685 char const* lo = lower.back();
00686 char const* up = upper.back();
00687 char const* p = lo;
00688 id_type sid;
00689 ushort off;
00690 count_type wcount=0;
00691 Token const* crpStart = root->corpus->sntStart(0);
00692 while (p < up)
00693 {
00694 p = root->readSid(p,up,sid);
00695 p = root->readOffset(p,up,off);
00696 Token const* t = root->corpus->sntStart(sid)+off;
00697 for (size_t i = 1; i < lower.size(); ++i, t = t->next());
00698 dest[t-crpStart]++;
00699 wcount++;
00700 }
00701 return wcount;
00702 }
00703
00704
00705
00706
00707 template<typename Token>
00708 count_type
00709 TSA_tree_iterator<Token>::
00710 markEndOfSequence(Token const* start, Token const* stop,
00711 boost::dynamic_bitset<uint64_t>& dest) const
00712 {
00713 count_type matchCount=0;
00714 Token const* a = getToken(0);
00715 for (Token const* x = start; x < stop; ++x)
00716 {
00717 if (*x != *a) continue;
00718 Token const* y = x;
00719 Token const* b = a;
00720 size_t i;
00721 for (i = 0; *b==*y && ++i < this->size();)
00722 {
00723 b = b->next();
00724 y = y->next();
00725 if (y < start || y >= stop) break;
00726 }
00727 if (i == this->size())
00728 {
00729 dest.set(y-start);
00730 ++matchCount;
00731 }
00732 }
00733 return matchCount;
00734 }
00735
00736
00737
00738
00739 template<typename Token>
00740 count_type
00741 TSA_tree_iterator<Token>::
00742 markSequence(Token const* start,
00743 Token const* stop,
00744 bitvector& dest) const
00745 {
00746 count_type numMatches=0;
00747 Token const* a = getToken(0);
00748 for (Token const* x = start; x < stop; ++x)
00749 {
00750 if (*x != *a) continue;
00751 Token const* y = x;
00752 Token const* b = a;
00753 size_t i;
00754 for (i = 0; *b==*y && i++ < this->size();)
00755 {
00756 dest.set(y-start);
00757 b = b->next();
00758 y = y->next();
00759 if (y < start || y >= stop) break;
00760 }
00761 if (i == this->size()) ++numMatches;
00762 }
00763 return numMatches;
00764 }
00765
00766
00767 template<typename Token>
00768 ::uint64_t
00769 TSA_tree_iterator<Token>::
00770 getSequenceId() const
00771 {
00772 if (this->size() == 0) return 0;
00773 char const* p = this->lower_bound(-1);
00774 typename Token::ArrayEntry I;
00775 root->readEntry(p,I);
00776 return (::uint64_t(I.sid)<<32)+(I.offset<<16)+this->size();
00777 }
00778
00779 template<typename Token>
00780 std::string
00781 TSA_tree_iterator<Token>::
00782 str(TokenIndex const* V, int start, int stop) const
00783 {
00784 if (this->size()==0) return "";
00785 if (start < 0) start = this->size()+start;
00786 if (stop <= 0) stop = this->size()+stop;
00787 assert(start>=0 && start < int(this->size()));
00788 assert(stop > 0 && stop <= int(this->size()));
00789 Token const* x = this->getToken(0);
00790 std::ostringstream buf;
00791 for (int i = start; i < stop; ++i, x = x->next())
00792 {
00793 assert(x);
00794 buf << (i > start ? " " : "");
00795 if (V) buf << (*V)[x->id()];
00796 else buf << x->id();
00797 }
00798 return buf.str();
00799 }
00800
00801 #if 0
00802 template<typename Token>
00803 string
00804 TSA_tree_iterator<Token>::
00805 str(Vocab const& V, int start, int stop) const
00806 {
00807 if (this->size()==0) return "";
00808 if (start < 0) start = this->size()+start;
00809 if (stop <= 0) stop = this->size()+stop;
00810 assert(start>=0 && start < int(this->size()));
00811 assert(stop > 0 && stop <= int(this->size()));
00812 Token const* x = this->getToken(0);
00813 std::ostringstream buf;
00814 for (int i = start; i < stop; ++i, x = x->next())
00815 {
00816 assert(x);
00817 buf << (i > start ? " " : "");
00818 buf << V[x->id()].str;
00819 }
00820 return buf.str();
00821 }
00822 #endif
00823
00825 template<typename Token>
00826 bool
00827 TSA_tree_iterator<Token>::
00828 match(Token const* start, Token const* stop) const
00829 {
00830 Token const* a = getToken(0);
00831 for (Token const* t = start; t < stop; ++t)
00832 {
00833 if (*t != *a) continue;
00834 Token const* b = a;
00835 Token const* y = t;
00836 size_t i;
00837 for (i = 1; i < lower.size(); ++i)
00838 {
00839 y = y->next();
00840 if (y < start || y >= stop) break;
00841 b = b->next();
00842 if (*b != *y) break;
00843 }
00844 if (i == lower.size()) return true;
00845 }
00846 return false;
00847 }
00848
00850 template<typename Token>
00851 bool
00852 TSA_tree_iterator<Token>::
00853 match(id_type sid) const
00854 {
00855 return match(root->corpus->sntStart(sid),root->corpus->sntEnd(sid));
00856 }
00857
00859
00860
00861 template<typename Token>
00862 void
00863 TSA_tree_iterator<Token>::
00864 tfAndRoot(bitvector const& ref,
00865 bitvector const& snt,
00866 bitvector& dest) const
00867 {
00868 tsa::ArrayEntry I(lower.back());
00869 Token const* crpStart = root->corpus->sntStart(0);
00870 do
00871 {
00872 root->readEntry(I.next,I);
00873 if (!snt.test(I.sid)) continue;
00874
00875 Token const* t = root->corpus->getToken(I)->next(lower.size()-1);
00876 assert(t >= crpStart);
00877 size_t p = t-crpStart;
00878 if (ref.test(p))
00879 dest.set(p);
00880 } while (I.next != upper.back());
00881 }
00882
00883
00884
00885 template<typename Token>
00886 bitvector&
00887 TSA_tree_iterator<Token>::
00888 filterSentences(bitvector& bv) const
00889 {
00890 float aveSntLen = root->corpus->numTokens()/root->corpus->size();
00891 size_t ANDcost = bv.size()/8;
00892 float aveEntrySize = ((root->endArray-root->startArray)
00893 /root->corpus->numTokens());
00894 if (arrayByteSpanSize()+ANDcost < aveEntrySize*aveSntLen*bv.count())
00895 {
00896 bitvector tmp(bv.size());
00897 markSentences(tmp);
00898 bv &= tmp;
00899 }
00900 else
00901 {
00902 for (size_t i = bv.find_first(); i < bv.size(); i = bv.find_next(i))
00903 if (!match(i)) bv.reset(i);
00904 }
00905 return bv;
00906 }
00907
00909 template<typename Token>
00910 SPTR<std::vector<typename ttrack::Position> >
00911 TSA_tree_iterator<Token>::
00912 randomSample(int level, size_t N) const
00913 {
00914 if (level < 0) level += lower.size();
00915 assert(level >=0);
00916
00917 SPTR<std::vector<typename ttrack::Position> >
00918 ret(new std::vector<typename ttrack::Position>(N));
00919
00920 size_t m=0;
00921 typename Token::ArrayEntry I(lower.at(level));
00922
00923 char const* stop = upper.at(level);
00924 while (m < N && (I.next) < stop)
00925 {
00926 root->readEntry(I.next,I);
00927
00928
00929 const double t = (stop - I.pos)/root->aveIndexEntrySize();
00930 const double r = util::rand_excl(t);
00931 if (r < N-m)
00932 {
00933 ret->at(m).offset = I.offset;
00934 ret->at(m++).sid = I.sid;
00935 }
00936 }
00937 ret->resize(m);
00938
00939 return ret;
00940 }
00941
00942 }
00943 #endif