00001 #include "DynSuffixArray.h"
00002 #include "util/random.hh"
00003
00004 #include <iostream>
00005 #include <boost/foreach.hpp>
00006
00007 using namespace std;
00008
00009 namespace Moses
00010 {
00011
00012 DynSuffixArray::DynSuffixArray()
00013 {
00014 m_SA = new vuint_t();
00015 m_ISA = new vuint_t();
00016 m_F = new vuint_t();
00017 m_L = new vuint_t();
00018 std::cerr << "DYNAMIC SUFFIX ARRAY CLASS INSTANTIATED" << std::endl;
00019 }
00020
00021 DynSuffixArray::~DynSuffixArray()
00022 {
00023 delete m_SA;
00024 delete m_ISA;
00025 delete m_F;
00026 delete m_L;
00027 }
00028
00029 DynSuffixArray::DynSuffixArray(vuint_t* crp)
00030 {
00031
00032 m_corpus = crp;
00033 int size = m_corpus->size();
00034 int* tmpArr = new int[size];
00035 for(int i=0 ; i < size; ++i) tmpArr[i] = i;
00036
00037 Qsort(tmpArr, 0, size-1);
00038
00039 m_SA = new vuint_t(tmpArr, tmpArr + size);
00040
00041
00042 delete[] tmpArr;
00043 std::cerr << "DYNAMIC SUFFIX ARRAY CLASS INSTANTIATED WITH SIZE " << size << std::endl;
00044 BuildAuxArrays();
00045
00046 }
00047
00048 void DynSuffixArray::BuildAuxArrays()
00049 {
00050 int size = m_SA->size();
00051 m_ISA = new vuint_t(size);
00052 m_F = new vuint_t(size);
00053 m_L = new vuint_t(size);
00054
00055 for(int i=0; i < size; ++i) {
00056 m_ISA->at(m_SA->at(i)) = i;
00057
00058 (*m_F)[i] = (*m_corpus)[m_SA->at(i)];
00059 (*m_L)[i] = (*m_corpus)[(m_SA->at(i) == 0 ? size-1 : m_SA->at(i)-1)];
00060 }
00061 }
00062
00063 int DynSuffixArray::Rank(unsigned word, unsigned idx)
00064 {
00065
00066
00067 int r(0);
00068 for(unsigned i=0; i < idx; ++i)
00069 if(m_L->at(i) == word) ++r;
00070 return r;
00071 }
00072
00073
00074
00075 int DynSuffixArray::F_firstIdx(unsigned word)
00076 {
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
00087
00088 if((size_t)low >= m_F->size())
00089 return -1;
00090 else
00091 return low;
00092 }
00093
00094
00095 int DynSuffixArray::LastFirstFunc(unsigned L_idx)
00096 {
00097 int fIdx(-1);
00098
00099 unsigned word = m_L->at(L_idx);
00100 if((fIdx = F_firstIdx(word)) != -1) {
00101
00102 fIdx += Rank(word, L_idx);
00103 }
00104 return fIdx;
00105 }
00106
00107 void DynSuffixArray::Insert(vuint_t* newSent, unsigned newIndex)
00108 {
00109
00110
00111
00112
00113
00114
00115 UTIL_THROW_IF2(newIndex > m_SA->size(), "Error");
00116 int k(-1), kprime(-1);
00117 k = (newIndex < m_SA->size() ? m_ISA->at(newIndex) : m_ISA->at(0));
00118 int true_pos = LastFirstFunc(k);
00119 int Ltmp = m_L->at(k);
00120 m_L->at(k) = newSent->at(newSent->size()-1);
00121 for(int j = newSent->size()-1; j > -1; --j) {
00122 kprime = LastFirstFunc(k);
00123
00124
00125 kprime = (kprime > 0 ? kprime : m_SA->size());
00126 true_pos += (kprime <= true_pos ? 1 : 0);
00127
00128 m_F->insert(m_F->begin() + kprime, newSent->at(j));
00129 int theLWord = (j == 0 ? Ltmp : newSent->at(j-1));
00130
00131 m_L->insert(m_L->begin() + kprime, theLWord);
00132 for (vuint_t::iterator itr = m_SA->begin(); itr != m_SA->end(); ++itr) {
00133 if(*itr >= newIndex) ++(*itr);
00134 }
00135 m_SA->insert(m_SA->begin() + kprime, newIndex);
00136 for (vuint_t::iterator itr = m_ISA->begin(); itr != m_ISA->end(); ++itr) {
00137 if((int)*itr >= kprime) ++(*itr);
00138 }
00139
00140 m_ISA->insert(m_ISA->begin() + newIndex, kprime);
00141 k = kprime;
00142
00143 }
00144
00145 Reorder(true_pos, LastFirstFunc(kprime));
00146 }
00147
00148 void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
00149 {
00150 set<pair<unsigned, unsigned> > seen;
00151 while(j != jprime) {
00152
00153
00154 bool seenit = seen.insert(std::make_pair(j, jprime)).second;
00155 if(seenit) {
00156 for(size_t i=1; i < m_SA->size(); ++i) {
00157 if(m_corpus->at(m_SA->at(i)) < m_corpus->at(m_SA->at(i-1))) {
00158 cerr << "PROBLEM WITH SUFFIX ARRAY REORDERING. EXITING...\n";
00159 exit(1);
00160 }
00161 }
00162 return;
00163 }
00164
00165 int isaIdx(-1);
00166 int new_j = LastFirstFunc(j);
00167 UTIL_THROW_IF2(j > jprime, "Error");
00168
00169 m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
00170 m_L->erase(m_L->begin() + j);
00171 m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
00172 m_SA->erase(m_SA->begin() + j);
00173
00174 for(size_t i = 0; i < m_ISA->size(); ++i) {
00175 if((m_ISA->at(i) == j) && (isaIdx == -1))
00176 isaIdx = i;
00177 if((m_ISA->at(i) > j) && (m_ISA->at(i) <= jprime)) --(*m_ISA)[i];
00178 }
00179
00180
00181 m_ISA->at(isaIdx) = jprime;
00182 j = new_j;
00183 jprime = LastFirstFunc(jprime);
00184 }
00185
00186 }
00187
00188 void DynSuffixArray::Delete(unsigned index, unsigned num2del)
00189 {
00190 int ltmp = m_L->at(m_ISA->at(index));
00191 int true_pos = LastFirstFunc(m_ISA->at(index));
00192 for(size_t q = 0; q < num2del; ++q) {
00193 int row = m_ISA->at(index);
00194
00195
00196 true_pos -= (row <= true_pos ? 1 : 0);
00197 m_L->erase(m_L->begin() + row);
00198 m_F->erase(m_F->begin() + row);
00199
00200 m_ISA->erase(m_ISA->begin() + index);
00201 for (vuint_t::iterator itr = m_ISA->begin(); itr != m_ISA->end(); ++itr) {
00202 if((int)*itr > row) --(*itr);
00203 }
00204
00205 m_SA->erase(m_SA->begin() + row);
00206 for (vuint_t::iterator itr = m_SA->begin(); itr != m_SA->end(); ++itr) {
00207 if(*itr > index) --(*itr);
00208 }
00209 }
00210 m_L->at(m_ISA->at(index))= ltmp;
00211 Reorder(LastFirstFunc(m_ISA->at(index)), true_pos);
00212
00213 }
00214
00215 void DynSuffixArray::Substitute(vuint_t* , unsigned )
00216 {
00217 std::cerr << "NEEDS TO IMPLEMENT SUBSITITUTE FACTOR\n";
00218 return;
00219 }
00220
00221 ComparePosition::
00222 ComparePosition(vuint_t const& crp, vuint_t const& sfa)
00223 : m_crp(crp), m_sfa(sfa) { }
00224
00225 bool
00226 ComparePosition::
00227 operator()(unsigned const& i, vector<wordID_t> const& phrase) const
00228 {
00229 unsigned const* x = &m_crp.at(i);
00230 unsigned const* e = &m_crp.back();
00231 size_t k = 0;
00232 for (; k < phrase.size() && x < e; ++k, ++x)
00233 if (*x != phrase[k]) return *x < phrase[k];
00234 return (x == e && k < phrase.size());
00235 }
00236
00237 bool
00238 ComparePosition::
00239 operator()(vector<wordID_t> const& phrase, unsigned const& i) const
00240 {
00241 unsigned const* x = &m_crp.at(i);
00242 unsigned const* e = &m_crp.back();
00243 size_t k = 0;
00244 for (; k < phrase.size() && x < e; ++k, ++x)
00245 if (*x != phrase[k]) return phrase[k] < *x;
00246 return false;
00247 }
00248
00249 bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
00250 {
00251
00252 pair<vuint_t::iterator,vuint_t::iterator> bounds;
00253 indices->clear();
00254 size_t phrasesize = phrase->size();
00255
00256 bounds = std::equal_range(m_F->begin(), m_F->end(), phrase->at(0));
00257
00258 size_t lwrBnd = size_t(bounds.first - m_F->begin());
00259 size_t uprBnd = size_t(bounds.second - m_F->begin());
00260
00261
00262 if(uprBnd - lwrBnd == 0) return false;
00263 if(phrasesize == 1) {
00264 for(size_t i=lwrBnd; i < uprBnd; ++i) {
00265 indices->push_back(m_SA->at(i));
00266 }
00267 return (indices->size() > 0);
00268 }
00269
00270 for(size_t i = lwrBnd; i < uprBnd; ++i) {
00271 size_t crpIdx = m_SA->at(i);
00272 if((crpIdx + phrasesize) > m_corpus->size()) continue;
00273 for(size_t pos = 1; pos < phrasesize; ++pos) {
00274 if(m_corpus->at(crpIdx + pos) != phrase->at(pos)) {
00275 if(indices->size() > 0) i = uprBnd;
00276 break;
00277 } else if(pos == phrasesize-1) {
00278 indices->push_back(crpIdx + pos);
00279 }
00280 }
00281 }
00282
00283 return (indices->size() > 0);
00284 }
00285
00286 size_t
00287 DynSuffixArray::
00288 GetCount(vuint_t const& phrase) const
00289 {
00290 ComparePosition cmp(*m_corpus, *m_SA);
00291 vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
00292 vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
00293 return ub-lb;
00294 }
00295
00296 void DynSuffixArray::Save(FILE* fout)
00297 {
00298 fWriteVector(fout, *m_SA);
00299 }
00300
00301 void DynSuffixArray::Load(FILE* fin)
00302 {
00303 fReadVector(fin, *m_SA);
00304 }
00305
00306 int DynSuffixArray::Compare(int pos1, int pos2, int max)
00307 {
00308 for (size_t i = 0; i < (unsigned)max; ++i) {
00309 if((pos1 + i < m_corpus->size()) && (pos2 + i >= m_corpus->size()))
00310 return 1;
00311 if((pos2 + i < m_corpus->size()) && (pos1 + i >= m_corpus->size()))
00312 return -1;
00313
00314 int diff = m_corpus->at(pos1+i) - m_corpus->at(pos2+i);
00315 if(diff != 0) return diff;
00316 }
00317 return 0;
00318 }
00319
00320 namespace
00321 {
00323 inline void swap_ints(int array[], int one, int other)
00324 {
00325 const int tmp = array[one];
00326 array[one] = array[other];
00327 array[other] = tmp;
00328 }
00329 }
00330
00331 void DynSuffixArray::Qsort(int* array, int begin, int end)
00332 {
00333 if(end > begin) {
00334 int index = util::rand_incl(begin, end);
00335 {
00336 const int pivot = array[index];
00337 swap_ints(array, index, end);
00338 for(int i=index=begin; i < end; ++i) {
00339 if (Compare(array[i], pivot, 20) <= 0) {
00340 swap_ints(array, index, i);
00341 index++;
00342 }
00343 }
00344 swap_ints(array, index, end);
00345 }
00346 Qsort(array, begin, index - 1);
00347 Qsort(array, index + 1, end);
00348 }
00349 }
00350
00351
00352
00353 }