00001
00002 #ifndef INC_PERFECTHASH_H
00003 #define INC_PERFECTHASH_H
00004
00005 #include <map>
00006 #include <stdint.h>
00007 #include "hash.h"
00008 #include "RandLMFilter.h"
00009 #include "quantizer.h"
00010
00015 using randlm::Filter;
00016 using randlm::BitFilter;
00017 typedef std::map<string, count_t> hpDict_t;
00018 typedef hpDict_t::iterator hpdEntry_t;
00019 static count_t collisions_ = 0;
00020
00021
00022 template<typename T>
00023 class PerfectHash {
00024 public:
00025 PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
00026 PerfectHash(FileHandler* fin) {
00027 CHECK(fin != 0);
00028 }
00029 virtual ~PerfectHash();
00030 void analyze();
00031 count_t hpDictMemUse();
00032 count_t bucketsMemUse();
00033 protected:
00034 Filter<T>* filter_;
00035 Filter<T>* values_;
00036 hpDict_t dict_;
00037 uint64_t cells_;
00038 count_t hitMask_;
00039 int totBuckets_;
00040 uint8_t bucketRange_;
00041 uint8_t* idxTracker_;
00042 uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
00043 bool update(const wordID_t* IDs, const int len, const count_t value,
00044 hpdEntry_t& hpdAddr, uint64_t& filterIdx);
00045 bool update2(const wordID_t* IDs, const int len, const count_t value,
00046 hpdEntry_t& hpdAddr, uint64_t& filterIdx);
00047 int query(const wordID_t* IDs, const int len,
00048 hpdEntry_t& hpdAddr, uint64_t& filterIdx);
00049 virtual void remove(const wordID_t* IDs, const int len);
00050 void remove(uint64_t index);
00051 void save(FileHandler* fout);
00052 void load(FileHandler* fin);
00053 virtual void markQueried(const uint64_t&)=0;
00054
00055 virtual void markQueried(hpdEntry_t&)=0;
00056 private:
00057 T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
00058 string hpDictKeyValue(const wordID_t* IDs, const int len);
00059 uint64_t memBound_;
00060 uint16_t cellWidth_;
00061 UnivHash_linear<count_t>* bucketHash_;
00062 UnivHash_linear<T>* fingerHash_;
00063 LogQtizer* qtizer_;
00064 };
00065
00066 template<typename T>
00067 PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
00068 float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
00069 cellWidth_(width) {
00070 bucketRange_ = static_cast<uint8_t>(bucketRange);
00071 if(bucketRange > 255) {
00072 cerr << "ERROR: Max bucket range is > 2^8\n";
00073 exit(1);
00074 }
00075 qtizer_ = new LogQtizer(qBase);
00076 int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
00077 cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
00078 uint64_t totalBits = memBound_ << 3;
00079 cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits));
00080 cells_ += (cells_ % bucketRange_);
00081 totBuckets_ = (cells_ / bucketRange_) - 1;
00082 filter_ = new Filter<T>(cells_, cellWidth_);
00083 values_ = new Filter<T>(cells_, valBits);
00084 idxTracker_ = new uint8_t[totBuckets_];
00085 for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
00086
00087 bucketHash_ = new UnivHash_linear<count_t>(totBuckets_, 1, PRIME);
00088 fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
00089 }
00090
00091 template<typename T>
00092 PerfectHash<T>::~PerfectHash() {
00093 delete[] idxTracker_;
00094 delete filter_;
00095 filter_ = NULL;
00096 delete fingerHash_;
00097 delete bucketHash_;
00098 delete qtizer_;
00099 delete values_;
00100 }
00101
00102 template<typename T>
00103 uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
00104 const count_t value) {
00105 count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
00106 if(idxTracker_[bucket] < (int)bucketRange_) {
00107
00108 T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
00109 uint64_t emptyidx = cells_ + 1;
00110 uint64_t index = bucket * bucketRange_,
00111 lastrow = index + bucketRange_;
00112 while(index < lastrow) {
00113 T filterVal = filter_->read(index);
00114 if((filterVal == 0) && (emptyidx == cells_ + 1)) {
00115 emptyidx = index;
00116 }
00117 else if(filterVal == fp) {
00118 ++collisions_;
00119 dict_[hpDictKeyValue(IDs, len)] = value;
00120 return cells_ + 1;
00121 }
00122 ++index;
00123 }
00124 CHECK((emptyidx < index) && (filter_->read(emptyidx) == 0));
00125 T code = (T)qtizer_->code(value);
00126 filter_->write(emptyidx, fp);
00127 values_->write(emptyidx, code);
00128 ++idxTracker_[bucket];
00129 return emptyidx;
00130 }
00131 else {
00132 dict_[hpDictKeyValue(IDs, len)] = value;
00133 return cells_ + 1;
00134 }
00135 }
00136
00137 template<typename T>
00138 bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
00139 const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
00140
00141 filterIdx = cells_ + 1;
00142 string skey = hpDictKeyValue(IDs, len);
00143 if((hpdAddr = dict_.find(skey)) != dict_.end()) {
00144 hpdAddr->second = value;
00145 return true;
00146 }
00147
00148
00149 count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
00150
00151 T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
00152 uint64_t index = bucket * bucketRange_,
00153 lastrow = index + bucketRange_;
00154 while(index < lastrow) {
00155 T filterVal = filter_->read(index);
00156 if(filterVal == fp) {
00157 values_->write(index, (T)qtizer_->code(value));
00158 filterIdx = index;
00159 return true;
00160 }
00161 ++index;
00162 }
00163
00164 return false;
00165 }
00166
00167 template<typename T>
00168 int PerfectHash<T>::query(const wordID_t* IDs, const int len,
00169 hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
00170
00171 string skey = hpDictKeyValue(IDs, len);
00172 if((hpdAddr = dict_.find(skey)) != dict_.end()) {
00173 filterIdx = cells_ + 1;
00174 return(hpdAddr->second);
00175 }
00176 else {
00177
00178
00179 count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
00180
00181 T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
00182
00183 uint64_t index = bucket * bucketRange_,
00184 lastrow = index + bucketRange_;
00185 for(; index < lastrow; ++index) {
00186 if(filter_->read(index) == fp) {
00187
00188
00189 filterIdx = index;
00190 hpdAddr = dict_.end();
00191 return (int)qtizer_->value(values_->read(index));
00192 }
00193 }
00194 }
00195 return -1;
00196 }
00197
00198 template<typename T>
00199 void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
00200
00201 string skey = hpDictKeyValue(IDs, len);
00202 if(dict_.find(skey) != dict_.end())
00203 dict_.erase(skey);
00204 else {
00205
00206
00207 count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
00208
00209 T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
00210
00211 uint64_t index = bucket * bucketRange_,
00212 lastrow = index + bucketRange_;
00213 for(; index < lastrow; ++index) {
00214 if(filter_->read(index) == fp) {
00215 filter_->write(index, 0);
00216 values_->write(index, 0);
00217 --idxTracker_[bucket];
00218 break;
00219 }
00220 }
00221 }
00222 }
00223
00224 template<typename T>
00225 void PerfectHash<T>::remove(uint64_t index) {
00226 CHECK(index < cells_);
00227 CHECK(filter_->read(index) != 0);
00228 filter_->write(index, 0);
00229 values_->write(index, 0);
00230
00231 count_t bucket = index / bucketRange_;
00232 --idxTracker_[bucket];
00233 }
00234
00235 template<typename T>
00236 T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
00237 count_t bucket) {
00238 count_t h = bucket;
00239 T fingerprint(0);
00240 do {
00241 fingerprint = fingerHash_->hash(IDs, len, h);
00242 h += (h < fingerHash_->size() - 1 ? 1 : -h);
00243 } while((fingerprint == 0) && (h != bucket));
00244 if(fingerprint == 0)
00245 cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
00246 return fingerprint;
00247 }
00248
00249 template<typename T>
00250 string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
00251 string skey(" ");
00252 for(int i = 0; i < len; ++i)
00253 skey += Utils::IntToStr(IDs[i]) + "¬";
00254 Utils::trim(skey);
00255 return skey;
00256 }
00257
00258 template<typename T>
00259 count_t PerfectHash<T>::hpDictMemUse() {
00260
00261 return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
00262 }
00263
00264 template<typename T>
00265 count_t PerfectHash<T>::bucketsMemUse() {
00266
00267 return (count_t) (filter_->size() + values_->size());
00268 }
00269
00270 template<typename T>
00271 void PerfectHash<T>::save(FileHandler* fout) {
00272 CHECK(fout != 0);
00273 cerr << "\tSaving perfect hash parameters...\n";
00274 fout->write((char*)&hitMask_, sizeof(hitMask_));
00275 fout->write((char*)&memBound_, sizeof(memBound_));
00276 fout->write((char*)&cellWidth_, sizeof(cellWidth_));
00277 fout->write((char*)&cells_, sizeof(cells_));
00278 fout->write((char*)&totBuckets_, sizeof(totBuckets_));
00279 fout->write((char*)&bucketRange_, sizeof(bucketRange_));
00280 fout->write((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0]));
00281 qtizer_->save(fout);
00282 cerr << "\tSaving hash functions...\n";
00283 fingerHash_->save(fout);
00284 bucketHash_->save(fout);
00285 cerr << "\tSaving bit filter...\n";
00286 filter_->save(fout);
00287 values_->save(fout);
00288 cerr << "\tSaving high performance dictionary...\n";
00289 count_t size = dict_.size();
00290 fout->write((char*)&size, sizeof(count_t));
00291 *fout << endl;
00292 iterate(dict_, t)
00293 *fout << t->first << "\t" << t->second << "\n";
00294 }
00295
00296 template<typename T>
00297 void PerfectHash<T>::load(FileHandler* fin) {
00298 CHECK(fin != 0);
00299 cerr << "\tLoading perfect hash parameters...\n";
00300 fin->read((char*)&hitMask_, sizeof(hitMask_));
00301 fin->read((char*)&memBound_, sizeof(memBound_));
00302 fin->read((char*)&cellWidth_, sizeof(cellWidth_));
00303 fin->read((char*)&cells_, sizeof(cells_));
00304 fin->read((char*)&totBuckets_, sizeof(totBuckets_));
00305 fin->read((char*)&bucketRange_, sizeof(bucketRange_));
00306 idxTracker_ = new uint8_t[totBuckets_];
00307 fin->read((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0]));
00308 qtizer_ = new LogQtizer(fin);
00309 cerr << "\tLoading hash functions...\n";
00310 fingerHash_ = new UnivHash_linear<T>(fin);
00311 bucketHash_ = new UnivHash_linear<count_t>(fin);
00312 cerr << "\tLoading bit filter...\n";
00313 filter_ = new Filter<T>(fin);
00314 values_ = new Filter<T>(fin);
00315 cerr << "\tLoading HPD...\n";
00316 count_t size = 0;
00317 fin->read((char*)&size, sizeof(count_t));
00318 fin->ignore(256, '\n');
00319 string line;
00320 hpDict_t::key_type key;
00321 hpDict_t::mapped_type val;
00322 for(count_t i=0; i < size; ++i) {
00323 getline(*fin, line);
00324 Utils::trim(line);
00325 std::istringstream ss(line.c_str());
00326 ss >> key, ss >> val;
00327 dict_[key] = val;
00328 }
00329 cerr << "\tHPD size=" << dict_.size() << endl;
00330 cerr << "Finished loading ORLM." << endl;
00331 }
00332
00333 template<typename T>
00334 void PerfectHash<T>::analyze() {
00335 cerr << "Analyzing Dynamic Bloomier Filter...\n";
00336
00337 uint8_t* bucketCnt = new uint8_t[totBuckets_];
00338 unsigned largestBucket = 0, totalCellsSet = 0,
00339 smallestBucket = bucketRange_, totalZeroes = 0;
00340 int curBucket = -1, fullBuckets(0);
00341 for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
00342 for(uint64_t i =0; i < cells_; ++i) {
00343 if(i % bucketRange_ == 0) ++curBucket;
00344 if(filter_->read(i) != 0) {
00345 ++bucketCnt[curBucket];
00346 ++totalCellsSet;
00347 }
00348 else ++totalZeroes;
00349 }
00350 count_t bi = 0, si = 0;
00351 for(int i = 0; i < totBuckets_; ++i) {
00352 if(bucketCnt[i] > largestBucket) {
00353 largestBucket = bucketCnt[i];
00354 bi = i;
00355 }
00356 else if(bucketCnt[i] < smallestBucket) {
00357 smallestBucket = bucketCnt[i];
00358 si = i;
00359 }
00360 }
00361 count_t trackerCells(0);
00362 for(int i = 0; i < totBuckets_; i++) {
00363 trackerCells += idxTracker_[i];
00364 if(idxTracker_[i] == bucketRange_)
00365 ++fullBuckets;
00366 }
00367 for(int i = 0; i < totBuckets_; ++i) {
00368 if(bucketCnt[i] != idxTracker_[i])
00369 cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
00370 "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
00371 }
00372 cerr << "total cells= " << cells_ << endl;
00373 cerr << "total buckets= " << totBuckets_ << endl;
00374 cerr << "bucket range= " << (int)bucketRange_ << endl;
00375 cerr << "fingerprint bits= " << cellWidth_ << endl;
00376 cerr << "total cells set= " << totalCellsSet;
00377 cerr << " (idxTracker set = " << trackerCells << ")" << endl;
00378 cerr << "total zeroes=" << totalZeroes;
00379 cerr << " (idxTracker zeros = " << cells_ - trackerCells << ")" << endl;
00380 cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
00381 cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
00382 cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
00383 " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
00384 cerr << "total buckets full = " << fullBuckets << endl;
00385 cerr << "total collision errors= " << collisions_ << endl;
00386 cerr << "high performance dictionary size= " << dict_.size() << endl;
00387 cerr << "high performance dictionary MBs= " << hpDictMemUse() << endl;
00388 cerr << "filter MBs= " << filter_->size() << endl;
00389 cerr << "values MBs= " << values_->size() << endl;
00390 delete[] bucketCnt;
00391 }
00392
00393 template<typename T>
00394 bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
00395 const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
00396
00397 filterIdx = cells_ + 1;
00398 string skey = hpDictKeyValue(IDs, len);
00399 if((hpdAddr = dict_.find(skey)) != dict_.end()) {
00400 hpdAddr->second += value;
00401 return true;
00402 }
00403
00404
00405 count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
00406
00407 T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
00408 uint64_t index = bucket * bucketRange_,
00409 lastrow = index + bucketRange_;
00410 while(index < lastrow) {
00411 T filterVal = filter_->read(index);
00412 if(filterVal == fp) {
00413 int oldval = (int)qtizer_->value(values_->read(index));
00414 values_->write(index, (T)qtizer_->code(oldval + value));
00415 filterIdx = index;
00416 return true;
00417 }
00418 ++index;
00419 }
00420
00421 insert(IDs, len, value);
00422 return false;
00423 }
00424
00425 #endif
00426