00001 #include "huffmanish.hh"
00002
00003 Huffman::Huffman (const char * filepath)
00004 {
00005
00006 util::FilePiece filein(filepath);
00007
00008
00009 uniq_lines = 0;
00010
00011 line_text prev_line;
00012 int num_lines = 0 ;
00013
00014 while (true) {
00015 line_text new_line;
00016
00017 num_lines++;
00018
00019 try {
00020
00021 new_line = splitLine(filein.ReadLine());
00022 count_elements(new_line);
00023
00024 } catch (util::EndOfFileException e) {
00025 std::cerr << "Unique entries counted: ";
00026 break;
00027 }
00028
00029 if (new_line.source_phrase == prev_line.source_phrase) {
00030 continue;
00031 } else {
00032 uniq_lines++;
00033 prev_line = new_line;
00034 }
00035 }
00036
00037 std::cerr << uniq_lines << std::endl;
00038 }
00039
00040 void Huffman::count_elements(line_text linein)
00041 {
00042
00043 util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
00044 while (it) {
00045
00046 std::map<std::string, unsigned int>::iterator mapiter;
00047 mapiter = target_phrase_words.find(it->as_string());
00048
00049 if (mapiter != target_phrase_words.end()) {
00050
00051 mapiter->second++;
00052 } else {
00053
00054 target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
00055 }
00056 it++;
00057 }
00058
00059
00060 std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
00061 std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
00062 mapiter3 = word_all1.find(numbers);
00063
00064 if (mapiter3 != word_all1.end()) {
00065
00066 mapiter3->second++;
00067 } else {
00068
00069 word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
00070 }
00071
00072 }
00073
00074
00075 void Huffman::assign_values()
00076 {
00077
00078
00079
00080 for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
00081 target_phrase_words_counts.push_back(*it);
00082 }
00083
00084 std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
00085
00086
00087 for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
00088 word_all1_counts.push_back(*it);
00089 }
00090
00091 std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
00092
00093
00094
00095 unsigned int i = 1;
00096 for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
00097 it != target_phrase_words_counts.end(); it++) {
00098 target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
00099 i++;
00100 }
00101
00102 i = 1;
00103 for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
00104 it != word_all1_counts.end(); it++) {
00105 word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
00106 i++;
00107 }
00108
00109
00110 target_phrase_words.clear();
00111 word_all1.clear();
00112
00113 target_phrase_words_counts.clear();
00114 word_all1_counts.clear();
00115
00116 std::cerr << "Finished generating huffman codes." << std::endl;
00117
00118 }
00119
00120 void Huffman::serialize_maps(const char * dirname)
00121 {
00122
00123 std::string basedir(dirname);
00124 std::string target_phrase_path(basedir + "/target_phrases");
00125 std::string probabilities_path(basedir + "/probs");
00126 std::string word_all1_path(basedir + "/Wall1");
00127
00128
00129 std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
00130 boost::archive::text_oarchive oarch(os);
00131 oarch << lookup_target_phrase;
00132 os.close();
00133
00134
00135 std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
00136 boost::archive::text_oarchive oarch2(os2);
00137 oarch2 << lookup_word_all1;
00138 os2.close();
00139 }
00140
00141 std::vector<unsigned char> Huffman::full_encode_line(line_text line)
00142 {
00143 return vbyte_encode_line((encode_line(line)));
00144 }
00145
00146 std::vector<unsigned int> Huffman::encode_line(line_text line)
00147 {
00148 std::vector<unsigned int> retvector;
00149
00150
00151 util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
00152 while (it) {
00153 retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
00154 it++;
00155 }
00156
00157 retvector.push_back(0);
00158
00159
00160 util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
00161 while (probit) {
00162
00163 double tempnum = atof(probit->data());
00164 float num = (float)tempnum;
00165 retvector.push_back(reinterpret_float(&num));
00166 probit++;
00167 }
00168
00169 retvector.push_back(0);
00170
00171
00172
00173 retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
00174 retvector.push_back(0);
00175
00176 return retvector;
00177 }
00178
00179 void Huffman::produce_lookups()
00180 {
00181
00182 for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
00183 lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
00184 }
00185
00186 for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
00187 lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
00188 }
00189
00190 }
00191
00192 HuffmanDecoder::HuffmanDecoder (const char * dirname)
00193 {
00194
00195
00196
00197 std::string basedir(dirname);
00198 std::string target_phrase_path(basedir + "/target_phrases");
00199 std::string word_all1_path(basedir + "/Wall1");
00200
00201
00202 std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
00203 boost::archive::text_iarchive iarch(is);
00204 iarch >> lookup_target_phrase;
00205 is.close();
00206
00207
00208 std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
00209 boost::archive::text_iarchive iarch2(is2);
00210 iarch2 >> lookup_word_all1;
00211 is2.close();
00212
00213 }
00214
00215 HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
00216 std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
00217 {
00218 lookup_target_phrase = *lookup_target;
00219 lookup_word_all1 = *lookup_word1;
00220 }
00221
00222 std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
00223 {
00224 std::vector<target_text> retvector;
00225 std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines);
00226 std::vector<unsigned int>::iterator it = decoded_lines.begin();
00227 std::vector<unsigned int> current_target_phrase;
00228
00229 short zero_count = 0;
00230 while(it != decoded_lines.end()) {
00231 if (zero_count == 1) {
00232
00233
00234
00235 for (int i = 0; i < num_scores; i++) {
00236 current_target_phrase.push_back(*it);
00237 it++;
00238 }
00239 }
00240
00241 if (zero_count == 3) {
00242
00243 retvector.push_back(decode_line(current_target_phrase, num_scores));
00244 current_target_phrase.clear();
00245 zero_count = 0;
00246 }
00247
00248 current_target_phrase.push_back(*it);
00249 if (*it == 0) {
00250 zero_count++;
00251 }
00252 it++;
00253 }
00254
00255 if (zero_count == 3) {
00256
00257 retvector.push_back(decode_line(current_target_phrase, num_scores));
00258 current_target_phrase.clear();
00259 zero_count = 0;
00260 }
00261
00262 return retvector;
00263
00264 }
00265
00266 target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
00267 {
00268
00269 target_text ret;
00270
00271 std::vector<unsigned int> target_phrase;
00272 std::vector<unsigned int> probs;
00273 unsigned int wAll;
00274
00275
00276 short num_zeroes = 0;
00277 int counter = 0;
00278 while (num_zeroes < 3) {
00279 unsigned int num = input[counter];
00280 if (num == 0) {
00281 num_zeroes++;
00282 } else if (num_zeroes == 0) {
00283 target_phrase.push_back(num);
00284 } else if (num_zeroes == 1) {
00285
00286 for (int i = 0; i < num_scores; i++) {
00287 probs.push_back(num);
00288 counter++;
00289 num = input[counter];
00290 }
00291 continue;
00292 } else if (num_zeroes == 2) {
00293 wAll = num;
00294 }
00295 counter++;
00296 }
00297
00298 ret.target_phrase = target_phrase;
00299 ret.word_all1 = lookup_word_all1.find(wAll)->second;
00300
00301
00302 for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
00303 ret.prob.push_back(reinterpret_uint(&(*it)));
00304 }
00305
00306 return ret;
00307
00308 }
00309
00310 inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
00311 {
00312 return lookup_target_phrase.find(id)->second;
00313 }
00314
00315 std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
00316 {
00317 std::string returnstring;
00318 for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
00319 returnstring.append(getTargetWordFromID(*it) + " ");
00320 }
00321
00322 return returnstring;
00323 }
00324
00325 inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
00326 {
00327 return lookup_target_phrase->find(id)->second;
00328 }
00329
00330 std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
00331 {
00332 std::string returnstring;
00333 for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
00334 returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
00335 }
00336
00337 return returnstring;
00338 }
00339
00340
00341
00342
00343
00344 inline unsigned int reinterpret_float(float * num)
00345 {
00346 unsigned int * converted_num;
00347 converted_num = reinterpret_cast<unsigned int *>(num);
00348 return *converted_num;
00349 }
00350
00351 inline float reinterpret_uint(unsigned int * num)
00352 {
00353 float * converted_num;
00354 converted_num = reinterpret_cast<float *>(num);
00355 return *converted_num;
00356 }
00357
00358
00359
00360
00361 inline std::vector<unsigned char> vbyte_encode(unsigned int num)
00362 {
00363
00364 short size;
00365 std::vector<unsigned char> byte_vector;
00366
00367 if (num < 0x00000080U) {
00368 size = 1;
00369 byte_vector.reserve(size);
00370 goto b1;
00371 }
00372 if (num < 0x00004000U) {
00373 size = 2;
00374 byte_vector.reserve(size);
00375 goto b2;
00376 }
00377 if (num < 0x00200000U) {
00378 size = 3;
00379 byte_vector.reserve(size);
00380 goto b3;
00381 }
00382 if (num < 0x10000000U) {
00383 size = 4;
00384 byte_vector.reserve(size);
00385 goto b4;
00386 }
00387 size = 5;
00388 byte_vector.reserve(size);
00389
00390
00391
00392 byte_vector.push_back((num & 0x7f) | 0x80);
00393 num >>= 7;
00394 b4:
00395 byte_vector.push_back((num & 0x7f) | 0x80);
00396 num >>= 7;
00397 b3:
00398 byte_vector.push_back((num & 0x7f) | 0x80);
00399 num >>= 7;
00400 b2:
00401 byte_vector.push_back((num & 0x7f) | 0x80);
00402 num >>= 7;
00403 b1:
00404 byte_vector.push_back(num);
00405
00406 return byte_vector;
00407 }
00408
00409 std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
00410 {
00411 std::vector<unsigned int> huffman_line;
00412 std::vector<unsigned char> current_num;
00413
00414 for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
00415 current_num.push_back(*it);
00416 if ((*it >> 7) != 1) {
00417
00418 huffman_line.push_back(bytes_to_int(current_num));
00419 current_num.clear();
00420 }
00421 }
00422 return huffman_line;
00423 }
00424
00425 inline unsigned int bytes_to_int(std::vector<unsigned char> number)
00426 {
00427 unsigned int retvalue = 0;
00428 std::vector<unsigned char>::iterator it = number.begin();
00429 unsigned char shift = 0;
00430
00431 while (it != number.end()) {
00432 retvalue |= (*it & 0x7f) << shift;
00433 shift += 7;
00434 it++;
00435 }
00436
00437 return retvalue;
00438 }
00439
00440 std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
00441 {
00442 std::vector<unsigned char> retvec;
00443
00444
00445 for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
00446 std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
00447 retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
00448 }
00449
00450 return retvec;
00451 }