00001 #pragma once
00002
00003
00004 #include "hash.hh"
00005 #include "line_splitter.hh"
00006 #include <cstdio>
00007 #include <fstream>
00008 #include <iostream>
00009 #include <sstream>
00010 #include <boost/serialization/serialization.hpp>
00011 #include <boost/serialization/vector.hpp>
00012 #include <boost/serialization/map.hpp>
00013 #include <boost/archive/text_iarchive.hpp>
00014 #include <boost/archive/text_oarchive.hpp>
00015
00016
00017 struct sort_pair {
00018 bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
00019 return left.second > right.second;
00020 }
00021 };
00022
00023 struct sort_pair_vec {
00024 bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
00025 return left.second > right.second;
00026 }
00027 };
00028
00029 class Huffman
00030 {
00031 unsigned long uniq_lines;
00032
00033
00034 std::map<std::string, unsigned int> target_phrase_words;
00035 std::map<std::vector<unsigned char>, unsigned int> word_all1;
00036
00037
00038 std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
00039 std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
00040
00041
00042 std::map<std::string, unsigned int> target_phrase_huffman;
00043 std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
00044
00045
00046 std::map<unsigned int, std::string> lookup_target_phrase;
00047 std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
00048
00049 public:
00050 Huffman (const char *);
00051 void count_elements (line_text line);
00052 void assign_values();
00053 void serialize_maps(const char * dirname);
00054 void produce_lookups();
00055
00056 std::vector<unsigned int> encode_line(line_text line);
00057
00058
00059 std::vector<unsigned char> full_encode_line(line_text line);
00060
00061
00062 const std::map<unsigned int, std::string> get_target_lookup_map() const {
00063 return lookup_target_phrase;
00064 }
00065 const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
00066 return lookup_word_all1;
00067 }
00068
00069 unsigned long getUniqLines() {
00070 return uniq_lines;
00071 }
00072 };
00073
00074 class HuffmanDecoder
00075 {
00076 std::map<unsigned int, std::string> lookup_target_phrase;
00077 std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
00078
00079 public:
00080 HuffmanDecoder (const char *);
00081 HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
00082
00083
00084 const std::map<unsigned int, std::string> get_target_lookup_map() const {
00085 return lookup_target_phrase;
00086 }
00087 const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
00088 return lookup_word_all1;
00089 }
00090
00091 inline std::string getTargetWordFromID(unsigned int id);
00092
00093 std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
00094
00095 target_text decode_line (std::vector<unsigned int> input, int num_scores);
00096
00097
00098 std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores);
00099 };
00100
00101 std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
00102
00103 inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
00104
00105 inline unsigned int reinterpret_float(float * num);
00106
00107 inline float reinterpret_uint(unsigned int * num);
00108
00109 std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
00110 inline std::vector<unsigned char> vbyte_encode(unsigned int num);
00111 std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
00112 inline unsigned int bytes_to_int(std::vector<unsigned char> number);