00001 #include "quering.hh"
00002
00003 unsigned char * read_binary_file(const char * filename, size_t filesize)
00004 {
00005
00006 int fd;
00007 unsigned char * map;
00008
00009 fd = open(filename, O_RDONLY);
00010
00011 if (fd == -1) {
00012 perror("Error opening file for reading");
00013 exit(EXIT_FAILURE);
00014 }
00015
00016 map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
00017 if (map == MAP_FAILED) {
00018 close(fd);
00019 perror("Error mmapping the file");
00020 exit(EXIT_FAILURE);
00021 }
00022
00023 return map;
00024 }
00025
00026 QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
00027 {
00028
00029
00030 std::string basepath(filepath);
00031 std::string path_to_hashtable = basepath + "/probing_hash.dat";
00032 std::string path_to_data_bin = basepath + "/binfile.dat";
00033 std::string path_to_source_vocabid = basepath + "/source_vocabids";
00034
00036 read_map(&source_vocabids, path_to_source_vocabid.c_str());
00037
00038
00039 vocabids = decoder.get_target_lookup_map();
00040
00041
00042 std::string line;
00043 std::ifstream config ((basepath + "/config").c_str());
00044
00045 getline(config, line);
00046 if (atoi(line.c_str()) != API_VERSION) {
00047 std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl;
00048 exit(EXIT_FAILURE);
00049 }
00050
00051 getline(config, line);
00052 int tablesize = atoi(line.c_str());
00053
00054 getline(config, line);
00055 num_scores = atoi(line.c_str());
00056
00057 getline(config, line);
00058 std::transform(line.begin(), line.end(), line.begin(), ::tolower);
00059 is_reordering = false;
00060 if (line == "true") {
00061 is_reordering = true;
00062 std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl;
00063 }
00064 config.close();
00065
00066
00067 struct stat filestatus;
00068 stat(path_to_data_bin.c_str(), &filestatus);
00069 binary_filesize = filestatus.st_size;
00070 binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
00071
00072
00073 table_filesize = Table::Size(tablesize, 1.2);
00074 mem = readTable(path_to_hashtable.c_str(), table_filesize);
00075 Table table_init(mem, table_filesize);
00076 table = table_init;
00077
00078 std::cerr << "Initialized successfully! " << std::endl;
00079 }
00080
00081 QueryEngine::~QueryEngine()
00082 {
00083
00084 munmap(binary_mmaped, binary_filesize);
00085 munmap(mem, table_filesize);
00086
00087 }
00088
00089 std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase)
00090 {
00091 bool found;
00092 std::vector<target_text> translation_entries;
00093 const Entry * entry;
00094
00095
00096 uint64_t key = 0;
00097 for (int i = 0; i < source_phrase.size(); i++) {
00098 key += (source_phrase[i] << i);
00099 }
00100
00101
00102 found = table.Find(key, entry);
00103
00104 if (found) {
00105
00106
00107
00108 uint64_t initial_index = entry -> GetValue();
00109 unsigned int bytes_toread = entry -> bytes_toread;
00110
00111
00112 std::vector<unsigned char> encoded_text;
00113 encoded_text.reserve(bytes_toread);
00114 for (int i = 0; i < bytes_toread; i++) {
00115 encoded_text.push_back(binary_mmaped[i+initial_index]);
00116 }
00117
00118
00119 translation_entries = decoder.full_decode_line(encoded_text, num_scores);
00120
00121 }
00122
00123 std::pair<bool, std::vector<target_text> > output (found, translation_entries);
00124
00125 return output;
00126
00127 }
00128
00129 std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase)
00130 {
00131 bool found;
00132 std::vector<target_text> translation_entries;
00133 const Entry * entry;
00134
00135 std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
00136
00137
00138 uint64_t key = 0;
00139 for (int i = 0; i < source_phrase_vid.size(); i++) {
00140 key += (source_phrase_vid[i] << i);
00141 }
00142
00143 found = table.Find(key, entry);
00144
00145
00146 if (found) {
00147
00148
00149
00150 uint64_t initial_index = entry -> GetValue();
00151 unsigned int bytes_toread = entry -> bytes_toread;
00152
00153 std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
00154
00155
00156 std::vector<unsigned char> encoded_text;
00157 encoded_text.reserve(bytes_toread);
00158 for (int i = 0; i < bytes_toread; i++) {
00159 encoded_text.push_back(binary_mmaped[i+initial_index]);
00160 }
00161
00162
00163 translation_entries = decoder.full_decode_line(encoded_text, num_scores);
00164
00165 }
00166
00167 std::pair<bool, std::vector<target_text> > output (found, translation_entries);
00168
00169 return output;
00170
00171 }
00172
00173 void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases)
00174 {
00175 int entries = target_phrases.size();
00176
00177 for (int i = 0; i<entries; i++) {
00178 std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
00179
00180 std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
00181
00182
00183 for (int j = 0; j<target_phrases[i].prob.size(); j++) {
00184 std::cout << target_phrases[i].prob[j] << " ";
00185 }
00186 std::cout << "\t";
00187
00188
00189 for (int j = 0; j<target_phrases[i].word_all1.size(); j++) {
00190 if (j%2 == 0) {
00191 std::cout << (short)target_phrases[i].word_all1[j] << "-";
00192 } else {
00193 std::cout << (short)target_phrases[i].word_all1[j] << " ";
00194 }
00195 }
00196 std::cout << std::endl;
00197 }
00198 }