00001
00002
00003 #include <sstream>
00004 #include <cstring>
00005 #include <algorithm>
00006 #include <iostream>
00007 #include <stdexcept>
00008
00009 #include <boost/pool/pool_alloc.hpp>
00010
00011 #include "tpt_tokenindex.h"
00012 #include "ug_typedefs.h"
00013
00014 using namespace std;
00015 namespace sapt
00016 {
00017
00018 TokenIndex::
00019 TokenIndex(string unkToken)
00020 : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0)
00021 , startIdx(0), endIdx(0)
00022 {
00023 lock.reset(new boost::mutex());
00024 };
00025
00026 #if 0
00027 TokenIndex::
00028 TokenIndex(string fname, string unkToken,bool dyna)
00029 : ridx(0),unkLabel(unkToken)
00030 {
00031 this->open(fname,unkToken,dyna);
00032 };
00033 #endif
00034
00035 void
00036 TokenIndex::
00037 open(string fname, string unkToken,bool dyna)
00038 {
00039 if (access(fname.c_str(),F_OK))
00040 {
00041 ostringstream msg;
00042 msg << "TokenIndex::open: File '" << fname << "' does not exist.";
00043 throw std::runtime_error(msg.str().c_str());
00044 }
00045
00046 file.open(fname);
00047 if (!file.is_open())
00048 {
00049 ostringstream msg;
00050 msg << "TokenIndex::open: Error opening file '" << fname << "'.";
00051 throw std::runtime_error(msg.str().c_str());
00052 }
00053
00054 this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
00055 unkId = *(reinterpret_cast<id_type const*>(file.data()+4));
00056
00057 startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
00058 endIdx = startIdx + numTokens;
00059 comp.base = reinterpret_cast<char const*>(endIdx);
00060 if (!unkToken.empty())
00061 {
00062 Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp);
00063 unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
00064 ? bla->id
00065 : numTokens);
00066 }
00067 this->dynamic=dyna;
00068 if (dyna)
00069 {
00070 this->str2idExtra.reset(new map<string,id_type>());
00071 this->newWords.reset(new vector<string>());
00072 }
00073 }
00074
00075 void
00076 TokenIndex::
00077 close()
00078 {
00079 file.close();
00080 }
00081
00082 TokenIndex::
00083 CompFunc::
00084 CompFunc()
00085 {};
00086
00087 bool
00088 TokenIndex::
00089 CompFunc::
00090 operator()(Entry const& A, char const* w)
00091 {
00092 return strcmp(base+A.offset,w) < 0;
00093 };
00094
00095 id_type
00096 TokenIndex::
00097 operator[](char const* p) const
00098 {
00099 if (startIdx != endIdx)
00100 {
00101 Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
00102 if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
00103 return bla->id;
00104 if (!dynamic) return unkId;
00105 }
00106 else if (!dynamic) return strcmp(p,"NULL") && unkId;
00107
00108 boost::lock_guard<boost::mutex> lk(*this->lock);
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118 map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
00119 pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
00120 if (foo.second)
00121 newWords->push_back(foo.first->first);
00122 return foo.first->second;
00123 }
00124
00125 id_type
00126 TokenIndex::
00127 operator[](string const& w) const
00128 {
00129 return (*this)[w.c_str()];
00130 }
00131
00132 vector<char const*>
00133 TokenIndex::
00134 reverseIndex() const
00135 {
00136 size_t numToks = endIdx-startIdx;
00137
00138
00139
00140 vector<char const*> v(numToks,NULL);
00141
00142 for (Entry const* x = startIdx; x != endIdx; x++)
00143 {
00144 if (x->id >= v.size())
00145 v.resize(x->id+1);
00146 v[x->id] = comp.base+x->offset;
00147 }
00148
00149 return v;
00150 }
00151
00152 char const* const
00153 TokenIndex::
00154 operator[](id_type id) const
00155 {
00156 if (!ridx.size())
00157 {
00158 boost::lock_guard<boost::mutex> lk(*this->lock);
00159
00160
00161 if (!ridx.size()) ridx = reverseIndex();
00162 }
00163 if (id < ridx.size())
00164 return ridx[id];
00165
00166 boost::lock_guard<boost::mutex> lk(*this->lock);
00167 if (dynamic && id < ridx.size()+newWords->size())
00168 return (*newWords)[id-ridx.size()].c_str();
00169 return unkLabel.c_str();
00170 }
00171
00172 void
00173 TokenIndex::
00174 iniReverseIndex()
00175 {
00176 if (!ridx.size())
00177 {
00178 boost::lock_guard<boost::mutex> lk(*this->lock);
00179 if (!ridx.size()) ridx = reverseIndex();
00180 }
00181 }
00182
00183
00184 char const* const
00185 TokenIndex::
00186 operator[](id_type id)
00187 {
00188 if (!ridx.size())
00189 {
00190 boost::lock_guard<boost::mutex> lk(*this->lock);
00191 if (!ridx.size()) ridx = reverseIndex();
00192 }
00193 if (id < ridx.size())
00194 return ridx[id];
00195 boost::lock_guard<boost::mutex> lk(*this->lock);
00196 if (dynamic && id < ridx.size()+newWords->size())
00197 return (*newWords)[id-ridx.size()].c_str();
00198 return unkLabel.c_str();
00199 }
00200
00201 string
00202 TokenIndex::
00203 toString(vector<id_type> const& v)
00204 {
00205 if (!ridx.size())
00206 {
00207 boost::lock_guard<boost::mutex> lk(*this->lock);
00208 if (!ridx.size()) ridx = reverseIndex();
00209 }
00210 ostringstream buf;
00211 for (size_t i = 0; i < v.size(); i++)
00212 buf << (i ? " " : "") << (*this)[v[i]];
00213 return buf.str();
00214 }
00215
00216 string
00217 TokenIndex::
00218 toString(vector<id_type> const& v) const
00219 {
00220 if (!ridx.size())
00221 {
00222 boost::lock_guard<boost::mutex> lk(*this->lock);
00223 if (!ridx.size()) ridx = reverseIndex();
00224 }
00225 ostringstream buf;
00226 for (size_t i = 0; i < v.size(); i++)
00227 buf << (i ? " " : "") << (*this)[v[i]];
00228 return buf.str();
00229 }
00230
00231 string
00232 TokenIndex::
00233 toString(id_type const* start, id_type const* const stop)
00234 {
00235 if (!ridx.size())
00236 {
00237 boost::lock_guard<boost::mutex> lk(*this->lock);
00238 if (!ridx.size()) ridx = reverseIndex();
00239 }
00240 ostringstream buf;
00241 if (start < stop)
00242 buf << (*this)[*start];
00243 while (++start < stop)
00244 buf << " " << (*this)[*start];
00245 return buf.str();
00246 }
00247
00248 string
00249 TokenIndex::
00250 toString(id_type const* start, id_type const* const stop) const
00251 {
00252 if (!ridx.size())
00253 {
00254 boost::lock_guard<boost::mutex> lk(*this->lock);
00255 if (!ridx.size()) ridx = reverseIndex();
00256 }
00257 ostringstream buf;
00258 if (start < stop)
00259 buf << (*this)[*start];
00260 while (++start < stop)
00261 buf << " " << (*this)[*start];
00262 return buf.str();
00263 }
00264
00265 vector<id_type>
00266 TokenIndex::
00267 toIdSeq(string const& line) const
00268 {
00269 istringstream buf(line);
00270 string w;
00271 vector<id_type> retval;
00272 while (buf>>w)
00273 retval.push_back((*this)[w]);
00274 return retval;
00275 }
00276
00278 bool
00279 TokenIndex::
00280 fillIdSeq(string const& line, vector<id_type> & v) const
00281 {
00282 bool allgood = true; string w;
00283 v.clear();
00284 for (istringstream buf(line); buf>>w;)
00285 {
00286 v.push_back((*this)[w]);
00287 allgood = allgood && v.back() > 1;
00288 }
00289 return allgood;
00290 }
00291
00292 id_type
00293 TokenIndex::
00294 getNumTokens() const
00295 {
00296 return numTokens;
00297 }
00298
00299 id_type
00300 TokenIndex::
00301 getUnkId() const
00302 {
00303 return unkId;
00304 }
00305
00306 char const* const
00307 TokenIndex::
00308 getUnkToken() const
00309 {
00310 return unkLabel.c_str();
00311
00312 }
00313
00314 id_type
00315 TokenIndex::
00316 knownVocabSize() const
00317 {
00318 return numTokens;
00319 }
00320
00321 id_type
00322 TokenIndex::
00323 ksize() const
00324 {
00325 return numTokens;
00326 }
00327
00328 id_type
00329 TokenIndex::
00330 totalVocabSize() const
00331 { return tsize(); }
00332
00333 id_type
00334 TokenIndex::
00335 tsize() const
00336 {
00337 return (newWords != NULL
00338 ? numTokens+newWords->size()
00339 : numTokens);
00340 }
00341
00342 void
00343 write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
00344 string const& ofile, string const& unkToken)
00345 {
00346 typedef pair<uint32_t,id_type> IndexEntry;
00347
00348
00349 vector<IndexEntry> index(tok.size());
00350 ostringstream data;
00351 id_type unkId = tok.size();
00352 for (size_t i = 0; i < tok.size(); i++)
00353 {
00354 if (tok[i].first == unkToken)
00355 unkId = tok[i].second;
00356 index[i].first = data.tellp();
00357 index[i].second = tok[i].second;
00358 data<<tok[i].first<<char(0);
00359 }
00360
00361
00362 ofstream out(ofile.c_str());
00363 uint32_t vsize = index.size();
00364 out.write(reinterpret_cast<char*>(&vsize),4);
00365 out.write(reinterpret_cast<char*>(&unkId),sizeof(id_type));
00366 for (size_t i = 0; i < index.size(); i++)
00367 {
00368 out.write(reinterpret_cast<char*>(&index[i].first),4);
00369 out.write(reinterpret_cast<char*>(&index[i].second),sizeof(id_type));
00370 }
00371 out<<data.str();
00372 }
00373
00374 void
00375 TokenIndex::
00376 write(string fname)
00377 {
00378 typedef pair<string,uint32_t> Token;
00379 vector<Token> tok(totalVocabSize());
00380 for (id_type i = 0; i < tok.size(); ++i)
00381 tok[i] = Token((*this)[i],i);
00382 sort(tok.begin(),tok.end());
00383 write_tokenindex_to_disk(tok,fname,unkLabel);
00384 }
00385
00386 bool
00387 TokenIndex::
00388 isDynamic() const
00389 {
00390 return dynamic;
00391 }
00392
00393 bool
00394 TokenIndex::
00395 setDynamic(bool on)
00396 {
00397 bool ret = dynamic;
00398 if (on && this->str2idExtra == NULL)
00399 {
00400 this->str2idExtra.reset(new map<string,id_type>());
00401 this->newWords.reset(new vector<string>());
00402 }
00403 dynamic = on;
00404 if (on)
00405 {
00406 (*this)["NULL"];
00407 (*this)[unkLabel];
00408 }
00409 return ret;
00410 }
00411
00412 void
00413 TokenIndex::
00414 setUnkLabel(string unk)
00415 {
00416 unkId = (*this)[unk];
00417 unkLabel = unk;
00418 }
00419
00420 }