Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/mm/tpt

00001 // -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
00002 // (c) 2007-2013 Ulrich Germann
00003 #include <sstream>
00004 #include <cstring>
00005 #include <algorithm>
00006 #include <iostream>
00007 #include <stdexcept>
00008 
00009 #include <boost/pool/pool_alloc.hpp>
00010 
00011 #include "tpt_tokenindex.h"
00012 #include "ug_typedefs.h"
00013 
00014 using namespace std;
00015 namespace sapt
00016 {
00017 
00018   TokenIndex::
00019   TokenIndex(string unkToken)
00020     : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0)
00021     , startIdx(0), endIdx(0)
00022   {
00023     lock.reset(new boost::mutex());
00024   };
00025 
00026 #if 0
00027   TokenIndex::
00028   TokenIndex(string fname, string unkToken,bool dyna)
00029     : ridx(0),unkLabel(unkToken)
00030   {
00031     this->open(fname,unkToken,dyna);
00032   };
00033 #endif
00034 
00035   void
00036   TokenIndex::
00037   open(string fname, string unkToken,bool dyna)
00038   {
00039     if (access(fname.c_str(),F_OK))
00040       {
00041         ostringstream msg;
00042         msg << "TokenIndex::open: File '" << fname << "' does not exist.";
00043         throw std::runtime_error(msg.str().c_str());
00044       }
00045 
00046     file.open(fname);
00047     if (!file.is_open())
00048       {
00049         ostringstream msg;
00050         msg << "TokenIndex::open: Error opening file '" << fname << "'.";
00051         throw std::runtime_error(msg.str().c_str());
00052       }
00053 
00054     this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
00055     unkId = *(reinterpret_cast<id_type const*>(file.data()+4));
00056 
00057     startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
00058     endIdx   = startIdx + numTokens;
00059     comp.base = reinterpret_cast<char const*>(endIdx);
00060     if (!unkToken.empty())
00061       {
00062         Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp);
00063         unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
00064                  ? bla->id
00065                  : numTokens);
00066       }
00067     this->dynamic=dyna;
00068     if (dyna)
00069       {
00070         this->str2idExtra.reset(new map<string,id_type>());
00071         this->newWords.reset(new vector<string>());
00072       }
00073   }
00074 
00075   void
00076   TokenIndex::
00077   close()
00078   {
00079     file.close();
00080   }
00081 
00082   TokenIndex::
00083   CompFunc::
00084   CompFunc()
00085   {};
00086 
00087   bool
00088   TokenIndex::
00089   CompFunc::
00090   operator()(Entry const& A, char const* w)
00091   {
00092     return strcmp(base+A.offset,w) < 0;
00093   };
00094 
00095   id_type
00096   TokenIndex::
00097   operator[](char const* p) const
00098   {
00099     if (startIdx != endIdx)
00100       {
00101         Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
00102         if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
00103           return bla->id;
00104         if (!dynamic) return unkId;
00105       }
00106     else if (!dynamic) return strcmp(p,"NULL") && unkId;
00107     
00108     boost::lock_guard<boost::mutex> lk(*this->lock);
00109     // stuff below is new as of 2011-01-30, for dynamic adding of
00110     // unknown items IMPORTANT: numTokens is not currently not
00111     // changed, it is the number of PRE-EXISING TOKENS, not including
00112     // dynamically added Items
00113     // if (!str2idExtra)
00114     //   {
00115     //     this->str2idExtra.reset(new map<string,id_type>());
00116     //     this->newWords.reset(new vector<string>());
00117     //   }
00118     map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
00119     pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
00120     if (foo.second) // it actually is a new item
00121       newWords->push_back(foo.first->first);
00122     return foo.first->second;
00123   }
00124 
00125   id_type
00126   TokenIndex::
00127   operator[](string const& w) const
00128   {
00129     return (*this)[w.c_str()];
00130   }
00131 
00132   vector<char const*>
00133   TokenIndex::
00134   reverseIndex() const
00135   {
00136     size_t numToks = endIdx-startIdx;
00137 
00138     // cout << "tokenindex has " << numToks << " tokens" << endl;
00139 
00140     vector<char const*> v(numToks,NULL);
00141     // v.reserve(endIdx-startIdx);
00142     for (Entry const* x = startIdx; x != endIdx; x++)
00143       {
00144         if (x->id >= v.size())
00145           v.resize(x->id+1);
00146         v[x->id] = comp.base+x->offset;
00147       }
00148     // cout << "done reversing index " << endl;
00149     return v;
00150   }
00151 
00152   char const* const
00153   TokenIndex::
00154   operator[](id_type id) const
00155   {
00156     if (!ridx.size())
00157       {
00158         boost::lock_guard<boost::mutex> lk(*this->lock);
00159         // Someone else (multi-threading!) may have created the 
00160         // reverse index in the meantime, so let's check again
00161         if (!ridx.size()) ridx = reverseIndex();
00162       }
00163     if (id < ridx.size())
00164       return ridx[id];
00165     
00166     boost::lock_guard<boost::mutex> lk(*this->lock);
00167     if (dynamic && id < ridx.size()+newWords->size())
00168       return (*newWords)[id-ridx.size()].c_str();
00169     return unkLabel.c_str();
00170   }
00171 
00172   void
00173   TokenIndex::
00174   iniReverseIndex()
00175   {
00176     if (!ridx.size())
00177       {
00178         boost::lock_guard<boost::mutex> lk(*this->lock);
00179         if (!ridx.size()) ridx = reverseIndex();
00180       }
00181   }
00182 
00183 
00184   char const* const
00185   TokenIndex::
00186   operator[](id_type id)
00187   {
00188     if (!ridx.size())
00189       {
00190         boost::lock_guard<boost::mutex> lk(*this->lock);
00191         if (!ridx.size()) ridx = reverseIndex();
00192       }
00193     if (id < ridx.size())
00194       return ridx[id];
00195     boost::lock_guard<boost::mutex> lk(*this->lock);
00196     if (dynamic && id < ridx.size()+newWords->size())
00197       return (*newWords)[id-ridx.size()].c_str();
00198     return unkLabel.c_str();
00199   }
00200 
00201   string
00202   TokenIndex::
00203   toString(vector<id_type> const& v)
00204   {
00205     if (!ridx.size())
00206       {
00207         boost::lock_guard<boost::mutex> lk(*this->lock);
00208         if (!ridx.size()) ridx = reverseIndex();
00209       }
00210     ostringstream buf;
00211     for (size_t i = 0; i < v.size(); i++)
00212       buf << (i ? " " : "") << (*this)[v[i]];
00213     return buf.str();
00214   }
00215 
00216   string
00217   TokenIndex::
00218   toString(vector<id_type> const& v) const
00219   {
00220     if (!ridx.size())
00221       {
00222         boost::lock_guard<boost::mutex> lk(*this->lock);
00223         if (!ridx.size()) ridx = reverseIndex();
00224       }
00225     ostringstream buf;
00226     for (size_t i = 0; i < v.size(); i++)
00227       buf << (i ? " " : "") << (*this)[v[i]];
00228     return buf.str();
00229   }
00230 
00231   string
00232   TokenIndex::
00233   toString(id_type const* start, id_type const* const stop)
00234   {
00235     if (!ridx.size())
00236       {
00237         boost::lock_guard<boost::mutex> lk(*this->lock);
00238         if (!ridx.size()) ridx = reverseIndex();
00239       }
00240     ostringstream buf;
00241     if (start < stop)
00242       buf << (*this)[*start];
00243     while (++start < stop)
00244       buf << " " << (*this)[*start];
00245     return buf.str();
00246   }
00247 
00248   string
00249   TokenIndex::
00250   toString(id_type const* start, id_type const* const stop) const
00251   {
00252     if (!ridx.size())
00253       {
00254         boost::lock_guard<boost::mutex> lk(*this->lock);
00255         if (!ridx.size()) ridx = reverseIndex();
00256       }
00257     ostringstream buf;
00258     if (start < stop)
00259       buf << (*this)[*start];
00260     while (++start < stop)
00261       buf << " " << (*this)[*start];
00262     return buf.str();
00263   }
00264 
00265   vector<id_type>
00266   TokenIndex::
00267   toIdSeq(string const& line) const
00268   {
00269     istringstream buf(line);
00270     string w;
00271     vector<id_type> retval;
00272     while (buf>>w)
00273       retval.push_back((*this)[w]);
00274     return retval;
00275   }
00276 
00278   bool
00279   TokenIndex::
00280   fillIdSeq(string const& line, vector<id_type> & v) const
00281   {
00282     bool allgood = true; string w;
00283     v.clear();
00284     for (istringstream buf(line); buf>>w;)
00285       {
00286         v.push_back((*this)[w]);
00287         allgood = allgood && v.back() > 1;
00288       }
00289     return allgood;
00290   }
00291 
00292   id_type
00293   TokenIndex::
00294   getNumTokens() const
00295   {
00296     return numTokens;
00297   }
00298 
00299   id_type
00300   TokenIndex::
00301   getUnkId() const
00302   {
00303     return unkId;
00304   }
00305 
00306   char const* const
00307   TokenIndex::
00308   getUnkToken() const
00309   {
00310     return unkLabel.c_str();
00311     // return (*this)[unkId];
00312   }
00313 
00314   id_type
00315   TokenIndex::
00316   knownVocabSize() const
00317   {
00318     return numTokens;
00319   }
00320 
00321   id_type
00322   TokenIndex::
00323   ksize() const
00324   {
00325     return numTokens;
00326   }
00327 
00328   id_type
00329   TokenIndex::
00330   totalVocabSize() const
00331   { return tsize(); }
00332 
00333   id_type
00334   TokenIndex::
00335   tsize() const
00336   {
00337     return (newWords != NULL
00338             ? numTokens+newWords->size()
00339             : numTokens);
00340   }
00341 
00342   void
00343   write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
00344                            string const& ofile, string const& unkToken)
00345   {
00346     typedef pair<uint32_t,id_type> IndexEntry; // offset and id
00347 
00348     // Write token strings to a buffer, keep track of offsets
00349     vector<IndexEntry> index(tok.size());
00350     ostringstream data;
00351     id_type unkId = tok.size();
00352     for (size_t i = 0; i < tok.size(); i++)
00353       {
00354         if (tok[i].first == unkToken)
00355           unkId = tok[i].second;
00356         index[i].first  = data.tellp();   // offset of string
00357         index[i].second = tok[i].second;  // respective ID
00358         data<<tok[i].first<<char(0);      // write string to buffer
00359       }
00360 
00361     // Now write the actual file
00362     ofstream out(ofile.c_str());
00363     uint32_t vsize = index.size(); // how many vocab items?
00364     out.write(reinterpret_cast<char*>(&vsize),4);
00365     out.write(reinterpret_cast<char*>(&unkId),sizeof(id_type));
00366     for (size_t i = 0; i < index.size(); i++)
00367       {
00368         out.write(reinterpret_cast<char*>(&index[i].first),4);
00369         out.write(reinterpret_cast<char*>(&index[i].second),sizeof(id_type));
00370       }
00371     out<<data.str();
00372   }
00373 
00374   void
00375   TokenIndex::
00376   write(string fname)
00377   {
00378     typedef pair<string,uint32_t>  Token;      // token and id
00379     vector<Token>       tok(totalVocabSize());
00380     for (id_type i = 0; i < tok.size(); ++i)
00381       tok[i] = Token((*this)[i],i);
00382     sort(tok.begin(),tok.end());
00383     write_tokenindex_to_disk(tok,fname,unkLabel);
00384   }
00385 
00386   bool
00387   TokenIndex::
00388   isDynamic() const
00389   {
00390     return dynamic;
00391   }
00392 
00393   bool
00394   TokenIndex::
00395   setDynamic(bool on)
00396   {
00397     bool ret = dynamic;
00398     if (on && this->str2idExtra == NULL)
00399       {
00400         this->str2idExtra.reset(new map<string,id_type>());
00401         this->newWords.reset(new vector<string>());
00402       }
00403     dynamic = on;
00404     if (on)
00405       {
00406         (*this)["NULL"];
00407         (*this)[unkLabel];
00408       }
00409     return ret;
00410   }
00411 
00412   void
00413   TokenIndex::
00414   setUnkLabel(string unk)
00415   {
00416     unkId = (*this)[unk];
00417     unkLabel = unk;
00418   }
00419 
00420 }
/disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc