00001 #include <sstream>
00002 #include "vocab.h"
00003
00004 namespace Moses
00005 {
00006
00007
00008 void Vocab::InitSpecialWords()
00009 {
00010 m_kBOSWord = InitSpecialWord(BOS_);
00011 m_kEOSWord = InitSpecialWord(EOS_);
00012 m_kOOVWord = InitSpecialWord(UNKNOWN_FACTOR);
00013 }
00014
00015 const Word Vocab::InitSpecialWord( const std::string& word_str)
00016 {
00017 FactorList factors;
00018 factors.push_back(0);
00019 Word word;
00020
00021 word.CreateFromString( Input, factors, word_str, false );
00022
00023
00024
00025
00026
00027 return word;
00028 }
00029 wordID_t Vocab::GetWordID(const std::string& word_str)
00030 {
00031 FactorList factors;
00032 factors.push_back(0);
00033 Word word;
00034 word.CreateFromString(Input, factors, word_str, false);
00035 return GetWordID(word);
00036 }
00037
00038
00039 wordID_t Vocab::GetWordID(const std::string& word_str,
00040 const FactorDirection& direction, const FactorList& factors, bool isNonTerminal)
00041 {
00042
00043 Word word;
00044 word.CreateFromString( direction, factors, word_str, isNonTerminal);
00045 return GetWordID( word);
00046 }
00047
00048 wordID_t Vocab::GetWordID(const Word& word)
00049 {
00050
00051 if(m_words2ids.find(word) == m_words2ids.end()) {
00052 if (!m_closed) {
00053 wordID_t id = m_words2ids.size() + 1;
00054 m_ids2words[id] = word;
00055
00056 m_words2ids[word] = id;
00057 } else {
00058 return m_kOOVWordID;
00059 }
00060 }
00061 wordID_t id = m_words2ids[word];
00062 return id;
00063 }
00064
00065 Word& Vocab::GetWord(wordID_t id)
00066 {
00067
00068 return (m_ids2words.find(id) == m_ids2words.end()) ? m_kOOVWord : m_ids2words[id];
00069 }
00070
00071 bool Vocab::InVocab(wordID_t id)
00072 {
00073 return m_ids2words.find(id) != m_ids2words.end();
00074 }
00075
00076 bool Vocab::InVocab(const Word& word)
00077 {
00078 return m_words2ids.find(word) != m_words2ids.end();
00079 }
00080
00081 bool Vocab::Save(const std::string & vocab_path)
00082 {
00083
00084 FileHandler vcbout(vocab_path, std::ios::out);
00085 return Save(&vcbout);
00086 }
00087
00088 bool Vocab::Save(FileHandler* vcbout)
00089 {
00090
00091 *vcbout << m_ids2words.size() << "\n";
00092 for (Id2Word::const_iterator iter = m_ids2words.begin();
00093 iter != m_ids2words.end(); ++iter) {
00094 *vcbout << iter->second << "\t" << iter->first << "\n";
00095 }
00096 return true;
00097 }
00098
00099 bool Vocab::Load(const std::string & vocab_path, const FactorDirection& direction,
00100 const FactorList& factors, bool closed)
00101 {
00102 FileHandler vcbin(vocab_path, std::ios::in);
00103 std::cerr << "Loading vocab from " << vocab_path << std::endl;
00104 return Load(&vcbin, direction, factors, closed);
00105 }
00106 bool Vocab::Load(FileHandler* vcbin)
00107 {
00108 FactorList factors;
00109 factors.push_back(0);
00110 return Load(vcbin, Input, factors);
00111 }
00112 bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
00113 const FactorList& factors, bool closed)
00114 {
00115
00116 m_words2ids.clear();
00117 m_ids2words.clear();
00118 std::string line, word_str;
00119 wordID_t id;
00120
00121 std::istream &ret = getline(*vcbin, line);
00122 UTIL_THROW_IF2(!ret, "Couldn't read file");
00123 std::istringstream first(line.c_str());
00124 uint32_t vcbsize(0);
00125 first >> vcbsize;
00126 uint32_t loadedsize = 0;
00127 while (loadedsize++ < vcbsize && getline(*vcbin, line)) {
00128 std::istringstream entry(line.c_str());
00129 entry >> word_str;
00130 Word word;
00131 word.CreateFromString( direction, factors, word_str, false);
00132 entry >> id;
00133
00134 if (id == 0 && word != GetkOOVWord())
00135 id = m_ids2words.size() + 1;
00136 UTIL_THROW_IF2(m_ids2words.count(id) != 0 || m_words2ids.count(word) != 0,
00137 "Error");
00138
00139 m_ids2words[id] = word;
00140 m_words2ids[word] = id;
00141 }
00142 m_closed = closed;
00143 std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl;
00144 return true;
00145 }
00146 void Vocab::PrintVocab()
00147 {
00148 for (Id2Word::const_iterator iter = m_ids2words.begin();
00149 iter != m_ids2words.end(); ++iter ) {
00150 std::cerr << iter->second << "\t" << iter->first << "\n";
00151 }
00152 for (Word2Id::const_iterator iter = m_words2ids.begin();
00153 iter != m_words2ids.end(); ++iter ) {
00154 std::cerr << iter->second << "\t" << iter->first << "\n";
00155 }
00156 }
00157
00158 }