00001 #ifndef moses_BilingualDynSuffixArray_h
00002 #define moses_BilingualDynSuffixArray_h
00003
00004 #include "DynSuffixArray.h"
00005 #include "moses/TranslationModel/DynSAInclude/vocab.h"
00006 #include "moses/TranslationModel/DynSAInclude/types.h"
00007 #include "moses/TranslationModel/DynSAInclude/utils.h"
00008 #include "moses/TranslationModel/WordCoocTable.h"
00009 #include "moses/InputFileStream.h"
00010 #include "moses/FactorTypeSet.h"
00011 #include "moses/TargetPhrase.h"
00012 #include <boost/dynamic_bitset.hpp>
00013 #include "moses/TargetPhraseCollection.h"
00014 #include <map>
00015
00016 namespace Moses
00017 {
00018 class PhraseDictionaryDynSuffixArray;
00019
00022 class SAPhrase
00023 {
00024 public:
00025 std::vector<wordID_t> words;
00026
00027 SAPhrase(size_t phraseSize)
00028 :words(phraseSize) {
00029 }
00030
00031 void SetId(size_t pos, wordID_t id) {
00032 words.at(pos) = id;
00033 }
00034 bool operator<(const SAPhrase& phr2) const {
00035 return words < phr2.words;
00036 }
00037 };
00038
00041 class PhrasePair
00042 {
00043 public:
00044 int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
00045 PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
00046 : m_startTarget(startTarget)
00047 , m_endTarget(endTarget)
00048 , m_startSource(startSource)
00049 , m_endSource(endSource)
00050 , m_sntIndex(sntIndex) {
00051 }
00052
00053 size_t GetTargetSize() const {
00054 return m_endTarget - m_startTarget + 1;
00055 }
00056
00057 size_t GetSourceSize() const {
00058 return m_endSource - m_startSource + 1;
00059 }
00060 };
00061
00064 class SentenceAlignment
00065 {
00066 public:
00067 SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
00068 int m_sntIndex;
00069 std::vector<wordID_t>* trgSnt;
00070 std::vector<wordID_t>* srcSnt;
00071 std::vector<int> numberAligned;
00072 std::vector< std::vector<int> > alignedList;
00073 bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret,
00074 int startSource, int endSource) const;
00075 };
00076
00077 class ScoresComp
00078 {
00079 public:
00080 ScoresComp(const std::vector<float>& weights) {
00081 }
00082 bool operator()(const Scores& s1, const Scores& s2) const {
00083 return s1[0] < s2[0];
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095 }
00096 };
00097
00098 struct BetterPhrase {
00099 ScoresComp const& cmp;
00100 BetterPhrase(ScoresComp const& sc);
00101
00102
00103 bool operator()(std::pair<Scores, SAPhrase const*> const& a,
00104 std::pair<Scores, SAPhrase const*> const& b) const;
00105 };
00106
00109 class BilingualDynSuffixArray
00110 {
00111 public:
00112 BilingualDynSuffixArray();
00113 ~BilingualDynSuffixArray();
00114 bool Load( const std::vector<FactorType>& inputFactors,
00115 const std::vector<FactorType>& outputTactors,
00116 std::string source, std::string target, std::string alignments,
00117 const std::vector<float> &weight);
00118
00119
00120
00121
00122 void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
00123
00124 void CleanUp(const InputType& source);
00125 void addSntPair(std::string& source, std::string& target, std::string& alignment);
00126 std::pair<float,float>
00127 GatherCands(Phrase const& src, std::map<SAPhrase, std::vector<float> >& pstats) const;
00128
00129 TargetPhrase*
00130 GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase, const PhraseDictionary *pt) const;
00131
00132 private:
00133
00134
00135 mutable WordCoocTable m_wrd_cooc;
00136 DynSuffixArray * m_srcSA;
00137 DynSuffixArray * m_trgSA;
00138 std::vector<wordID_t>* m_srcCorpus;
00139 std::vector<wordID_t>* m_trgCorpus;
00140 std::vector<FactorType> m_inputFactors;
00141 std::vector<FactorType> m_outputFactors;
00142
00143 std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
00144
00145 Vocab* m_srcVocab, *m_trgVocab;
00146 ScoresComp* m_scoreCmp;
00147
00148 std::vector<SentenceAlignment> m_alignments;
00149 std::vector<std::vector<short> > m_rawAlignments;
00150
00151 mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
00152 mutable std::set<wordID_t> m_freqWordsCached;
00153 const size_t m_maxPhraseLength, m_maxSampleSize;
00154 const size_t m_maxPTEntries;
00155 int LoadCorpus(FactorDirection direction,
00156 InputFileStream&, const std::vector<FactorType>& factors,
00157 std::vector<wordID_t>&, std::vector<wordID_t>&,
00158 Vocab*);
00159 int LoadAlignments(InputFileStream& aligs);
00160 int LoadRawAlignments(InputFileStream& aligs);
00161 int LoadRawAlignments(std::string& aligs);
00162
00163 bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
00164 SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
00165 int SampleSelection(std::vector<unsigned>&, int = 300) const;
00166
00167 std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
00168 SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
00169 bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
00170 void CacheWordProbs(wordID_t) const;
00171 void CacheFreqWords() const;
00172 void ClearWordInCache(wordID_t);
00173 std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
00174
00175 int GetSourceSentenceSize(size_t sentenceId) const;
00176 int GetTargetSentenceSize(size_t sentenceId) const;
00177
00178 };
00179 }
00180 #endif