00001 #ifndef moses_BilingualDynSuffixArray_h
00002 #define moses_BilingualDynSuffixArray_h
00003
00004 #include "TargetPhrase.h"
00005 #include "DynSuffixArray.h"
00006 #include "DynSAInclude/vocab.h"
00007 #include "DynSAInclude/types.h"
00008 #include "DynSAInclude/utils.h"
00009 #include "InputFileStream.h"
00010 #include "FactorTypeSet.h"
00011
00012 namespace Moses {
00013
00014 class SAPhrase
00015 {
00016 public:
00017 std::vector<wordID_t> words;
00018
00019 SAPhrase(size_t phraseSize)
00020 :words(phraseSize)
00021 {}
00022
00023 void SetId(size_t pos, wordID_t id)
00024 {
00025 CHECK(pos < words.size());
00026 words[pos] = id;
00027 }
00028 bool operator<(const SAPhrase& phr2) const
00029 { return words < phr2.words; }
00030 };
00031
00032 class PhrasePair
00033 {
00034 public:
00035 int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
00036 PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
00037 : m_startTarget(startTarget)
00038 , m_endTarget(endTarget)
00039 , m_startSource(startSource)
00040 , m_endSource(endSource)
00041 , m_sntIndex(sntIndex)
00042 {}
00043
00044 size_t GetTargetSize() const
00045 { return m_endTarget - m_startTarget + 1; }
00046 };
00047
00048 class SentenceAlignment
00049 {
00050 public:
00051 SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
00052 int m_sntIndex;
00053 std::vector<wordID_t>* trgSnt;
00054 std::vector<wordID_t>* srcSnt;
00055 std::vector<int> numberAligned;
00056 std::vector< std::vector<int> > alignedList;
00057 bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
00058 };
00059 class ScoresComp {
00060 public:
00061 ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
00062 bool operator()(const Scores& s1, const Scores& s2) const {
00063 return s1[0] < s2[0];
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075 }
00076 private:
00077 const std::vector<float>& m_weights;
00078 };
00079
00080 class BilingualDynSuffixArray {
00081 public:
00082 BilingualDynSuffixArray();
00083 ~BilingualDynSuffixArray();
00084 bool Load( const std::vector<FactorType>& inputFactors,
00085 const std::vector<FactorType>& outputTactors,
00086 std::string source, std::string target, std::string alignments,
00087 const std::vector<float> &weight);
00088 void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
00089 void CleanUp();
00090 void addSntPair(string& source, string& target, string& alignment);
00091 private:
00092 DynSuffixArray* m_srcSA;
00093 DynSuffixArray* m_trgSA;
00094 std::vector<wordID_t>* m_srcCorpus;
00095 std::vector<wordID_t>* m_trgCorpus;
00096 std::vector<FactorType> m_inputFactors;
00097 std::vector<FactorType> m_outputFactors;
00098
00099 std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
00100
00101 Vocab* m_srcVocab, *m_trgVocab;
00102 ScoresComp* m_scoreCmp;
00103
00104 std::vector<SentenceAlignment> m_alignments;
00105 std::vector<std::vector<short> > m_rawAlignments;
00106
00107 mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
00108 mutable std::set<wordID_t> m_freqWordsCached;
00109 const size_t m_maxPhraseLength, m_maxSampleSize;
00110
00111 int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
00112 std::vector<wordID_t>&, std::vector<wordID_t>&,
00113 Vocab*);
00114 int LoadAlignments(InputFileStream& aligs);
00115 int LoadRawAlignments(InputFileStream& aligs);
00116 int LoadRawAlignments(string& aligs);
00117
00118 bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
00119 SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
00120 int SampleSelection(std::vector<unsigned>&, int = 300) const;
00121
00122 std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
00123 TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
00124 SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
00125 bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
00126 void CacheWordProbs(wordID_t) const;
00127 void CacheFreqWords() const;
00128 void ClearWordInCache(wordID_t);
00129 std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
00130
00131 int GetSourceSentenceSize(size_t sentenceId) const
00132 {
00133 return (sentenceId==m_srcSntBreaks.size()-1) ?
00134 m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
00135 m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
00136 }
00137 int GetTargetSentenceSize(size_t sentenceId) const
00138 {
00139 return (sentenceId==m_trgSntBreaks.size()-1) ?
00140 m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
00141 m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
00142 }
00143 };
00144 }
00145 #endif