00001 #include "line_splitter.hh" 00002 00003 namespace Moses 00004 { 00005 00006 line_text splitLine(const StringPiece &textin, bool scfg) 00007 { 00008 const char delim[] = "|||"; 00009 line_text output; 00010 00011 //Tokenize 00012 util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); 00013 //Get source phrase 00014 output.source_phrase = Trim(*it); 00015 //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl; 00016 00017 //Get target_phrase 00018 it++; 00019 output.target_phrase = Trim(*it); 00020 //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl; 00021 00022 if (scfg) { 00023 /* 00024 std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; 00025 std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; 00026 reformatSCFG(output); 00027 std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; 00028 std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; 00029 */ 00030 } 00031 00032 //Get probabilities 00033 it++; 00034 output.prob = Trim(*it); 00035 //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl; 00036 00037 //Get WordAllignment 00038 it++; 00039 if (it == util::TokenIter<util::MultiCharacter>::end()) return output; 00040 output.word_align = Trim(*it); 00041 //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl; 00042 00043 //Get count 00044 it++; 00045 if (it == util::TokenIter<util::MultiCharacter>::end()) return output; 00046 output.counts = Trim(*it); 00047 //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl; 00048 00049 //Get sparse_score 00050 it++; 00051 if (it == util::TokenIter<util::MultiCharacter>::end()) return output; 00052 output.sparse_score = Trim(*it); 00053 //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl; 00054 00055 //Get property 00056 it++; 00057 if (it == util::TokenIter<util::MultiCharacter>::end()) return output; 00058 output.property = Trim(*it); 00059 //std::cerr << "output.property=" << output.property << "AAAA" << std::endl; 00060 00061 return output; 00062 } 00063 00064 std::vector<unsigned char> splitWordAll1(const StringPiece &textin) 00065 { 00066 const char delim[] = " "; 00067 const char delim2[] = "-"; 00068 std::vector<unsigned char> output; 00069 00070 //Case with no word alignments. 00071 if (textin.size() == 0) { 00072 return output; 00073 } 00074 00075 //Split on space 00076 util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); 00077 00078 //For each int 00079 while (it) { 00080 //Split on dash (-) 00081 util::TokenIter<util::MultiCharacter> itInner(*it, 00082 util::MultiCharacter(delim2)); 00083 00084 //Insert the two entries in the vector. User will read entry 0 and 1 to get the first, 00085 //2 and 3 for second etc. Use unsigned char instead of int to save space, as 00086 //word allignments are all very small numbers that fit in a single byte 00087 output.push_back((unsigned char) (atoi(itInner->data()))); 00088 itInner++; 00089 output.push_back((unsigned char) (atoi(itInner->data()))); 00090 it++; 00091 } 00092 00093 return output; 00094 00095 } 00096 00097 void reformatSCFG(line_text &output) 00098 { 00099 00100 } 00101 00102 } 00103