00001
00002
00003 #include <cassert>
00004 #include <iomanip>
00005 #include <iostream>
00006 #include <fstream>
00007 #include <sstream>
00008 #include <stdexcept>
00009 #include <string>
00010 #include <list>
00011 #include <vector>
00012 #include <set>
00013 #include <algorithm>
00014 #include <cstring>
00015 #include "cmd.h"
00016
00017 using namespace std;
00018
00019 const int MAX_WORD = 10000;
00020 const int MAX_M = 400;
00021 const int MAX_N = 400;
00022
00023 enum Alignment {
00024 UNION = 1,
00025 INTERSECT,
00026 GROW,
00027 SRCTOTGT,
00028 TGTTOSRC,
00029 };
00030
00031 const Enum_T END_ENUM = {0, 0};
00032
00033 namespace
00034 {
00035 Enum_T AlignEnum [] = {
00036 { "union", UNION },
00037 { "u", UNION },
00038 { "intersect", INTERSECT},
00039 { "i", INTERSECT},
00040 { "grow", GROW },
00041 { "g", GROW },
00042 { "srctotgt", SRCTOTGT },
00043 { "s2t", SRCTOTGT },
00044 { "tgttosrc", TGTTOSRC },
00045 { "t2s", TGTTOSRC },
00046 END_ENUM
00047 };
00048
00049 Enum_T BoolEnum [] = {
00050 { "true", true },
00051 { "yes", true },
00052 { "y", true },
00053 { "false", false },
00054 { "no", false },
00055 { "n", false },
00056 END_ENUM
00057 };
00058
00059
00060
00061 int* fa;
00062 int* ea;
00063 int** A;
00064
00065 int verbose=0;
00066
00067
00068
00069 int lc = 0;
00070
00071 int getals(istream& inp,int& m, int *a,int& n, int *b)
00072 {
00073 char w[MAX_WORD], dummy[10];
00074 int i,j,freq;
00075 if (inp >> freq) {
00076 ++lc;
00077
00078 inp >> n;
00079 assert(n<MAX_N);
00080 for (i=1; i<=n; i++) {
00081 inp >> setw(MAX_WORD) >> w;
00082 if (strlen(w)>=MAX_WORD-1) {
00083 cerr << lc << ": target len=" << strlen(w) << " is not less than MAX_WORD-1="
00084 << MAX_WORD-1 << endl;
00085 assert(strlen(w)<MAX_WORD-1);
00086 }
00087 }
00088
00089 inp >> dummy;
00090
00091 for (i=1; i<=n; i++) inp >> b[i];
00092
00093
00094 inp >> m;
00095 assert(m<MAX_M);
00096 for (j=1; j<=m; j++) {
00097 inp >> setw(MAX_WORD) >> w;
00098 if (strlen(w)>=MAX_WORD-1) {
00099 cerr << lc << ": source len=" << strlen(w) << " is not less than MAX_WORD-1="
00100 << MAX_WORD-1 << endl;
00101 assert(strlen(w)<MAX_WORD-1);
00102 }
00103 }
00104
00105 inp >> dummy;
00106
00107
00108 for (j=1; j<=m; j++) {
00109 inp >> a[j];
00110 assert(0<=a[j] && a[j]<=n);
00111 }
00112
00113
00114 for (i=1; i<=n; i++)
00115 assert(0<=b[i] && b[i]<=m);
00116
00117 return 1;
00118
00119 } else
00120 return 0;
00121 }
00122
00123
00124
00125 int prunionalignment(ostream& out,int m,int *a,int n,int* b)
00126 {
00127
00128 ostringstream sout;
00129
00130 for (int j=1; j<=m; j++)
00131 if (a[j])
00132 sout << j-1 << "-" << a[j]-1 << " ";
00133
00134 for (int i=1; i<=n; i++)
00135 if (b[i] && a[b[i]]!=i)
00136 sout << b[i]-1 << "-" << i-1 << " ";
00137
00138
00139 string str = sout.str();
00140 if (str.length() == 0)
00141 str = "\n";
00142 else
00143 str.replace(str.length()-1,1,"\n");
00144
00145 out << str;
00146 out.flush();
00147
00148 return 1;
00149 }
00150
00151
00152
00153
00154 int printersect(ostream& out,int m,int *a,int n,int* b)
00155 {
00156
00157 ostringstream sout;
00158
00159 for (int j=1; j<=m; j++)
00160 if (a[j] && b[a[j]]==j)
00161 sout << j-1 << "-" << a[j]-1 << " ";
00162
00163
00164 string str = sout.str();
00165 if (str.length() == 0)
00166 str = "\n";
00167 else
00168 str.replace(str.length()-1,1,"\n");
00169
00170 out << str;
00171 out.flush();
00172
00173 return 1;
00174 }
00175
00176
00177
00178 int printtgttosrc(ostream& out,int m,int *a,int n,int* b)
00179 {
00180
00181 ostringstream sout;
00182
00183 for (int i=1; i<=n; i++)
00184 if (b[i])
00185 sout << b[i]-1 << "-" << i-1 << " ";
00186
00187
00188 string str = sout.str();
00189 if (str.length() == 0)
00190 str = "\n";
00191 else
00192 str.replace(str.length()-1,1,"\n");
00193
00194 out << str;
00195 out.flush();
00196
00197 return 1;
00198 }
00199
00200
00201
00202 int printsrctotgt(ostream& out,int m,int *a,int n,int* b)
00203 {
00204
00205 ostringstream sout;
00206
00207 for (int j=1; j<=m; j++)
00208 if (a[j])
00209 sout << j-1 << "-" << a[j]-1 << " ";
00210
00211
00212 string str = sout.str();
00213 if (str.length() == 0)
00214 str = "\n";
00215 else
00216 str.replace(str.length()-1,1,"\n");
00217
00218 out << str;
00219 out.flush();
00220
00221 return 1;
00222 }
00223
00224
00225
00226
00227
00228
00229
00230 int printgrow(ostream& out,int m,int *a,int n,int* b, bool diagonal=false,bool isfinal=false,bool bothuncovered=false)
00231 {
00232
00233 ostringstream sout;
00234
00235 vector <pair <int,int> > neighbors;
00236
00237 pair <int,int> entry;
00238
00239 neighbors.push_back(make_pair(-1,-0));
00240 neighbors.push_back(make_pair(0,-1));
00241 neighbors.push_back(make_pair(1,0));
00242 neighbors.push_back(make_pair(0,1));
00243
00244
00245 if (diagonal) {
00246 neighbors.push_back(make_pair(-1,-1));
00247 neighbors.push_back(make_pair(-1,1));
00248 neighbors.push_back(make_pair(1,-1));
00249 neighbors.push_back(make_pair(1,1));
00250 }
00251
00252
00253 int i,j;
00254 size_t o;
00255
00256
00257
00258
00259 memset(fa,0,(m+1)*sizeof(int));
00260 memset(ea,0,(n+1)*sizeof(int));
00261
00262
00263
00264
00265 for (int i=1; i<=n; i++) memset(A[i],0,(m+1)*sizeof(int));
00266
00267 set <pair <int,int> > currentpoints;
00268 set <pair <int,int> > unionalignment;
00269
00270 pair <int,int> point;
00271 set<pair <int,int> >::const_iterator k;
00272
00273
00274 for (j=1; j<=m; j++) {
00275 if (a[j]) {
00276 unionalignment.insert(make_pair(a[j],j));
00277 if (b[a[j]]==j) {
00278 fa[j]=1;
00279 ea[a[j]]=1;
00280 A[a[j]][j]=2;
00281 currentpoints.insert(make_pair(a[j],j));
00282 } else
00283 A[a[j]][j]=-1;
00284 }
00285 }
00286
00287 for (i=1; i<=n; i++)
00288 if (b[i] && a[b[i]]!=i) {
00289 unionalignment.insert(make_pair(i,b[i]));
00290 A[i][b[i]]=1;
00291 }
00292
00293
00294 int added=1;
00295
00296 while (added) {
00297 added=0;
00299 for (k=currentpoints.begin(); k!=currentpoints.end(); k++) {
00300
00301 for (o=0; o<neighbors.size(); o++) {
00302
00303 point.first=k->first+neighbors[o].first;
00304 point.second=k->second+neighbors[o].second;
00305
00306
00307 if (point.first>0 && point.first <=n && point.second>0 && point.second<=m)
00308
00309 if (b[point.first]==point.second || a[point.second]==point.first) {
00310
00311
00312 if (!(ea[point.first] && fa[point.second])) {
00313
00314 currentpoints.insert(point);
00315 A[point.first][point.second]=2;
00316 ea[point.first]=1;
00317 fa[point.second]=1;
00318 added=1;
00319
00320 }
00321 }
00322 }
00323 }
00324 }
00325
00326 if (isfinal) {
00327 for (k=unionalignment.begin(); k!=unionalignment.end(); k++)
00328 if (A[k->first][k->second]==1) {
00329 point.first=k->first;
00330 point.second=k->second;
00331
00332
00333 if ((bothuncovered && !ea[point.first] && !fa[point.second]) ||
00334 (!bothuncovered && !(ea[point.first] && fa[point.second]))) {
00335
00336 currentpoints.insert(point);
00337 A[point.first][point.second]=2;
00338
00339 ea[point.first]=1;
00340 fa[point.second]=1;
00341
00342
00343
00344 }
00345 }
00346
00347 for (k=unionalignment.begin(); k!=unionalignment.end(); k++)
00348 if (A[k->first][k->second]==-1) {
00349 point.first=k->first;
00350 point.second=k->second;
00351
00352
00353 if ((bothuncovered && !ea[point.first] && !fa[point.second]) ||
00354 (!bothuncovered && !(ea[point.first] && fa[point.second]))) {
00355
00356 currentpoints.insert(point);
00357 A[point.first][point.second]=2;
00358
00359 ea[point.first]=1;
00360 fa[point.second]=1;
00361
00362
00363
00364 }
00365 }
00366 }
00367
00368
00369 for (k=currentpoints.begin(); k!=currentpoints.end(); k++)
00370 sout << k->second-1 << "-" << k->first-1 << " ";
00371
00372
00373
00374 string str = sout.str();
00375 if (str.length() == 0)
00376 str = "\n";
00377 else
00378 str.replace(str.length()-1,1,"\n");
00379
00380 out << str;
00381 out.flush();
00382 return 1;
00383
00384 return 1;
00385 }
00386
00387 }
00388
00389
00390
00391
00392
00393 int main(int argc, char** argv)
00394 {
00395
00396 int alignment=0;
00397 char* input= NULL;
00398 char* output= NULL;
00399 int diagonal=false;
00400 int isfinal=false;
00401 int bothuncovered=false;
00402
00403
00404 DeclareParams("a", CMDENUMTYPE, &alignment, AlignEnum,
00405 "alignment", CMDENUMTYPE, &alignment, AlignEnum,
00406 "d", CMDENUMTYPE, &diagonal, BoolEnum,
00407 "diagonal", CMDENUMTYPE, &diagonal, BoolEnum,
00408 "f", CMDENUMTYPE, &isfinal, BoolEnum,
00409 "final", CMDENUMTYPE, &isfinal, BoolEnum,
00410 "b", CMDENUMTYPE, &bothuncovered, BoolEnum,
00411 "both", CMDENUMTYPE, &bothuncovered, BoolEnum,
00412 "i", CMDSTRINGTYPE, &input,
00413 "o", CMDSTRINGTYPE, &output,
00414 "v", CMDENUMTYPE, &verbose, BoolEnum,
00415 "verbose", CMDENUMTYPE, &verbose, BoolEnum,
00416
00417 NULL);
00418
00419 GetParams(&argc, &argv, NULL);
00420
00421 if (alignment==0) {
00422 cerr << "usage: symal [-i=<inputfile>] [-o=<outputfile>] -a=[u|i|g] -d=[yes|no] -b=[yes|no] -f=[yes|no] \n"
00423 << "Input file or std must be in .bal format (see script giza2bal.pl).\n";
00424
00425 exit(1);
00426 }
00427
00428 istream *inp = &std::cin;
00429 ostream *out = &std::cout;
00430
00431 try {
00432 if (input) {
00433 fstream *fin = new fstream(input,ios::in);
00434 if (!fin->is_open()) throw runtime_error("cannot open " + string(input));
00435 inp = fin;
00436 }
00437
00438 if (output) {
00439 fstream *fout = new fstream(output,ios::out);
00440 if (!fout->is_open()) throw runtime_error("cannot open " + string(output));
00441 out = fout;
00442 }
00443
00444 int a[MAX_M],b[MAX_N],m,n;
00445 fa=new int[MAX_M+1];
00446 ea=new int[MAX_N+1];
00447
00448
00449 int sents = 0;
00450 A=new int *[MAX_N+1];
00451 for (int i=1; i<=MAX_N; i++) A[i]=new int[MAX_M+1];
00452
00453 switch (alignment) {
00454 case UNION:
00455 cerr << "symal: computing union alignment\n";
00456 while(getals(*inp,m,a,n,b)) {
00457 prunionalignment(*out,m,a,n,b);
00458 sents++;
00459 }
00460 cerr << "Sents: " << sents << endl;
00461 break;
00462 case INTERSECT:
00463 cerr << "symal: computing intersect alignment\n";
00464 while(getals(*inp,m,a,n,b)) {
00465 printersect(*out,m,a,n,b);
00466 sents++;
00467 }
00468 cerr << "Sents: " << sents << endl;
00469 break;
00470 case GROW:
00471 cerr << "symal: computing grow alignment: diagonal ("
00472 << diagonal << ") final ("<< isfinal << ")"
00473 << "both-uncovered (" << bothuncovered <<")\n";
00474
00475 while(getals(*inp,m,a,n,b))
00476 printgrow(*out,m,a,n,b,diagonal,isfinal,bothuncovered);
00477
00478 break;
00479 case TGTTOSRC:
00480 cerr << "symal: computing target-to-source alignment\n";
00481
00482 while(getals(*inp,m,a,n,b)) {
00483 printtgttosrc(*out,m,a,n,b);
00484 sents++;
00485 }
00486 cerr << "Sents: " << sents << endl;
00487 break;
00488 case SRCTOTGT:
00489 cerr << "symal: computing source-to-target alignment\n";
00490
00491 while(getals(*inp,m,a,n,b)) {
00492 printsrctotgt(*out,m,a,n,b);
00493 sents++;
00494 }
00495 cerr << "Sents: " << sents << endl;
00496 break;
00497 default:
00498 throw runtime_error("Unknown alignment");
00499 }
00500
00501 delete [] fa;
00502 delete [] ea;
00503 for (int i=1; i<=MAX_N; i++) delete [] A[i];
00504 delete [] A;
00505
00506 if (inp != &std::cin) {
00507 delete inp;
00508 }
00509 if (out != &std::cout) {
00510 delete inp;
00511 }
00512 } catch (const std::exception &e) {
00513 cerr << e.what() << std::endl;
00514 exit(1);
00515 }
00516
00517 exit(0);
00518 }