#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <fstream>
#include <sstream>
#include <vector>
#include <iostream>
#include <set>
#include <boost/thread/tss.hpp>
#include <boost/thread.hpp>
#include <boost/unordered_map.hpp>
#include <boost/program_options.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include <unistd.h>
#include "mm/ug_bitext.h"
Go to the source code of this file.
Classes | |
class | Cache |
struct | SA |
struct | PTEntry |
struct | PfeComparer |
struct | NlogSigThresholder |
Typedefs | |
typedef sapt::L2R_Token < sapt::SimpleWordId > | Token |
typedef sapt::mmTtrack< Token > | ttrack_t |
typedef sapt::mmTSA< Token > | tsa_t |
typedef sapt::TokenIndex | tind_t |
typedef size_t | TextLenType |
typedef boost::shared_ptr < std::vector< TextLenType > > | SentIdSet |
Functions | |
void | usage () |
std::ostream & | operator<< (std::ostream &os, const PTEntry &pp) |
void | print (int a, int b, int c, int d, float p) |
double | fisher_exact (int cfe, int ce, int cf) |
template<class setType > | |
void | ordered_set_intersect (setType &out, const setType set_1, const setType set_2) |
void | lookup_phrase (SentIdSet &ids, const std::string &phrase, tsa_t &my_sa, tind_t &my_v, Cache &cache) |
void | lookup_multiple_phrases (SentIdSet &ids, std::vector< std::string > &phrases, tsa_t &my_sa, tind_t &my_v, const std::string &rule, Cache &cache) |
void | find_occurrences (SentIdSet &ids, const std::string &rule, tsa_t &my_sa, tind_t &my_v, Cache &cache) |
void | compute_cooc_stats_and_filter (std::vector< PTEntry * > &options) |
void | filter_thread (std::istream *in, std::ostream *out, int pfe_index) |
int | main (int argc, char *argv[]) |
Variables | |
const size_t | MINIMUM_SIZE_TO_KEEP = 10000 |
const std::string | SEPARATOR = " ||| " |
const double | ALPHA_PLUS_EPS = -1000.0 |
const double | ALPHA_MINUS_EPS = -2000.0 |
int | pfe_filter_limit = 0 |
bool | print_cooc_counts = false |
bool | print_neglog_significance = false |
double | sig_filter_limit = 0 |
bool | pef_filter_only = false |
bool | hierarchical = false |
double | p_111 = 0.0 |
size_t | pt_lines = 0 |
size_t | nremoved_sigfilter = 0 |
size_t | nremoved_pfefilter = 0 |
int | num_lines |
boost::mutex | in_mutex |
boost::mutex | out_mutex |
boost::mutex | err_mutex |
std::vector< boost::shared_ptr < SA > > | e_sas |
std::vector< boost::shared_ptr < SA > > | f_sas |
typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet |
Definition at line 69 of file filter-pt.cc.
typedef size_t TextLenType |
Definition at line 67 of file filter-pt.cc.
typedef sapt::TokenIndex tind_t |
Definition at line 59 of file filter-pt.cc.
typedef sapt::L2R_Token<sapt::SimpleWordId> Token |
Definition at line 56 of file filter-pt.cc.
typedef sapt::mmTSA<Token> tsa_t |
Definition at line 58 of file filter-pt.cc.
typedef sapt::mmTtrack<Token> ttrack_t |
Definition at line 57 of file filter-pt.cc.
void compute_cooc_stats_and_filter | ( | std::vector< PTEntry * > & | options | ) |
Definition at line 379 of file filter-pt.cc.
References e_sas, f_sas, find_occurrences(), fisher_exact(), nremoved_pfefilter, nremoved_sigfilter, ordered_set_intersect(), pef_filter_only, pfe_filter_limit, and sig_filter_limit.
Referenced by filter_thread().
void filter_thread | ( | std::istream * | in, | |
std::ostream * | out, | |||
int | pfe_index | |||
) |
Definition at line 435 of file filter-pt.cc.
References compute_cooc_stats_and_filter(), e_sas, err_mutex, PTEntry::f_phrase, f_sas, in_mutex, nremoved_pfefilter, nremoved_sigfilter, out_mutex, and pt_lines.
Referenced by main().
void find_occurrences | ( | SentIdSet & | ids, | |
const std::string & | rule, | |||
tsa_t & | my_sa, | |||
tind_t & | my_v, | |||
Cache & | cache | |||
) |
Definition at line 344 of file filter-pt.cc.
References hierarchical, lookup_multiple_phrases(), and lookup_phrase().
Referenced by compute_cooc_stats_and_filter().
double fisher_exact | ( | int | cfe, | |
int | ce, | |||
int | cf | |||
) |
Definition at line 244 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().
void lookup_multiple_phrases | ( | SentIdSet & | ids, | |
std::vector< std::string > & | phrases, | |||
tsa_t & | my_sa, | |||
tind_t & | my_v, | |||
const std::string & | rule, | |||
Cache & | cache | |||
) |
Definition at line 312 of file filter-pt.cc.
References lookup_phrase(), and ordered_set_intersect().
Referenced by find_occurrences().
void lookup_phrase | ( | SentIdSet & | ids, | |
const std::string & | phrase, | |||
tsa_t & | my_sa, | |||
tind_t & | my_v, | |||
Cache & | cache | |||
) |
Definition at line 280 of file filter-pt.cc.
References sapt::TokenIndex::fillIdSeq(), Cache::get(), I, k, m, MINIMUM_SIZE_TO_KEEP, Cache::put(), sort(), stop, and unique().
Referenced by find_occurrences(), and lookup_multiple_phrases().
int main | ( | int | argc, | |
char * | argv[] | |||
) |
Definition at line 523 of file filter-pt.cc.
References ALPHA_MINUS_EPS, ALPHA_PLUS_EPS, e_sas, f_sas, filter_thread(), fisher_exact(), hierarchical, nremoved_pfefilter, nremoved_sigfilter, num_lines, optind, p_111, pef_filter_only, pfe_filter_limit, print_cooc_counts, print_neglog_significance, pt_lines, Cache::set_max_cache(), sig_filter_limit, store(), and usage().
std::ostream& operator<< | ( | std::ostream & | os, | |
const PTEntry & | pp | |||
) |
Definition at line 225 of file filter-pt.cc.
References PTEntry::ce, PTEntry::cf, PTEntry::cfe, PTEntry::e_phrase, PTEntry::extra, PTEntry::f_phrase, PTEntry::nlog_pte, print_cooc_counts, print_neglog_significance, and PTEntry::scores.
void ordered_set_intersect | ( | setType & | out, | |
const setType | set_1, | |||
const setType | set_2 | |||
) | [inline] |
Definition at line 273 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and lookup_multiple_phrases().
void print | ( | int | a, | |
int | b, | |||
int | c, | |||
int | d, | |||
float | p | |||
) |
Definition at line 235 of file filter-pt.cc.
void usage | ( | ) |
Definition at line 135 of file filter-pt.cc.
Referenced by main().
const double ALPHA_MINUS_EPS = -2000.0 |
const double ALPHA_PLUS_EPS = -1000.0 |
Definition at line 130 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
boost::mutex err_mutex |
Definition at line 131 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
bool hierarchical = false |
boost::mutex in_mutex |
const size_t MINIMUM_SIZE_TO_KEEP = 10000 |
size_t nremoved_pfefilter = 0 |
Definition at line 54 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
size_t nremoved_sigfilter = 0 |
Definition at line 53 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
int num_lines |
boost::mutex out_mutex |
double p_111 = 0.0 |
bool pef_filter_only = false |
Definition at line 48 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().
int pfe_filter_limit = 0 |
Definition at line 43 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().
bool print_cooc_counts = false |
bool print_neglog_significance = false |
size_t pt_lines = 0 |
const std::string SEPARATOR = " ||| " |
double sig_filter_limit = 0 |
Definition at line 46 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().