-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvaryingrows.cpp
179 lines (149 loc) · 4.88 KB
/
varyingrows.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/* varyingrows: get (or remove) the most varying rows
*
* Copyright (C) 2018 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <string>
#include <vector>
#include <iostream>
#include <unordered_set>
#include <queue>
#include <fstream>
#include "OptionParser.hpp"
#include "smithlab_utils.hpp"
#include "smithlab_os.hpp"
#include <gsl/gsl_statistics_double.h>
using std::string;
using std::vector;
using std::ifstream;
using std::cerr;
using std::cout;
using std::endl;
using std::unordered_set;
using std::istringstream;
using std::priority_queue;
using std::pair;
using std::make_pair;
using std::greater;
using std::runtime_error;
static void
parse_table_row(const string &row, vector<double> &values) {
std::istringstream is;
is.rdbuf()->pubsetbuf(const_cast<char*>(row.c_str()), row.size());
string dummy;
is >> dummy; //eliminate the row name
values.clear();
double val = 0.0;
while (is >> val)
values.push_back(val);
}
static void
get_row_name(const string &line, string &rowname) {
rowname = line.substr(0, line.find_first_of(" \t"));
}
static double
get_row_variance(const string &line) {
vector<double> vals;
parse_table_row(line, vals);
return gsl_stats_variance(&vals[0], 1, vals.size());
}
int
main(int argc, const char **argv) {
try {
string outfile;
bool VERBOSE = false;
bool INVERT = false;
size_t n_top = 10000;
/****************** COMMAND LINE OPTIONS ********************/
OptionParser opt_parse(strip_path(argv[0]), "select to keep or remove the top "
"most varying rows in the matrix", "<matrix>");
opt_parse.add_opt("outfile", 'o', "output file", false, outfile);
opt_parse.add_opt("top", 't', "number of top varying to get", false, n_top);
opt_parse.add_opt("invert", 'I', "invert the selection", false, INVERT);
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
vector<string> leftover_args;
opt_parse.parse(argc, argv, leftover_args);
if (argc == 1 || opt_parse.help_requested()) {
cerr << opt_parse.help_message() << endl
<< opt_parse.about_message() << endl;
return EXIT_SUCCESS;
}
if (opt_parse.about_requested()) {
cerr << opt_parse.about_message() << endl;
return EXIT_SUCCESS;
}
if (opt_parse.option_missing()) {
cerr << opt_parse.option_missing_message() << endl;
return EXIT_SUCCESS;
}
if (leftover_args.size() != 1) {
cerr << opt_parse.help_message() << endl;
return EXIT_SUCCESS;
}
const string table_file(leftover_args.back());
/****************** END COMMAND LINE OPTIONS *****************/
if (VERBOSE)
cerr << "[extracting row variances]" << endl;
ifstream in(table_file);
if (!in)
throw runtime_error("could not open file: " + table_file);
typedef pair<double, string> var_row;
priority_queue<var_row, vector<var_row>, greater<var_row> > top_row_sorter;
string header;
getline(in, header); // first remove header
string line;
size_t lines_read = 0;
while (getline(in, line)) {
lines_read++;
string curr_row;
get_row_name(line, curr_row);
const double var = get_row_variance(line);
top_row_sorter.push(make_pair(var, curr_row));
if (top_row_sorter.size() > n_top)
top_row_sorter.pop();
if (VERBOSE && (lines_read % 10000 == 0))
cerr << "lines read: " << lines_read << '\r';
}
if (VERBOSE)
cerr << "lines read: " << ++lines_read << endl;
in.close();
if (VERBOSE)
cerr << "[identifying top varying rows]" << endl;
unordered_set<string> good_rows;
while (!top_row_sorter.empty()) {
good_rows.insert(top_row_sorter.top().second);
top_row_sorter.pop();
}
in.open(table_file);
std::ofstream of;
if (!outfile.empty()) of.open(outfile.c_str());
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
getline(in, line);
out << line << endl;
if (VERBOSE)
cerr << "[selecting top varying rows]" << endl;
while (getline(in, line)) {
string curr_row;
get_row_name(line, curr_row);
const bool found = (good_rows.find(curr_row) != good_rows.end());
if ((!INVERT && found) || (INVERT && !found))
out << line << endl;
}
}
catch (const std::runtime_error &e) {
cerr << e.what() << endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}