-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataframe-euclidean-dist.cpp
188 lines (154 loc) · 5.46 KB
/
dataframe-euclidean-dist.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/* dataframe-euclidean-dist: produce a distance matrix for all pairs
* of columns in a matrix with very many rows
*
* Copyright (C) 2018 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <exception>
#include <sstream>
#include "OptionParser.hpp"
#include "smithlab_utils.hpp"
#include "smithlab_os.hpp"
using std::string;
using std::to_string;
using std::vector;
using std::ifstream;
using std::cerr;
using std::cout;
using std::endl;
using std::istringstream;
using std::runtime_error;
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
static size_t
count_lines_fast(const string &filename) {
struct stat st;
stat(filename.c_str(), &st);
int fd = open(filename.c_str(), O_RDONLY, 0);
if (fd < 0)
throw runtime_error("bad file: " + filename);
char *mmap_data = static_cast<char *>(mmap(NULL, st.st_size, PROT_READ,
MAP_PRIVATE | MAP_NORESERVE, fd, 0));
if (mmap_data == MAP_FAILED)
throw runtime_error("failed mmap for: " + filename);
const size_t n_lines = std::count(mmap_data, mmap_data + st.st_size, '\n');
if (munmap(static_cast<void *>(mmap_data), st.st_size) != 0)
throw runtime_error("failed to release mmap for: " + filename);
close(fd);
return n_lines;
}
template <typename T> T
euclidean_dist(const vector<T> &a, const vector<T> &b, vector<T> &aux) {
transform(begin(a), end(a), begin(b), begin(aux),
[](T elmnt1, T elmnt2) {
const T x = elmnt1-elmnt2; return x*x;});
return std::sqrt(accumulate(begin(aux), end(aux), 0.0));
}
template <typename T> static void
parse_table_row(const string &row, const size_t id, vector<vector<T> > &values) {
std::istringstream is;
is.rdbuf()->pubsetbuf(const_cast<char*>(row.c_str()), row.size());
string dummy;
is >> dummy; //eliminate first column
for (size_t i = 0; i < values.size(); ++i)
if (!(is >> values[i][id]))
throw runtime_error("bad line: " + row + "[" + to_string(id + 1) + "]");
}
template <typename T> static void
parse_strings_whitespace(const string &line, vector<T> &parts) {
istringstream parser;
parser.rdbuf()->pubsetbuf(const_cast<char*>(line.c_str()), line.size());
parts.clear();
string buffer;
while (parser >> buffer)
parts.push_back(buffer);
}
int
main(int argc, const char **argv) {
try {
string outfile;
bool VERBOSE = false;
/****************** COMMAND LINE OPTIONS ********************/
OptionParser opt_parse(strip_path(argv[0]), "make a dist matrix "
"from a data frame", "<data-frame>");
opt_parse.add_opt("outfile", 'o', "output file", false, outfile);
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
vector<string> leftover_args;
opt_parse.parse(argc, argv, leftover_args);
if (argc == 1 || opt_parse.help_requested()) {
cerr << opt_parse.help_message() << endl
<< opt_parse.about_message() << endl;
return EXIT_SUCCESS;
}
if (opt_parse.about_requested()) {
cerr << opt_parse.about_message() << endl;
return EXIT_SUCCESS;
}
if (opt_parse.option_missing()) {
cerr << opt_parse.option_missing_message() << endl;
return EXIT_SUCCESS;
}
if (leftover_args.size() != 1) {
cerr << opt_parse.help_message() << endl;
return EXIT_SUCCESS;
}
const string table_file(leftover_args.back());
/****************** END COMMAND LINE OPTIONS *****************/
ifstream in(table_file);
if (!in)
throw runtime_error("cannot open: " + table_file);
string header;
if (!getline(in, header))
throw runtime_error("could not extract header from: " + table_file);
const size_t n_rows = count_lines_fast(table_file) - 1;
if (VERBOSE)
cerr << "n_rows=" << n_rows << endl;
vector<string> column_names;
parse_strings_whitespace(header, column_names);
const size_t n_columns = column_names.size();
if (VERBOSE)
cerr << "n_columns=" << n_columns << endl;
vector<vector<double> > the_table(n_columns, vector<double>(n_rows));
string line;
size_t row_idx = 0;
while (getline(in, line))
parse_table_row(line, row_idx++, the_table);
vector<double> auxiliary(n_rows, 0.0);
vector<vector<double> > dist(n_columns, vector<double>(n_columns));
for (size_t i = 0; i < the_table.size(); ++i)
for (size_t j = 0; j < i; ++j)
dist[i][j] = euclidean_dist(the_table[i], the_table[j], auxiliary);
std::ofstream of;
if (!outfile.empty()) of.open(outfile.c_str());
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
out << header;
for (size_t i = 0; i < dist.size(); ++i) {
out << column_names[i];
for (size_t j = 0; j < i; ++j)
out << '\t' << dist[i][j];
out << endl;
}
}
catch (const std::runtime_error &e) {
cerr << e.what() << endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}