Skip to content

Commit

Permalink
Merge pull request #2 from biocore/in-memory-one-off
Browse files Browse the repository at this point in the history
In memory one off
  • Loading branch information
wasade authored Mar 21, 2022
2 parents 22e0302 + 80aeab0 commit 864cc46
Show file tree
Hide file tree
Showing 9 changed files with 578 additions and 107 deletions.
1 change: 1 addition & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ install: libssu.so ssu faithpd
mkdir -p ${PREFIX}/include/unifrac
cp task_parameters.hpp ${PREFIX}/include/unifrac/
cp api.hpp ${PREFIX}/include/unifrac/
cp status_enum.hpp ${PREFIX}/include/unifrac/

rapi_test: main
mkdir -p ~/.R
Expand Down
237 changes: 170 additions & 67 deletions src/api.cpp

Large diffs are not rendered by default.

95 changes: 85 additions & 10 deletions src/api.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "task_parameters.hpp"
#include "status_enum.hpp"

#ifdef __cplusplus
#include <vector>
Expand All @@ -14,10 +15,6 @@
#define PARTIAL_MAGIC_V2 0x088ABA02


typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus;
typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus;
typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus;

/* a result matrix
*
* n_samples <uint> the number of samples.
Expand Down Expand Up @@ -122,14 +119,49 @@ typedef struct partial_dyn_mat {
char* filename;
} partial_dyn_mat_t;

/* support structure to carry in biom table information
*
* obs_ids <char**> the observation IDs
* sample_ids <char**> the sample IDs
* indices <int32_t*> the indices of the data values
* indptr <int32_t*> the row offset of the data values
* data <double*> the actual matrix values
* n_obs <int> the number of observations, corresponding to length of obs_ids
* n_samples <int> the number of samples, corresponding to the length of sample_ids
* nnz <int> the number of nonzero values, corresponding to the length of data and indices
*/
typedef struct support_biom {
char** obs_ids;
char** sample_ids;
uint32_t* indices;
uint32_t* indptr;
double* data;
int n_obs;
int n_samples;
int nnz;
} support_biom_t;

/* support structure to carry in bptree information
*
* structure <bool*> the topology of the tree
* lengths <double*> the branch lengths of the tree
* names <char**> the names of the tips and internal nodes of hte tree
* n_parens <int> the length of the structure array
*/
typedef struct support_bptree {
bool* structure;
double* lengths;
char** names;
int n_parens;
} support_bptree_t;


void destroy_mat(mat_t** result);
void destroy_mat_full_fp64(mat_full_fp64_t** result);
void destroy_mat_full_fp32(mat_full_fp32_t** result);
void destroy_partial_mat(partial_mat_t** result);
void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
void destroy_results_vec(r_vec** result);
EXTERN void destroy_mat(mat_t** result);
EXTERN void destroy_mat_full_fp64(mat_full_fp64_t** result);
EXTERN void destroy_mat_full_fp32(mat_full_fp32_t** result);
EXTERN void destroy_partial_mat(partial_mat_t** result);
EXTERN void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
EXTERN void destroy_results_vec(r_vec** result);

/* Compute UniFrac - condensed form
*
Expand All @@ -154,6 +186,49 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int threads, mat_t** result);


/* Compute UniFrac - against in-memory objects returning full form matrix
*
* table <biom> a constructed BIOM object
* tree <BPTree> a constructed BPTree object
* unifrac_method <const char*> the requested unifrac method.
* variance_adjust <bool> whether to apply variance adjustment.
* alpha <double> GUniFrac alpha, only relevant if method == generalized.
* bypass_tips <bool> disregard tips, reduces compute by about 50%
* threads <uint> the number of threads to use.
* result <mat_full_fp64_t**> the resulting distance matrix in full form, this is initialized within the method so using **
*
* one_off_inmem returns the following error codes:
*
* okay : no problems encountered
* unknown_method : the requested method is unknown.
* table_empty : the table does not have any entries
*/
EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int threads, mat_full_fp64_t** result);

/* Compute UniFrac - against in-memory objects returning full form matrix, fp32
*
* table <biom> a constructed BIOM object
* tree <BPTree> a constructed BPTree object
* unifrac_method <const char*> the requested unifrac method.
* variance_adjust <bool> whether to apply variance adjustment.
* alpha <double> GUniFrac alpha, only relevant if method == generalized.
* bypass_tips <bool> disregard tips, reduces compute by about 50%
* threads <uint> the number of threads to use.
* result <mat_full_fp32_t**> the resulting distance matrix in full form, this is initialized within the method so using **
*
* one_off_inmem returns the following error codes:
*
* okay : no problems encountered
* unknown_method : the requested method is unknown.
* table_empty : the table does not have any entries
*/
EXTERN ComputeStatus one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int threads, mat_full_fp32_t** result);

/* Compute UniFrac - matrix form
*
* biom_filename <const char*> the filename to the biom table.
Expand Down
161 changes: 140 additions & 21 deletions src/biom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices");
const std::string SAMPLE_DATA = std::string("/sample/matrix/data");
const std::string SAMPLE_IDS = std::string("/sample/ids");

biom::biom(std::string filename) {
biom::biom(std::string filename) : has_hdf5_backing(true) {
file = H5File(filename.c_str(), H5F_ACC_RDONLY);

/* establish the datasets */
Expand Down Expand Up @@ -55,9 +55,51 @@ biom::biom(std::string filename) {
obs_id_index = std::unordered_map<std::string, uint32_t>();
sample_id_index = std::unordered_map<std::string, uint32_t>();

create_id_index(obs_ids, obs_id_index);
create_id_index(sample_ids, sample_id_index);
#pragma omp parallel for schedule(static)
for(int i = 0; i < 3; i++) {
if(i == 0)
create_id_index(obs_ids, obs_id_index);
else if(i == 1)
create_id_index(sample_ids, sample_id_index);
else if(i == 2)
malloc_resident(n_obs);
}

uint32_t *current_indices = NULL;
double *current_data = NULL;
for(unsigned int i = 0; i < obs_ids.size(); i++) {
std::string id_ = obs_ids[i];
unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
obs_counts_resident[i] = n;
obs_indices_resident[i] = current_indices;
obs_data_resident[i] = current_data;
}
sample_counts = get_sample_counts();
}

biom::~biom() {
if(has_hdf5_backing) {
if(obs_indices_resident != NULL && obs_data_resident != NULL) {
for(unsigned int i = 0; i < n_obs; i++) {
if(obs_indices_resident[i] != NULL)
free(obs_indices_resident[i]);
if(obs_data_resident[i] != NULL)
free(obs_data_resident[i]);
}
}

if(obs_indices_resident != NULL)
free(obs_indices_resident);
if(obs_data_resident != NULL)
free(obs_data_resident);
if(obs_counts_resident != NULL)
free(obs_counts_resident);
}
// else, it is the responsibility of the entity constructing this object
// to clean itself up
}

void biom::malloc_resident(uint32_t n_obs) {
/* load obs sparse data */
obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs);
if(obs_indices_resident == NULL) {
Expand All @@ -77,30 +119,82 @@ biom::biom(std::string filename) {
sizeof(unsigned int) * n_obs, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}

uint32_t *current_indices = NULL;
double *current_data = NULL;
for(unsigned int i = 0; i < obs_ids.size(); i++) {
std::string id_ = obs_ids[i];
unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
obs_counts_resident[i] = n;
obs_indices_resident[i] = current_indices;
obs_data_resident[i] = current_data;
}
sample_counts = get_sample_counts();
biom::biom() : has_hdf5_backing(false) {
n_obs = 0;
malloc_resident(0);
}

biom::~biom() {
for(unsigned int i = 0; i < n_obs; i++) {
free(obs_indices_resident[i]);
free(obs_data_resident[i]);
// not using const on indices/indptr/data as the pointers are being borrowed
biom::biom(char** obs_ids_in,
char** samp_ids_in,
uint32_t* indices,
uint32_t* indptr,
double* data,
const int n_obs,
const int n_samples,
const int nnz) : has_hdf5_backing(false) {

this->nnz = nnz;
this->n_samples = n_samples;
this->n_obs = n_obs;

sample_ids = std::vector<std::string>();
sample_ids.resize(n_samples);
obs_ids = std::vector<std::string>();
obs_ids.resize(n_obs);

#pragma omp parallel for schedule(static)
for(int x = 0; x < 2; x++) {
if(x == 0) {
for(int i = 0; i < n_obs; i++) {
obs_ids[i] = std::string(obs_ids_in[i]);
}
} else {
for(int i = 0; i < n_samples; i++) {
sample_ids[i] = std::string(samp_ids_in[i]);
}
}
}
free(obs_indices_resident);
free(obs_data_resident);
free(obs_counts_resident);

/* define a mapping between an ID and its corresponding offset */
obs_id_index = std::unordered_map<std::string, uint32_t>();
sample_id_index = std::unordered_map<std::string, uint32_t>();

#pragma omp parallel for schedule(static)
for(int i = 0; i < 3; i++) {
if(i == 0)
create_id_index(obs_ids, obs_id_index);
else if(i == 1)
create_id_index(sample_ids, sample_id_index);
else if(i == 2)
malloc_resident(n_obs);
}

#pragma omp parallel for schedule(static)
for(unsigned int i = 0; i < n_obs; i++) {
int32_t start = indptr[i];
int32_t end = indptr[i + 1];
unsigned int count = end - start;

uint32_t* index_ptr = (indices + start);
double* data_ptr = (data + start);

obs_indices_resident[i] = index_ptr;
obs_data_resident[i] = data_ptr;
obs_counts_resident[i] = count;
}
sample_counts = get_sample_counts();
}

void biom::set_nnz() {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

// should these be cached?
DataType dtype = obs_data.getDataType();
DataSpace dataspace = obs_data.getSpace();
Expand All @@ -111,6 +205,12 @@ void biom::set_nnz() {
}

void biom::load_ids(const char *path, std::vector<std::string> &ids) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

DataSet ds_ids = file.openDataSet(path);
DataType dtype = ds_ids.getDataType();
DataSpace dataspace = ds_ids.getSpace();
Expand Down Expand Up @@ -138,6 +238,12 @@ void biom::load_ids(const char *path, std::vector<std::string> &ids) {
}

void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

DataSet ds = file.openDataSet(path);
DataType dtype = ds.getDataType();
DataSpace dataspace = ds.getSpace();
Expand All @@ -159,7 +265,7 @@ void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
free(dataout);
}

void biom::create_id_index(std::vector<std::string> &ids,
void biom::create_id_index(const std::vector<std::string> &ids,
std::unordered_map<std::string, uint32_t> &map) {
uint32_t count = 0;
map.reserve(ids.size());
Expand All @@ -169,6 +275,12 @@ void biom::create_id_index(std::vector<std::string> &ids,
}

unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

uint32_t idx = obs_id_index.at(id);
uint32_t start = obs_indptr[idx];
uint32_t end = obs_indptr[idx + 1];
Expand Down Expand Up @@ -270,6 +382,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne
}

unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

uint32_t idx = sample_id_index.at(id);
uint32_t start = sample_indptr[idx];
uint32_t end = sample_indptr[idx + 1];
Expand Down Expand Up @@ -310,6 +428,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur

double* biom::get_sample_counts() {
double *sample_counts = (double*)calloc(sizeof(double), n_samples);

for(unsigned int i = 0; i < n_obs; i++) {
unsigned int count = obs_counts_resident[i];
uint32_t *indices = obs_indices_resident[i];
Expand Down
Loading

0 comments on commit 864cc46

Please sign in to comment.