Skip to content

Commit

Permalink
Return reference sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
pkerpedjiev committed Sep 17, 2023
1 parent 43b2340 commit 2880b06
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 56 deletions.
66 changes: 34 additions & 32 deletions oxbow/src/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ use arrow::array::{
ArrayRef, GenericStringBuilder, Int32Array, Int32Builder, StringArray, StringDictionaryBuilder,
UInt16Array, UInt16Builder, UInt8Array, UInt8Builder,
};
use arrow::ipc::writer::FileWriter;
use arrow::{datatypes::Int32Type, error::ArrowError, record_batch::RecordBatch};
use noodles::core::Region;
use noodles::vcf::record::info::field::key::SV_LENGTHS;
use noodles::{bam, bgzf, csi, sam};
use arrow::ipc::writer::FileWriter;

use crate::batch_builder::{write_ipc_err, BatchBuilder};

Expand Down Expand Up @@ -127,37 +127,38 @@ impl<R: Read + Seek> BamReader<R> {
.map(|i| i.map_err(|e| ArrowError::ExternalError(e.into())));
write_ipc_err(records, batch_builder)
}
}

/// Returns the reference sequences in the BAM in Apache Arrow IPC. ///
/// # Examples
///
/// ```no_run
/// use oxbow::bam::BamReader;
///
/// let mut reader = BamReader::new_from_path("sample.bam").unwrap();
/// let ipc = reader.references_to_ipc().unwrap();
/// ```
pub fn references_to_ipc(
&mut self,
) -> Result<Vec<u8>, ArrowError> {
let mut names = GenericStringBuilder::<i32>::new();
let mut lengths = Int32Array::builder(1024);

for (name, sequence) in self.header.reference_sequences() {
names.append_value(name.as_str());
lengths.append_value(sequence.length().get() as i32);
}
/// Returns the reference sequences in the BAM in Apache Arrow IPC. ///
/// # Examples
///
/// ```no_run
/// use oxbow::bam::references_to_ipc;
///
/// let file = std::fs::File::open("sample.bam")?;
/// let ipc = references_to_ipc(file).unwrap();
/// ```
pub fn references_to_ipc<R: Read + Seek>(read: R) -> Result<Vec<u8>, ArrowError> {
let mut reader = bam::Reader::new(read);
let header = reader.read_header()?;

let mut names = GenericStringBuilder::<i32>::new();
let mut lengths = Int32Array::builder(1024);

for (name, sequence) in header.reference_sequences() {
names.append_value(name.as_str());
lengths.append_value(sequence.length().get() as i32);
}

let batch = RecordBatch::try_from_iter(vec![
("name", Arc::new(names.finish()) as ArrayRef),
("length", Arc::new(lengths.finish()) as ArrayRef),
])?;
let batch = RecordBatch::try_from_iter(vec![
("name", Arc::new(names.finish()) as ArrayRef),
("length", Arc::new(lengths.finish()) as ArrayRef),
])?;

let mut writer = FileWriter::try_new(Vec::new(), &batch.schema())?;
writer.write(&batch)?;
writer.finish()?;
writer.into_inner()
}
let mut writer = FileWriter::try_new(Vec::new(), &batch.schema())?;
writer.write(&batch)?;
writer.finish()?;
writer.into_inner()
}

struct BamBatchBuilder<'a> {
Expand Down Expand Up @@ -361,9 +362,10 @@ mod tests {
fn test_references() {
let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
dir.push("../fixtures/sample.bam");
let mut reader = BamReader::new_from_path(dir.to_str().unwrap()).unwrap();
let read = std::fs::File::open(dir.to_str().unwrap()).unwrap();

let ipc = reader.references_to_ipc().unwrap();
dbg!(&ipc);
let ipc = references_to_ipc(read).unwrap();
let record_batch = record_batch_from_ipc(ipc);
assert_eq!(record_batch.num_rows(), 24);
}
}
4 changes: 2 additions & 2 deletions py-oxbow/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 10 additions & 22 deletions py-oxbow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use pyo3::types::PyString;

use oxbow::bam;
use oxbow::bam::BamReader;
use oxbow::bam::references_to_ipc;
use oxbow::bcf;
use oxbow::bigbed::BigBedReader;
use oxbow::bigwig::BigWigReader;
Expand Down Expand Up @@ -105,29 +106,16 @@ fn read_bam_vpos(
#[pyfunction]
fn read_bam_references(
py: Python,
path_or_file_like: PyObject,
index: Option<PyObject>,
file_like: PyObject
) -> PyObject {
if let Ok(string_ref) = path_or_file_like.downcast::<PyString>(py) {
// If it's a string, treat it as a path
let mut reader = BamReader::new_from_path(string_ref.to_str().unwrap()).unwrap();
let ipc = reader.references_to_ipc().unwrap();
Python::with_gil(|py| PyBytes::new(py, &ipc).into())
} else {
// Otherwise, treat it as file-like
let file_like = match PyFileLikeObject::new(path_or_file_like, true, false, true) {
Ok(file_like) => file_like,
Err(_) => panic!("Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object."),
};
let index_file_like = match PyFileLikeObject::new(index.unwrap(), true, false, true) {
Ok(file_like) => file_like,
Err(_) => panic!("Unknown argument for `index`. Not a file path string or url, and not a file-like object."),
};
let index = bam::index_from_reader(index_file_like).unwrap();
let mut reader = BamReader::new(file_like, index).unwrap();
let ipc = reader.references_to_ipc().unwrap();
Python::with_gil(|py| PyBytes::new(py, &ipc).into())
}
// Otherwise, treat it as file-like
let _file_like = match PyFileLikeObject::new(file_like, true, false, true) {
Ok(_file_like) => _file_like,
Err(_) => panic!("Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object."),
};

let ipc = references_to_ipc(_file_like).unwrap();
Python::with_gil(|py| PyBytes::new(py, &ipc).into())
}

#[pyfunction]
Expand Down

0 comments on commit 2880b06

Please sign in to comment.