From 88cb4195f90ce35504f934f1866eb421d15d2e94 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 28 Apr 2023 04:08:52 +0000 Subject: [PATCH 01/16] construct the output sequences in parallel --- pgr-bin/src/bin/pgr-query.rs | 64 +++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/pgr-bin/src/bin/pgr-query.rs b/pgr-bin/src/bin/pgr-query.rs index eca0b31..330f036 100644 --- a/pgr-bin/src/bin/pgr-query.rs +++ b/pgr-bin/src/bin/pgr-query.rs @@ -2,6 +2,7 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); use clap::{self, CommandFactory, Parser}; use pgr_bin::{get_fastx_reader, GZFastaReader, SeqIndexDB}; use pgr_db::fasta_io::SeqRec; +use rayon::prelude::*; use rustc_hash::FxHashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -109,17 +110,16 @@ fn main() -> Result<(), std::io::Error> { let _ = seq_index_db.load_from_fastx(args.pgr_db_prefix, args.w, args.k, args.r, args.min_span); } else { - - #[cfg(feature = "with_agc")] { - let stderr = io::stderr(); - let mut handle = stderr.lock(); - let _ = handle.write_all(b"Read the input as a AGC backed index database files.\n"); - let _ = seq_index_db.load_from_agc_index(args.pgr_db_prefix); + #[cfg(feature = "with_agc")] + { + let stderr = io::stderr(); + let mut handle = stderr.lock(); + let _ = handle.write_all(b"Read the input as a AGC backed index database files.\n"); + let _ = seq_index_db.load_from_agc_index(args.pgr_db_prefix); } #[cfg(not(feature = "with_agc"))] panic!("This command is compiled with only frg file support, please specify `--frg-file"); - } let prefix = Path::new(&args.output_prefix); @@ -318,12 +318,15 @@ fn main() -> Result<(), std::io::Error> { .collect::>(); let mut fasta_out = None; - let mut fasta_buf: BufWriter; + let fasta_buf: BufWriter; if !args.only_summary { let ext = format!("{:03}.fa", idx); fasta_buf = BufWriter::new(File::create(prefix.with_extension(ext)).unwrap()); - fasta_out = Some(&mut fasta_buf); + fasta_out = Some(fasta_buf); }; + + let mut sub_seq_range_for_fasta = Vec::<(u32, u32, u32, u32, String)>::new(); + aln_range.into_iter().for_each(|(sid, rgns)| { let (ctg, src, _ctg_len) = seq_index_db.seq_info.as_ref().unwrap().get(&sid).unwrap(); @@ -376,22 +379,39 @@ fn main() -> Result<(), std::io::Error> { target_seq_name ); } + sub_seq_range_for_fasta.push(( + sid, + b, + e, + orientation, + target_seq_name.clone(), + )); //println!("DBG: {}", seq_id); - if let Some(fasta_out) = fasta_out.as_mut() { - let target_seq = seq_index_db - .get_sub_seq_by_id(sid, b as usize, e as usize) - .unwrap(); - let target_seq = if orientation == 1 { - pgr_db::fasta_io::reverse_complement(&target_seq) - } else { - target_seq - }; - let _ = writeln!(fasta_out, ">{}", target_seq_name); - let _ = - writeln!(fasta_out, "{}", String::from_utf8_lossy(&target_seq)); - }; }); }); + if let Some(fasta_out) = fasta_out.as_mut() { + sub_seq_range_for_fasta + .par_iter() + .map(|(sid, b, e, orientation, target_seq_name)| { + let target_seq = seq_index_db + .get_sub_seq_by_id(*sid, *b as usize, *e as usize) + .unwrap(); + let target_seq = if *orientation == 1 { + pgr_db::fasta_io::reverse_complement(&target_seq) + } else { + target_seq + }; + (target_seq_name.into(), target_seq) + }) + .collect::)>>() + .into_iter() + .for_each(|(target_seq_name, target_seq)| { + writeln!(fasta_out, ">{}", target_seq_name) + .expect("can't write the query output fasta file\n"); + writeln!(fasta_out, "{}", String::from_utf8_lossy(&target_seq)) + .expect("can't write the query output fasta file\n"); + }); + }; }; }); Ok(()) From 4945f6a03fba458b70769cc70e1675f63553effc Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 28 Apr 2023 04:38:02 +0000 Subject: [PATCH 02/16] query in parallel --- pgr-bin/src/bin/pgr-query.rs | 106 +++++++++++++++++------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/pgr-bin/src/bin/pgr-query.rs b/pgr-bin/src/bin/pgr-query.rs index 330f036..58d450f 100644 --- a/pgr-bin/src/bin/pgr-query.rs +++ b/pgr-bin/src/bin/pgr-query.rs @@ -123,54 +123,9 @@ fn main() -> Result<(), std::io::Error> { } let prefix = Path::new(&args.output_prefix); - let mut hit_file = if args.bed_summary { - BufWriter::new(File::create(prefix.with_extension("hit.bed")).unwrap()) - } else { - BufWriter::new(File::create(prefix.with_extension("hit")).unwrap()) - }; - if args.bed_summary { - writeln!( - hit_file, - "#{}", - [ - "target", - "bgn", - "end", - "query", - "color", - "orientation", - "q_len", - "aln_anchor_count", - "q_idx", - "src", - "ctg_bgn", - "ctg_end", - ] - .join("\t") - )?; - } else { - writeln!( - hit_file, - "#{}", - [ - "out_seq_name", - "ctg_bgn", - "ctg_end", - "color", - "q_name", - "orientation", - "idx", - "q_idx", - "query_bgn", - "query_end", - "q_len", - "aln_anchor_count", - ] - .join("\t") - )?; - }; + query_seqs - .into_iter() + .into_par_iter() .enumerate() .for_each(|(idx, seq_rec)| { let q_name = String::from_utf8_lossy(&seq_rec.id); @@ -326,7 +281,52 @@ fn main() -> Result<(), std::io::Error> { }; let mut sub_seq_range_for_fasta = Vec::<(u32, u32, u32, u32, String)>::new(); - + let mut hit_file = if args.bed_summary { + BufWriter::new(File::create(prefix.with_extension(format!("{:03}.hit.bed",idx ))).unwrap()) + } else { + BufWriter::new(File::create(prefix.with_extension(format!("{:03}.hit",idx ))).unwrap()) + }; + if args.bed_summary { + writeln!( + hit_file, + "#{}", + [ + "target", + "bgn", + "end", + "query", + "color", + "orientation", + "q_len", + "aln_anchor_count", + "q_idx", + "src", + "ctg_bgn", + "ctg_end", + ] + .join("\t") + ).expect("writing bed summary fail\n"); + } else { + writeln!( + hit_file, + "#{}", + [ + "out_seq_name", + "ctg_bgn", + "ctg_end", + "color", + "q_name", + "orientation", + "idx", + "q_idx", + "query_bgn", + "query_end", + "q_len", + "aln_anchor_count", + ] + .join("\t") + ).expect("writing bed summary fail\n"); + }; aln_range.into_iter().for_each(|(sid, rgns)| { let (ctg, src, _ctg_len) = seq_index_db.seq_info.as_ref().unwrap().get(&sid).unwrap(); @@ -343,9 +343,9 @@ fn main() -> Result<(), std::io::Error> { } else { format!("{}::{}_{}_{}_{}", base, ctg, b, e, orientation) }; - + if args.bed_summary { - let _ = writeln!( + writeln!( hit_file, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", target_seq_name, @@ -360,9 +360,9 @@ fn main() -> Result<(), std::io::Error> { src, q_bgn, q_end, - ); + ).expect("writing hit summary fail\n"); } else { - let _ = writeln!( + writeln!( hit_file, "{:03}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", idx, @@ -377,7 +377,7 @@ fn main() -> Result<(), std::io::Error> { e, orientation, target_seq_name - ); + ).expect("writing hit summary fail\n"); } sub_seq_range_for_fasta.push(( sid, From a7707e9de404053a9d6fc9e7cc0659ce9d18b67b Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 28 Apr 2023 15:47:02 +0000 Subject: [PATCH 03/16] allow configuring the number of CPU used in parallel just in case --- pgr-bin/src/bin/pgr-query.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pgr-bin/src/bin/pgr-query.rs b/pgr-bin/src/bin/pgr-query.rs index 58d450f..97f510c 100644 --- a/pgr-bin/src/bin/pgr-query.rs +++ b/pgr-bin/src/bin/pgr-query.rs @@ -72,12 +72,18 @@ struct CmdOptions { /// output summaries in the bed format #[clap(long, default_value_t = false)] bed_summary: bool, + + /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS + #[clap(long, default_value_t = 0)] + number_of_thread: usize, } fn main() -> Result<(), std::io::Error> { CmdOptions::command().version(VERSION_STRING).get_matches(); let args = CmdOptions::parse(); + rayon::ThreadPoolBuilder::new().num_threads(args.number_of_thread).build_global().unwrap(); + let mut query_seqs: Vec = vec![]; let mut add_seqs = |seq_iter: &mut dyn Iterator>| { seq_iter.into_iter().for_each(|r| { From 84e75d60c8a954e910dae2c533557a8b77ed2733 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Tue, 2 May 2023 04:43:09 +0000 Subject: [PATCH 04/16] remove the frag_map usage which is obsoleted --- pgr-bin/src/bin/pgr-compare-cov2.rs | 32 +++++++++++++++++------ pgr-bin/src/lib.rs | 8 +++--- pgr-db/src/seq_db.rs | 39 +++++++++++++++++++++-------- 3 files changed, 56 insertions(+), 23 deletions(-) diff --git a/pgr-bin/src/bin/pgr-compare-cov2.rs b/pgr-bin/src/bin/pgr-compare-cov2.rs index 8afdb64..11d3833 100644 --- a/pgr-bin/src/bin/pgr-compare-cov2.rs +++ b/pgr-bin/src/bin/pgr-compare-cov2.rs @@ -4,6 +4,7 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); use clap::{self, CommandFactory, Parser}; use pgr_bin::{pair_shmmrs, sequence_to_shmmrs, SeqIndexDB}; +use pgr_db::seq_db::{ShmmrPair, get_shmmr_matches_from_mmap_file}; use rayon::prelude::*; use rustc_hash::FxHashSet; use std::{ @@ -179,7 +180,24 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } }); - let frag_map = seq_index_db.get_shmmr_map_internal().unwrap(); + //let frag_map = seq_index_db.get_shmmr_map_internal().unwrap(); + + let get_shmmr_matches = |smps:ShmmrPair| { + #[cfg(feature = "with_agc")] + if input_type == "AGC" { + get_shmmr_matches_from_mmap_file(&seq_index_db.agc_db.as_ref().unwrap().frag_location_map, + smps, &seq_index_db.agc_db.as_ref().unwrap().frag_map_file) + } else { + get_shmmr_matches_from_mmap_file(&seq_index_db.frg_db.as_ref().unwrap().frag_location_map, + smps, &seq_index_db.frg_db.as_ref().unwrap().frag_map_file) + } + #[cfg(not(feature = "with_agc"))] + { + get_shmmr_matches_from_mmap_file(&seq_index_db.frg_db.as_ref().unwrap().frag_location_map, + smps, &seq_index_db.frg_db.as_ref().unwrap().frag_map_file) + } + + }; let mut output_bedgraph_file0 = BufWriter::new( File::create(Path::new(&prefix).with_extension("0.bedgraph")) .expect("can't create the output file"), @@ -217,7 +235,8 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } else { (s1, s0, p0, p1, 1_u8) }; - let (c0, c1) = if let Some(hits) = frag_map.get(&(k.0, k.1)) { + let (c0, c1) = { + let hits = get_shmmr_matches((k.0, k.1)); let mut c0 = 0_usize; let mut c1 = 0_usize; hits.iter().for_each(|v| { @@ -230,8 +249,6 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } }); (c0, c1) - } else { - (0, 0) }; assert!(c0 > 0); let r = c1 as f32 / c0 as f32; @@ -273,7 +290,8 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } else { (s1, s0, p0, p1, 1_u8) }; - let (c0, c1) = if let Some(hits) = frag_map.get(&(k.0, k.1)) { + let (c0, c1) = { + let hits = get_shmmr_matches((k.0, k.1)); let mut c0 = 0_usize; let mut c1 = 0_usize; hits.iter().for_each(|v| { @@ -286,9 +304,7 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } }); (c0, c1) - } else { - (0, 0) - }; + }; assert!(c1 > 0); let r = c0 as f32 / c1 as f32; (k.2, k.3, r, c1, c0) diff --git a/pgr-bin/src/lib.rs b/pgr-bin/src/lib.rs index 4abb7c1..fa47a01 100644 --- a/pgr-bin/src/lib.rs +++ b/pgr-bin/src/lib.rs @@ -8,10 +8,10 @@ use pgr_db::graph_utils::{AdjList, ShmmrGraphNode}; pub use pgr_db::seq_db::pair_shmmrs; use pgr_db::seq_db::{self, raw_query_fragment, raw_query_fragment_from_mmap_midx, GetSeq}; pub use pgr_db::shmmrutils::{sequence_to_shmmrs, ShmmrSpec}; -use pgr_db::{aln, frag_file_io}; +use pgr_db::{aln, frag_file_io::CompactSeqFragFileStorage}; #[cfg(feature = "with_agc")] -use pgr_db::agc_io; +use pgr_db::agc_io::{self, AGCSeqDB}; use rayon::prelude::*; use rustc_hash::{FxHashMap, FxHashSet}; @@ -51,8 +51,8 @@ pub struct SeqIndexDB { pub seq_db: Option, #[cfg(feature = "with_agc")] /// Rust internal: store the agc file and the index - pub agc_db: Option, - pub frg_db: Option, + pub agc_db: Option, + pub frg_db: Option, /// a dictionary maps (ctg_name, source) -> (id, len) #[allow(clippy::type_complexity)] pub seq_index: Option), (u32, u32)>>, diff --git a/pgr-db/src/seq_db.rs b/pgr-db/src/seq_db.rs index 31f66e7..1dd7004 100644 --- a/pgr-db/src/seq_db.rs +++ b/pgr-db/src/seq_db.rs @@ -815,11 +815,15 @@ impl CompactSeqDB { let mut sdx_file = BufWriter::new( File::create(file_prefix.clone() + ".sdx").expect("sdx file creating fail\n"), ); - sdx_file.write_all("SDX:0.5".as_bytes()).expect("sdx file writing error"); + sdx_file + .write_all("SDX:0.5".as_bytes()) + .expect("sdx file writing error"); let mut frg_file = BufWriter::new(File::create(file_prefix + ".frg").expect("frg file creating fail\n")); - frg_file.write_all("FRG:0.5".as_bytes()).expect("frg file writing error"); + frg_file + .write_all("FRG:0.5".as_bytes()) + .expect("frg file writing error"); let config = config::standard(); let chunk_size = chunk_size.unwrap_or(256_usize); @@ -857,9 +861,13 @@ impl CompactSeqDB { offset += l; frg_file.write_all(v).expect("frag file writing error\n"); }); - - bincode::encode_into_std_write((chunk_size, frag_addr_offset, &self.seqs), &mut sdx_file, config) - .expect("sdx file writing error\n"); + + bincode::encode_into_std_write( + (chunk_size, frag_addr_offset, &self.seqs), + &mut sdx_file, + config, + ) + .expect("sdx file writing error\n"); //bincode::encode_into_std_write(compressed_frags, &mut frg_file, config) // .expect(" frag file writing error"); } @@ -1243,17 +1251,26 @@ pub fn raw_query_fragment_from_mmap_midx( } }) .map(|(s0, s1, p0, p1, orientation)| { - if let Some(&(start, vec_len)) = frag_map_location.get(&(s0, s1)) { - let m = get_fragment_signatures_from_mmap_file(&frag_map_mmap_file, start, vec_len); - ((s0, s1), (p0, p1, orientation), m) - } else { - ((s0, s1), (p0, p1, orientation), vec![]) - } + let m = + get_shmmr_matches_from_mmap_file(frag_map_location, (s0, s1), frag_map_mmap_file); + ((s0, s1), (p0, p1, orientation), m) }) .collect::>(); query_results } +pub fn get_shmmr_matches_from_mmap_file( + frag_map_location: &ShmmrToIndexFileLocation, + (s0, s1): ShmmrPair, + frag_map_mmap_file: &Mmap, +) -> Vec<(u32, u32, u32, u32, u8)> { + if let Some(&(start, vec_len)) = frag_map_location.get(&(s0, s1)) { + get_fragment_signatures_from_mmap_file(&frag_map_mmap_file, start, vec_len) + } else { + vec![] + } +} + pub fn get_match_positions_with_fragment( shmmr_map: &ShmmrToFrags, frag: &Vec, From b721df9c2b44f08d413379bf5b136088909d15b4 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Thu, 4 May 2023 23:17:07 +0000 Subject: [PATCH 05/16] move the library in pgr-bin to pgr-db as the module "ext" (extension for bin and python) --- pgr-bin/src/bin/pgr-compare-cov.rs | 6 ++--- pgr-bin/src/bin/pgr-compare-cov2.rs | 36 +++++++++++++++---------- pgr-bin/src/bin/pgr-fetch-seqs.rs | 4 +-- pgr-bin/src/bin/pgr-make-frgdb.rs | 2 +- pgr-bin/src/bin/pgr-pbundle-decomp.rs | 4 +-- pgr-bin/src/bin/pgr-query.rs | 30 ++++++++++++++------- pgr-bin/src/lib.rs => pgr-db/src/ext.rs | 23 +++++++++------- pgr-db/src/lib.rs | 15 +++++++---- 8 files changed, 73 insertions(+), 47 deletions(-) rename pgr-bin/src/lib.rs => pgr-db/src/ext.rs (98%) diff --git a/pgr-bin/src/bin/pgr-compare-cov.rs b/pgr-bin/src/bin/pgr-compare-cov.rs index 14391a9..5c660de 100644 --- a/pgr-bin/src/bin/pgr-compare-cov.rs +++ b/pgr-bin/src/bin/pgr-compare-cov.rs @@ -3,14 +3,14 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); //use std::path::PathBuf; use clap::{self, CommandFactory, Parser}; -use pgr_bin::{pair_shmmrs, sequence_to_shmmrs, SeqIndexDB, ShmmrSpec}; +use pgr_db::ext::{pair_shmmrs, sequence_to_shmmrs, SeqIndexDB, ShmmrSpec}; +use rayon::prelude::*; use rustc_hash::FxHashSet; use std::{ fs::File, io::{BufRead, BufReader, BufWriter, Write}, path::Path, }; -use rayon::prelude::*; /// Compare SHIMMER pair count in two input sequence files #[derive(Parser, Debug)] @@ -320,7 +320,7 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } else { panic!("input type has to be specified AGC or FRG backends") }; - + #[cfg(not(feature = "with_agc"))] if input_type == "FRG" { let _ = seq_index_db.load_from_frg_index(args.frg_idx_prefix.as_ref().unwrap().clone()); diff --git a/pgr-bin/src/bin/pgr-compare-cov2.rs b/pgr-bin/src/bin/pgr-compare-cov2.rs index 11d3833..58c5549 100644 --- a/pgr-bin/src/bin/pgr-compare-cov2.rs +++ b/pgr-bin/src/bin/pgr-compare-cov2.rs @@ -3,8 +3,8 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); //use std::path::PathBuf; use clap::{self, CommandFactory, Parser}; -use pgr_bin::{pair_shmmrs, sequence_to_shmmrs, SeqIndexDB}; -use pgr_db::seq_db::{ShmmrPair, get_shmmr_matches_from_mmap_file}; +use pgr_db::ext::{pair_shmmrs, sequence_to_shmmrs, SeqIndexDB}; +use pgr_db::seq_db::{get_shmmr_matches_from_mmap_file, ShmmrPair}; use rayon::prelude::*; use rustc_hash::FxHashSet; use std::{ @@ -182,21 +182,29 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { //let frag_map = seq_index_db.get_shmmr_map_internal().unwrap(); - let get_shmmr_matches = |smps:ShmmrPair| { - #[cfg(feature = "with_agc")] + let get_shmmr_matches = |smps: ShmmrPair| { + #[cfg(feature = "with_agc")] if input_type == "AGC" { - get_shmmr_matches_from_mmap_file(&seq_index_db.agc_db.as_ref().unwrap().frag_location_map, - smps, &seq_index_db.agc_db.as_ref().unwrap().frag_map_file) + get_shmmr_matches_from_mmap_file( + &seq_index_db.agc_db.as_ref().unwrap().frag_location_map, + smps, + &seq_index_db.agc_db.as_ref().unwrap().frag_map_file, + ) } else { - get_shmmr_matches_from_mmap_file(&seq_index_db.frg_db.as_ref().unwrap().frag_location_map, - smps, &seq_index_db.frg_db.as_ref().unwrap().frag_map_file) + get_shmmr_matches_from_mmap_file( + &seq_index_db.frg_db.as_ref().unwrap().frag_location_map, + smps, + &seq_index_db.frg_db.as_ref().unwrap().frag_map_file, + ) } #[cfg(not(feature = "with_agc"))] { - get_shmmr_matches_from_mmap_file(&seq_index_db.frg_db.as_ref().unwrap().frag_location_map, - smps, &seq_index_db.frg_db.as_ref().unwrap().frag_map_file) + get_shmmr_matches_from_mmap_file( + &seq_index_db.frg_db.as_ref().unwrap().frag_location_map, + smps, + &seq_index_db.frg_db.as_ref().unwrap().frag_map_file, + ) } - }; let mut output_bedgraph_file0 = BufWriter::new( File::create(Path::new(&prefix).with_extension("0.bedgraph")) @@ -236,7 +244,7 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { (s1, s0, p0, p1, 1_u8) }; let (c0, c1) = { - let hits = get_shmmr_matches((k.0, k.1)); + let hits = get_shmmr_matches((k.0, k.1)); let mut c0 = 0_usize; let mut c1 = 0_usize; hits.iter().for_each(|v| { @@ -291,7 +299,7 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { (s1, s0, p0, p1, 1_u8) }; let (c0, c1) = { - let hits = get_shmmr_matches((k.0, k.1)); + let hits = get_shmmr_matches((k.0, k.1)); let mut c0 = 0_usize; let mut c1 = 0_usize; hits.iter().for_each(|v| { @@ -304,7 +312,7 @@ fn generate_bed_graph_from_sdb(args: &CmdOptions, input_type: &str) { } }); (c0, c1) - }; + }; assert!(c1 > 0); let r = c0 as f32 / c1 as f32; (k.2, k.3, r, c1, c0) diff --git a/pgr-bin/src/bin/pgr-fetch-seqs.rs b/pgr-bin/src/bin/pgr-fetch-seqs.rs index 07ab792..72299d2 100644 --- a/pgr-bin/src/bin/pgr-fetch-seqs.rs +++ b/pgr-bin/src/bin/pgr-fetch-seqs.rs @@ -1,6 +1,6 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); use clap::{self, CommandFactory, Parser}; -use pgr_bin::SeqIndexDB; +use pgr_db::ext::SeqIndexDB; use pgr_db::fasta_io; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Write}; @@ -88,7 +88,7 @@ fn main() -> Result<(), std::io::Error> { } else { Box::new(io::stdout()) }; - + region_file.lines().into_iter().for_each(|line| { let line = line.expect("fail to get a line in the region file"); let fields = line.split('\t').collect::>(); diff --git a/pgr-bin/src/bin/pgr-make-frgdb.rs b/pgr-bin/src/bin/pgr-make-frgdb.rs index 028ad4c..11fcdb9 100644 --- a/pgr-bin/src/bin/pgr-make-frgdb.rs +++ b/pgr-bin/src/bin/pgr-make-frgdb.rs @@ -3,7 +3,7 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); //use std::path::PathBuf; use clap::{self, CommandFactory, Parser}; -use pgr_bin::SeqIndexDB; +use pgr_db::ext::SeqIndexDB; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::Path; diff --git a/pgr-bin/src/bin/pgr-pbundle-decomp.rs b/pgr-bin/src/bin/pgr-pbundle-decomp.rs index 587bafc..99e1892 100644 --- a/pgr-bin/src/bin/pgr-pbundle-decomp.rs +++ b/pgr-bin/src/bin/pgr-pbundle-decomp.rs @@ -1,7 +1,7 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); use bincode::config; use clap::{self, CommandFactory, Parser}; -use pgr_bin::{ +use pgr_db::ext::{ get_principal_bundle_decomposition, PrincipalBundlesWithId, SeqIndexDB, VertexToBundleIdMap, }; use rustc_hash::{FxHashMap, FxHashSet}; @@ -200,7 +200,7 @@ fn main() -> Result<(), std::io::Error> { usize, usize, PrincipalBundlesWithId, - pgr_bin::VertexToBundleIdMap, + VertexToBundleIdMap, ), config::Configuration, >(&s[..], config) diff --git a/pgr-bin/src/bin/pgr-query.rs b/pgr-bin/src/bin/pgr-query.rs index 97f510c..b920960 100644 --- a/pgr-bin/src/bin/pgr-query.rs +++ b/pgr-bin/src/bin/pgr-query.rs @@ -1,6 +1,6 @@ const VERSION_STRING: &str = env!("VERSION_STRING"); use clap::{self, CommandFactory, Parser}; -use pgr_bin::{get_fastx_reader, GZFastaReader, SeqIndexDB}; +use pgr_db::ext::{get_fastx_reader, GZFastaReader, SeqIndexDB}; use pgr_db::fasta_io::SeqRec; use rayon::prelude::*; use rustc_hash::FxHashMap; @@ -82,7 +82,10 @@ fn main() -> Result<(), std::io::Error> { CmdOptions::command().version(VERSION_STRING).get_matches(); let args = CmdOptions::parse(); - rayon::ThreadPoolBuilder::new().num_threads(args.number_of_thread).build_global().unwrap(); + rayon::ThreadPoolBuilder::new() + .num_threads(args.number_of_thread) + .build_global() + .unwrap(); let mut query_seqs: Vec = vec![]; let mut add_seqs = |seq_iter: &mut dyn Iterator>| { @@ -129,7 +132,6 @@ fn main() -> Result<(), std::io::Error> { } let prefix = Path::new(&args.output_prefix); - query_seqs .into_par_iter() .enumerate() @@ -288,9 +290,13 @@ fn main() -> Result<(), std::io::Error> { let mut sub_seq_range_for_fasta = Vec::<(u32, u32, u32, u32, String)>::new(); let mut hit_file = if args.bed_summary { - BufWriter::new(File::create(prefix.with_extension(format!("{:03}.hit.bed",idx ))).unwrap()) + BufWriter::new( + File::create(prefix.with_extension(format!("{:03}.hit.bed", idx))).unwrap(), + ) } else { - BufWriter::new(File::create(prefix.with_extension(format!("{:03}.hit",idx ))).unwrap()) + BufWriter::new( + File::create(prefix.with_extension(format!("{:03}.hit", idx))).unwrap(), + ) }; if args.bed_summary { writeln!( @@ -311,7 +317,8 @@ fn main() -> Result<(), std::io::Error> { "ctg_end", ] .join("\t") - ).expect("writing bed summary fail\n"); + ) + .expect("writing bed summary fail\n"); } else { writeln!( hit_file, @@ -331,7 +338,8 @@ fn main() -> Result<(), std::io::Error> { "aln_anchor_count", ] .join("\t") - ).expect("writing bed summary fail\n"); + ) + .expect("writing bed summary fail\n"); }; aln_range.into_iter().for_each(|(sid, rgns)| { let (ctg, src, _ctg_len) = @@ -349,7 +357,7 @@ fn main() -> Result<(), std::io::Error> { } else { format!("{}::{}_{}_{}_{}", base, ctg, b, e, orientation) }; - + if args.bed_summary { writeln!( hit_file, @@ -366,7 +374,8 @@ fn main() -> Result<(), std::io::Error> { src, q_bgn, q_end, - ).expect("writing hit summary fail\n"); + ) + .expect("writing hit summary fail\n"); } else { writeln!( hit_file, @@ -383,7 +392,8 @@ fn main() -> Result<(), std::io::Error> { e, orientation, target_seq_name - ).expect("writing hit summary fail\n"); + ) + .expect("writing hit summary fail\n"); } sub_seq_range_for_fasta.push(( sid, diff --git a/pgr-bin/src/lib.rs b/pgr-db/src/ext.rs similarity index 98% rename from pgr-bin/src/lib.rs rename to pgr-db/src/ext.rs index fa47a01..f87969f 100644 --- a/pgr-bin/src/lib.rs +++ b/pgr-db/src/ext.rs @@ -3,15 +3,16 @@ use flate2::bufread::MultiGzDecoder; #[cfg(feature = "with_agc")] use memmap2::Mmap; -use pgr_db::fasta_io::FastaReader; -use pgr_db::graph_utils::{AdjList, ShmmrGraphNode}; -pub use pgr_db::seq_db::pair_shmmrs; -use pgr_db::seq_db::{self, raw_query_fragment, raw_query_fragment_from_mmap_midx, GetSeq}; -pub use pgr_db::shmmrutils::{sequence_to_shmmrs, ShmmrSpec}; -use pgr_db::{aln, frag_file_io::CompactSeqFragFileStorage}; +use crate::fasta_io::FastaReader; +use crate::frag_file_io; +use crate::graph_utils::{AdjList, ShmmrGraphNode}; +pub use crate::seq_db::pair_shmmrs; +use crate::seq_db::{self, raw_query_fragment, raw_query_fragment_from_mmap_midx, GetSeq}; +pub use crate::shmmrutils::{sequence_to_shmmrs, ShmmrSpec}; +use crate::{aln, frag_file_io::CompactSeqFragFileStorage}; #[cfg(feature = "with_agc")] -use pgr_db::agc_io::{self, AGCSeqDB}; +use crate::agc_io::{self, AGCSeqDB}; use rayon::prelude::*; use rustc_hash::{FxHashMap, FxHashSet}; @@ -129,7 +130,7 @@ impl SeqIndexDB { } pub fn load_from_frg_index(&mut self, prefix: String) -> Result<(), std::io::Error> { - let mut frag_db = pgr_db::frag_file_io::CompactSeqFragFileStorage::new(prefix); + let mut frag_db = frag_file_io::CompactSeqFragFileStorage::new(prefix); let seq_index = frag_db.seq_index.into_iter().map(|(k, v)| (k, v)).collect(); @@ -640,7 +641,6 @@ impl SeqIndexDB { (principal_bundles_with_id, vertex_to_bundle_id_direction_pos) } - pub fn generate_mapg_gfa( &self, min_count: usize, @@ -979,7 +979,10 @@ pub fn get_principal_bundle_decomposition( let (ctg_name, source, _) = data; let source = source.clone().unwrap(); let seq = seq_db.get_seq(source, ctg_name.clone()).unwrap(); - (*sid, seq_db.get_smps(seq, &seq_db.shmmr_spec.clone().unwrap())) + ( + *sid, + seq_db.get_smps(seq, &seq_db.shmmr_spec.clone().unwrap()), + ) }) .collect(); diff --git a/pgr-db/src/lib.rs b/pgr-db/src/lib.rs index dd24a6f..73a0d11 100644 --- a/pgr-db/src/lib.rs +++ b/pgr-db/src/lib.rs @@ -12,6 +12,7 @@ pub mod graph_utils; pub mod kmer_filter; pub mod seq_db; //pub mod seqs2variants; +pub mod ext; pub mod shmmrutils; #[cfg(test)] @@ -114,7 +115,8 @@ mod tests { let m = match_reads(&base_frg, &frg, true, 0.1, 0, 0, 32); if let Some(m) = m { let deltas: Vec = m.deltas.unwrap(); - let aln_segs = deltas_to_aln_segs(&deltas, m.end0 as usize, m.end1 as usize, &base_frg, &frg); + let aln_segs = + deltas_to_aln_segs(&deltas, m.end0 as usize, m.end1 as usize, &base_frg, &frg); let re_seq = reconstruct_seq_from_aln_segs(&base_frg, &aln_segs); if frg != re_seq || true { println!("{} {}", String::from_utf8_lossy(&base_frg), base_frg.len()); @@ -143,7 +145,8 @@ mod tests { let m = match_reads(&base_frg, &frg, true, 0.1, 0, 0, 32); if let Some(m) = m { let deltas: Vec = m.deltas.unwrap(); - let aln_segs = deltas_to_aln_segs(&deltas, m.end0 as usize, m.end1 as usize, &base_frg, &frg); + let aln_segs = + deltas_to_aln_segs(&deltas, m.end0 as usize, m.end1 as usize, &base_frg, &frg); let re_seq = reconstruct_seq_from_aln_segs(&base_frg, &aln_segs); if frg != re_seq || true { println!("{} {}", String::from_utf8_lossy(&base_frg), base_frg.len()); @@ -361,9 +364,10 @@ mod tests { #[test] fn test_open_compact_seq_db_storage() { - use seq_db::GetSeq; use crate::frag_file_io::CompactSeqFragFileStorage; - let seq_storage = CompactSeqFragFileStorage::new("test/test_data/test_seqs_frag".to_string()); + use seq_db::GetSeq; + let seq_storage = + CompactSeqFragFileStorage::new("test/test_data/test_seqs_frag".to_string()); let seq = seq_storage.get_seq_by_id(0); println!("{}", String::from_utf8_lossy(&seq[..])); let seq = seq_storage.get_sub_seq_by_id(0, 100, 200); @@ -374,7 +378,8 @@ mod tests { fn test_seq_db_storage_get_sub_read() { use crate::frag_file_io::CompactSeqFragFileStorage; use seq_db::GetSeq; - let seq_storage = CompactSeqFragFileStorage::new("test/test_data/test_seqs_frag".to_string()); + let seq_storage = + CompactSeqFragFileStorage::new("test/test_data/test_seqs_frag".to_string()); let sid = 0; let seq = seq_storage.get_seq_by_id(sid); From f33d58d8aab591b168ad518e2d44ed06a65514a4 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 01:06:13 +0000 Subject: [PATCH 06/16] remove the reduant code using the shared pgr_db::ext module --- pgr-tk/src/lib.rs | 676 ++++++---------------------------------------- 1 file changed, 88 insertions(+), 588 deletions(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index f43d1e5..f7fa2f7 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -1,7 +1,6 @@ // src/lib.rs pub const VERSION_STRING: &'static str = env!("VERSION_STRING"); #[cfg(feature = "with_agc")] -use memmap2::Mmap; use pgr_db::aln::{self, HitPair}; use pgr_db::graph_utils::{AdjList, ShmmrGraphNode}; use pgr_db::seq_db::{self, raw_query_fragment}; @@ -11,7 +10,7 @@ use pgr_db::shmmrutils::{sequence_to_shmmrs, DeltaPoint, ShmmrSpec}; #[cfg(feature = "with_agc")] use pgr_db::agc_io; -use pgr_db::{fasta_io, frag_file_io}; +use pgr_db::fasta_io; use pyo3::exceptions; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; @@ -20,23 +19,8 @@ use pyo3::wrap_pyfunction; use pyo3::Python; use rayon::prelude::*; use rustc_hash::{FxHashMap, FxHashSet}; -use seq_db::GetSeq; -use std::collections::HashMap; -use std::fs::File; -#[cfg(feature = "with_agc")] -use std::io::{BufRead, BufReader}; -use std::io::{BufWriter, Write}; -use std::iter::FromIterator; -#[derive(Clone, Copy, PartialEq)] -enum Backend { - #[cfg(feature = "with_agc")] - AGC, - FRG, - FASTX, - MEMORY, - UNKNOWN, -} +use pgr_db::ext::Backend; /// Get the revision (git-hashtag) of the build #[pyfunction] @@ -73,25 +57,8 @@ pub fn pgr_lib_version() -> PyResult { /// #[pyclass] struct SeqIndexDB { - /// Rust internal: store the specification of the shmmr_spec - pub shmmr_spec: Option, - /// Rust internal: store the sequences - pub seq_db: Option, - - #[cfg(feature = "with_agc")] - /// Rust internal: store the agc file and the index - pub agc_db: Option, - - pub frg_db: Option, - /// a dictionary maps (ctg_name, source) -> (id, len) - #[pyo3(get)] - pub seq_index: Option), (u32, u32)>>, - /// a dictionary maps id -> (ctg_name, source, len) - #[pyo3(get)] - pub seq_info: Option, u32)>>, - - pub backend: Backend, - + /// Rust internal: + db_internal: pgr_db::ext::SeqIndexDB, pub principal_bundles: Option<(usize, usize, Vec>)>, } @@ -101,14 +68,16 @@ impl SeqIndexDB { #[new] pub fn new() -> Self { SeqIndexDB { - seq_db: None, - frg_db: None, - #[cfg(feature = "with_agc")] - agc_db: None, - shmmr_spec: None, - seq_index: None, - seq_info: None, - backend: Backend::UNKNOWN, + db_internal: pgr_db::ext::SeqIndexDB { + seq_db: None, + frg_db: None, + #[cfg(feature = "with_agc")] + agc_db: None, + shmmr_spec: None, + seq_index: None, + seq_info: None, + backend: Backend::UNKNOWN, + }, principal_bundles: None, } } @@ -129,69 +98,13 @@ impl SeqIndexDB { #[cfg(feature = "with_agc")] #[pyo3(text_signature = "($self, prefix)")] pub fn load_from_agc_index(&mut self, prefix: String) -> PyResult<()> { - let (shmmr_spec, frag_location_map) = - seq_db::read_mdb_file_to_frag_locations(prefix.to_string() + ".mdb").unwrap(); - - let frag_location_map = - FxHashMap::<(u64, u64), (usize, usize)>::from_iter(frag_location_map); - - let agc_file = agc_io::AGCFile::new(prefix.to_string() + ".agc")?; - - let fmap_file = File::open(prefix.clone() + ".mdb").expect("frag map file open fail"); - let frag_map_file = - unsafe { Mmap::map(&fmap_file).expect("frag map file memory map creation fail") }; - - self.agc_db = Some(agc_io::AGCSeqDB { - agc_file, - frag_location_map, - frag_map_file, - }); - self.backend = Backend::AGC; - self.shmmr_spec = Some(shmmr_spec); - - let mut seq_index = HashMap::<(String, Option), (u32, u32)>::new(); - let mut seq_info = HashMap::, u32)>::new(); - - let midx_file = BufReader::new(File::open(prefix.to_string() + ".midx")?); - midx_file - .lines() - .into_iter() - .try_for_each(|line| -> Result<(), std::io::Error> { - let line = line.unwrap(); - let mut line = line.as_str().split("\t"); - let sid = line.next().unwrap().parse::().unwrap(); - let len = line.next().unwrap().parse::().unwrap(); - let ctg_name = line.next().unwrap().to_string(); - let source = line.next().unwrap().to_string(); - seq_index.insert((ctg_name.clone(), Some(source.clone())), (sid, len)); - seq_info.insert(sid, (ctg_name, Some(source), len)); - Ok(()) - })?; - - self.seq_index = Some(seq_index); - self.seq_info = Some(seq_info); + self.db_internal.load_from_agc_index(prefix)?; Ok(()) } #[pyo3(text_signature = "($self, prefix)")] pub fn load_from_frg_index(&mut self, prefix: String) -> PyResult<()> { - let mut frag_db = pgr_db::frag_file_io::CompactSeqFragFileStorage::new(prefix); - - let seq_index = frag_db.seq_index.into_iter().map(|(k, v)| (k, v)).collect(); - - let seq_info = frag_db.seq_info.into_iter().map(|(k, v)| (k, v)).collect(); - - frag_db.seq_index = FxHashMap::<(String, Option), (u32, u32)>::default(); - frag_db.seq_info = FxHashMap::, u32)>::default(); - - let shmmr_spec = frag_db.shmmr_spec.clone(); - - self.frg_db = Some(frag_db); - self.backend = Backend::FRG; - self.shmmr_spec = Some(shmmr_spec); - - self.seq_index = Some(seq_index); - self.seq_info = Some(seq_info); + self.db_internal.load_from_frg_index(prefix)?; Ok(()) } @@ -231,45 +144,19 @@ impl SeqIndexDB { r: u32, min_span: u32, ) -> PyResult<()> { - let spec = ShmmrSpec { - w, - k, - r, - min_span, - sketch: false, - }; - let mut sdb = seq_db::CompactSeqDB::new(spec.clone()); - sdb.load_seqs_from_fastx(filepath)?; - self.shmmr_spec = Some(spec); - let mut seq_index = HashMap::<(String, Option), (u32, u32)>::new(); - let mut seq_info = HashMap::, u32)>::new(); - sdb.seqs.iter().for_each(|v| { - seq_index.insert((v.name.clone(), v.source.clone()), (v.id, v.len as u32)); - seq_info.insert(v.id, (v.name.clone(), v.source.clone(), v.len as u32)); - }); - self.seq_index = Some(seq_index); - self.seq_info = Some(seq_info); - self.seq_db = Some(sdb); - self.backend = Backend::FASTX; + self.db_internal + .load_from_fastx(filepath, w, k, r, min_span)?; Ok(()) } #[pyo3(text_signature = "($self)")] pub fn append_from_fastx(&mut self, filepath: String) -> PyResult<()> { assert!( - self.backend == Backend::FASTX, + self.db_internal.backend == Backend::FASTX, "Only DB created with load_from_fastx() can add data from another fastx file" ); - let sdb = self.seq_db.as_mut().unwrap(); + let sdb = self.db_internal.seq_db.as_mut().unwrap(); sdb.load_seqs_from_fastx(filepath)?; - let mut seq_index = HashMap::<(String, Option), (u32, u32)>::new(); - let mut seq_info = HashMap::, u32)>::new(); - sdb.seqs.iter().for_each(|v| { - seq_index.insert((v.name.clone(), v.source.clone()), (v.id, v.len as u32)); - seq_info.insert(v.id, (v.name.clone(), v.source.clone(), v.len as u32)); - }); - self.seq_index = Some(seq_index); - self.seq_info = Some(seq_info); Ok(()) } @@ -313,36 +200,24 @@ impl SeqIndexDB { r: u32, min_span: u32, ) -> PyResult<()> { - let spec = ShmmrSpec { - w, - k, - r, - min_span, - sketch: false, - }; - self.backend = Backend::MEMORY; - let source = Some(source.unwrap().to_string()); - let mut sdb = seq_db::CompactSeqDB::new(spec.clone()); - let seq_vec = seq_list - .into_iter() - .enumerate() - .map(|(sid, v)| (sid as u32, source.clone(), v.0, v.1)) - .collect::, String, Vec)>>(); - sdb.load_seqs_from_seq_vec(&seq_vec); - - self.shmmr_spec = Some(spec); - let mut seq_index = HashMap::<(String, Option), (u32, u32)>::new(); - let mut seq_info = HashMap::, u32)>::new(); - sdb.seqs.iter().for_each(|v| { - seq_index.insert((v.name.clone(), v.source.clone()), (v.id, v.len as u32)); - seq_info.insert(v.id, (v.name.clone(), v.source.clone(), v.len as u32)); - }); - self.seq_index = Some(seq_index); - self.seq_info = Some(seq_info); - self.seq_db = Some(sdb); + self.db_internal + .load_from_seq_list(seq_list, source, w, k, r, min_span)?; + Ok(()) } + /// get a dictionary that maps (ctg_name, source) -> (id, len) + pub fn get_seq_index( + &self, + ) -> PyResult), (u32, u32)>>> { + Ok(self.db_internal.seq_index.clone()) + } + + /// a dictionary that maps id -> (ctg_name, source, len) + pub fn get_seq_info(&self) -> PyResult, u32)>>> { + Ok(self.db_internal.seq_info.clone()) + } + /// use a fragment of sequence to query the database to get all hits /// /// Parameters @@ -369,7 +244,7 @@ impl SeqIndexDB { &self, seq: Vec, ) -> PyResult)>> { - let shmmr_spec = &self.shmmr_spec.as_ref().unwrap(); + let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); let res: Vec<((u64, u64), (u32, u32, u8), Vec)> = seq_db::raw_query_fragment(shmmr_to_frags, &seq, shmmr_spec); @@ -396,7 +271,7 @@ impl SeqIndexDB { &self, seq: Vec, ) -> PyResult>> { - let shmmr_spec = &self.shmmr_spec.as_ref().unwrap(); + let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); let res = seq_db::get_match_positions_with_fragment(shmmr_to_frags, &seq, shmmr_spec); Ok(res) @@ -450,23 +325,17 @@ impl SeqIndexDB { max_count_target: Option, max_aln_span: Option, ) -> PyResult)>)>> { - let shmmr_spec = &self.shmmr_spec.as_ref().unwrap(); - if let Some(frag_map) = self.get_shmmr_map_internal() { - let raw_query_hits = raw_query_fragment(frag_map, &seq, shmmr_spec); - let res = aln::query_fragment_to_hps( - raw_query_hits, - &seq, - shmmr_spec, + Ok(self + .db_internal + .query_fragment_to_hps( + seq, penalty, max_count, max_count_query, max_count_target, max_aln_span, - ); - Ok(res) - } else { - Err(PyValueError::new_err("fail to find an index")) - } + ) + .unwrap()) } /// Given a sequence context, this function maps the specific positions in the context @@ -528,7 +397,7 @@ impl SeqIndexDB { max_count_target: Option, max_aln_span: Option, ) -> PyResult> { - let shmmr_spec = &self.shmmr_spec.as_ref().unwrap(); + let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); let mut all_alns = if let Some(frag_map) = self.get_shmmr_map_internal() { let raw_query_hits = raw_query_fragment(frag_map, &seq, shmmr_spec); @@ -577,13 +446,19 @@ impl SeqIndexDB { // fetch the sequence for each match if possible let mut out = vec![]; - if self.seq_info.is_none() { + if self.db_internal.seq_info.is_none() { return Ok(out); }; pos2hits.iter().for_each(|(pos, hits)| { hits.iter() .for_each(|(seq_id, _score, left_match, right_match)| { - let (ctg, src, t_len) = self.seq_info.as_ref().unwrap().get(&seq_id).unwrap(); //TODO, check if seq_info is None + let (ctg, src, t_len) = self + .db_internal + .seq_info + .as_ref() + .unwrap() + .get(&seq_id) + .unwrap(); //TODO, check if seq_info is None let same_orientation = if left_match.0 .2 == left_match.1 .2 { true } else { @@ -730,6 +605,7 @@ impl SeqIndexDB { .for_each(|v| { let sid = v.1; let source = self + .db_internal .seq_info .as_ref() .unwrap() @@ -770,7 +646,7 @@ impl SeqIndexDB { /// (window_size, k_mer_size, reduction_factor, min_space, use_sketch) /// pub fn get_shmmr_spec(&self) -> PyResult> { - if let Some(spec) = self.shmmr_spec.as_ref() { + if let Some(spec) = self.db_internal.shmmr_spec.as_ref() { Ok(Some((spec.w, spec.k, spec.r, spec.min_span, spec.sketch))) } else { Ok(None) @@ -850,42 +726,9 @@ impl SeqIndexDB { bgn: usize, end: usize, ) -> PyResult> { - match self.backend { - #[cfg(feature = "with_agc")] - Backend::AGC => Ok(self.agc_db.as_ref().unwrap().agc_file.get_sub_seq( - sample_name, - ctg_name, - bgn, - end, - )), - Backend::MEMORY | Backend::FASTX => { - let &(sid, _) = self - .seq_index - .as_ref() - .unwrap() - .get(&(ctg_name, Some(sample_name))) - .unwrap(); - Ok(self - .seq_db - .as_ref() - .unwrap() - .get_sub_seq_by_id(sid, bgn as u32, end as u32)) - } - Backend::FRG => { - let &(sid, _) = self - .seq_index - .as_ref() - .unwrap() - .get(&(ctg_name, Some(sample_name))) - .unwrap(); - Ok(self - .frg_db - .as_ref() - .unwrap() - .get_sub_seq_by_id(sid, bgn as u32, end as u32)) - } - Backend::UNKNOWN => Err(PyValueError::new_err("no seq db found")), - } + Ok(self + .db_internal + .get_sub_seq(sample_name, ctg_name, bgn, end)?) } /// fetch a contiguous sub-sequence by a sequence id @@ -906,31 +749,7 @@ impl SeqIndexDB { /// a list of bytes representing the sequence #[pyo3(text_signature = "($self, sample_name, ctg_name, bgn, end)")] pub fn get_sub_seq_by_id(&self, sid: u32, bgn: usize, end: usize) -> PyResult> { - match self.backend { - #[cfg(feature = "with_agc")] - Backend::AGC => { - let (ctg_name, sample_name, _) = self.seq_info.as_ref().unwrap().get(&sid).unwrap(); //TODO: handle Option unwrap properly - let ctg_name = ctg_name.clone(); - let sample_name = sample_name.as_ref().unwrap().clone(); - Ok(self.agc_db.as_ref().unwrap().agc_file.get_sub_seq( - sample_name, - ctg_name, - bgn, - end, - )) - } - Backend::MEMORY | Backend::FASTX => Ok(self - .seq_db - .as_ref() - .unwrap() - .get_sub_seq_by_id(sid, bgn as u32, end as u32)), - Backend::FRG => Ok(self - .frg_db - .as_ref() - .unwrap() - .get_sub_seq_by_id(sid, bgn as u32, end as u32)), - Backend::UNKNOWN => Err(PyValueError::new_err("no seq db found")), - } + Ok(self.db_internal.get_sub_seq_by_id(sid, bgn, end)?) } /// fetch a sequence @@ -949,34 +768,7 @@ impl SeqIndexDB { /// a list of bytes representing the sequence #[pyo3(text_signature = "($self, sample_name, ctg_name)")] pub fn get_seq(&self, sample_name: String, ctg_name: String) -> PyResult> { - match self.backend { - #[cfg(feature = "with_agc")] - Backend::AGC => Ok(self - .agc_db - .as_ref() - .unwrap() - .agc_file - .get_seq(sample_name, ctg_name)), - Backend::MEMORY | Backend::FASTX => { - let &(sid, _) = self - .seq_index - .as_ref() - .unwrap() - .get(&(ctg_name, Some(sample_name))) - .unwrap(); - Ok(self.seq_db.as_ref().unwrap().get_seq_by_id(sid)) - } - Backend::FRG => { - let &(sid, _) = self - .seq_index - .as_ref() - .unwrap() - .get(&(ctg_name, Some(sample_name))) - .unwrap(); - Ok(self.frg_db.as_ref().unwrap().get_seq_by_id(sid)) - } - Backend::UNKNOWN => Err(PyValueError::new_err("no seq db found")), - } + Ok(self.db_internal.get_seq(sample_name, ctg_name)?) } /// fetch a sequence by the sequence id in the database @@ -995,24 +787,7 @@ impl SeqIndexDB { /// a list of bytes representing the sequence #[pyo3(text_signature = "($self, sample_name, ctg_name)")] pub fn get_seq_by_id(&self, sid: u32) -> PyResult> { - match self.backend { - #[cfg(feature = "with_agc")] - Backend::AGC => { - let (ctg_name, sample_name, _) = self.seq_info.as_ref().unwrap().get(&sid).unwrap(); //TODO: handle Option unwrap properly - let ctg_name = ctg_name.clone(); - let sample_name = sample_name.as_ref().unwrap().clone(); - Ok(self - .agc_db - .as_ref() - .unwrap() - .agc_file - .get_seq(sample_name, ctg_name)) - } - Backend::MEMORY => Ok(self.seq_db.as_ref().unwrap().get_seq_by_id(sid)), - Backend::FASTX => Ok(self.seq_db.as_ref().unwrap().get_seq_by_id(sid)), - Backend::FRG => Ok(self.frg_db.as_ref().unwrap().get_seq_by_id(sid)), - Backend::UNKNOWN => Err(PyValueError::new_err("no seq db found")), - } + Ok(self.db_internal.get_seq_by_id(sid).unwrap()) } /// Get adjacent list of the shimmer graph shimmer_pair -> shimmer_pair @@ -1138,23 +913,9 @@ impl SeqIndexDB { path_len_cutoff: usize, keeps: Option>, ) -> Vec> { - if let Some((m, plc, pb)) = self.principal_bundles.as_ref() { - if *m == min_count && *plc == path_len_cutoff { - return pb.clone(); - } - } - - let pb = if let Some(frag_map) = self.get_shmmr_map_internal() { - let adj_list = seq_db::frag_map_to_adj_list(frag_map, min_count as usize, keeps); - - seq_db::get_principal_bundles_from_adj_list(frag_map, &adj_list, path_len_cutoff) - .0 - .into_iter() - .map(|p| p.into_iter().map(|v| (v.0, v.1, v.2)).collect()) - .collect::>>() - } else { - vec![] - }; + let pb = self + .db_internal + .get_principal_bundles(min_count, path_len_cutoff, keeps); self.principal_bundles = Some((min_count, path_len_cutoff, pb.clone())); pb } @@ -1226,6 +987,7 @@ impl SeqIndexDB { //println!("DBG: # bundles {}", pb.len()); let seqid_seq_list: Vec<(u32, Vec)> = self + .db_internal .seq_info .clone() .unwrap_or_default() @@ -1326,7 +1088,12 @@ impl SeqIndexDB { let seqid_smps: Vec<(u32, Vec<(u64, u64, u32, u32, u8)>)> = sequences .into_iter() - .map(|(sid, seq)| (sid as u32, get_smps(seq, &self.shmmr_spec.clone().unwrap()))) + .map(|(sid, seq)| { + ( + sid as u32, + get_smps(seq, &self.db_internal.shmmr_spec.clone().unwrap()), + ) + }) .collect(); // data for reordering the bundles and for re-ordering them along the sequences @@ -1454,136 +1221,8 @@ impl SeqIndexDB { method: &str, keeps: Option>, ) -> PyResult<()> { - let get_seq_by_id = |sid| -> Vec { - match self.backend { - #[cfg(feature = "with_agc")] - Backend::AGC => { - let (ctg_name, sample_name, _) = - self.seq_info.as_ref().unwrap().get(&sid).unwrap(); //TODO: handle Option unwrap properly - let ctg_name = ctg_name.clone(); - let sample_name = sample_name.as_ref().unwrap().clone(); - self.agc_db - .as_ref() - .unwrap() - .agc_file - .get_seq(sample_name, ctg_name) - } - Backend::MEMORY => self.seq_db.as_ref().unwrap().get_seq_by_id(sid), - Backend::FASTX => self.seq_db.as_ref().unwrap().get_seq_by_id(sid), - Backend::FRG => self.frg_db.as_ref().unwrap().get_seq_by_id(sid), - Backend::UNKNOWN => vec![], - } - }; - - let frag_map = self.get_shmmr_map_internal(); - if frag_map.is_none() { - return Err(PyValueError::new_err("no index found")); - } - let mut overlaps = - FxHashMap::<(ShmmrGraphNode, ShmmrGraphNode), Vec<(u32, u8, u8)>>::default(); - let mut frag_id = FxHashMap::<(u64, u64), usize>::default(); - let mut id = 0_usize; - - let frag_map = frag_map.unwrap(); - - let adj_list = if method == "from_fragmap" { - seq_db::frag_map_to_adj_list(frag_map, min_count, keeps) - } else { - let keeps = if let Some(keeps) = keeps { - Some(FxHashSet::::from_iter(keeps)) - } else { - None - }; - - self.seq_info - .as_ref() - .unwrap() - .keys() - .into_iter() - .map(|k| *k) - .collect::>() - .into_par_iter() - .flat_map(|sid| { - let seq = get_seq_by_id(sid); - let mc = if let Some(keeps) = &keeps { - if keeps.contains(&sid) { - 0 - } else { - min_count - } - } else { - min_count - }; - seq_db::generate_smp_adj_list_for_seq( - &seq, - sid, - &frag_map, - self.shmmr_spec.as_ref().unwrap(), - mc, - ) - }) - .collect::() - }; - - adj_list.iter().for_each(|(k, v, w)| { - if v.0 <= w.0 { - let key = (*v, *w); - let val = (*k, v.2, w.2); - overlaps.entry(key).or_insert_with(|| vec![]).push(val); - frag_id.entry((v.0, v.1)).or_insert_with(|| { - let c_id = id; - id += 1; - c_id - }); - frag_id.entry((w.0, w.1)).or_insert_with(|| { - let c_id = id; - id += 1; - c_id - }); - } - }); - - let mut out_file = BufWriter::new(File::create(filepath).unwrap()); - - let kmer_size = self.shmmr_spec.as_ref().unwrap().k; - out_file.write("H\tVN:Z:1.0\tCM:Z:Sparse Genome Graph Generated By pgr-tk\n".as_bytes())?; - (&frag_id) - .into_iter() - .try_for_each(|(smp, id)| -> PyResult<()> { - let hits = frag_map.get(&smp).unwrap(); - let ave_len = - hits.iter().fold(0_u32, |len_sum, &s| len_sum + s.3 - s.2) / hits.len() as u32; - let seg_line = format!( - "S\t{}\t*\tLN:i:{}\tSN:Z:{:016x}_{:016x}\n", - id, - ave_len + kmer_size, - smp.0, - smp.1 - ); - out_file.write(seg_line.as_bytes())?; - Ok(()) - })?; - - overlaps - .into_iter() - .try_for_each(|(op, vs)| -> PyResult<()> { - let o1 = if op.0 .2 == 0 { "+" } else { "-" }; - let o2 = if op.1 .2 == 0 { "+" } else { "-" }; - let id0 = frag_id.get(&(op.0 .0, op.0 .1)).unwrap(); - let id1 = frag_id.get(&(op.1 .0, op.1 .1)).unwrap(); - let overlap_line = format!( - "L\t{}\t{}\t{}\t{}\t{}M\tSC:i:{}\n", - id0, - o1, - id1, - o2, - kmer_size, - vs.len() - ); - out_file.write(overlap_line.as_bytes())?; - Ok(()) - })?; - + self.db_internal + .generate_mapg_gfa(min_count, filepath, method, keeps)?; Ok(()) } @@ -1602,60 +1241,7 @@ impl SeqIndexDB { /// fn write_mapg_idx(&self, filepath: &str) -> Result<(), std::io::Error> { - let mut writer = BufWriter::new(File::create(filepath)?); - - if let Some(shmmr_spec) = self.shmmr_spec.clone() { - writer.write( - format!( - "K\t{}\t{}\t{}\t{}\t{}\n", - shmmr_spec.w, - shmmr_spec.k, - shmmr_spec.r, - shmmr_spec.min_span, - shmmr_spec.sketch - ) - .as_bytes(), - )?; - } - - self.seq_info.as_ref().unwrap().iter().try_for_each( - |(k, v)| -> Result<(), std::io::Error> { - let line = format!( - "C\t{}\t{}\t{}\t{}\n", - k, - v.0, - v.1.clone().unwrap_or("NA".to_string()), - v.2 - ); - writer.write(line.as_bytes())?; - Ok(()) - }, - )?; - - let frag_map = self.get_shmmr_map_internal(); - if frag_map.is_none() { - return Err(std::io::Error::new( - std::io::ErrorKind::Other, - "fail to load index", - )); - }; - let frag_map = frag_map.unwrap(); - frag_map - .iter() - .try_for_each(|v| -> Result<(), std::io::Error> { - v.1.iter() - .try_for_each(|vv| -> Result<(), std::io::Error> { - writer.write( - format!( - "F\t{:016x}_{:016x}\t{}\t{}\t{}\t{}\t{}\n", - v.0 .0, v.0 .1, vv.0, vv.1, vv.2, vv.3, vv.4 - ) - .as_bytes(), - )?; - Ok(()) - })?; - Ok(()) - })?; + self.db_internal.write_mapg_idx(filepath)?; Ok(()) } @@ -1688,105 +1274,18 @@ impl SeqIndexDB { filepath: &str, keeps: Option>, ) -> PyResult<()> { - let frag_map = self.get_shmmr_map_internal(); - if frag_map.is_none() { - return Err(PyValueError::new_err("can't load index")); - }; - let frag_map = frag_map.unwrap(); - let adj_list = seq_db::frag_map_to_adj_list(frag_map, min_count, keeps); - let mut overlaps = - FxHashMap::<(ShmmrGraphNode, ShmmrGraphNode), Vec<(u32, u8, u8)>>::default(); - let mut frag_id = FxHashMap::<(u64, u64), usize>::default(); - let mut id = 0_usize; - let (pb, filtered_adj_list) = - seq_db::get_principal_bundles_from_adj_list(frag_map, &adj_list, path_len_cutoff); - - // TODO: we will remove this redundant conversion in the future - let pb = pb - .into_iter() - .map(|p| p.into_iter().map(|v| (v.0, v.1, v.2)).collect()) - .collect::>>(); - - let vertex_to_bundle_id_direction_pos = self.get_vertex_map_from_principal_bundles(pb); - - filtered_adj_list.iter().for_each(|(k, v, w)| { - if v.0 <= w.0 { - let key = (*v, *w); - let val = (*k, v.2, w.2); - overlaps.entry(key).or_insert_with(|| vec![]).push(val); - frag_id.entry((v.0, v.1)).or_insert_with(|| { - let c_id = id; - id += 1; - c_id - }); - frag_id.entry((w.0, w.1)).or_insert_with(|| { - let c_id = id; - id += 1; - c_id - }); - } - }); - - let mut out_file = BufWriter::new(File::create(filepath).unwrap()); - - let kmer_size = self.shmmr_spec.as_ref().unwrap().k; - out_file.write("H\tVN:Z:1.0\tCM:Z:Sparse Genome Graph Generated By pgr-tk\n".as_bytes())?; - (&frag_id) - .into_iter() - .try_for_each(|(smp, id)| -> PyResult<()> { - let hits = frag_map.get(&smp).unwrap(); - let ave_len = - hits.iter().fold(0_u32, |len_sum, &s| len_sum + s.3 - s.2) / hits.len() as u32; - let seg_line; - if let Some(bundle_id) = vertex_to_bundle_id_direction_pos.get(smp) { - seg_line = format!( - "S\t{}\t*\tLN:i:{}\tSN:Z:{:016x}_{:016x}\tBN:i:{}\tBP:i:{}\n", - id, - ave_len + kmer_size, - smp.0, - smp.1, - bundle_id.0, - bundle_id.2 - ); - } else { - seg_line = format!( - "S\t{}\t*\tLN:i:{}\tSN:Z:{:016x}_{:016x}\n", - id, - ave_len + kmer_size, - smp.0, - smp.1 - ); - } - out_file.write(seg_line.as_bytes())?; - Ok(()) - })?; - - overlaps - .into_iter() - .try_for_each(|(op, vs)| -> PyResult<()> { - let o1 = if op.0 .2 == 0 { "+" } else { "-" }; - let o2 = if op.1 .2 == 0 { "+" } else { "-" }; - let id0 = frag_id.get(&(op.0 .0, op.0 .1)).unwrap(); - let id1 = frag_id.get(&(op.1 .0, op.1 .1)).unwrap(); - let overlap_line = format!( - "L\t{}\t{}\t{}\t{}\t{}M\tSC:i:{}\n", - id0, - o1, - id1, - o2, - kmer_size, - vs.len() - ); - out_file.write(overlap_line.as_bytes())?; - Ok(()) - })?; - + self.db_internal.generate_principal_mapg_gfa( + min_count, + path_len_cutoff, + filepath, + keeps, + )?; Ok(()) } fn write_frag_and_index_files(&self, file_prefix: String) -> () { - if self.seq_db.is_some() { - let internal = self.seq_db.as_ref().unwrap(); + if self.db_internal.seq_db.is_some() { + let internal = self.db_internal.seq_db.as_ref().unwrap(); internal.write_to_frag_files(file_prefix.clone(), None); internal @@ -1804,10 +1303,11 @@ impl SeqIndexDB { min_cov: u32, ) -> PyResult, Vec)>)>> { assert!( - self.backend == Backend::FASTX || self.backend == Backend::MEMORY, + self.db_internal.backend == Backend::FASTX + || self.db_internal.backend == Backend::MEMORY, "Only DB created with load_from_fastx() can add data from another fastx file" ); - let sdb = &self.seq_db.as_ref().unwrap(); + let sdb = &self.db_internal.seq_db.as_ref().unwrap(); let consensus = pgr_db::ec::shmmr_sparse_aln_consensus_with_sdb(sids, sdb, min_cov); match consensus { Ok(seq) => Ok(seq), @@ -1819,11 +1319,11 @@ impl SeqIndexDB { impl SeqIndexDB { // depending on the storage type, return the corresponded index fn get_shmmr_map_internal(&self) -> Option<&seq_db::ShmmrToFrags> { - match self.backend { + match self.db_internal.backend { #[cfg(feature = "with_agc")] Backend::AGC => None, - Backend::FASTX => Some(&self.seq_db.as_ref().unwrap().frag_map), - Backend::MEMORY => Some(&self.seq_db.as_ref().unwrap().frag_map), + Backend::FASTX => Some(&self.db_internal.seq_db.as_ref().unwrap().frag_map), + Backend::MEMORY => Some(&self.db_internal.seq_db.as_ref().unwrap().frag_map), Backend::FRG => None, Backend::UNKNOWN => None, } @@ -1836,7 +1336,7 @@ impl SeqIndexDB { /// /// >>> agc_file = AGCFile("/path/to/genomes.agc") /// -#[cfg(feature = "with_agc")] +#[cfg(feature = "with_agc")] #[pyclass(unsendable)] // lock in one thread (see https://github.com/PyO3/pyo3/blob/main/guide/src/class.md) struct AGCFile { /// internal agc_file handle From db674f608f20488fdee66bd22a5c3f2b0affa8d3 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 01:27:00 +0000 Subject: [PATCH 07/16] bump pyo3 to 0.18.3 --- pgr-tk/Cargo.toml | 2 +- pgr-tk/src/lib.rs | 59 +++++++++++++++++++---------------------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/pgr-tk/Cargo.toml b/pgr-tk/Cargo.toml index 7a565a5..6edf742 100644 --- a/pgr-tk/Cargo.toml +++ b/pgr-tk/Cargo.toml @@ -10,7 +10,7 @@ name = "pgrtk" crate-type = ["rlib","cdylib"] [dependencies] -pyo3 = { version = "0.14.3", features = ["extension-module"] } +pyo3 = { version = "0.18.3", features = ["extension-module"] } pgr-db = { path = "../pgr-db/", default-features = false } rustc-hash = "1.1.0" diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index f7fa2f7..c7d066d 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -134,8 +134,7 @@ impl SeqIndexDB { /// None or I/O Error /// None /// - #[pyo3(text_signature = "($self, w, k, r, min_span)")] - #[args(w = "80", k = "56", r = "4", min_span = "64")] + #[pyo3(signature = (filepath, w=80, k=56, r=4, min_span=64))] pub fn load_from_fastx( &mut self, filepath: String, @@ -189,8 +188,7 @@ impl SeqIndexDB { /// None or I/O Error /// None /// - #[pyo3(text_signature = "($self, seq_list, source, w, k, r, min_span)")] - #[args(source = "\"Memory\"", w = "80", k = "56", r = "4", min_span = "8")] + #[pyo3(signature = (seq_list, source="Memory", w=80, k=56, r=4, min_span=8))] pub fn load_from_seq_list( &mut self, seq_list: Vec<(String, Vec)>, @@ -207,6 +205,7 @@ impl SeqIndexDB { } /// get a dictionary that maps (ctg_name, source) -> (id, len) + #[getter] pub fn get_seq_index( &self, ) -> PyResult), (u32, u32)>>> { @@ -214,6 +213,7 @@ impl SeqIndexDB { } /// a dictionary that maps id -> (ctg_name, source, len) + #[getter] pub fn get_seq_info(&self) -> PyResult, u32)>>> { Ok(self.db_internal.seq_info.clone()) } @@ -583,8 +583,7 @@ impl SeqIndexDB { /// list /// a list of the tuple (source_name : string, count : int) /// - #[pyo3(text_signature = "($self, shmmr_pair, max_unique_count)")] - #[args(max_unique_count = "1")] + #[pyo3(signature = (shmmr_pair, max_unique_count))] pub fn get_shmmr_pair_source_count( &self, shmmr_pair: (u64, u64), @@ -802,7 +801,7 @@ impl SeqIndexDB { /// list /// list of pairs of shimmer pairs ((h00, h01, orientation0),(h10, h11, orientation1)) /// - #[args(keeps = "None")] + #[pyo3(signature = (min_count, keeps=None))] pub fn get_smp_adj_list( &self, min_count: usize, @@ -906,7 +905,7 @@ impl SeqIndexDB { /// list /// list of paths, each path is a list of nodes /// - #[args(keeps = "None")] + #[pyo3(signature = (min_count, path_len_cutoff, keeps=None))] pub fn get_principal_bundles( &mut self, min_count: usize, @@ -970,7 +969,7 @@ impl SeqIndexDB { /// the elements of the list are ((hash0:u64, hash1:u64, pos0:u32, pos0:u32, direction:0), /// (principal_bundle_id, direction, order_in_the_bundle)) /// - #[args(keeps = "None")] + #[pyo3(signature = (min_count, path_len_cutoff, keeps=None))] pub fn get_principal_bundle_decomposition( &mut self, min_count: usize, @@ -1032,12 +1031,12 @@ impl SeqIndexDB { /// (principal_bundle_id, direction, order_in_the_bundle)) /// /// - #[args(keeps = "None")] + #[pyo3(signature = (min_count, path_len_cutoff, sequence, keeps=None))] pub fn get_principal_bundle_projection( &mut self, min_count: usize, path_len_cutoff: usize, - sequences: Vec<(u32, Vec)>, + sequence: Vec<(u32, Vec)>, keeps: Option>, ) -> ( Vec<(usize, usize, Vec<(u64, u64, u8)>)>, @@ -1048,7 +1047,7 @@ impl SeqIndexDB { ) { let pb = self.get_principal_bundles(min_count, path_len_cutoff, keeps); //println!("DBG: # bundles {}", pb.len()); - self._get_principal_bundle_projection_internal(pb, sequences) + self._get_principal_bundle_projection_internal(pb, sequence) } fn _get_principal_bundle_projection_internal( @@ -1213,7 +1212,7 @@ impl SeqIndexDB { /// None /// The data is written into the file at filepath /// - #[args(method = "\"from_fragmap\"", keeps = "None")] + #[pyo3(signature = (min_count, filepath, method="from_fragmap", keeps=None))] pub fn generate_mapg_gfa( &self, min_count: usize, @@ -1266,7 +1265,7 @@ impl SeqIndexDB { /// None /// The data is written into the file at filepath /// - #[args(keeps = "None")] + #[pyo3(signature = (min_count, path_len_cutoff, filepath, keeps=None))] pub fn generate_principal_mapg_gfa( &self, min_count: usize, @@ -1295,8 +1294,7 @@ impl SeqIndexDB { } /// generate consensus sequence for one sequence in the database - #[args(sid, min_cov)] - #[pyo3(text_signature = "($self, sid, min_cov)")] + #[pyo3(signature = (sids, min_cov))] pub fn shmmr_sparse_aln_consensus( &self, sids: Vec, @@ -1362,7 +1360,7 @@ impl AGCFile { /// ---------- /// filepath: string /// the path to a AGC file - #[args(filepath)] + #[pyo3(signature=(filepath))] #[new] pub fn new(filepath: String) -> PyResult { let agc_file = agc_io::AGCFile::new(filepath)?; @@ -1390,8 +1388,7 @@ impl AGCFile { /// ------- /// list /// a list of bytes representing the sequence - #[args(sample_name, ctg_name, bgn, end)] - #[pyo3(text_signature = "($self, sample_name, ctg_name, bgn, end)")] + #[pyo3(signature = (sample_name, ctg_name, bgn, end))] pub fn get_sub_seq( &self, sample_name: String, @@ -1415,8 +1412,7 @@ impl AGCFile { /// ------- /// list /// a list of bytes representing the sequence - #[args(sample_name, ctg_name)] - #[pyo3(text_signature = "($self, sample_name, ctg_name)")] + #[pyo3(signature = (sample_name, ctg_name))] pub fn get_seq(&self, sample_name: String, ctg_name: String) -> PyResult> { Ok(self.agc_file.get_seq(sample_name, ctg_name)) } @@ -1450,8 +1446,7 @@ impl AGCFile { /// chunk alignment ignoring the gaps. Typically, a number between 0.1 to 0.5 should /// be used. /// -#[pyfunction(sp_hits, max_span, penalty)] -#[pyo3(text_signature = "($self, sp_hits, max_span, penalty)")] +#[pyfunction(signature = (sp_hits, max_span, penalty))] pub fn sparse_aln( sp_hits: Vec, max_span: u32, @@ -1491,8 +1486,7 @@ pub fn sparse_aln( /// list of tuple /// a list fo tuple of ``(shmmr0, shmmr1, position0, position1, orientation)`` /// -#[pyfunction(w = "80", k = "56", r = "4", min_span = "16", padding = "false")] -#[pyo3(text_signature = "($self, w, k, r, min_span, padding)")] +#[pyfunction(signature = (seq, w=80, k=56, r=4, min_span=16, padding=false))] fn get_shmmr_pairs_from_seq( seq: Vec, w: u32, @@ -1560,8 +1554,7 @@ fn get_shmmr_pairs_from_seq( /// - ``x``: the matched shimmer positions in sequence 0 /// - ``y``: the matched shimmer positions in sequence 1 /// -#[pyfunction(w = "80", k = "56", r = "4", min_span = "16")] -#[pyo3(text_signature = "($self, seq0, seq1, w, k, r, min_span)")] +#[pyfunction(signature = (seq0, seq1, w = 80, k = 56, r = 4, min_span = 16))] fn get_shmmr_dots( seq0: Vec, seq1: Vec, @@ -1824,8 +1817,7 @@ pub struct AlnMap { /// list /// a list of bytes representing the consensus sequence /// -#[pyfunction(seqs, kmer_size = 33, min_cov = 2)] -#[pyo3(text_signature = "($self, seqs, kmer_size, min_cov)")] +#[pyfunction(signature = (seqs, kmer_size=33, min_cov=2))] pub fn naive_dbg_consensus( seqs: Vec>, kmer_size: usize, @@ -1855,8 +1847,7 @@ pub fn naive_dbg_consensus( /// list /// a list of a set of bytes representing the consensus sequences of all branches in the graph /// -#[pyfunction(seqs, w = 33, k = 33, r = 1, min_span = 0)] -#[pyo3(text_signature = "($self, seqs, w, k, r, min_span)")] +#[pyfunction(signature= (seqs, w = 33, k = 33, r = 1, min_span = 0))] pub fn shmmr_dbg_consensus( seqs: Vec>, w: u32, @@ -1898,8 +1889,7 @@ pub fn shmmr_dbg_consensus( /// list /// a list of a set of bytes representing the consensus sequences of all branches in the graph /// -#[pyfunction(seqs, w = 33, k = 33, r = 1, min_span = 0, min_cov = 2)] -#[pyo3(text_signature = "($self, seqs, w, k, r, min_span, min_cov)")] +#[pyfunction(signature = (seqs, w = 33, k = 33, r = 1, min_span = 0, min_cov = 2))] pub fn guided_shmmr_dbg_consensus( seqs: Vec>, w: u32, @@ -1939,8 +1929,7 @@ pub fn guided_shmmr_dbg_consensus( /// list /// a list of a set of bytes representing the consensus sequences of all branches in the graph /// -#[pyfunction(seqs, w = 33, k = 33, r = 1, min_span = 0, min_cov = 2)] -#[pyo3(text_signature = "($self, seqs, w, k, r, min_span, min_cov)")] +#[pyfunction(signature = (seqs, w = 33, k = 33, r = 1, min_span = 0, min_cov = 2))] pub fn shmmr_sparse_aln_consensus( seqs: Vec>, w: u32, From c0014255885892d80d112a8f1bfa8c64835f5b97 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 01:35:58 +0000 Subject: [PATCH 08/16] fix types --- pgr-tk/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index c7d066d..88e40fb 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -28,7 +28,7 @@ pub fn pgr_lib_version() -> PyResult { Ok(VERSION_STRING.to_string()) } -/// A class that stores pangenomics indices and sequences with multiple backend storage options (AGC, fasta file, memory) +/// A class that stores pangenome indices and sequences with multiple backend storage options (AGC, fasta file, memory) /// Large set of genomic sequences, a user should use AGC backend. A binary file provides the command ``pgr-mdb`` /// which can read an AGC to create the index file. For example, we can create the index files from an AGC file:: /// @@ -1750,12 +1750,12 @@ pub struct AlnMap { /// a python string of the reference sequence /// /// s1: int -/// a interger id for the target sequnece +/// a integer id for the target sequence /// /// Returns /// ------- /// list -/// a list of ``AlnSegement`` +/// a list of ``AlnSegment`` // #[pyfunction(aln_segs, s0, s1)] // #[pyo3(text_signature = "($self, aln_segs, s0, s1)")] // fn get_aln_map( @@ -1799,7 +1799,7 @@ pub struct AlnMap { // }) // } -/// Perform a navie de Bruijn graph consensus +/// Perform a naive de Bruijn graph consensus /// /// Parameters /// ---------- @@ -1810,7 +1810,7 @@ pub struct AlnMap { /// the size of kmers used for constructing the de Bruijn graph /// /// min_cov : int -/// to keep hyplotype specific consensus, if a kmer has coverage more or equal to min_cov, it will be kept +/// to keep haplotype specific consensus, if a kmer has coverage more or equal to min_cov, it will be kept /// /// Returns /// ------- From ddcd6c37669710c2e2b8f92e40755255ebdf019f Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 01:38:45 +0000 Subject: [PATCH 09/16] remove wrong directive --- pgr-tk/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index 88e40fb..aefca6b 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -14,7 +14,6 @@ use pgr_db::fasta_io; use pyo3::exceptions; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -#[cfg(feature = "with_agc")] use pyo3::wrap_pyfunction; use pyo3::Python; use rayon::prelude::*; From b9eda4137e13fd576571de45255e54b22c5444f8 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 01:42:22 +0000 Subject: [PATCH 10/16] remove wrong compile directive --- pgr-tk/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index aefca6b..0a5fc59 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -1,6 +1,5 @@ // src/lib.rs pub const VERSION_STRING: &'static str = env!("VERSION_STRING"); -#[cfg(feature = "with_agc")] use pgr_db::aln::{self, HitPair}; use pgr_db::graph_utils::{AdjList, ShmmrGraphNode}; use pgr_db::seq_db::{self, raw_query_fragment}; From b91dfc523fc91d77ac3a8fd18b70371a767e6dae Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 01:43:37 +0000 Subject: [PATCH 11/16] comment out WFA2 build --- build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.sh b/build.sh index b928fab..aa4c612 100644 --- a/build.sh +++ b/build.sh @@ -1,6 +1,6 @@ -pushd WFA2-lib -make all -popd +#pushd WFA2-lib +#make all +#popd rustup default stable cargo build -p pgr-db --release From 18f8820a0a94310137aa411be6bf4e8935b1edb6 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 02:10:07 +0000 Subject: [PATCH 12/16] enable query from mmap files --- pgr-tk/src/lib.rs | 67 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index 0a5fc59..d46d4ed 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -242,11 +242,32 @@ impl SeqIndexDB { &self, seq: Vec, ) -> PyResult)>> { - let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); - let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); - let res: Vec<((u64, u64), (u32, u32, u8), Vec)> = - seq_db::raw_query_fragment(shmmr_to_frags, &seq, shmmr_spec); - Ok(res) + match self.db_internal.backend { + #[cfg(feature = "with_agc")] + Backend::AGC => { + let (frag_location_map, frag_map_file) = ( + &self.db_internal.agc_db.as_ref().unwrap().frag_location_map, + &self.db_internal.agc_db.as_ref().unwrap().frag_map_file, + ); + let shmmr_spec = self.db_internal.shmmr_spec.as_ref().unwrap().clone(); + Ok(pgr_db::seq_db::raw_query_fragment_from_mmap_midx(frag_location_map, frag_map_file, &seq, &shmmr_spec)) + }, + Backend::FRG => { + let (frag_location_map, frag_map_file) = ( + &self.db_internal.frg_db.as_ref().unwrap().frag_location_map, + &self.db_internal.frg_db.as_ref().unwrap().frag_map_file, + ); + let shmmr_spec = self.db_internal.shmmr_spec.as_ref().unwrap().clone(); + Ok(pgr_db::seq_db::raw_query_fragment_from_mmap_midx(frag_location_map, frag_map_file, &seq, &shmmr_spec)) + }, + _ => { + let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); + let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); + let res: Vec<((u64, u64), (u32, u32, u8), Vec)> = + seq_db::raw_query_fragment(shmmr_to_frags, &seq, shmmr_spec); + Ok(res) + } + } } /// use a fragment of sequence to query the database to get all hits and sort it by the data base sequence id @@ -323,17 +344,31 @@ impl SeqIndexDB { max_count_target: Option, max_aln_span: Option, ) -> PyResult)>)>> { - Ok(self - .db_internal - .query_fragment_to_hps( - seq, - penalty, - max_count, - max_count_query, - max_count_target, - max_aln_span, - ) - .unwrap()) + match self.db_internal.backend { + Backend::AGC | Backend::FRG => Ok(self + .db_internal + .query_fragment_to_hps_from_mmap_file( + seq, + penalty, + max_count, + max_count_query, + max_count_target, + max_aln_span, + ) + .unwrap()), + Backend::MEMORY | Backend::FASTX => Ok(self + .db_internal + .query_fragment_to_hps( + seq, + penalty, + max_count, + max_count_query, + max_count_target, + max_aln_span, + ) + .unwrap()), + Backend::UNKNOWN => Ok(vec![]), + } } /// Given a sequence context, this function maps the specific positions in the context From 83d30c9897ca1a25282826b339e8f86cfc3435b8 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 02:14:21 +0000 Subject: [PATCH 13/16] fix match statement for non-agc build --- pgr-tk/src/lib.rs | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index d46d4ed..d918178 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -250,23 +250,34 @@ impl SeqIndexDB { &self.db_internal.agc_db.as_ref().unwrap().frag_map_file, ); let shmmr_spec = self.db_internal.shmmr_spec.as_ref().unwrap().clone(); - Ok(pgr_db::seq_db::raw_query_fragment_from_mmap_midx(frag_location_map, frag_map_file, &seq, &shmmr_spec)) - }, + Ok(pgr_db::seq_db::raw_query_fragment_from_mmap_midx( + frag_location_map, + frag_map_file, + &seq, + &shmmr_spec, + )) + } Backend::FRG => { let (frag_location_map, frag_map_file) = ( &self.db_internal.frg_db.as_ref().unwrap().frag_location_map, &self.db_internal.frg_db.as_ref().unwrap().frag_map_file, ); let shmmr_spec = self.db_internal.shmmr_spec.as_ref().unwrap().clone(); - Ok(pgr_db::seq_db::raw_query_fragment_from_mmap_midx(frag_location_map, frag_map_file, &seq, &shmmr_spec)) - }, - _ => { + Ok(pgr_db::seq_db::raw_query_fragment_from_mmap_midx( + frag_location_map, + frag_map_file, + &seq, + &shmmr_spec, + )) + } + Backend::MEMORY | Backend::FASTX => { let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); let res: Vec<((u64, u64), (u32, u32, u8), Vec)> = seq_db::raw_query_fragment(shmmr_to_frags, &seq, shmmr_spec); Ok(res) } + Backend::UNKNOWN => Ok(vec![]), } } @@ -345,7 +356,19 @@ impl SeqIndexDB { max_aln_span: Option, ) -> PyResult)>)>> { match self.db_internal.backend { - Backend::AGC | Backend::FRG => Ok(self + #[cfg(feature = "with_agc")] + Backend::AGC => Ok(self + .db_internal + .query_fragment_to_hps_from_mmap_file( + seq, + penalty, + max_count, + max_count_query, + max_count_target, + max_aln_span, + ) + .unwrap()), + Backend::FRG => Ok(self .db_internal .query_fragment_to_hps_from_mmap_file( seq, From f828d204cec55fffa59dea14f34895569c7f84ae Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 02:58:04 +0000 Subject: [PATCH 14/16] error handling when the frag_map is not in the memory --- pgr-tk/src/lib.rs | 74 ++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/pgr-tk/src/lib.rs b/pgr-tk/src/lib.rs index d918178..08678f4 100644 --- a/pgr-tk/src/lib.rs +++ b/pgr-tk/src/lib.rs @@ -2,7 +2,7 @@ pub const VERSION_STRING: &'static str = env!("VERSION_STRING"); use pgr_db::aln::{self, HitPair}; use pgr_db::graph_utils::{AdjList, ShmmrGraphNode}; -use pgr_db::seq_db::{self, raw_query_fragment}; +use pgr_db::seq_db; //use pgr_db::seqs2variants; use pgr_db::shmmrutils::{sequence_to_shmmrs, DeltaPoint, ShmmrSpec}; @@ -11,7 +11,6 @@ use pgr_db::agc_io; use pgr_db::fasta_io; use pyo3::exceptions; -use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::wrap_pyfunction; use pyo3::Python; @@ -302,9 +301,17 @@ impl SeqIndexDB { seq: Vec, ) -> PyResult>> { let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); - let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); - let res = seq_db::get_match_positions_with_fragment(shmmr_to_frags, &seq, shmmr_spec); - Ok(res) + match self.db_internal.backend { + Backend::FASTX | Backend::MEMORY => { + let shmmr_to_frags = self.get_shmmr_map_internal().unwrap(); + let res = + seq_db::get_match_positions_with_fragment(shmmr_to_frags, &seq, shmmr_spec); + Ok(res) + } + _ => Err(exceptions::PyException::new_err( + "This method only support FASTX or MEMORY backend.", + )), + } } /// use a fragment of sequence to query the database to get all hits @@ -453,10 +460,9 @@ impl SeqIndexDB { max_count_target: Option, max_aln_span: Option, ) -> PyResult> { - let shmmr_spec = &self.db_internal.shmmr_spec.as_ref().unwrap(); - - let mut all_alns = if let Some(frag_map) = self.get_shmmr_map_internal() { - let raw_query_hits = raw_query_fragment(frag_map, &seq, shmmr_spec); + let shmmr_spec = self.db_internal.shmmr_spec.as_ref().unwrap(); + let mut all_alns = { + let raw_query_hits = self.query_fragment(seq.clone()).unwrap(); aln::query_fragment_to_hps( raw_query_hits, &seq, @@ -467,8 +473,6 @@ impl SeqIndexDB { max_count_target, max_aln_span, ) - } else { - return Err(PyValueError::new_err("fail to find an index")); }; // for reach position, we find the left_match and right_match shimmer pair that sandwiched the @@ -609,15 +613,17 @@ impl SeqIndexDB { /// int /// number of hits #[pyo3(text_signature = "($self, shmmr_pair)")] - pub fn get_shmmr_pair_count(&self, shmmr_pair: (u64, u64)) -> usize { + pub fn get_shmmr_pair_count(&self, shmmr_pair: (u64, u64)) -> PyResult { if let Some(shmmr_to_frags) = self.get_shmmr_map_internal() { if shmmr_to_frags.contains_key(&shmmr_pair) { - shmmr_to_frags.get(&shmmr_pair).unwrap().len() + Ok(shmmr_to_frags.get(&shmmr_pair).unwrap().len()) } else { - 0 + Ok(0) } } else { - 0 + Err(exceptions::PyException::new_err( + "This method only support FASTX or MEMORY backend.", + )) } } @@ -644,11 +650,13 @@ impl SeqIndexDB { &self, shmmr_pair: (u64, u64), max_unique_count: Option, - ) -> Vec<(String, usize)> { + ) -> PyResult> { let mut count = FxHashMap::::default(); let shmmr_to_frags = self.get_shmmr_map_internal(); if shmmr_to_frags.is_none() { - return vec![]; + return Err(exceptions::PyException::new_err( + "This method only support FASTX or MEMORY backend.", + )); }; let shmmr_to_frags = shmmr_to_frags.unwrap(); @@ -672,7 +680,8 @@ impl SeqIndexDB { .clone(); *count.entry(source).or_insert(0) += 1; }); - count + + let out = count .into_par_iter() .filter(|(_k, v)| { if let Some(muc) = max_unique_count { @@ -686,9 +695,10 @@ impl SeqIndexDB { } }) .map(|(k, v)| (k, v)) - .collect::>() + .collect::>(); + Ok(out) } else { - vec![] + Ok(vec![]) } } @@ -726,8 +736,13 @@ impl SeqIndexDB { pub fn get_shmmr_map(&self) -> PyResult { // very expansive as the Rust FxHashMap will be converted to Python's dictionary // maybe limit the size that can be converted to avoid OOM - let shmmr_to_frags = self.get_shmmr_map_internal(); - pyo3::Python::with_gil(|py| Ok(shmmr_to_frags.to_object(py))) + if let Some(shmmr_to_frags) = self.get_shmmr_map_internal() { + pyo3::Python::with_gil(|py| Ok(shmmr_to_frags.to_object(py))) + } else { + Err(exceptions::PyException::new_err( + "This method only support FASTX or MEMORY backend.", + )) + } } /// get the ``shmmr_pair`` to ``fragment_id`` map in Python as a list @@ -752,7 +767,9 @@ impl SeqIndexDB { .collect::>(); Ok(py_out) } else { - Err(PyValueError::new_err("index not found")) + Err(exceptions::PyException::new_err( + "This method only support FASTX or MEMORY backend.", + )) } } @@ -862,13 +879,15 @@ impl SeqIndexDB { &self, min_count: usize, keeps: Option>, - ) -> Vec<(u32, (u64, u64, u8), (u64, u64, u8))> { + ) -> PyResult> { let frag_map = self.get_shmmr_map_internal(); if frag_map.is_none() { - vec![] + return Err(exceptions::PyException::new_err( + "This method only support FASTX or MEMORY backend.", + )); } else { let frag_map = frag_map.unwrap(); - seq_db::frag_map_to_adj_list(frag_map, min_count, keeps) + let out = seq_db::frag_map_to_adj_list(frag_map, min_count, keeps) .iter() .map(|adj_pair| { ( @@ -877,7 +896,8 @@ impl SeqIndexDB { (adj_pair.2 .0, adj_pair.2 .1, adj_pair.2 .2), ) }) - .collect() + .collect(); + Ok(out) } } From c66ba52e8c4368956aa5d85de57c3253ba5b9a1c Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 03:08:45 +0000 Subject: [PATCH 15/16] fix typo --- pgr-tk/pgrtk/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pgr-tk/pgrtk/__init__.py b/pgr-tk/pgrtk/__init__.py index 8409ad1..dcb7e3f 100644 --- a/pgr-tk/pgrtk/__init__.py +++ b/pgr-tk/pgrtk/__init__.py @@ -5,7 +5,7 @@ Example ------- -This shows a simple example to query the pangenomics database:: +This shows a simple example to query the pangenome database:: import pgrtk @@ -128,7 +128,7 @@ def u8_to_string(u8): def query_sdb(seq_index_db, query_seq, - gap_penality_factor=0.25, + gap_penalty_factor=0.25, merge_range_tol=12, max_count=128, max_query_count=128, @@ -144,8 +144,8 @@ def query_sdb(seq_index_db, query_seq, query_seq : list of bytes a list of bytes representing the DNA sequence - gap_penality_factor : float - the gap penalty factor used in sparse dyanmic programming for finding the hits + gap_penalty_factor : float + the gap penalty factor used in sparse dynamic programming for finding the hits merge_range_tol : int a parameter used to merge the alignment ranges @@ -180,7 +180,7 @@ def query_sdb(seq_index_db, query_seq, """ r = seq_index_db.query_fragment_to_hps( query_seq, - gap_penality_factor, + gap_penalty_factor, max_count, max_query_count, max_target_count, @@ -333,7 +333,7 @@ def get_variant_calls(aln_segs, ref_bgn, ctg_bgn, rs0, cs0, strand): Parameters ---------- aln_segs : list of tuples - - a list of tuples of "alignment segments" generate by ``pgrtk.pgrtk.get_aln_segements()`` + - a list of tuples of "alignment segments" generate by ``pgrtk.pgrtk.get_aln_segments()`` - the "alignment segments" are a list of ``(ref_loc: SeqLocus, tgt_loc: SeqLocus, align_type: AlnSegType)``. The data structures is defined as following Rust structs:: @@ -552,7 +552,7 @@ def group_smps_by_principle_bundle_id(smps, len_cutoff=2500, merge_length=5000): Parameters ---------- len_cutoff: int - the length cutoff used for filtering small bundle segement + the length cutoff used for filtering small bundle segment merge_length: int the length determining if two bundles should be merged @@ -564,7 +564,7 @@ def group_smps_by_principle_bundle_id(smps, len_cutoff=2500, merge_length=5000): each element of the list SHIMMER is a tuple of `((shimmer0, shimmer1, pos0, pos1, direction), - bundle_id, direction_to_the_bundle, postion_in bundle)` + bundle_id, direction_to_the_bundle, position_in bundle)` """ pbid, pdirection = None, None From 2af0ef359d7ce18dd10ef1f44a43a7e6020e9203 Mon Sep 17 00:00:00 2001 From: Jason Chin Date: Fri, 5 May 2023 03:15:10 +0000 Subject: [PATCH 16/16] bump to v0.5.1 --- pgr-bin/Cargo.toml | 2 +- pgr-db/Cargo.toml | 2 +- pgr-tk/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgr-bin/Cargo.toml b/pgr-bin/Cargo.toml index 07d1d1a..bc3a31c 100644 --- a/pgr-bin/Cargo.toml +++ b/pgr-bin/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgr-bin" -version = "0.5.0" +version = "0.5.1" edition = "2021" authors = ["Jason Chin "] diff --git a/pgr-db/Cargo.toml b/pgr-db/Cargo.toml index b2244da..ca4ad2a 100644 --- a/pgr-db/Cargo.toml +++ b/pgr-db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgr-db" -version = "0.5.0" +version = "0.5.1" edition = "2021" authors = ["Jason Chin "] build = "build.rs" diff --git a/pgr-tk/Cargo.toml b/pgr-tk/Cargo.toml index 6edf742..6a6c0f5 100644 --- a/pgr-tk/Cargo.toml +++ b/pgr-tk/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgrtk" -version = "0.5.0" +version = "0.5.1" authors = ["Jason Chin "] edition = "2021"