From e59cd97c31475f215002b9f10268e2ba160a6442 Mon Sep 17 00:00:00 2001 From: Joseph Kuo Date: Thu, 6 Jun 2024 13:45:22 +0200 Subject: [PATCH] modify write_FASTA for both GSequence and GSequences --- docs/source/examples_bed.rst | 2 +- genomkit/regions/gregions_intervaltree.py | 1 - genomkit/sequences/gsequence.py | 5 +++ genomkit/sequences/gsequences.py | 2 +- genomkit/sequences/io.py | 42 ++++++++++++++--------- 5 files changed, 33 insertions(+), 19 deletions(-) diff --git a/docs/source/examples_bed.rst b/docs/source/examples_bed.rst index 00904db..f356b83 100644 --- a/docs/source/examples_bed.rst +++ b/docs/source/examples_bed.rst @@ -17,7 +17,7 @@ For example, you have a BED file for all the exons from hg38. Now you want to ge exons = GRegions(name="exons", load="hg38_exons.bed") exon_seqs = exons.get_GSequences(FASTA_file=FASTA_hg38) - exon_seqs.write_fasta(filename="hg38_exons.fasta") + exon_seqs.write_FASTA(filename="hg38_exons.fasta") Get TSSs (Transcription Start Sites) and TTSs (Transcription Termination Sites) of genes in a BED file diff --git a/genomkit/regions/gregions_intervaltree.py b/genomkit/regions/gregions_intervaltree.py index 6b1fab3..0c6ed25 100644 --- a/genomkit/regions/gregions_intervaltree.py +++ b/genomkit/regions/gregions_intervaltree.py @@ -1,6 +1,5 @@ import random from genomkit import GRegion -import copy import numpy as np from .io import load_BED_intervaltree import os diff --git a/genomkit/sequences/gsequence.py b/genomkit/sequences/gsequence.py index 02c422d..5bd74f7 100644 --- a/genomkit/sequences/gsequence.py +++ b/genomkit/sequences/gsequence.py @@ -1,4 +1,5 @@ from collections import Counter +from .io import write_FASTA class GSequence: @@ -116,3 +117,7 @@ def slice_sequence(self, start, end): seq = GSequence(sequence=self.sequence[start:end], name=self.name, data=self.data) return seq + + def write_FASTA(self, filename: str, data: bool = False, + gz: bool = False): + write_FASTA(input_object=self, filename=filename, data=data, gz=gz) diff --git a/genomkit/sequences/gsequences.py b/genomkit/sequences/gsequences.py index 3faefe0..293d75d 100644 --- a/genomkit/sequences/gsequences.py +++ b/genomkit/sequences/gsequences.py @@ -131,7 +131,7 @@ def get_sequence(self, name, start, end): def write_FASTA(self, filename: str, data: bool = False, gz: bool = False): - write_FASTA(seqs=self, filename=filename, data=data, gz=gz) + write_FASTA(input_object=self, filename=filename, data=data, gz=gz) def write_FASTQ(self, filename: str, data: bool = False, gz: bool = True): diff --git a/genomkit/sequences/io.py b/genomkit/sequences/io.py index ffa1534..bd6de20 100644 --- a/genomkit/sequences/io.py +++ b/genomkit/sequences/io.py @@ -105,25 +105,35 @@ def load_FASTQ_from_file(file): return res -def write_FASTA(seqs, filename: str, data: bool = False, +def write_FASTA(input_object, filename: str, data: bool = False, gz: bool = False): + from genomkit import GSequence, GSequences if gz: - with gzip.open(filename, "wt") as fasta_file: - write_fasta_content(seqs, fasta_file, data) + if isinstance(input_object, GSequence): + with gzip.open(filename, "wt") as fasta_file: + write_fasta_content(input_object, fasta_file, data) + elif isinstance(input_object, GSequences): + with gzip.open(filename, "wt") as fasta_file: + for seq in input_object.elements: + write_fasta_content(seq, fasta_file, data) else: - with open(filename, "w") as fasta_file: - write_fasta_content(seqs, fasta_file, data) - - -def write_fasta_content(seqs, fasta_file, data: bool): - for seq in seqs.elements: - if data: - fasta_file.write(">" + seq.name + " " + - " ".join(seq.data) + "\n") - else: - fasta_file.write(f">{seq.name}\n") - for i in range(0, len(seq.sequence), 80): - fasta_file.write(f"{seq.sequence[i:i+80]}\n") + if isinstance(input_object, GSequence): + with open(filename, "w") as fasta_file: + write_fasta_content(input_object, fasta_file, data) + elif isinstance(input_object, GSequences): + with open(filename, "w") as fasta_file: + for seq in input_object.elements: + write_fasta_content(seq, fasta_file, data) + + +def write_fasta_content(seq, fasta_file, data: bool): + if data: + fasta_file.write(">" + seq.name + " " + + " ".join(seq.data) + "\n") + else: + fasta_file.write(f">{seq.name}\n") + for i in range(0, len(seq.sequence), 80): + fasta_file.write(f"{seq.sequence[i:i+80]}\n") def write_FASTQ(seqs, filename: str, data: bool = False, gz: bool = True):