Skip to content

Commit

Permalink
modify write_FASTA for both GSequence and GSequences
Browse files Browse the repository at this point in the history
  • Loading branch information
Joseph Kuo committed Jun 6, 2024
1 parent 8b5ca23 commit e59cd97
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 19 deletions.
2 changes: 1 addition & 1 deletion docs/source/examples_bed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ For example, you have a BED file for all the exons from hg38. Now you want to ge
exons = GRegions(name="exons", load="hg38_exons.bed")
exon_seqs = exons.get_GSequences(FASTA_file=FASTA_hg38)
exon_seqs.write_fasta(filename="hg38_exons.fasta")
exon_seqs.write_FASTA(filename="hg38_exons.fasta")
Get TSSs (Transcription Start Sites) and TTSs (Transcription Termination Sites) of genes in a BED file
Expand Down
1 change: 0 additions & 1 deletion genomkit/regions/gregions_intervaltree.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import random
from genomkit import GRegion
import copy
import numpy as np
from .io import load_BED_intervaltree
import os
Expand Down
5 changes: 5 additions & 0 deletions genomkit/sequences/gsequence.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import Counter
from .io import write_FASTA


class GSequence:
Expand Down Expand Up @@ -116,3 +117,7 @@ def slice_sequence(self, start, end):
seq = GSequence(sequence=self.sequence[start:end],
name=self.name, data=self.data)
return seq

def write_FASTA(self, filename: str, data: bool = False,
gz: bool = False):
write_FASTA(input_object=self, filename=filename, data=data, gz=gz)
2 changes: 1 addition & 1 deletion genomkit/sequences/gsequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def get_sequence(self, name, start, end):

def write_FASTA(self, filename: str, data: bool = False,
gz: bool = False):
write_FASTA(seqs=self, filename=filename, data=data, gz=gz)
write_FASTA(input_object=self, filename=filename, data=data, gz=gz)

def write_FASTQ(self, filename: str, data: bool = False,
gz: bool = True):
Expand Down
42 changes: 26 additions & 16 deletions genomkit/sequences/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,25 +105,35 @@ def load_FASTQ_from_file(file):
return res


def write_FASTA(seqs, filename: str, data: bool = False,
def write_FASTA(input_object, filename: str, data: bool = False,
gz: bool = False):
from genomkit import GSequence, GSequences
if gz:
with gzip.open(filename, "wt") as fasta_file:
write_fasta_content(seqs, fasta_file, data)
if isinstance(input_object, GSequence):
with gzip.open(filename, "wt") as fasta_file:
write_fasta_content(input_object, fasta_file, data)
elif isinstance(input_object, GSequences):
with gzip.open(filename, "wt") as fasta_file:
for seq in input_object.elements:
write_fasta_content(seq, fasta_file, data)
else:
with open(filename, "w") as fasta_file:
write_fasta_content(seqs, fasta_file, data)


def write_fasta_content(seqs, fasta_file, data: bool):
for seq in seqs.elements:
if data:
fasta_file.write(">" + seq.name + " " +
" ".join(seq.data) + "\n")
else:
fasta_file.write(f">{seq.name}\n")
for i in range(0, len(seq.sequence), 80):
fasta_file.write(f"{seq.sequence[i:i+80]}\n")
if isinstance(input_object, GSequence):
with open(filename, "w") as fasta_file:
write_fasta_content(input_object, fasta_file, data)
elif isinstance(input_object, GSequences):
with open(filename, "w") as fasta_file:
for seq in input_object.elements:
write_fasta_content(seq, fasta_file, data)


def write_fasta_content(seq, fasta_file, data: bool):
if data:
fasta_file.write(">" + seq.name + " " +
" ".join(seq.data) + "\n")
else:
fasta_file.write(f">{seq.name}\n")
for i in range(0, len(seq.sequence), 80):
fasta_file.write(f"{seq.sequence[i:i+80]}\n")


def write_FASTQ(seqs, filename: str, data: bool = False, gz: bool = True):
Expand Down

0 comments on commit e59cd97

Please sign in to comment.