Skip to content

Commit

Permalink
update examples
Browse files Browse the repository at this point in the history
  • Loading branch information
chaochungkuo committed Feb 20, 2024
1 parent faab017 commit c0cbcbf
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 70 deletions.
24 changes: 23 additions & 1 deletion docs/source/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,26 @@ Extract the genes by their biotypes from a GTF file
- snRNA
- miRNA

.. code-block:: python
.. code-block:: python
from genomkit import GAnnotation
gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
target_biotypes = ["protein_coding", "lncRNA", "snRNA", "miRNA"]
for biotype in target_biotypes:
genes = gtf.get_regions(element_type="gene",
attribute="gene_type", value=biotype)
genes.write(filename="hg38_genes_"+biotype+".bed")
Get the sequences in FASTA format from a BED file
-------------------------------------------------

For example, you have a BED file for all the exons from hg38. Now you want to get their sequences in FASTA format with care of the orientation of the strands.

.. code-block:: python
from genomkit import GRegions
exons = GRegions(name="exons", load="hg38_exons.bed")
exon_seqs = exons.get_GSequences(FASTA_file=FASTA_hg38)
exon_seqs.write_fasta(filename="hg38_exons.fasta")
116 changes: 73 additions & 43 deletions genomkit/regions/gregions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from .io import load_BED
import os
import sys


###########################################################################
Expand Down Expand Up @@ -691,100 +692,104 @@ def subtract(self, regions, whole_region: bool = False,
finished = 0
last_region = self.elements[-1]

for region in self.elements:
for r in self.elements:
if not finished:
if region != last_region:
if r != last_region:
# handling of duplicates within self
if region.sequence == regions.elements[counts].sequence:
if region.start == regions.elements[counts].start:
if region.end == regions.elements[counts].end:
if r.sequence == regions.elements[counts].sequence:
if r.start == regions.elements[counts].start:
if r.end == regions.elements[counts].end:
# cur y |-----|
# region |-----|
counts = counts + 1
if counts == len(regions.elements):
finished = 1

elif region.end < regions.elements[counts].end:
elif r.end < regions.elements[counts].end:
# cur y |------|
# region |---|
small_self.add(region)
small_self.add(r)
else:
# cur y |----|
# region |------|
loop = 1
while regions.elements[counts].end < region.end and loop:
while regions.elements[counts].end < r.end and loop:
if counts < len(regions.elements) - 1:
counts = counts + 1
if regions.elements[counts].sequence > region.sequence:
small_self.add(region)
if regions.elements[counts].sequence > r.sequence:
small_self.add(r)
loop = 0
else:
# still the same chromosome as current region from self
if regions.elements[counts].end == region.end:
if regions.elements[counts].end == r.end:
# cur y |-----|
# region |-----|
counts = counts + 1
if counts == len(regions.elements):
loop = 0
finished = 1
elif regions.elements[counts].end > region.end:
elif regions.elements[counts].end > r.end:
# cur y |----|
# region |---|
small_self.add(region)
small_self.add(r)
else:
# reached the last region in y and this is not exactly mapping
small_self.add(region)
small_self.add(r)
loop = 0
elif region.start < regions.elements[counts].start:
elif r.start < regions.elements[counts].start:
# cur y |---| | |----| | |---| | |---|
# region |---| | |---| | |-----| | |------|
small_self.add(region)
small_self.add(r)

else:
# cur y |---| | |---| | |-----| | |----|
# region |---| | |---| | |--| | |--|
loop = 1
while regions.elements[counts].start < region.start and loop:
while regions.elements[counts].start < r.start and loop:
if counts < len(regions.elements) - 1:
counts = counts + 1
if regions.elements[counts].sequence > region.sequence:
small_self.add(region)
if regions.elements[counts].sequence > r.sequence:
small_self.add(r)
loop = 0
else:
if regions.elements[counts].end == region.end:
if regions.elements[counts].end == r.end:
# cur y |-----|
# region |-----|
counts = counts + 1
if counts == len(regions.elements):
loop = 0
finished = 1
elif regions.elements[counts].start > region.start:
elif regions.elements[counts].start > r.start:
# cur y |---| | |----| | |---| | |---|
# region |---| | |---| | |-----| | |------|
small_self.add(region)
small_self.add(r)
else:
# reached the last region in y and this is not exactly mapping
small_self.add(region)
small_self.add(r)
loop = 0
finished = 1

elif region.sequence > regions.elements[counts].sequence:
elif r.sequence > regions.elements[counts].sequence:
counts = counts + 1
if counts == len(regions.elements):
finished = 1
else:
# region.sequence < target.sequences[counts].sequence:
small_self.add(region)
# region.sequence <
# target.sequences[counts].sequence:
small_self.add(r)
else:
# finished -> reached end of y, simply add all remaining regions of self that are not equal
# to the last subtracted or a region that has already been added
if region != last_region:
small_self.add(region)
last_region = region
# finished -> reached end of y, simply add all
# remaining regions of self that are not equal
# to the last subtracted or a region that has
# already been added
if r != last_region:
small_self.add(r)
last_region = r
return small_self

elif len(self.elements) == 1:
# GRS only contains 1 region, only check if this matches exactly with any region within y
# GRS only contains 1 region, only check if this matches
# exactly with any region within y
for target_region in regions.elements:
if target_region == self.elements[0]:
return GRegions("small_self") # return empty GRS
Expand Down Expand Up @@ -846,29 +851,35 @@ def subtract(self, regions, whole_region: bool = False,
name=s.name, orientation=s.orientation, data=s.data)
res.add(s1)
s = s2
if j < last_j: j = j + 1
if j < last_j:
j = j + 1
cont_overlap = True
continue
else:
s1 = GRegion(sequence=s.sequence, start=s.start, end=b[j].start,
name=s.name, orientation=s.orientation, data=s.data)
s1 = GRegion(sequence=s.sequence, start=s.start,
end=b[j].start, name=s.name,
orientation=s.orientation,
data=s.data)
res.add(s1)
try:
s = next(iter_a)
j = pre_inter
cont_overlap = False
continue
except:
except StopIteration:
cont_loop = False

elif s.end > b[j].end:

# ------
# ------
s2 = GRegion(sequence=s.sequence, start=b[j].end, end=s.end,
name=s.name, orientation=s.orientation, data=s.data)
s2 = GRegion(sequence=s.sequence, start=b[j].end,
end=s.end, name=s.name,
orientation=s.orientation,
data=s.data)
s = s2
if j < last_j: j = j + 1
if j < last_j:
j = j + 1
cont_overlap = True
continue
else:
Expand All @@ -878,14 +889,14 @@ def subtract(self, regions, whole_region: bool = False,
try:
s = next(iter_a)
j = pre_inter
except:
except StopIteration:
cont_loop = False

if j == last_j:
try:
s = next(iter_a)
j = pre_inter
except:
except StopIteration:
cont_loop = False
else:
j = j + 1
Expand All @@ -897,15 +908,15 @@ def subtract(self, regions, whole_region: bool = False,
s = next(iter_a)
j = pre_inter
cont_overlap = False
except:
except StopIteration:
cont_loop = False

elif s > b[j]:
if j == last_j:
res.add(s)
try:
s = next(iter_a)
except:
except StopIteration:
cont_loop = False
else:
j = j + 1
Expand All @@ -914,3 +925,22 @@ def subtract(self, regions, whole_region: bool = False,
self.elements = res.elements
else:
return res

def get_GSequences(self, FASTA_file):
from genomkit import GSequences
if os.path.exists(FASTA_file):
fasta = GSequences(load=FASTA_file)
else:
print(FASTA_file + " is not found.")
sys.exit()
res = GSequences(name=self.name)
for region in self.elements:
seq = fasta.get_sequence(name=region.sequence,
start=region.start,
end=region.end)
seq.name = region.name
seq.data = region.data
if region.orientation == "-":
seq.reverse_complement()
res.add(seq)
return res
14 changes: 14 additions & 0 deletions genomkit/sequences/gsequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,17 @@ def count_table(self):
:rtype: dict
"""
return Counter(self.sequence)

def slice_sequence(self, start, end):
"""Return the sequence by the given start and end positions.
:param start: Start position
:type start: int
:param end: End position
:type end: int
:return: Sequence
:rtype: str
"""
seq = GSequence(sequence=self.sequence[start:end],
name=self.name, data=self.data)
return seq
27 changes: 27 additions & 0 deletions genomkit/sequences/gsequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,30 @@ def count_table(self):
for key, count in count_dict.items():
result[key] = result.get(key, 0) + count
return result

def get_sequence(self, name, start, end):
"""Return the sequence according to the given name, start and end.
:param name: Sequence name
:type name: str
:param start: Start position
:type start: int
:param end: End position
:type end: int
:return: GSequence
:rtype: GSequence
"""
for seq in self.elements:
if seq.name == name:
return seq.slice_sequence(start, end)

def write_fasta(self, filename: str, data: bool = False):
with open(filename, "w") as fasta_file:
for seq in self.elements:
if data:
fasta_file.write(">" + seq.name + " " +
" ".join(seq.data) + "\n")
else:
fasta_file.write(f">{seq.name}\n")
for i in range(0, len(seq.sequence), 80):
fasta_file.write(f"{seq.sequence[i:i+80]}\n")
59 changes: 33 additions & 26 deletions genomkit/sequences/io.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from tqdm import tqdm


###########################################################################
# IO functions
###########################################################################
Expand All @@ -14,32 +17,36 @@ def load_FASTA_from_file(file):
res = GSequences()
current_sequence_id = None
current_sequence = ""
for line in file:
line = line.strip()
if line.startswith("#"):
continue
elif line.startswith(">"):
# If there was a previously stored sequence, store it
if current_sequence_id is not None:
infos = current_sequence_id.split()
name = infos[0]
data = infos[1:]
res.add(GSequence(sequence=current_sequence,
name=name, data=data))
# Extract the sequence ID
current_sequence_id = line[1:]
# Start a new sequence
current_sequence = ""
else: # Sequence line
# Append the sequence line to the current sequence
current_sequence += line
# Store the last sequence
if current_sequence_id is not None:
infos = current_sequence_id.split()
name = infos[0]
data = infos[1:]
res.add(GSequence(sequence=current_sequence,
name=name, data=data))
total_lines = sum(1 for line in file if not line.startswith("#"))
file.seek(0) # Reset file pointer to the beginning
with tqdm(total=total_lines) as pbar:
for line in file:
line = line.strip()
if line.startswith("#"):
continue
elif line.startswith(">"):
# If there was a previously stored sequence, store it
if current_sequence_id is not None:
infos = current_sequence_id.split()
name = infos[0]
data = infos[1:]
res.add(GSequence(sequence=current_sequence,
name=name, data=data))
# Extract the sequence ID
current_sequence_id = line[1:]
# Start a new sequence
current_sequence = ""
else: # Sequence line
# Append the sequence line to the current sequence
current_sequence += line
pbar.update(1)
# Store the last sequence
if current_sequence_id is not None:
infos = current_sequence_id.split()
name = infos[0]
data = infos[1:]
res.add(GSequence(sequence=current_sequence,
name=name, data=data))
return res


Expand Down

0 comments on commit c0cbcbf

Please sign in to comment.