update examples

chaochungkuo · Feb 20, 2024 · c0cbcbf · c0cbcbf
1 parent faab017
commit c0cbcbf
Show file tree

Hide file tree

Showing 5 changed files with 170 additions and 70 deletions.
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -55,4 +55,26 @@ Extract the genes by their biotypes from a GTF file
 - snRNA
 - miRNA
 
-.. code-block:: python
+.. code-block:: python
+
+    from genomkit import GAnnotation
+
+    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
+    target_biotypes = ["protein_coding", "lncRNA", "snRNA", "miRNA"]
+    for biotype in target_biotypes:
+        genes = gtf.get_regions(element_type="gene",
+                                attribute="gene_type", value=biotype)
+        genes.write(filename="hg38_genes_"+biotype+".bed")
+
+Get the sequences in FASTA format from a BED file
+-------------------------------------------------
+
+For example, you have a BED file for all the exons from hg38. Now you want to get their sequences in FASTA format with care of the orientation of the strands.
+
+.. code-block:: python
+
+    from genomkit import GRegions
+
+    exons = GRegions(name="exons", load="hg38_exons.bed")
+    exon_seqs = exons.get_GSequences(FASTA_file=FASTA_hg38)
+    exon_seqs.write_fasta(filename="hg38_exons.fasta")
diff --git a/genomkit/regions/gregions.py b/genomkit/regions/gregions.py
@@ -4,6 +4,7 @@
 import numpy as np
 from .io import load_BED
 import os
+import sys
 
 
 ###########################################################################
@@ -691,100 +692,104 @@ def subtract(self, regions, whole_region: bool = False,
                 finished = 0
                 last_region = self.elements[-1]
 
-                for region in self.elements:
+                for r in self.elements:
                     if not finished:
-                        if region != last_region:
+                        if r != last_region:
                             # handling of duplicates within self
-                            if region.sequence == regions.elements[counts].sequence:
-                                if region.start == regions.elements[counts].start:
-                                    if region.end == regions.elements[counts].end:
+                            if r.sequence == regions.elements[counts].sequence:
+                                if r.start == regions.elements[counts].start:
+                                    if r.end == regions.elements[counts].end:
                                         # cur y  |-----|
                                         # region |-----|
                                         counts = counts + 1
                                         if counts == len(regions.elements):
                                             finished = 1
 
-                                    elif region.end < regions.elements[counts].end:
+                                    elif r.end < regions.elements[counts].end:
                                         # cur y    |------|
                                         # region   |---|
-                                        small_self.add(region)
+                                        small_self.add(r)
                                     else:
                                         # cur y   |----|
                                         # region  |------|
                                         loop = 1
-                                        while regions.elements[counts].end < region.end and loop:
+                                        while regions.elements[counts].end < r.end and loop:
                                             if counts < len(regions.elements) - 1:
                                                 counts = counts + 1
-                                                if regions.elements[counts].sequence > region.sequence:
-                                                    small_self.add(region)
+                                                if regions.elements[counts].sequence > r.sequence:
+                                                    small_self.add(r)
                                                     loop = 0
                                                 else:
                                                     # still the same chromosome as current region from self
-                                                    if regions.elements[counts].end == region.end:
+                                                    if regions.elements[counts].end == r.end:
                                                         # cur y  |-----|
                                                         # region |-----|
                                                         counts = counts + 1
                                                         if counts == len(regions.elements):
                                                             loop = 0
                                                             finished = 1
-                                                    elif regions.elements[counts].end > region.end:
+                                                    elif regions.elements[counts].end > r.end:
                                                         # cur y   |----|
                                                         # region  |---|
-                                                        small_self.add(region)
+                                                        small_self.add(r)
                                             else:
                                                 # reached the last region in y and this is not exactly mapping
-                                                small_self.add(region)
+                                                small_self.add(r)
                                                 loop = 0
-                                elif region.start < regions.elements[counts].start:
+                                elif r.start < regions.elements[counts].start:
                                     # cur y       |---| |        |----| |     |---|     |    |---|
                                     # region    |---|   |  |---|        |   |-----|     |   |------|
-                                    small_self.add(region)
+                                    small_self.add(r)
 
                                 else:
                                     # cur y     |---|        |  |---|      | |-----| |   |----|
                                     # region          |---|  |     |---|   |   |--|  |     |--|
                                     loop = 1
-                                    while regions.elements[counts].start < region.start and loop:
+                                    while regions.elements[counts].start < r.start and loop:
                                         if counts < len(regions.elements) - 1:
                                             counts = counts + 1
-                                            if regions.elements[counts].sequence > region.sequence:
-                                                small_self.add(region)
+                                            if regions.elements[counts].sequence > r.sequence:
+                                                small_self.add(r)
                                                 loop = 0
                                             else:
-                                                if regions.elements[counts].end == region.end:
+                                                if regions.elements[counts].end == r.end:
                                                     # cur y    |-----|
                                                     # region   |-----|
                                                     counts = counts + 1
                                                     if counts == len(regions.elements):
                                                         loop = 0
                                                         finished = 1
-                                                elif regions.elements[counts].start > region.start:
+                                                elif regions.elements[counts].start > r.start:
                                                     # cur y      |---| |        |----| |     |---|     |    |---|
                                                     # region   |---|   |  |---|        |   |-----|     |   |------|
-                                                    small_self.add(region)
+                                                    small_self.add(r)
                                         else:
                                             # reached the last region in y and this is not exactly mapping
-                                            small_self.add(region)
+                                            small_self.add(r)
                                             loop = 0
                                             finished = 1
 
-                            elif region.sequence > regions.elements[counts].sequence:
+                            elif r.sequence > regions.elements[counts].sequence:
                                 counts = counts + 1
                                 if counts == len(regions.elements):
                                     finished = 1
                             else:
-                                # region.sequence < target.sequences[counts].sequence:
-                                small_self.add(region)
+                                # region.sequence < 
+                                # target.sequences[counts].sequence:
+                                small_self.add(r)
                     else:
-                        # finished -> reached end of y, simply add all remaining regions of self that are not equal
-                        # to the last subtracted or a region that has already been added
-                        if region != last_region:
-                            small_self.add(region)
-                    last_region = region
+                        # finished -> reached end of y, simply add all
+                        # remaining regions of self that are not equal
+                        # to the last subtracted or a region that has
+                        # already been added
+                        if r != last_region:
+                            small_self.add(r)
+                    last_region = r
                 return small_self
 
             elif len(self.elements) == 1:
-                # GRS only contains 1 region, only check if this matches exactly with any region within y
+                # GRS only contains 1 region, only check if this matches
+                # exactly with any region within y
                 for target_region in regions.elements:
                     if target_region == self.elements[0]:
                         return GRegions("small_self")  # return empty GRS
@@ -846,29 +851,35 @@ def subtract(self, regions, whole_region: bool = False,
                                          name=s.name, orientation=s.orientation, data=s.data)
                             res.add(s1)
                             s = s2
-                            if j < last_j: j = j + 1
+                            if j < last_j:
+                                j = j + 1
                             cont_overlap = True
                             continue
                         else:
-                            s1 = GRegion(sequence=s.sequence, start=s.start, end=b[j].start,
-                                         name=s.name, orientation=s.orientation, data=s.data)
+                            s1 = GRegion(sequence=s.sequence, start=s.start,
+                                         end=b[j].start, name=s.name,
+                                         orientation=s.orientation,
+                                         data=s.data)
                             res.add(s1)
                             try:
                                 s = next(iter_a)
                                 j = pre_inter
                                 cont_overlap = False
                                 continue
-                            except:
+                            except StopIteration:
                                 cont_loop = False
 
                     elif s.end > b[j].end:
 
                         #     ------
                         # ------
-                        s2 = GRegion(sequence=s.sequence, start=b[j].end, end=s.end,
-                                     name=s.name, orientation=s.orientation, data=s.data)
+                        s2 = GRegion(sequence=s.sequence, start=b[j].end,
+                                     end=s.end, name=s.name,
+                                     orientation=s.orientation,
+                                     data=s.data)
                         s = s2
-                        if j < last_j: j = j + 1
+                        if j < last_j:
+                            j = j + 1
                         cont_overlap = True
                         continue
                     else:
@@ -878,14 +889,14 @@ def subtract(self, regions, whole_region: bool = False,
                         try:
                             s = next(iter_a)
                             j = pre_inter
-                        except:
+                        except StopIteration:
                             cont_loop = False
 
                     if j == last_j:
                         try:
                             s = next(iter_a)
                             j = pre_inter
-                        except:
+                        except StopIteration:
                             cont_loop = False
                     else:
                         j = j + 1
@@ -897,15 +908,15 @@ def subtract(self, regions, whole_region: bool = False,
                         s = next(iter_a)
                         j = pre_inter
                         cont_overlap = False
-                    except:
+                    except StopIteration:
                         cont_loop = False
 
                 elif s > b[j]:
                     if j == last_j:
                         res.add(s)
                         try:
                             s = next(iter_a)
-                        except:
+                        except StopIteration:
                             cont_loop = False
                     else:
                         j = j + 1
@@ -914,3 +925,22 @@ def subtract(self, regions, whole_region: bool = False,
                 self.elements = res.elements
             else:
                 return res
+
+    def get_GSequences(self, FASTA_file):
+        from genomkit import GSequences
+        if os.path.exists(FASTA_file):
+            fasta = GSequences(load=FASTA_file)
+        else:
+            print(FASTA_file + " is not found.")
+            sys.exit()
+        res = GSequences(name=self.name)
+        for region in self.elements:
+            seq = fasta.get_sequence(name=region.sequence,
+                                     start=region.start,
+                                     end=region.end)
+            seq.name = region.name
+            seq.data = region.data
+            if region.orientation == "-":
+                seq.reverse_complement()
+            res.add(seq)
+        return res
diff --git a/genomkit/sequences/gsequence.py b/genomkit/sequences/gsequence.py
@@ -102,3 +102,17 @@ def count_table(self):
         :rtype: dict
         """
         return Counter(self.sequence)
+
+    def slice_sequence(self, start, end):
+        """Return the sequence by the given start and end positions.
+
+        :param start: Start position
+        :type start: int
+        :param end: End position
+        :type end: int
+        :return: Sequence
+        :rtype: str
+        """
+        seq = GSequence(sequence=self.sequence[start:end],
+                        name=self.name, data=self.data)
+        return seq
diff --git a/genomkit/sequences/gsequences.py b/genomkit/sequences/gsequences.py
@@ -111,3 +111,30 @@ def count_table(self):
             for key, count in count_dict.items():
                 result[key] = result.get(key, 0) + count
         return result
+
+    def get_sequence(self, name, start, end):
+        """Return the sequence according to the given name, start and end.
+
+        :param name: Sequence name
+        :type name: str
+        :param start: Start position
+        :type start: int
+        :param end: End position
+        :type end: int
+        :return: GSequence
+        :rtype: GSequence
+        """
+        for seq in self.elements:
+            if seq.name == name:
+                return seq.slice_sequence(start, end)
+
+    def write_fasta(self, filename: str, data: bool = False):
+        with open(filename, "w") as fasta_file:
+            for seq in self.elements:
+                if data:
+                    fasta_file.write(">" + seq.name + " " +
+                                     " ".join(seq.data) + "\n")
+                else:
+                    fasta_file.write(f">{seq.name}\n")
+                for i in range(0, len(seq.sequence), 80):
+                    fasta_file.write(f"{seq.sequence[i:i+80]}\n")
diff --git a/genomkit/sequences/io.py b/genomkit/sequences/io.py
@@ -1,3 +1,6 @@
+from tqdm import tqdm
+
+
 ###########################################################################
 # IO functions
 ###########################################################################
@@ -14,32 +17,36 @@ def load_FASTA_from_file(file):
     res = GSequences()
     current_sequence_id = None
     current_sequence = ""
-    for line in file:
-        line = line.strip()
-        if line.startswith("#"):
-            continue
-        elif line.startswith(">"):
-            # If there was a previously stored sequence, store it
-            if current_sequence_id is not None:
-                infos = current_sequence_id.split()
-                name = infos[0]
-                data = infos[1:]
-                res.add(GSequence(sequence=current_sequence,
-                                  name=name, data=data))
-            # Extract the sequence ID
-            current_sequence_id = line[1:]
-            # Start a new sequence
-            current_sequence = ""
-        else:  # Sequence line
-            # Append the sequence line to the current sequence
-            current_sequence += line
-    # Store the last sequence
-    if current_sequence_id is not None:
-        infos = current_sequence_id.split()
-        name = infos[0]
-        data = infos[1:]
-        res.add(GSequence(sequence=current_sequence,
-                          name=name, data=data))
+    total_lines = sum(1 for line in file if not line.startswith("#"))
+    file.seek(0)  # Reset file pointer to the beginning
+    with tqdm(total=total_lines) as pbar:
+        for line in file:
+            line = line.strip()
+            if line.startswith("#"):
+                continue
+            elif line.startswith(">"):
+                # If there was a previously stored sequence, store it
+                if current_sequence_id is not None:
+                    infos = current_sequence_id.split()
+                    name = infos[0]
+                    data = infos[1:]
+                    res.add(GSequence(sequence=current_sequence,
+                                      name=name, data=data))
+                # Extract the sequence ID
+                current_sequence_id = line[1:]
+                # Start a new sequence
+                current_sequence = ""
+            else:  # Sequence line
+                # Append the sequence line to the current sequence
+                current_sequence += line
+            pbar.update(1)
+        # Store the last sequence
+        if current_sequence_id is not None:
+            infos = current_sequence_id.split()
+            name = infos[0]
+            data = infos[1:]
+            res.add(GSequence(sequence=current_sequence,
+                              name=name, data=data))
     return res