Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Biopython update #157

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
fail-fast: false
matrix:
os: [macOS-latest, ubuntu-latest]
python-version: [3.7, 3.8]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
env:
CI_OS: ${{ matrix.os }}
PYVER: ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion devtools/conda-envs/test_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- tqdm
# Chimera reimplementation
- biotite
- biopython<=1.77
- biopython
# mmligner
- mmligner
# theseus
Expand Down
2 changes: 1 addition & 1 deletion devtools/conda-envs/user_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- tqdm
# Chimera reimplementation
- biotite
- biopython<=1.77
- biopython
# mmligner
- mmligner
# theseus
Expand Down
37 changes: 29 additions & 8 deletions opencadd/structure/superposition/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,27 @@ def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool
return alignment[0]


def find_gap_character(seq):
"""
Finds the gap character of an alignment sequence

Args:
seq: str
aligned sequence

Returns:
gap character if found: str
None else
"""
if '-' in seq:
return '-'
elif '=' in seq:
return '='
else:
return ''
#raise ValueError("No standard gap character found!")


def fasta2select(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't the gap characters be removed explicitly now that BioPython doesn't recognize them?

From BioPython Documentation:

Another case where the alphabet was used was in declaring the gap character, by default - in the various Biopython sequence and alignment parsers. If you are using a different character, you will need to pass this to the Seq object .replace() method explicitly now:

# Old style
from Bio.Alphabet import generic_dna, Gapped
from Bio.Seq import Seq

my_dna = Seq("ACGT=TT", Gapped(generic_dna, "="))
print(my_dna.ungap())
# New style
from Bio.Seq import Seq

my_dna = Seq("ACGT=TT")
print(my_dna.replace("=", ""))

fastafilename,
ref_resids=None,
Expand Down Expand Up @@ -112,9 +133,8 @@ def fasta2select(
dictionary with 'reference' and 'mobile' selection string

"""
protein_gapped = Bio.Alphabet.Gapped(Bio.Alphabet.IUPAC.protein)
with open(fastafilename) as fasta:
alignment = Bio.AlignIO.read(fasta, "fasta", alphabet=protein_gapped)
alignment = Bio.AlignIO.read(fasta, "fasta")

nseq = len(alignment)
if nseq != 2:
Expand All @@ -127,7 +147,7 @@ def fasta2select(
if orig_resids[iseq] is None:
# build default: assume consecutive numbering of all
# residues in the alignment
GAP = a.seq.alphabet.gap_char
GAP = find_gap_character(a.seq)
length = len(a.seq) - a.seq.count(GAP)
orig_resids[iseq] = np.arange(1, length + 1)
else:
Expand All @@ -136,7 +156,7 @@ def fasta2select(
if orig_segids[iseq] is None:
# build default: assume consecutive numbering of all
# residues in the alignment
GAP = a.seq.alphabet.gap_char
GAP = find_gap_character(a.seq)
length = len(a.seq) - a.seq.count(GAP)
orig_segids[iseq] = np.full(length, "")
else:
Expand Down Expand Up @@ -175,7 +195,8 @@ def resid_factory(alignment, seq2resids):
t = np.zeros((nseq, alignment.get_alignment_length()), dtype=int)
s = np.zeros((nseq, alignment.get_alignment_length()), dtype=object)
for iseq, a in enumerate(alignment):
GAP = a.seq.alphabet.gap_char
print(a.seq)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leftover from debugging?

GAP = find_gap_character(a.seq)
indices = np.cumsum(np.where(np.array(list(a.seq)) == GAP, 0, 1)) - 1
t[iseq, :] = seq2resids[iseq][0][indices]
s[iseq, :] = seq2resids[iseq][1][indices]
Expand All @@ -190,8 +211,8 @@ def resid(nseq, ipos, t=t, s=s):
res_list = [] # collect individual selection string

# should be the same for both seqs
GAP = alignment[0].seq.alphabet.gap_char
if GAP != alignment[1].seq.alphabet.gap_char:
GAP = find_gap_character(a.seq)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this (Line 214) be GAP = find_gap_character(alignment[0].seq) instead?
a is a loop variable from above, and is simply equal to the last value of the loop iteration, which seems quite random. Also, the original lines were:

GAP = alignment[0].seq.alphabet.gap_char
if GAP != alignment[1].seq.alphabet.gap_char:

and the second line has changed to

if GAP != find_gap_character(alignment[1].seq):

so it looks like the first line must also be:

GAP = find_gap_character(alignment[0].seq)

instead of:

GAP = find_gap_character(a.seq)

if GAP != find_gap_character(alignment[1].seq):
raise ValueError("Different gap characters in sequence 'target' and 'mobile'.")
for ipos in range(alignment.get_alignment_length()):
aligned = list(alignment[:, ipos])
Expand All @@ -204,4 +225,4 @@ def resid(nseq, ipos, t=t, s=s):

ref_selection = " or ".join(sel[0])
target_selection = " or ".join(sel[1])
return {"reference": ref_selection, "mobile": target_selection}
return {"reference": ref_selection, "mobile": target_selection}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add end-of-file newline

Loading