-
Notifications
You must be signed in to change notification settings - Fork 94
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #675 from gregcaporaso/pyqi-lives-on
pyqi/travis fix
- Loading branch information
Showing
5 changed files
with
205 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2011-2013, The BIOM Format Development Team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
|
||
from __future__ import division | ||
|
||
import click | ||
|
||
from biom.cli import cli | ||
from biom.cli.util import write_biom_table | ||
from biom.parse import parse_uc | ||
from biom.exception import TableException | ||
|
||
|
||
@cli.command('from-uc') | ||
@click.option('-i', '--input-fp', required=True, | ||
type=click.Path(exists=True, dir_okay=False), | ||
help='The input uc filepath.') | ||
@click.option('-o', '--output-fp', default=None, | ||
type=click.Path(writable=True), | ||
help='The output BIOM filepath', required=False) | ||
@click.option('--rep-set-fp', type=click.Path(exists=True, dir_okay=False), | ||
help="Fasta file containing representative sequences with " | ||
"where sequences are labeled with OTU identifiers, and " | ||
"description fields contain original sequence identifiers. " | ||
"This output is created, for example, by vsearch with the " | ||
"--relabel_sha1 --relabel_keep options.", | ||
required=False) | ||
def from_uc(input_fp, output_fp, rep_set_fp): | ||
"""Create a BIOM table from a vsearch/uclust/usearch BIOM file. | ||
Example usage: | ||
Simple BIOM creation: | ||
$ biom from-uc -i in.uc -o out.biom | ||
BIOM creation with OTU re-naming: | ||
$ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna | ||
""" | ||
input_f = open(input_fp, 'U') | ||
if rep_set_fp is not None: | ||
rep_set_f = open(rep_set_fp, 'U') | ||
else: | ||
rep_set_f = None | ||
table = _from_uc(input_f, rep_set_f) | ||
write_biom_table(table, 'hdf5', output_fp) | ||
|
||
|
||
def _id_map_from_fasta(fasta_lines): | ||
result = {} | ||
for line in fasta_lines: | ||
if line.startswith('>'): | ||
try: | ||
obs_id, seq_id = line.split()[:2] | ||
except ValueError: | ||
raise ValueError('Sequence identifiers in fasta file ' | ||
'must contain at least two space-' | ||
'separated fields.') | ||
result[seq_id] = obs_id[1:] | ||
else: | ||
pass | ||
return result | ||
|
||
|
||
def _from_uc(input_f, rep_set_f=None): | ||
table = parse_uc(input_f) | ||
|
||
if rep_set_f is not None: | ||
obs_id_map = _id_map_from_fasta(rep_set_f) | ||
try: | ||
table.update_ids(obs_id_map, axis='observation', strict=True, | ||
inplace=True) | ||
except TableException: | ||
raise ValueError('Not all sequence identifiers in the input BIOM ' | ||
'file are present in description fields in the ' | ||
'representative sequence fasta file.') | ||
|
||
return table |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/bin/env python | ||
|
||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2011-2015, The BIOM Format Development Team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ----------------------------------------------------------------------------- | ||
|
||
import tempfile | ||
from unittest import TestCase, main | ||
|
||
import numpy as np | ||
|
||
import biom | ||
from biom.cli.uc_processor import _from_uc | ||
|
||
class TestUcProcessor(TestCase): | ||
|
||
def setUp(self): | ||
"""Set up data for use in unit tests.""" | ||
self.cmd = _from_uc | ||
self.uc_minimal = uc_minimal.split('\n') | ||
self.uc = uc.split('\n') | ||
self.rep_set = rep_set.split('\n') | ||
self.rep_set_no_mapping = rep_set_no_mapping.split('\n') | ||
self.rep_set_missing_id = rep_set_missing_id.split('\n') | ||
|
||
def test_basic(self): | ||
obs = self.cmd(self.uc_minimal) | ||
expected = biom.Table(np.array([[1.0]]), | ||
observation_ids=['f2_1539'], | ||
sample_ids=['f2']) | ||
self.assertEqual(obs, expected) | ||
|
||
def test_basic_w_mapping(self): | ||
obs = self.cmd(self.uc_minimal, self.rep_set) | ||
expected = biom.Table(np.array([[1.0]]), | ||
observation_ids=['otu1'], | ||
sample_ids=['f2']) | ||
self.assertEqual(obs, expected) | ||
|
||
def test_rep_set_no_mapping(self): | ||
self.assertRaises(ValueError, self.cmd, self.uc_minimal, | ||
self.rep_set_no_mapping) | ||
|
||
def test_rep_set_missing_id(self): | ||
self.assertRaises(ValueError, self.cmd, self.uc_minimal, | ||
self.rep_set_missing_id) | ||
|
||
def test_uc(self): | ||
obs = self.cmd(self.uc) | ||
expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]), | ||
observation_ids=['f2_1539', 'f3_1540'], | ||
sample_ids=['f2', 'f3']) | ||
self.assertEqual(obs, expected) | ||
|
||
def test_uc_w_mapping(self): | ||
obs = self.cmd(self.uc, self.rep_set) | ||
expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]), | ||
observation_ids=['otu1', 'otu2'], | ||
sample_ids=['f2', 'f3']) | ||
self.assertEqual(obs, expected) | ||
|
||
uc_minimal = """# uclust --input /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T/UclustExactMatchFilterrW47Ju.fasta --id 0.97 --tmpdir /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T --w 8 --stepwords 8 --usersort --maxaccepts 1 --stable_sort --maxrejects 8 --uc dn-otus/uclust_picked_otus/seqs_clusters.uc | ||
# version=1.2.22 | ||
# Tab-separated fields: | ||
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel | ||
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit | ||
# For C and D types, PctId is average id with seed. | ||
# QueryStart and SeedStart are zero-based relative to start of sequence. | ||
# If minus strand, SeedStart is relative to reverse-complemented seed. | ||
S 0 133 * * * * * f2_1539 * | ||
""" | ||
|
||
uc = """# uclust --input /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T/UclustExactMatchFilterrW47Ju.fasta --id 0.97 --tmpdir /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T --w 8 --stepwords 8 --usersort --maxaccepts 1 --stable_sort --maxrejects 8 --uc dn-otus/uclust_picked_otus/seqs_clusters.uc | ||
# version=1.2.22 | ||
# Tab-separated fields: | ||
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel | ||
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit | ||
# For C and D types, PctId is average id with seed. | ||
# QueryStart and SeedStart are zero-based relative to start of sequence. | ||
# If minus strand, SeedStart is relative to reverse-complemented seed. | ||
S 0 133 * * * * * f2_1539 * | ||
S 0 133 * * * * * f3_1540 * | ||
H 0 141 100.0 + 0 0 133M8D f3_42 f2_1539 | ||
""" | ||
|
||
rep_set = """>otu1 f2_1539 | ||
ACGT | ||
>otu2 f3_1540 | ||
ACCT | ||
""" | ||
|
||
rep_set_no_mapping = """>otu1 | ||
ACGT | ||
>otu2 | ||
ACCT | ||
""" | ||
|
||
rep_set_missing_id = """>otu1 f99_1539 | ||
ACGT | ||
>otu2 f99_1539 | ||
ACCT | ||
""" | ||
|
||
if __name__ == '__main__': | ||
main() |