Merge pull request #675 from gregcaporaso/pyqi-lives-on

pyqi/travis fix
biocore · Oct 21, 2015 · 5f3405f · 5f3405f
2 parents 2c0f8d9 + 748806f
commit 5f3405f
Show file tree

Hide file tree

Showing 5 changed files with 205 additions and 4 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -18,6 +18,7 @@ install:
   - if [ ${USE_H5PY} ]; then conda install --yes -n env_name h5py>=2.2.0; fi
   - if [ ${PYTHON_VERSION} = "2.7" ]; then conda install --yes -n env_name Sphinx=1.2.2; fi
   - source activate env_name
+  - if [ ${PYTHON_VERSION} = "2.7" ]; then pip install pyqi; fi
   - pip install coveralls
   - pip install -e . --no-deps
 script:

diff --git a/biom/cli/__init__.py b/biom/cli/__init__.py
@@ -28,3 +28,4 @@ def cli():
 import_module('biom.cli.table_normalizer')
 import_module('biom.cli.table_head')
 import_module('biom.cli.table_validator')
+import_module('biom.cli.uc_processor')
diff --git a/biom/cli/uc_processor.py b/biom/cli/uc_processor.py
@@ -0,0 +1,85 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2011-2013, The BIOM Format Development Team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from __future__ import division
+
+import click
+
+from biom.cli import cli
+from biom.cli.util import write_biom_table
+from biom.parse import parse_uc
+from biom.exception import TableException
+
+
+@cli.command('from-uc')
+@click.option('-i', '--input-fp', required=True,
+              type=click.Path(exists=True, dir_okay=False),
+              help='The input uc filepath.')
+@click.option('-o', '--output-fp', default=None,
+              type=click.Path(writable=True),
+              help='The output BIOM filepath', required=False)
+@click.option('--rep-set-fp', type=click.Path(exists=True, dir_okay=False),
+              help="Fasta file containing representative sequences with "
+                   "where sequences are labeled with OTU identifiers, and "
+                   "description fields contain original sequence identifiers. "
+                   "This output is created, for example, by vsearch with the "
+                   "--relabel_sha1 --relabel_keep options.",
+              required=False)
+def from_uc(input_fp, output_fp, rep_set_fp):
+    """Create a BIOM table from a vsearch/uclust/usearch BIOM file.
+
+    Example usage:
+
+    Simple BIOM creation:
+
+    $ biom from-uc -i in.uc -o out.biom
+
+    BIOM creation with OTU re-naming:
+
+    $ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna
+
+    """
+    input_f = open(input_fp, 'U')
+    if rep_set_fp is not None:
+        rep_set_f = open(rep_set_fp, 'U')
+    else:
+        rep_set_f = None
+    table = _from_uc(input_f, rep_set_f)
+    write_biom_table(table, 'hdf5', output_fp)
+
+
+def _id_map_from_fasta(fasta_lines):
+    result = {}
+    for line in fasta_lines:
+        if line.startswith('>'):
+            try:
+                obs_id, seq_id = line.split()[:2]
+            except ValueError:
+                raise ValueError('Sequence identifiers in fasta file '
+                                 'must contain at least two space-'
+                                 'separated fields.')
+            result[seq_id] = obs_id[1:]
+        else:
+            pass
+    return result
+
+
+def _from_uc(input_f, rep_set_f=None):
+    table = parse_uc(input_f)
+
+    if rep_set_f is not None:
+        obs_id_map = _id_map_from_fasta(rep_set_f)
+        try:
+            table.update_ids(obs_id_map, axis='observation', strict=True,
+                             inplace=True)
+        except TableException:
+            raise ValueError('Not all sequence identifiers in the input BIOM '
+                             'file are present in description fields in the '
+                             'representative sequence fasta file.')
+
+    return table
diff --git a/setup.py b/setup.py
@@ -10,6 +10,7 @@
 # ----------------------------------------------------------------------------
 
 import os
+import sys
 from setuptools import setup, find_packages
 from setuptools.extension import Extension
 
@@ -78,6 +79,13 @@
     from Cython.Build import cythonize
     extensions = cythonize(extensions)
 
+install_requires = ["click", "numpy >= 1.3.0", "future >= 0.14.3",
+                    "scipy >= 0.13.0"]
+# HACK: for backward-compatibility with QIIME 1.9.x, pyqi must be installed.
+# pyqi is not used anymore in this project.
+if sys.version_info[0] < 3:
+    install_requires.append("pyqi")
+
 setup(name='biom-format',
       version=__version__,
       description='Biological Observation Matrix (BIOM) format',
@@ -93,10 +101,7 @@
       include_package_data=True,
       ext_modules=extensions,
       include_dirs=[np.get_include()],
-      install_requires=["click",
-                        "numpy >= 1.3.0",
-                        "future >= 0.14.3",
-                        "scipy >= 0.13.0"],
+      install_requires=install_requires,
       extras_require={'test': ["nose >= 0.10.1", "flake8"],
                       'hdf5': ["h5py >= 2.2.0"]
                       },

diff --git a/tests/test_cli/test_uc_processor.py b/tests/test_cli/test_uc_processor.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+# -----------------------------------------------------------------------------
+# Copyright (c) 2011-2015, The BIOM Format Development Team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# -----------------------------------------------------------------------------
+
+import tempfile
+from unittest import TestCase, main
+
+import numpy as np
+
+import biom
+from biom.cli.uc_processor import _from_uc
+
+class TestUcProcessor(TestCase):
+
+    def setUp(self):
+        """Set up data for use in unit tests."""
+        self.cmd = _from_uc
+        self.uc_minimal = uc_minimal.split('\n')
+        self.uc = uc.split('\n')
+        self.rep_set = rep_set.split('\n')
+        self.rep_set_no_mapping = rep_set_no_mapping.split('\n')
+        self.rep_set_missing_id = rep_set_missing_id.split('\n')
+
+    def test_basic(self):
+        obs = self.cmd(self.uc_minimal)
+        expected = biom.Table(np.array([[1.0]]),
+                              observation_ids=['f2_1539'],
+                              sample_ids=['f2'])
+        self.assertEqual(obs, expected)
+
+    def test_basic_w_mapping(self):
+        obs = self.cmd(self.uc_minimal, self.rep_set)
+        expected = biom.Table(np.array([[1.0]]),
+                              observation_ids=['otu1'],
+                              sample_ids=['f2'])
+        self.assertEqual(obs, expected)
+
+    def test_rep_set_no_mapping(self):
+        self.assertRaises(ValueError, self.cmd, self.uc_minimal,
+                          self.rep_set_no_mapping)
+
+    def test_rep_set_missing_id(self):
+        self.assertRaises(ValueError, self.cmd, self.uc_minimal,
+                          self.rep_set_missing_id)
+
+    def test_uc(self):
+        obs = self.cmd(self.uc)
+        expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]),
+                              observation_ids=['f2_1539', 'f3_1540'],
+                              sample_ids=['f2', 'f3'])
+        self.assertEqual(obs, expected)
+
+    def test_uc_w_mapping(self):
+        obs = self.cmd(self.uc, self.rep_set)
+        expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]),
+                              observation_ids=['otu1', 'otu2'],
+                              sample_ids=['f2', 'f3'])
+        self.assertEqual(obs, expected)
+
+uc_minimal = """# uclust --input /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T/UclustExactMatchFilterrW47Ju.fasta --id 0.97 --tmpdir /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T --w 8 --stepwords 8 --usersort --maxaccepts 1 --stable_sort --maxrejects 8 --uc dn-otus/uclust_picked_otus/seqs_clusters.uc
+# version=1.2.22
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+S	0	133	*	*	*	*	*	f2_1539	*
+"""
+
+uc = """# uclust --input /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T/UclustExactMatchFilterrW47Ju.fasta --id 0.97 --tmpdir /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T --w 8 --stepwords 8 --usersort --maxaccepts 1 --stable_sort --maxrejects 8 --uc dn-otus/uclust_picked_otus/seqs_clusters.uc
+# version=1.2.22
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+S	0	133	*	*	*	*	*	f2_1539	*
+S	0	133	*	*	*	*	*	f3_1540	*
+H	0	141	100.0	+	0	0	133M8D	f3_42	f2_1539
+"""
+
+rep_set = """>otu1 f2_1539
+ACGT
+>otu2 f3_1540
+ACCT
+"""
+
+rep_set_no_mapping = """>otu1
+ACGT
+>otu2
+ACCT
+"""
+
+rep_set_missing_id = """>otu1 f99_1539
+ACGT
+>otu2 f99_1539
+ACCT
+"""
+
+if __name__ == '__main__':
+    main()