Skip to content

Commit

Permalink
Merge pull request #675 from gregcaporaso/pyqi-lives-on
Browse files Browse the repository at this point in the history
pyqi/travis fix
  • Loading branch information
ebolyen committed Oct 21, 2015
2 parents 2c0f8d9 + 748806f commit 5f3405f
Show file tree
Hide file tree
Showing 5 changed files with 205 additions and 4 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ install:
- if [ ${USE_H5PY} ]; then conda install --yes -n env_name h5py>=2.2.0; fi
- if [ ${PYTHON_VERSION} = "2.7" ]; then conda install --yes -n env_name Sphinx=1.2.2; fi
- source activate env_name
- if [ ${PYTHON_VERSION} = "2.7" ]; then pip install pyqi; fi
- pip install coveralls
- pip install -e . --no-deps
script:
Expand Down
1 change: 1 addition & 0 deletions biom/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ def cli():
import_module('biom.cli.table_normalizer')
import_module('biom.cli.table_head')
import_module('biom.cli.table_validator')
import_module('biom.cli.uc_processor')
85 changes: 85 additions & 0 deletions biom/cli/uc_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2011-2013, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import division

import click

from biom.cli import cli
from biom.cli.util import write_biom_table
from biom.parse import parse_uc
from biom.exception import TableException


@cli.command('from-uc')
@click.option('-i', '--input-fp', required=True,
type=click.Path(exists=True, dir_okay=False),
help='The input uc filepath.')
@click.option('-o', '--output-fp', default=None,
type=click.Path(writable=True),
help='The output BIOM filepath', required=False)
@click.option('--rep-set-fp', type=click.Path(exists=True, dir_okay=False),
help="Fasta file containing representative sequences with "
"where sequences are labeled with OTU identifiers, and "
"description fields contain original sequence identifiers. "
"This output is created, for example, by vsearch with the "
"--relabel_sha1 --relabel_keep options.",
required=False)
def from_uc(input_fp, output_fp, rep_set_fp):
"""Create a BIOM table from a vsearch/uclust/usearch BIOM file.
Example usage:
Simple BIOM creation:
$ biom from-uc -i in.uc -o out.biom
BIOM creation with OTU re-naming:
$ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna
"""
input_f = open(input_fp, 'U')
if rep_set_fp is not None:
rep_set_f = open(rep_set_fp, 'U')
else:
rep_set_f = None
table = _from_uc(input_f, rep_set_f)
write_biom_table(table, 'hdf5', output_fp)


def _id_map_from_fasta(fasta_lines):
result = {}
for line in fasta_lines:
if line.startswith('>'):
try:
obs_id, seq_id = line.split()[:2]
except ValueError:
raise ValueError('Sequence identifiers in fasta file '
'must contain at least two space-'
'separated fields.')
result[seq_id] = obs_id[1:]
else:
pass
return result


def _from_uc(input_f, rep_set_f=None):
table = parse_uc(input_f)

if rep_set_f is not None:
obs_id_map = _id_map_from_fasta(rep_set_f)
try:
table.update_ids(obs_id_map, axis='observation', strict=True,
inplace=True)
except TableException:
raise ValueError('Not all sequence identifiers in the input BIOM '
'file are present in description fields in the '
'representative sequence fasta file.')

return table
13 changes: 9 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# ----------------------------------------------------------------------------

import os
import sys
from setuptools import setup, find_packages
from setuptools.extension import Extension

Expand Down Expand Up @@ -78,6 +79,13 @@
from Cython.Build import cythonize
extensions = cythonize(extensions)

install_requires = ["click", "numpy >= 1.3.0", "future >= 0.14.3",
"scipy >= 0.13.0"]
# HACK: for backward-compatibility with QIIME 1.9.x, pyqi must be installed.
# pyqi is not used anymore in this project.
if sys.version_info[0] < 3:
install_requires.append("pyqi")

setup(name='biom-format',
version=__version__,
description='Biological Observation Matrix (BIOM) format',
Expand All @@ -93,10 +101,7 @@
include_package_data=True,
ext_modules=extensions,
include_dirs=[np.get_include()],
install_requires=["click",
"numpy >= 1.3.0",
"future >= 0.14.3",
"scipy >= 0.13.0"],
install_requires=install_requires,
extras_require={'test': ["nose >= 0.10.1", "flake8"],
'hdf5': ["h5py >= 2.2.0"]
},
Expand Down
109 changes: 109 additions & 0 deletions tests/test_cli/test_uc_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python

# -----------------------------------------------------------------------------
# Copyright (c) 2011-2015, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

import tempfile
from unittest import TestCase, main

import numpy as np

import biom
from biom.cli.uc_processor import _from_uc

class TestUcProcessor(TestCase):

def setUp(self):
"""Set up data for use in unit tests."""
self.cmd = _from_uc
self.uc_minimal = uc_minimal.split('\n')
self.uc = uc.split('\n')
self.rep_set = rep_set.split('\n')
self.rep_set_no_mapping = rep_set_no_mapping.split('\n')
self.rep_set_missing_id = rep_set_missing_id.split('\n')

def test_basic(self):
obs = self.cmd(self.uc_minimal)
expected = biom.Table(np.array([[1.0]]),
observation_ids=['f2_1539'],
sample_ids=['f2'])
self.assertEqual(obs, expected)

def test_basic_w_mapping(self):
obs = self.cmd(self.uc_minimal, self.rep_set)
expected = biom.Table(np.array([[1.0]]),
observation_ids=['otu1'],
sample_ids=['f2'])
self.assertEqual(obs, expected)

def test_rep_set_no_mapping(self):
self.assertRaises(ValueError, self.cmd, self.uc_minimal,
self.rep_set_no_mapping)

def test_rep_set_missing_id(self):
self.assertRaises(ValueError, self.cmd, self.uc_minimal,
self.rep_set_missing_id)

def test_uc(self):
obs = self.cmd(self.uc)
expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]),
observation_ids=['f2_1539', 'f3_1540'],
sample_ids=['f2', 'f3'])
self.assertEqual(obs, expected)

def test_uc_w_mapping(self):
obs = self.cmd(self.uc, self.rep_set)
expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]),
observation_ids=['otu1', 'otu2'],
sample_ids=['f2', 'f3'])
self.assertEqual(obs, expected)

uc_minimal = """# uclust --input /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T/UclustExactMatchFilterrW47Ju.fasta --id 0.97 --tmpdir /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T --w 8 --stepwords 8 --usersort --maxaccepts 1 --stable_sort --maxrejects 8 --uc dn-otus/uclust_picked_otus/seqs_clusters.uc
# version=1.2.22
# Tab-separated fields:
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
# For C and D types, PctId is average id with seed.
# QueryStart and SeedStart are zero-based relative to start of sequence.
# If minus strand, SeedStart is relative to reverse-complemented seed.
S 0 133 * * * * * f2_1539 *
"""

uc = """# uclust --input /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T/UclustExactMatchFilterrW47Ju.fasta --id 0.97 --tmpdir /var/folders/xq/0kh93ng53bs6zzk091w_bbsr0000gn/T --w 8 --stepwords 8 --usersort --maxaccepts 1 --stable_sort --maxrejects 8 --uc dn-otus/uclust_picked_otus/seqs_clusters.uc
# version=1.2.22
# Tab-separated fields:
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
# For C and D types, PctId is average id with seed.
# QueryStart and SeedStart are zero-based relative to start of sequence.
# If minus strand, SeedStart is relative to reverse-complemented seed.
S 0 133 * * * * * f2_1539 *
S 0 133 * * * * * f3_1540 *
H 0 141 100.0 + 0 0 133M8D f3_42 f2_1539
"""

rep_set = """>otu1 f2_1539
ACGT
>otu2 f3_1540
ACCT
"""

rep_set_no_mapping = """>otu1
ACGT
>otu2
ACCT
"""

rep_set_missing_id = """>otu1 f99_1539
ACGT
>otu2 f99_1539
ACCT
"""

if __name__ == '__main__':
main()

0 comments on commit 5f3405f

Please sign in to comment.