From 0f470cd69e69a48643fa671f1ac3a1642102760e Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 16 Nov 2020 12:23:18 -0800 Subject: [PATCH] Normalize metadata on parse (#853) * TST: verify bytes -> utf-8 on parse * Force general parser to return utf-8 if presented with bytes * REL/DOC: note bugfix, demark version changes for release --- ChangeLog.md | 9 +++++++++ biom/__init__.py | 4 ++-- biom/err.py | 2 +- biom/exception.py | 4 ++-- biom/parse.py | 4 ++-- biom/table.py | 6 ++++-- biom/tests/test_table.py | 15 ++++++++++++++- biom/util.py | 6 +++--- doc/conf.py | 4 ++-- setup.py | 6 +++--- 10 files changed, 42 insertions(+), 18 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index a2b72ffc5..afc3d0f32 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,15 @@ BIOM-Format ChangeLog ===================== +biom 2.1.10 +----------- + +Bug fix, released on 16 November 2020. + +Bug fixes: + +* During deployment testing for QIIME 2 2020.11, it was observed that certain combinations of hdf5 or h5py dependencies can result in metadata strings parsing as ASCII rather than UTF-8. Parse of BIOM-Format 2.1.0 files now normalize metadata strings as UTF-8, see [PR #853](https://github.com/biocore/biom-format/pull/853). + biom 2.1.9 ---------- diff --git a/biom/__init__.py b/biom/__init__.py index e61570a13..7217299d4 100755 --- a/biom/__init__.py +++ b/biom/__init__.py @@ -41,7 +41,7 @@ """ # ---------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # @@ -53,7 +53,7 @@ from .util import __format_version__, __version__ __author__ = "Daniel McDonald" -__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" +__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso", "Jose Clemente", "Justin Kuczynski", "Antonio Gonzalez", "Yoshiki Vazquez Baeza", "Jose Navas", "Adam Robbins-Pianka", diff --git a/biom/err.py b/biom/err.py index 7cd676c4b..0cbcf0af9 100644 --- a/biom/err.py +++ b/biom/err.py @@ -51,7 +51,7 @@ """ # ----------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # diff --git a/biom/exception.py b/biom/exception.py index 0cbcb5fdd..0969ac44f 100644 --- a/biom/exception.py +++ b/biom/exception.py @@ -2,7 +2,7 @@ """Define BIOM exceptions""" # ----------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # @@ -10,7 +10,7 @@ # ----------------------------------------------------------------------------- __author__ = "Daniel McDonald" -__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" +__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso", "Jose Clemente", "Justin Kuczynski"] __license__ = "BSD" diff --git a/biom/parse.py b/biom/parse.py index 67f869666..ad29f02c5 100644 --- a/biom/parse.py +++ b/biom/parse.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # ---------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # @@ -23,7 +23,7 @@ __author__ = "Justin Kuczynski" -__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" +__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" __credits__ = ["Justin Kuczynski", "Daniel McDonald", "Greg Caporaso", "Jose Carlos Clemente Litran", "Adam Robbins-Pianka", "Jose Antonio Navas Molina"] diff --git a/biom/table.py b/biom/table.py index 264bbcc7e..33a4d7871 100644 --- a/biom/table.py +++ b/biom/table.py @@ -165,7 +165,7 @@ """ # ----------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # @@ -212,7 +212,7 @@ __author__ = "Daniel McDonald" -__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" +__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso", "Jose Clemente", "Justin Kuczynski", "Adam Robbins-Pianka", "Joshua Shorenstein", "Jose Antonio Navas Molina", @@ -257,6 +257,8 @@ def _identify_bad_value(dtype, fields): def general_parser(x): + if isinstance(x, bytes): + x = x.decode('utf8') return x diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py index 7c61222dd..dd0f41595 100644 --- a/biom/tests/test_table.py +++ b/biom/tests/test_table.py @@ -29,7 +29,7 @@ list_dict_to_sparse, dict_to_sparse, coo_arrays_to_sparse, list_list_to_sparse, nparray_to_sparse, list_sparse_to_sparse, - _identify_bad_value) + _identify_bad_value, general_parser) from biom.parse import parse_biom_table from biom.err import errstate @@ -615,6 +615,19 @@ def test_max_whole(self): obs = self.simple_derived.max('whole') npt.assert_equal(obs, exp) + def test_general_parser(self): + test_and_exp = [(b'foo', 'foo'), + ('foo', 'foo'), + (b'', ''), + ('', ''), + (b'10', '10'), + ('10', '10'), + (b'3.14', '3.14'), + ('3.14', '3.14')] + for test, exp in test_and_exp: + obs = general_parser(test) + self.assertEqual(obs, exp) + @npt.dec.skipif(HAVE_H5PY is False, msg='H5PY is not installed') def test_from_hdf5_non_hdf5_file_or_group(self): with self.assertRaises(ValueError): diff --git a/biom/util.py b/biom/util.py index 889f407e8..6f2695fbb 100644 --- a/biom/util.py +++ b/biom/util.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # ---------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # @@ -37,7 +37,7 @@ from numpy import mean, median, min, max __author__ = "Daniel McDonald" -__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" +__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso", "Jose Clemente", "Justin Kuczynski", "Jorge Cañardo Alastuey"] __license__ = "BSD" @@ -45,7 +45,7 @@ __maintainer__ = "Daniel McDonald" __email__ = "daniel.mcdonald@colorado.edu" __format_version__ = (2, 1) -__version__ = "2.1.9" +__version__ = "2.1.10" def generate_subsamples(table, n, axis='sample', by_id=False): diff --git a/doc/conf.py b/doc/conf.py index 903a5de62..0a096d538 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -66,8 +66,8 @@ # built documents. # # The full version, including alpha/beta/rc tags. -version = "2.1.9" -release = "2.1.9" +version = "2.1.10" +release = "2.1.10" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index a7a9bc1f4..3e22097f6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # ---------------------------------------------------------------------------- -# Copyright (c) 2011-2017, The BIOM Format Development Team. +# Copyright (c) 2011-2020, The BIOM Format Development Team. # # Distributed under the terms of the Modified BSD License. # @@ -29,11 +29,11 @@ pass __author__ = "Daniel McDonald" -__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" +__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" __credits__ = ["Greg Caporaso", "Daniel McDonald", "Jose Clemente", "Jai Ram Rideout", "Jorge Cañardo Alastuey", "Michael Hall"] __license__ = "BSD" -__version__ = "2.1.9" +__version__ = "2.1.10" __maintainer__ = "Daniel McDonald" __email__ = "mcdonadt@colorado.edu"