diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..1c1218d --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +ignore = E129,E133,E203,E221,E241,E251,E303,E266,H106,H904,W291 +max-line-length = 100 +max-complexity = 15 +hang-closing = true +exclude = + .eggs + .tox + build + dist + docs/conf.py + tests/* + **/_data/migrations/* diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 9f1819a..a2ed63d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -4,42 +4,42 @@ on: push: jobs: - # cqa: - # runs-on: ubuntu-latest - - # steps: - # - uses: actions/checkout@v3 - - # - name: Set up Python - # uses: actions/setup-python@v4 - # with: - # python-version: "3.10" - # cache: pip - # cache-dependency-path: '**/setup.cfg' - - # - name: Install test dependencies - # run: | - # python -m pip install --upgrade pip - # pip install --use-deprecated=legacy-resolver -e .[dev] - - # - name: Lint with flake8 - # run: | - # # stop the build if there are Python syntax errors or undefined names - # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - # - name: Format check with isort - # run: | - # isort --check src - - # - name: Format check with black - # run: | - # black --check src - - # - name: Security check with bandit - # run: | - # bandit -ll -r src + cqa: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + cache: pip + cache-dependency-path: '**/setup.cfg' + + - name: Install test dependencies + run: | + python -m pip install --upgrade pip + pip install --use-deprecated=legacy-resolver -e .[dev] + + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Format check with isort + run: | + isort --check src + + - name: Format check with black + run: | + black --check src + + - name: Security check with bandit + run: | + bandit -ll -r src test: runs-on: ubuntu-latest @@ -71,7 +71,7 @@ jobs: - name: Test with pytest run: | make test - + deploy: needs: # - cqa diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..384b910 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: +- repo: local + hooks: + - id: flake8 + name: flake8 + entry: flake8 + language: system + types: [python] + - id: pyright + name: pyright + entry: pyright + language: system + types: [python] + - id: isort + name: isort + entry: isort + language: system + types: [python] + - id: black + name: black + entry: black + language: system + types: [python] diff --git a/Makefile b/Makefile index 0e53256..8464cfa 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ .SUFFIXES: -SHELL:=bash -e -o pipefail -O globstar +SHELL:=bash -e -o pipefail # -O globstar SELF:=$(firstword $(MAKEFILE_LIST)) @@ -47,6 +47,7 @@ ${VE_DIR}: .PHONY: develop develop: pip install -e .[dev] + pre-commit install #=> install: install package .PHONY: install @@ -91,6 +92,18 @@ test-%: tox: tox +#=> cqa: execute code quality tests +cqa: + flake8 src --show-source --statistics + pyright + isort --check src --profile black + black --check src + bandit -ll -r src + +#=> reformat: reformat code +.PHONY: reformat +reformat: + pre-commit ############################################################################ #= UTILITY TARGETS @@ -144,13 +157,13 @@ distclean: cleanest ## ## Copyright 2023 Source Code Committers -## +## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. ## You may obtain a copy of the License at -## +## ## http://www.apache.org/licenses/LICENSE-2.0 -## +## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/README.md b/README.md index 36065db..95e6b38 100644 --- a/README.md +++ b/README.md @@ -124,11 +124,11 @@ tested. Patches to get other systems working would be welcomed. $ pip install seqrepo $ sudo mkdir -p /usr/local/share/seqrepo $ sudo chown $USER /usr/local/share/seqrepo - $ seqrepo pull -i 2018-11-26 - $ seqrepo show-status -i 2018-11-26 + $ seqrepo pull -i 2018-11-26 + $ seqrepo show-status -i 2018-11-26 seqrepo 0.2.3.post3.dev8+nb8298bd62283 root directory: /usr/local/share/seqrepo/2018-11-26, 7.9 GB - backends: fastadir (schema 1), seqaliasdb (schema 1) + backends: fastadir (schema 1), seqaliasdb (schema 1) sequences: 773587 sequences, 93051609959 residues, 192 files aliases: 5579572 aliases, 5480085 current, 26 namespaces, 773587 sequences @@ -187,6 +187,15 @@ Here's how to get started developing: source venv/bin/activate seqrepo --version +Code reformatting: + + make reformat + +Install pre-commit hook: + + # included in `make devready`, not necessary for new installations + pre-commit install + ## Building a docker image Docker images are available at https://hub.docker.com/r/biocommons/seqrepo. diff --git a/pyproject.toml b/pyproject.toml index 58a4b93..e6cc30c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,11 +124,6 @@ line-length = 100 profile = "black" src_paths = ["src", "tests"] -# [tool.flake8] -# flake8 does not support configuration in pyproject.toml -# https://github.com/PyCQA/flake8/issues/234#issuecomment-812800832 -# The config in setup.cfg - [tool.pyright] include = ["src", "tests"] diff --git a/src/biocommons/seqrepo/_internal/logging_support.py b/src/biocommons/seqrepo/_internal/logging_support.py index 9dc7466..6c0f276 100644 --- a/src/biocommons/seqrepo/_internal/logging_support.py +++ b/src/biocommons/seqrepo/_internal/logging_support.py @@ -1,6 +1,3 @@ -import logging - - class DuplicateFilter: """ Filters away duplicate log messages. diff --git a/src/biocommons/seqrepo/_versionwarning.py b/src/biocommons/seqrepo/_versionwarning.py index f147926..53b5bc6 100644 --- a/src/biocommons/seqrepo/_versionwarning.py +++ b/src/biocommons/seqrepo/_versionwarning.py @@ -1,20 +1,13 @@ -"""emits a warning when imported under Python < 3.6 - -This module may be used by other biocommons packages - -""" +"""emits a warning when imported under Python < 3.9""" import logging import sys __all__ = [] -version_warning = ( - "biocommons packages are tested and supported only on Python >= 3.6" - " (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)" -) +version_warning = "This package is tested and supported only on Python >= 3.9." _logger = logging.getLogger(__package__) -if sys.version_info < (3, 6): +if sys.version_info < (3, 9): _logger.warning(version_warning) diff --git a/src/biocommons/seqrepo/cli.py b/src/biocommons/seqrepo/cli.py index b505aa8..37484a2 100644 --- a/src/biocommons/seqrepo/cli.py +++ b/src/biocommons/seqrepo/cli.py @@ -20,7 +20,6 @@ import itertools import logging import os -import pprint import re import shutil import stat @@ -30,7 +29,6 @@ import bioutils.assemblies import bioutils.seqfetcher -import six import tqdm from . import SeqRepo, __version__ @@ -62,7 +60,7 @@ def _get_remote_instances(opts): ] _logger.debug("Executing `" + " ".join(rsync_cmd) + "`") lines = subprocess.check_output(rsync_cmd).decode().splitlines()[1:] - dirs = (m.group(1) for m in (line_re.match(l) for l in lines) if m) + dirs = (m.group(1) for m in (line_re.match(line) for line in lines) if m) return sorted(list(filter(instance_name_new_re.match, dirs))) @@ -81,12 +79,14 @@ def _latest_instance_path(opts): def parse_arguments(): + epilog = ( + f"seqrepo {__version__}" + "See https://github.com/biocommons/biocommons.seqrepo for more information" + ) top_p = argparse.ArgumentParser( description=__doc__.split("\n\n")[0], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - epilog="seqrepo " - + __version__ - + ". See https://github.com/biocommons/biocommons.seqrepo for more information", + epilog=epilog, ) top_p.add_argument("--dry-run", "-n", default=False, action="store_true") top_p.add_argument("--remote-host", default="dl.biocommons.org", help="rsync server host") @@ -352,12 +352,12 @@ def add_assembly_names(opts): ] if not_in_seqrepo: _logger.warning( - "Assembly {an} references {n} accessions not in SeqRepo instance {opts.instance_name} (e.g., {acs})".format( + "Assembly {an} references {n} accessions not in SeqRepo instance " + "{opts.instance_name} (e.g., {acs})".format( an=assy_name, n=len(not_in_seqrepo), opts=opts, acs=", ".join(not_in_seqrepo[:5] + ["..."]), - seqrepo_dir=seqrepo_dir, ) ) if not opts.partial_load: @@ -432,8 +432,8 @@ def _rec_iterator(): "{ns}:{a}".format(ns=ns, a=a) for ns, aliases in sorted(nsad.items()) for a in aliases ] print(">" + " ".join(aliases)) - for l in _wrap_lines(srec["seq"], 100): - print(l) + for line in _wrap_lines(srec["seq"], 100): + print(line) def export_aliases(opts): @@ -477,7 +477,7 @@ def init(opts): seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) if os.path.exists(seqrepo_dir) and len(os.listdir(seqrepo_dir)) > 0: raise IOError("{seqrepo_dir} exists and is not empty".format(seqrepo_dir=seqrepo_dir)) - sr = SeqRepo(seqrepo_dir, writeable=True) # flake8: noqa + sr = SeqRepo(seqrepo_dir, writeable=True) # noqa: F841 def list_local_instances(opts): @@ -586,14 +586,12 @@ def show_status(opts): ) ) print( - "sequences: {ss[n_sequences]} sequences, {ss[tot_length]} residues, {ss[n_files]} files".format( - ss=sr.sequences.stats() - ) + "sequences: {ss[n_sequences]} sequences, {ss[tot_length]} residues, " + "{ss[n_files]} files".format(ss=sr.sequences.stats()) ) print( - "aliases: {sa[n_aliases]} aliases, {sa[n_current]} current, {sa[n_namespaces]} namespaces, {sa[n_sequences]} sequences".format( - sa=sr.aliases.stats() - ) + "aliases: {sa[n_aliases]} aliases, {sa[n_current]} current, " + "{sa[n_namespaces]} namespaces, {sa[n_sequences]} sequences".format(sa=sr.aliases.stats()) ) return sr @@ -679,7 +677,7 @@ def _drop_write(p): def start_shell(opts): seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) - sr = SeqRepo(seqrepo_dir) + sr = SeqRepo(seqrepo_dir) # noqa: 682 import IPython IPython.embed( diff --git a/src/biocommons/seqrepo/config.py b/src/biocommons/seqrepo/config.py index d10c582..208e474 100644 --- a/src/biocommons/seqrepo/config.py +++ b/src/biocommons/seqrepo/config.py @@ -18,5 +18,6 @@ def parse_caching_env_var(env_name: str, env_default: str) -> Optional[int]: SEQREPO_LRU_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_LRU_CACHE_MAXSIZE", "1000000") -# Using a default value here of -1 to differentiate not setting this env var and an explicit None (unbounded cache) +# Using a default value here of -1 to differentiate not setting this env var and an +# explicit None (unbounded cache) SEQREPO_FD_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_FD_CACHE_MAXSIZE", "-1") diff --git a/src/biocommons/seqrepo/dataproxy.py b/src/biocommons/seqrepo/dataproxy.py index 5222716..c5411d4 100644 --- a/src/biocommons/seqrepo/dataproxy.py +++ b/src/biocommons/seqrepo/dataproxy.py @@ -10,7 +10,6 @@ import logging import os from abc import ABC, abstractmethod -from collections.abc import Sequence from urllib.parse import urlparse import requests @@ -149,7 +148,7 @@ def _get_sequence(self, identifier, start=None, end=None): url = self.base_url + f"sequence/{identifier}" params = {"start": start, "end": end} _logger.info(f"Fetching {url} {params if (start or end) else ''}") - resp = requests.get(url, params=params) + resp = requests.get(url, params=params, timeout=60) if resp.status_code == 404: raise KeyError(identifier) resp.raise_for_status() @@ -158,7 +157,7 @@ def _get_sequence(self, identifier, start=None, end=None): def _get_metadata(self, identifier): url = self.base_url + f"metadata/{identifier}" _logger.info("Fetching %s", url) - resp = requests.get(url) + resp = requests.get(url, timeout=60) if resp.status_code == 404: raise KeyError(identifier) resp.raise_for_status() diff --git a/src/biocommons/seqrepo/fastadir/__init__.py b/src/biocommons/seqrepo/fastadir/__init__.py index d0c58aa..0d2db26 100644 --- a/src/biocommons/seqrepo/fastadir/__init__.py +++ b/src/biocommons/seqrepo/fastadir/__init__.py @@ -1 +1 @@ -from .fastadir import FastaDir +from .fastadir import FastaDir # noqa: F401 diff --git a/src/biocommons/seqrepo/fastadir/fabgz.py b/src/biocommons/seqrepo/fastadir/fabgz.py index 7376e64..f56df55 100644 --- a/src/biocommons/seqrepo/fastadir/fabgz.py +++ b/src/biocommons/seqrepo/fastadir/fabgz.py @@ -53,9 +53,8 @@ def _find_bgzip(): raise RuntimeError("Didn't find version string in bgzip executable ({exe})".format(exe=exe)) except missing_file_exception: raise RuntimeError( - "{exe} doesn't exist; you need to install htslib and tabix (See https://github.com/biocommons/biocommons.seqrepo#requirements)".format( - exe=exe - ) + "{exe} doesn't exist; you need to install htslib and tabix " + "(See https://github.com/biocommons/biocommons.seqrepo#requirements)".format(exe=exe) ) except Exception: raise RuntimeError("Unknown error while executing {exe}".format(exe=exe)) @@ -74,7 +73,8 @@ class FabgzReader(object): """ Class that implements ContextManager and wraps a FabgzReader. The FabgzReader is returned when acquired in a contextmanager with statement. - """ + """ + def __init__(self, filename): self.lock = threading.Lock() self._fh = FastaFile(filename) @@ -141,8 +141,8 @@ def wrap_lines(seq, line_width): if seq_id not in self._added: self._fh.write(">" + seq_id + "\n") - for l in wrap_lines(seq, line_width): - self._fh.write(l + "\n") + for line in wrap_lines(seq, line_width): + self._fh.write(line + "\n") self._added.add(seq_id) _logger.debug("added seq_id {i}; length {l}".format(i=seq_id, l=len(seq))) return seq_id diff --git a/src/biocommons/seqrepo/fastadir/fastadir.py b/src/biocommons/seqrepo/fastadir/fastadir.py index ab1aa36..00c66cc 100644 --- a/src/biocommons/seqrepo/fastadir/fastadir.py +++ b/src/biocommons/seqrepo/fastadir/fastadir.py @@ -1,5 +1,3 @@ -import contextlib -import threading import datetime import functools import importlib.resources @@ -83,7 +81,7 @@ def __init__(self, root_dir, writeable=False, check_same_thread=True, fd_cache_s ) if fd_cache_size == 0: - _logger.info(f"File descriptor caching disabled") + _logger.info("File descriptor caching disabled") else: _logger.warning(f"File descriptor caching enabled (size={fd_cache_size})") diff --git a/src/biocommons/seqrepo/fastaiter/__init__.py b/src/biocommons/seqrepo/fastaiter/__init__.py index 7d7b810..b838a5e 100644 --- a/src/biocommons/seqrepo/fastaiter/__init__.py +++ b/src/biocommons/seqrepo/fastaiter/__init__.py @@ -1 +1 @@ -from .fastaiter import FastaIter +from .fastaiter import FastaIter # noqa: F401 diff --git a/src/biocommons/seqrepo/fastaiter/fastaiter.py b/src/biocommons/seqrepo/fastaiter/fastaiter.py index d50f671..7ebcba7 100644 --- a/src/biocommons/seqrepo/fastaiter/fastaiter.py +++ b/src/biocommons/seqrepo/fastaiter/fastaiter.py @@ -8,14 +8,14 @@ def FastaIter(handle): for line in handle: if line.startswith(">"): if header is not None: # not the first record - yield header, "".join(seq_lines) + yield header, "".join(seq_lines) # noqa: F821 seq_lines = list() header = line[1:].rstrip() else: if header is not None: # not the first record - seq_lines.append(line.strip()) + seq_lines.append(line.strip()) # noqa: F821 if header is not None: - yield header, "".join(seq_lines) + yield header, "".join(seq_lines) # noqa: F821 else: # no FASTA records in file return diff --git a/src/biocommons/seqrepo/seqaliasdb/__init__.py b/src/biocommons/seqrepo/seqaliasdb/__init__.py index 008c939..06b1794 100644 --- a/src/biocommons/seqrepo/seqaliasdb/__init__.py +++ b/src/biocommons/seqrepo/seqaliasdb/__init__.py @@ -1 +1 @@ -from .seqaliasdb import SeqAliasDB +from .seqaliasdb import SeqAliasDB # noqa: F401 diff --git a/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py b/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py index ffc001d..c592ba7 100644 --- a/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py +++ b/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py @@ -1,11 +1,9 @@ -import itertools import logging import sqlite3 import pkg_resources import yoyo -from .._internal.logging_support import DuplicateFilter from .._internal.translate import translate_alias_records, translate_api2db _logger = logging.getLogger(__name__) @@ -39,7 +37,8 @@ def __init__( if translate_ncbi_namespace is not None: _logger.warning( - "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + "translate_ncbi_namespace is obsolete; translation is now automatic; " + "this flag will be removed" ) if self._writeable: @@ -87,7 +86,8 @@ def fetch_aliases(self, seq_id, current_only=True, translate_ncbi_namespace=None ) if translate_ncbi_namespace is not None: _logger.warning( - "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + "translate_ncbi_namespace is obsolete; translation is now automatic; " + "this flag will be removed" ) return [dict(r) for r in self.find_aliases(seq_id=seq_id, current_only=current_only)] @@ -119,7 +119,8 @@ def eq_or_like(s): if translate_ncbi_namespace is not None: _logger.warning( - "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + "translate_ncbi_namespace is obsolete; translation is now automatic; " + "this flag will be removed" ) if namespace is not None: @@ -139,7 +140,7 @@ def eq_or_like(s): cols = ["seqalias_id", "seq_id", "alias", "added", "is_current"] cols += ["namespace"] - sql = "select {cols} from seqalias".format(cols=", ".join(cols)) + sql = "select {cols} from seqalias".format(cols=", ".join(cols)) # nosec if clauses: sql += " where " + " and ".join("(" + c + ")" for c in clauses) sql += " order by seq_id, namespace, alias" diff --git a/src/biocommons/seqrepo/seqrepo.py b/src/biocommons/seqrepo/seqrepo.py index a5482b6..b9c385a 100644 --- a/src/biocommons/seqrepo/seqrepo.py +++ b/src/biocommons/seqrepo/seqrepo.py @@ -7,7 +7,7 @@ import bioutils.digests from bioutils.digests import seq_seqhash as sha512t24u -from .config import SEQREPO_LRU_CACHE_MAXSIZE, SEQREPO_FD_CACHE_MAXSIZE +from .config import SEQREPO_FD_CACHE_MAXSIZE, SEQREPO_LRU_CACHE_MAXSIZE from .fastadir import FastaDir from .seqaliasdb import SeqAliasDB @@ -100,7 +100,7 @@ def __init__( translate_ncbi_namespace=None, check_same_thread=False, use_sequenceproxy=True, - fd_cache_size=0 + fd_cache_size=0, ): self._root_dir = root_dir self._upcase = upcase @@ -123,7 +123,9 @@ def __init__( self._seq_path, writeable=self._writeable, check_same_thread=self._check_same_thread, - fd_cache_size=SEQREPO_FD_CACHE_MAXSIZE if SEQREPO_FD_CACHE_MAXSIZE != -1 else fd_cache_size + fd_cache_size=( + SEQREPO_FD_CACHE_MAXSIZE if SEQREPO_FD_CACHE_MAXSIZE != -1 else fd_cache_size + ), ) self.aliases = SeqAliasDB( self._db_path, @@ -133,7 +135,8 @@ def __init__( if translate_ncbi_namespace is not None: _logger.warn( - "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + "translate_ncbi_namespace is obsolete; translation is now automatic; " + "this flag will be removed" ) def __contains__(self, nsa): @@ -210,7 +213,7 @@ def store(self, seq, nsaliases): try: seqhash = sha512t24u(seq) - except Exception as e: + except Exception: import pprint _logger.critical("Exception raised for " + pprint.pformat(nsaliases)) @@ -253,8 +256,8 @@ def store(self, seq, nsaliases): n_aliases_added += len(upd_tuples) if ( self._pending_sequences > ct_n_seqs - or self._pending_aliases > ct_n_aliases - or self._pending_sequences_len > ct_n_residues + or self._pending_aliases > ct_n_aliases # noqa: W503 + or self._pending_sequences_len > ct_n_residues # noqa: W503 ): # pragma: no cover _logger.info( "Hit commit thresholds ({self._pending_sequences} sequences, " @@ -279,7 +282,8 @@ def translate_alias( if translate_ncbi_namespace is not None: _logger.warn( - "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + "translate_ncbi_namespace is obsolete; translation is now automatic; " + "this flag will be removed" ) seq_id = self._get_unique_seqid(alias=alias, namespace=namespace) aliases = self.aliases.find_aliases(seq_id=seq_id) @@ -297,7 +301,8 @@ def translate_identifier( """ if translate_ncbi_namespace is not None: _logger.warn( - "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + "translate_ncbi_namespace is obsolete; translation is now automatic; " + "this flag will be removed" ) namespace, alias = (