From f60d88dc366c81baec994690a89290801d2f6cdd Mon Sep 17 00:00:00 2001 From: Benjamin Adams Date: Tue, 25 Jan 2022 11:29:43 -0500 Subject: [PATCH] Merge in CF 1.8 implementation (#892) * add bio_taxa CDL example * Adds handling for taxon names and identifiers Adds support for CF 1.8 Section 6.1.2: Taxon Names and Identifiers Fetches LSID for appropriate "biological_taxon_lsid" from http://www.lsid.info after some basic validation. If the LSID refers to a WoRMS or ITIS database entry, validates against those databases, otherwise skips validation for other data sources. * Adds tests for well-formed WoRMS data Adds unit tests for well-formed WoRMS data by mocking the HTTP response. Updates test requirements file due to mocked requests. * Handle unset standard_name, fix standard_name search Properly handle no standard name when attempting to look for taxonomy related variables. Changes "biological_taxon_identifier" to "biological_taxon_lsid" for purposes of excluding coordinate variables for taxonomy. The former term was not a valid CF standard name. * Add skip condition, tests for nodata/fillvalue LSID variable * Add passing example for ITIS.gov TSN data sources * Remove unused import, comments, and fix typos * Add tests for ITIS failure case from non-matching taxon name vs TSN reference * Remove pyworms dependency and refactor WoRMS calls * Remove bad import * Fix test netCDF file name/references * Revert "Fix test netCDF file name/references" This reverts commit 8ff9e92f33894c1e30f1beb5faec097eae36e73a. * Add taxonomy_example.cdl to test data * Fix message append, message return, remove unused LSID class * Tests for invalid and malformed LSIDs * Fix status code handling, error message formatting * Add warnings import * test on py3k * fix typo * Add .coveragerc, coverage files added to .gitignore * Add coverage/codecov GitHub Action workflow * Add codecov to test requirements * pytest-cov to requirements * Set continue-on-error to true to pass along coverage to codecov * Update README to include codecov badge * re-enable flake8 * Apply black code formatting * Remove extraneous code * Undefined variable/function fixes * Use continue to avoid fall through if taxa not found * Remove dead variables and code sections * Fix util date test to use previously unused variable for testing * Add test for climatology invalid bounds size * Add simple test for IOOS 1.2 point featureType * Add Dataset import for type hint in cf/util.py * Fix tests for CF 1.8 * (Re?) add shapely to requirements * Fix various accident reverts to CF 1.8 tests * Add setitem/getitem methods to MockVariable * Fix messages/tests for polygon geometry * Unpin NetCDF version in GitHub Actions default test CI steps * Restore ncgen -4 call * Add pyrightconfig.json to .gitignore * Use ncgen -4 in resources.py for CDL generation in unit tests * Refactor to split CF version checks into separate modules Refactors CF checker classes to reside in separate modules for test coverage and organization purposes. Updates other modules such as tests to use these new module paths. `cf.py` is now used for importing the various versions of the checkers, whereas the versioned modules are not imported directly. Also created a new module `cf_base` to avoid circular import issues in `cf.py`. * Add CF 1.8 to supported standards in README.md table * Add CF 1.8 checker to setup.py * Fix unbound exception variables, index var Co-authored-by: Ben Hall Co-authored-by: Filipe Fernandes --- .coveragerc | 4 + .github/workflows/codecov.yml | 55 + .github/workflows/default-tests.yml | 4 +- .gitignore | 7 + .pre-commit-config.yaml | 18 +- README.md | 2 + compliance_checker/base.py | 14 + compliance_checker/cf/cf.py | 5332 +---------------- compliance_checker/cf/cf_1_6.py | 3238 ++++++++++ compliance_checker/cf/cf_1_7.py | 875 +++ compliance_checker/cf/cf_1_8.py | 893 +++ compliance_checker/cf/cf_base.py | 1281 ++++ compliance_checker/cf/util.py | 48 +- compliance_checker/cfutil.py | 2 +- compliance_checker/ioos.py | 38 +- compliance_checker/suite.py | 19 +- compliance_checker/tests/conftest.py | 2 +- .../tests/data/examples/bio_taxa.cdl | 22 + .../tests/data/line_geometry.cdl | 52 + .../tests/data/polygon_geometry.cdl | 61 + .../tests/data/taxonomy_example.cdl | 28 + compliance_checker/tests/helpers.py | 9 + compliance_checker/tests/resources.py | 6 +- compliance_checker/tests/test_cf.py | 310 +- .../tests/test_cf_integration.py | 8 +- compliance_checker/tests/test_ioos_profile.py | 33 +- compliance_checker/tests/test_suite.py | 6 +- compliance_checker/tests/test_util.py | 2 +- compliance_checker/util.py | 4 +- requirements.txt | 3 +- setup.py | 5 +- test_requirements.txt | 3 + 32 files changed, 6978 insertions(+), 5406 deletions(-) create mode 100644 .coveragerc create mode 100644 .github/workflows/codecov.yml create mode 100644 compliance_checker/cf/cf_1_6.py create mode 100644 compliance_checker/cf/cf_1_7.py create mode 100644 compliance_checker/cf/cf_1_8.py create mode 100644 compliance_checker/cf/cf_base.py create mode 100644 compliance_checker/tests/data/examples/bio_taxa.cdl create mode 100644 compliance_checker/tests/data/line_geometry.cdl create mode 100644 compliance_checker/tests/data/polygon_geometry.cdl create mode 100644 compliance_checker/tests/data/taxonomy_example.cdl diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..9635e934 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +source=compliance_checker +branch=True +omit=compliance_checker/tests/* diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml new file mode 100644 index 00000000..21a106e3 --- /dev/null +++ b/.github/workflows/codecov.yml @@ -0,0 +1,55 @@ +name: Code coverage report + +on: + pull_request: + push: + branches: + - master + - develop + +jobs: + run: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ["3.10"] + os: [ubuntu-latest] + fail-fast: false + + steps: + - uses: actions/checkout@v2 + + - name: Setup Conda + uses: s-weigand/setup-conda@v1 + with: + activate-conda: false + conda-channels: conda-forge + + - name: Python ${{ matrix.python-version }} + shell: bash -l {0} + run: | + conda create --name TEST python=${{ matrix.python-version }} pip --file requirements.txt --file test_requirements.txt --strict-channel-priority + source activate TEST + pip install -e . --no-deps --force-reinstall + + - name: Conda Info + shell: bash -l {0} + run: | + source activate TEST + conda info --all + conda list + + - name: Run tests with coverage + shell: bash -l {0} + run: | + source activate TEST + pytest --cov=compliance_checker --cov-report=xml compliance_checker/tests + # pass this step even if there are individual test failures, we are + # interested in the overall level of coverage and other checks can + # report on test failures. + continue-on-error: true + + - name: Upload to codecov + uses: codecov/codecov-action@v2 + with: + files: coverage.xml diff --git a/.github/workflows/default-tests.yml b/.github/workflows/default-tests.yml index df0dc0bf..1a8abf5b 100644 --- a/.github/workflows/default-tests.yml +++ b/.github/workflows/default-tests.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.6", "3.7", "3.8", "3.9"] + python-version: ["3.7", "3.8", "3.9", "3.10"] os: [windows-latest, ubuntu-latest, macos-latest] fail-fast: false @@ -26,7 +26,7 @@ jobs: - name: Python ${{ matrix.python-version }} shell: bash -l {0} run: | - conda create --name TEST python=${{ matrix.python-version }} pip "libnetcdf<4.8.0" --file requirements.txt --file test_requirements.txt --strict-channel-priority + conda create --name TEST python=${{ matrix.python-version }} pip --file requirements.txt --file test_requirements.txt --strict-channel-priority source activate TEST pip install -e . --no-deps --force-reinstall diff --git a/.gitignore b/.gitignore index 82c06aad..bffc4bdf 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ ENV/ # IDE project settings .spyderproject .vscode +pyrightconfig.json # Rope project settings .ropeproject @@ -104,3 +105,9 @@ ENV/ conda-requirements.txt compliance_checker/_version.py +.venv +activate + +# coverage output +coverage/ +coverage.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6bfb20c7..0d4addf5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v4.1.0 hooks: - id: trailing-whitespace exclude: compliance_checker/tests/data @@ -14,15 +14,15 @@ repos: - id: file-contents-sorter files: test_requirements.txt -# - repo: https://gitlab.com/pycqa/flake8 -# rev: 3.9.2 -# hooks: -# - id: flake8 -# exclude: docs/source/conf.py -# args: [--max-line-length=200, "--ignore=E203,E501,W503", "--select=select=C,E,F,W,B,B950"] +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + exclude: docs/source/conf.py + args: [--max-line-length=200, "--ignore=E203,E501,W503", "--select=select=C,E,F,W,B,B950"] - repo: https://github.com/pre-commit/mirrors-isort - rev: v5.9.3 + rev: v5.10.1 hooks: - id: isort additional_dependencies: [toml] @@ -34,7 +34,7 @@ repos: - id: seed-isort-config - repo: https://github.com/psf/black - rev: 21.9b0 + rev: 21.12b0 hooks: - id: black language_version: python3 diff --git a/README.md b/README.md index a29001bb..ab2a0e3f 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![Build Status](https://travis-ci.org/ioos/compliance-checker.svg)](https://travis-ci.org/ioos/compliance-checker) [![Build status](https://ci.appveyor.com/api/projects/status/lcc9co38pi6o45ho/branch/master?svg=true)](https://ci.appveyor.com/project/ocefpaf/compliance-checker/branch/master) +[![codecov](https://codecov.io/gh/ioos/compliance-checker/branch/master/graph/badge.svg)](https://codecov.io/gh/ioos/compliance-checker) The IOOS Compliance Checker is a python based tool for data providers to check for completeness and community standard compliance of local or remote @@ -22,6 +23,7 @@ It currently supports the following sources and standards: | Standard | Source | .nc/OPeNDAP/.cdl | SOS | | ---------------------------------------------------------------------------------------------------- | ----------- | ------ | ------------------------------- | | [ACDD (1.1, 1.3)](http://wiki.esipfed.org/index.php/Attribute_Convention_for_Data_Discovery_1-3) | Built-in | X | - | +| [CF (1.8)](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html) | Built-in | X | - | | [CF (1.7)](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html) | Built-in | X | - | | [CF (1.6)](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html) | Built-in | X | - | | IOOS SOS | Built-in | - | GetCapabilities, DescribeSensor | diff --git a/compliance_checker/base.py b/compliance_checker/base.py index 0ccff250..8992b2ee 100644 --- a/compliance_checker/base.py +++ b/compliance_checker/base.py @@ -359,6 +359,20 @@ def assert_true(self, test, message): return test + def add_failure(self, message): + """ + Adds a failure along with a message + :rtype: None + """ + self.assert_true(False, message) + + def add_pass(self): + """ + Adds a pass condition + :rtype: None + """ + self.assert_true(True, None) + def std_check_in(base_context, name, allowed_vals): """ diff --git a/compliance_checker/cf/cf.py b/compliance_checker/cf/cf.py index 3fc3cf14..83d7a9c7 100644 --- a/compliance_checker/cf/cf.py +++ b/compliance_checker/cf/cf.py @@ -34,5331 +34,7 @@ prime_meridian_names17, ) - -logger = logging.getLogger(__name__) - - -def print_exceptions(f): - @wraps(f) - def wrapper(*args, **kwargs): - try: - return f(*args, **kwargs) - except Exception as e: - from traceback import print_exc - - print_exc() - - return wrapper - - -# helper to see if we should do DSG tests -def is_likely_dsg(func): - @wraps(func) - def _dec(s, ds): - if hasattr(ds, "featureType"): - return func(s, ds) - - # @TODO: skips if we have formalized skips - return None - - return _dec - - -class CFBaseCheck(BaseCheck): - """ - CF Convention Checker Base - """ - - def __init__(self, options=None): - # The compliance checker can be run on multiple datasets in a single - # instantiation, so caching values has be done by the unique identifier - # for each dataset loaded. - - # Each default dict is a key, value mapping from the dataset object to - # a list of variables - super(CFBaseCheck, self).__init__(options) - self._coord_vars = defaultdict(list) - self._ancillary_vars = defaultdict(list) - self._clim_vars = defaultdict(list) - self._metadata_vars = defaultdict(list) - self._boundary_vars = defaultdict(list) - self._geophysical_vars = defaultdict(list) - self._aux_coords = defaultdict(list) - - self._std_names = util.StandardNameTable() - - self.section_titles = { # dict of section headers shared by grouped checks - "2.2": "§2.2 Data Types", - "2.3": "§2.3 Naming Conventions", - "2.4": "§2.4 Dimensions", - "2.5": "§2.5 Variables", - "2.6": "§2.6 Attributes", - "3.1": "§3.1 Units", - "3.2": "§3.2 Long Name", - "3.3": "§3.3 Standard Name", - "3.4": "§3.4 Ancillary Data", - "3.5": "§3.5 Flags", - "4": "§4 Coordinate Types", - "4.1": "§4.1 Latitude Coordinate", - "4.2": "§4.2 Longitude Coordinate", - "4.3": "§4.3 Vertical Coordinate", - "4.4": "§4.4 Time Coordinate", - "4.5": "§4.5 Discrete Axis", - "5": "§5 Coordinate Systems", - "5.1": "§5.1 Independent Latitude, Longitude, Vertical, and Time Axes", - "5.2": "§5.2 2-D Latitude, Longitude, Coordinate Variables", - "5.3": "§5.3 Reduced Horizontal Grid", - "5.4": "§5.4 Timeseries of Station Data", - "5.5": "§5.5 Trajectories", - "5.6": "§5.6 Horizontal Coordinate Reference Systems, Grid Mappings, Projections", - "5.7": "§5.7 Scalar Coordinate Variables", - "6.1": "§6.1 Labels", - "6.2": "§6.2 Alternative Coordinates", - "7.1": "§7.1 Cell Boundaries", - "7.2": "§7.2 Cell Measures", - "7.3": "§7.3 Cell Methods", - "7.4": "§7.4 Climatological Statistics", - "8.1": "§8.1 Packed Data", - "8.2": "§8.2 Compression by Gathering", - "9.1": "§9.1 Features and feature types", - "9.2": "§9.2 Collections, instances, and elements", - "9.3": "§9.3 Representations of Collections of features in data variables", - "9.4": "§9.4 The featureType attribute", - "9.5": "§9.5 Coordinates and metadata", - "9.6": "§9.6 Missing Data", - } - - ################################################################################ - # Helper Methods - var classifications, etc - ################################################################################ - - def setup(self, ds): - """ - Initialize various special variable types within the class. - Mutates a number of instance variables. - - :param netCDF4.Dataset ds: An open netCDF dataset - """ - self.coord_vars = self._find_coord_vars(ds) - self._find_aux_coord_vars(ds) - self._find_ancillary_vars(ds) - self._find_clim_vars(ds) - self._find_boundary_vars(ds) - self._find_metadata_vars(ds) - self._find_cf_standard_name_table(ds) - self._find_geophysical_vars(ds) - coord_containing_vars = ds.get_variables_by_attributes( - coordinates=lambda val: isinstance(val, str) - ) - - # coordinate data variables - - # Excerpt from "§1.3 Overview" on coordinate data - # There are two methods used to identify variables that contain - # coordinate data. The first is to use the NUG-defined "coordinate - # variables." The use of coordinate variables is required for all - # dimensions that correspond to one dimensional space or time - # coordinates . In cases where coordinate variables are not applicable, - # the variables containing coordinate data are identified by the - # coordinates attribute. - - # first read in variables referred to in coordinates which exist - # in the dataset - self.coord_data_vars = set() - for var in coord_containing_vars: - for coord_var_name in var.coordinates.strip().split(" "): - if coord_var_name in ds.variables: - self.coord_data_vars.add(coord_var_name) - # then add in the NUG coordinate variables -- single dimension with - # dimension name the same as coordinates - self.coord_data_vars.update(self.coord_vars) - - def check_grid_mapping(self, ds): - """ - 5.6 When the coordinate variables for a horizontal grid are not - longitude and latitude, it is required that the true latitude and - longitude coordinates be supplied via the coordinates attribute. If in - addition it is desired to describe the mapping between the given - coordinate variables and the true latitude and longitude coordinates, - the attribute grid_mapping may be used to supply this description. - - This attribute is attached to data variables so that variables with - different mappings may be present in a single file. The attribute takes - a string value which is the name of another variable in the file that - provides the description of the mapping via a collection of attached - attributes. This variable is called a grid mapping variable and is of - arbitrary type since it contains no data. Its purpose is to act as a - container for the attributes that define the mapping. - - The one attribute that all grid mapping variables must have is - grid_mapping_name which takes a string value that contains the mapping's - name. The other attributes that define a specific mapping depend on the - value of grid_mapping_name. The valid values of grid_mapping_name along - with the attributes that provide specific map parameter values are - described in Appendix F, Grid Mappings. - - When the coordinate variables for a horizontal grid are longitude and - latitude, a grid mapping variable with grid_mapping_name of - latitude_longitude may be used to specify the ellipsoid and prime - meridian. - - - In order to make use of a grid mapping to directly calculate latitude - and longitude values it is necessary to associate the coordinate - variables with the independent variables of the mapping. This is done by - assigning a standard_name to the coordinate variable. The appropriate - values of the standard_name depend on the grid mapping and are given in - Appendix F, Grid Mappings. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - ret_val = OrderedDict() - grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) - - # Check the grid_mapping attribute to be a non-empty string and that its reference exists - for variable in ds.get_variables_by_attributes( - grid_mapping=lambda x: x is not None - ): - grid_mapping = getattr(variable, "grid_mapping", None) - defines_grid_mapping = self.get_test_ctx( - BaseCheck.HIGH, self.section_titles["5.6"], variable.name - ) - defines_grid_mapping.assert_true( - (isinstance(grid_mapping, str) and grid_mapping), - "{}'s grid_mapping attribute must be a " - + "space-separated non-empty string".format(variable.name), - ) - if isinstance(grid_mapping, str): - # TODO (badams): refactor functionality to split functionality - # into requisite classes - if ":" in grid_mapping and self._cc_spec_version >= "1.7": - colon_count = grid_mapping.count(":") - re_all = regex.findall( - r"(\w+):\s*((?:\w+\s+)*(?:\w+)(?![\w:]))", grid_mapping - ) - if colon_count != len(re_all): - defines_grid_mapping.out_of += 1 - defines_grid_mapping.messages.append( - "Could not consume entire grid_mapping expression, please check for well-formedness" - ) - else: - for grid_var_name, coord_var_str in re_all: - defines_grid_mapping.assert_true( - grid_var_name in ds.variables, - "grid mapping variable {} must exist in this dataset".format( - grid_var_name - ), - ) - for ref_var in coord_var_str.split(): - defines_grid_mapping.assert_true( - ref_var in ds.variables, - "Coordinate-related variable {} referenced by grid_mapping variable {} must exist in this dataset".format( - ref_var, grid_var_name - ), - ) - - else: - for grid_var_name in grid_mapping.split(): - defines_grid_mapping.assert_true( - grid_var_name in ds.variables, - "grid mapping variable {} must exist in this dataset".format( - grid_var_name - ), - ) - ret_val[variable.name] = defines_grid_mapping.to_result() - - # Check the grid mapping variables themselves - for grid_var_name in grid_mapping_variables: - valid_grid_mapping = self.get_test_ctx( - BaseCheck.HIGH, self.section_titles["5.6"], grid_var_name - ) - grid_var = ds.variables[grid_var_name] - - grid_mapping_name = getattr(grid_var, "grid_mapping_name", None) - - # Grid mapping name must be in appendix F - valid_grid_mapping.assert_true( - grid_mapping_name in self.grid_mapping_dict, - "{} is not a valid grid_mapping_name.".format(grid_mapping_name) - + " See Appendix F for valid grid mappings", - ) - - # The self.grid_mapping_dict has a values of: - # - required attributes - # - optional attributes (can't check) - # - required standard_names defined - # - at least one of these attributes must be defined - - # We can't do any of the other grid mapping checks if it's not a valid grid mapping name - if grid_mapping_name not in self.grid_mapping_dict: - ret_val[grid_mapping_name] = valid_grid_mapping.to_result() - continue - - grid_mapping = self.grid_mapping_dict[grid_mapping_name] - required_attrs = grid_mapping[0] - # Make sure all the required attributes are defined - for req in required_attrs: - valid_grid_mapping.assert_true( - hasattr(grid_var, req), - "{} is a required attribute for grid mapping {}".format( - req, grid_mapping_name - ), - ) - - # Make sure that exactly one of the exclusive attributes exist - if len(grid_mapping) == 4: - at_least_attr = grid_mapping[3] - number_found = 0 - for attr in at_least_attr: - if hasattr(grid_var, attr): - number_found += 1 - valid_grid_mapping.assert_true( - number_found == 1, - "grid mapping {}".format(grid_mapping_name) - + "must define exactly one of these attributes: " - + "{}".format(" or ".join(at_least_attr)), - ) - - # Make sure that exactly one variable is defined for each of the required standard_names - expected_std_names = grid_mapping[2] - for expected_std_name in expected_std_names: - found_vars = ds.get_variables_by_attributes( - standard_name=expected_std_name - ) - valid_grid_mapping.assert_true( - len(found_vars) == 1, - "grid mapping {} requires exactly ".format(grid_mapping_name) - + "one variable with standard_name " - + "{} to be defined".format(expected_std_name), - ) - - ret_val[grid_var_name] = valid_grid_mapping.to_result() - - return ret_val - - def check_conventions_version(self, ds): - """ - CF §2.6.1 the NUG defined global attribute Conventions to the string - value "CF-"; check the Conventions attribute contains - the appropriate string. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - - valid = False - reasoning = [] - correct_version_string = "{}-{}".format( - self._cc_spec, self._cc_spec_version - ).upper() - if hasattr(ds, "Conventions"): - conventions = regex.split(r",|\s+", getattr(ds, "Conventions", "")) - for convention in conventions: - if convention == correct_version_string: - valid = True - break - else: - reasoning = [ - "§2.6.1 Conventions global attribute does not contain " - '"{}"'.format(correct_version_string) - ] - else: - valid = False - reasoning = ["§2.6.1 Conventions field is not present"] - return Result( - BaseCheck.MEDIUM, valid, self.section_titles["2.6"], msgs=reasoning - ) - - def _check_dimensionless_vertical_coordinates( - self, - ds, - deprecated_units, - version_specific_check, - version_specific_dimless_vertical_coord_dict, - ): - """ - Check the validity of dimensionless coordinates under CF - - :param netCDF4.Dataset ds: An open netCDF dataset - :param list deprecated_units: list of string names of deprecated units - :param function version_specific_check: version-specific implementation to check dimensionless vertical coord - :param dict version_specific_dimless_coord_dict: version-specific dict of dimensionless vertical coords and computed standard names - :return: List of results - """ - ret_val = [] - - z_variables = cfutil.get_z_variables(ds) - - # call version-specific implementation - for name in z_variables: - version_specific_check( - ds, - name, - deprecated_units, - ret_val, - version_specific_dimless_vertical_coord_dict, - ) - - return ret_val - - def _check_formula_terms(self, ds, coord, dimless_coords_dict): - """ - Checks a dimensionless vertical coordinate contains valid formula_terms - - - formula_terms is a non-empty string - - formula_terms matches regdimless_coords_dictx - - every variable defined in formula_terms exists - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - variable = ds.variables[coord] - standard_name = getattr(variable, "standard_name", None) - formula_terms = getattr(variable, "formula_terms", None) - valid_formula_terms = TestCtx(BaseCheck.HIGH, self.section_titles["4.3"]) - - valid_formula_terms.assert_true( - isinstance(formula_terms, str) and formula_terms, - "§4.3.2: {}'s formula_terms is a required attribute and must be a non-empty string" - "".format(coord), - ) - # We can't check any more - if not formula_terms: - return valid_formula_terms.to_result() - - # check that the formula_terms are well formed and are present - # The pattern for formula terms is always component: variable_name - # the regex grouping always has component names in even positions and - # the corresponding variable name in odd positions. - matches = regex.findall( - r"([A-Za-z][A-Za-z0-9_]*: )([A-Za-z][A-Za-z0-9_]*)", variable.formula_terms - ) - terms = set(m[0][:-2] for m in matches) - # get the variables named in the formula terms and check if any - # are not present in the dataset - missing_vars = sorted(set(m[1] for m in matches) - set(ds.variables)) - missing_fmt = "The following variable(s) referenced in {}:formula_terms are not present in the dataset: {}" - valid_formula_terms.assert_true( - len(missing_vars) == 0, missing_fmt.format(coord, ", ".join(missing_vars)) - ) - # try to reconstruct formula_terms by adding space in between the regex - # matches. If it doesn't exactly match the original, the formatting - # of the attribute is incorrect - reconstructed_formula = " ".join(m[0] + m[1] for m in matches) - valid_formula_terms.assert_true( - reconstructed_formula == formula_terms, - "Attribute formula_terms is not well-formed", - ) - - valid_formula_terms.assert_true( - standard_name in dimless_coords_dict, - "unknown standard_name '{}' for dimensionless vertical coordinate {}" - "".format(standard_name, coord), - ) - if standard_name not in dimless_coords_dict: - return valid_formula_terms.to_result() - - valid_formula_terms.assert_true( - no_missing_terms(standard_name, terms, dimless_coords_dict), - "{}'s formula_terms are invalid for {}, please see appendix D of CF 1.6" - "".format(coord, standard_name), - ) - - return valid_formula_terms.to_result() - - def _check_grid_mapping_attr_condition(self, attr, attr_name, ret_val): - """ - Evaluate a condition (or series of conditions) for a particular - attribute. Designed to be overloaded in subclass implementations. - - :param attr: attribute to teset condition for - :param str attr_name: name of the attribute - :param list ret_val: list of results to append to - :rtype None - :return None - """ - raise NotImplementedError - - def _dims_in_order(self, dimension_order): - """ - :param list dimension_order: A list of axes - :rtype: bool - :return: Returns True if the dimensions are in order U*, T, Z, Y, X, - False otherwise - """ - regx = regex.compile(r"^[^TZYX]*T?Z?Y?X?$") - dimension_string = "".join(dimension_order) - return regx.match(dimension_string) is not None - - def _parent_var_attr_type_check(self, attr_name, var, ctx): - """ - Checks that an attribute has an equivalent value to a parent variable. - Takes an attribute name, variable, and test context on which to operate. - :param str attr_name: The name of the attribute to be checked - :param netCDF4.Variable var: The variable against which to be checked - :param compliance_checker.base.TestCtx ctx: The associated test context to modify - :rtype None - :return None - """ - attr_val = var.getncattr(attr_name) - - if isinstance(attr_val, (str, bytes)): - type_match = (var.dtype is str) or (var.dtype.kind == "S") - val_type = type(attr_val) - else: - val_type = attr_val.dtype.type - type_match = val_type == var.dtype.type - - ctx.assert_true( - type_match, - "Attribute '{}' (type: {}) and parent variable '{}' (type: {}) " - "must have equivalent datatypes".format( - attr_name, val_type, var.name, var.dtype.type - ), - ) - - def _find_aux_coord_vars(self, ds, refresh=False): - """ - Returns a list of auxiliary coordinate variables - - An auxiliary coordinate variable is any netCDF variable that contains - coordinate data, but is not a coordinate variable (in the sense of the term - defined by CF). - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: List of variable names (str) that are defined to be auxiliary - coordinate variables. - """ - if self._aux_coords.get(ds, None) and refresh is False: - return self._aux_coords[ds] - - self._aux_coords[ds] = cfutil.get_auxiliary_coordinate_variables(ds) - return self._aux_coords[ds] - - def _find_boundary_vars(self, ds, refresh=False): - """ - Returns dictionary of boundary variables mapping the variable instance - to the name of the variable acting as a boundary variable. - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: A list containing strings with boundary variable names. - """ - if self._boundary_vars.get(ds, None) and refresh is False: - return self._boundary_vars[ds] - - self._boundary_vars[ds] = cfutil.get_cell_boundary_variables(ds) - - return self._boundary_vars[ds] - - def _find_ancillary_vars(self, ds, refresh=False): - """ - Returns a list of variable names that are defined as ancillary - variables in the dataset ds. - - An ancillary variable generally is a metadata container and referenced - from other variables via a string reference in an attribute. - - - via ancillary_variables (3.4) - - "grid mapping var" (5.6) - - TODO: more? - - The result is cached by the passed in dataset object inside of this - checker. Pass refresh=True to redo the cached value. - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: List of variable names (str) that are defined as ancillary - variables in the dataset ds. - """ - - # Used the cached version if it exists and is not empty - if self._ancillary_vars.get(ds, None) and refresh is False: - return self._ancillary_vars[ds] - - # Invalidate the cache at all costs - self._ancillary_vars[ds] = [] - - for name, var in ds.variables.items(): - if hasattr(var, "ancillary_variables"): - for anc_name in var.ancillary_variables.split(" "): - if anc_name in ds.variables: - self._ancillary_vars[ds].append(anc_name) - - if hasattr(var, "grid_mapping"): - gm_name = var.grid_mapping - if gm_name in ds.variables: - self._ancillary_vars[ds].append(gm_name) - - return self._ancillary_vars[ds] - - def _find_clim_vars(self, ds, refresh=False): - """ - Returns a list of variables that are likely to be climatology variables based on CF §7.4 - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: A list containing strings with geophysical variable - names. - """ - - if self._clim_vars.get(ds, None) and refresh is False: - return self._clim_vars[ds] - - climatology_variable = cfutil.get_climatology_variable(ds) - if climatology_variable: - self._clim_vars[ds].append(climatology_variable) - - return self._clim_vars[ds] - - def _find_cf_standard_name_table(self, ds): - """ - Parse out the `standard_name_vocabulary` attribute and download that - version of the cf standard name table. If the standard name table has - already been downloaded, use the cached version. Modifies `_std_names` - attribute to store standard names. Returns True if the file exists and - False if it fails to download. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: bool - """ - # Get the standard name vocab - standard_name_vocabulary = getattr(ds, "standard_name_vocabulary", "") - - # Try to parse this attribute to get version - version = None - try: - if "cf standard name table" in standard_name_vocabulary.lower(): - version = [ - s.strip("(").strip(")").strip("v").strip(",") - for s in standard_name_vocabulary.split() - ] - # This assumes that table version number won't start with 0. - version = [ - s - for s in version - if s.isdigit() and len(s) <= 2 and not s.startswith("0") - ] - if len(version) > 1: - return False - else: - try: - version = version[0] - except IndexError: - warn( - "Cannot extract CF standard name version number " - "from standard_name_vocabulary string" - ) - return False - else: - # Can't parse the attribute, use the packaged version - return False - # usually raised from .lower() with an incompatible (non-string) - # data type - except AttributeError: - warn( - "Cannot convert standard name table to lowercase. This can " - "occur if a non-string standard_name_vocabulary global " - "attribute is supplied" - ) - return False - - if version.startswith("v"): # i.e 'v34' -> '34' drop the v - version = version[1:] - - # If the packaged version is what we're after, then we're good - if version == self._std_names._version: - print( - "Using packaged standard name table v{0}".format(version), - file=sys.stderr, - ) - return False - - # Try to download the version specified - try: - data_directory = util.create_cached_data_dir() - location = os.path.join( - data_directory, "cf-standard-name-table-test-{0}.xml".format(version) - ) - # Did we already download this before? - if not os.path.isfile(location): - util.download_cf_standard_name_table(version, location) - print( - "Using downloaded standard name table v{0}".format(version), - file=sys.stderr, - ) - else: - print( - "Using cached standard name table v{0} from {1}".format( - version, location - ), - file=sys.stderr, - ) - - self._std_names = util.StandardNameTable(location) - return True - except Exception as e: - # There was an error downloading the CF table. That's ok, we'll just use the packaged version - warn( - "Problem fetching standard name table:\n{0}\n" - "Using packaged v{1}".format(e, self._std_names._version) - ) - return False - - def _find_coord_vars(self, ds, refresh=False): - """ - Returns a list of variable names that identify as coordinate variables. - - The result is cached by the passed in dataset object inside of this - checker. Pass refresh=True to redo the cached value. - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: A list of variables names (str) that are defined as coordinate - variables in the dataset ds. - """ - if ds in self._coord_vars and refresh is False: - return self._coord_vars[ds] - - self._coord_vars[ds] = cfutil.get_coordinate_variables(ds) - - return self._coord_vars[ds] - - def _find_geophysical_vars(self, ds, refresh=False): - """ - Returns a list of geophysical variables. Modifies - `self._geophysical_vars` - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: A list containing strings with geophysical variable - names. - """ - if self._geophysical_vars.get(ds, None) and refresh is False: - return self._geophysical_vars[ds] - - self._geophysical_vars[ds] = cfutil.get_geophysical_variables(ds) - - return self._geophysical_vars[ds] - - def _find_metadata_vars(self, ds, refresh=False): - """ - Returns a list of netCDF variable instances for those that are likely metadata variables - - :param netCDF4.Dataset ds: An open netCDF dataset - :param bool refresh: if refresh is set to True, the cache is - invalidated. - :rtype: list - :return: List of variable names (str) that are likely metadata - variable candidates. - - """ - if self._metadata_vars.get(ds, None) and refresh is False: - return self._metadata_vars[ds] - - self._metadata_vars[ds] = [] - for name, var in ds.variables.items(): - - if name in self._find_ancillary_vars(ds) or name in self._find_coord_vars( - ds - ): - continue - - if name in ( - "platform_name", - "station_name", - "instrument_name", - "station_id", - "platform_id", - "surface_altitude", - ): - self._metadata_vars[ds].append(name) - - elif getattr(var, "cf_role", "") != "": - self._metadata_vars[ds].append(name) - - elif ( - getattr(var, "standard_name", None) is None and len(var.dimensions) == 0 - ): - self._metadata_vars[ds].append(name) - - return self._metadata_vars[ds] - - def _get_coord_axis_map(self, ds): - """ - Returns a dictionary mapping each coordinate to a letter identifier - describing the _kind_ of coordinate. - - :param netCDF4.Dataset ds: An open netCDF dataset - - :rtype: dict - :return: A dictionary with variable names mapped to axis abbreviations, - i.e. {'longitude': 'X', ... 'pressure': 'Z'} - """ - expected = ["T", "Z", "Y", "X"] - coord_vars = self._find_coord_vars(ds) - coord_axis_map = {} - - # L - Unlimited Coordinates - # T - Time coordinates - # Z - Depth/Altitude Coordinate - # Y - Y-Coordinate (latitude) - # X - X-Coordinate (longitude) - # A - Auxiliary Coordinate - # I - Instance Coordinate - - time_variables = cfutil.get_time_variables(ds) - lat_variables = cfutil.get_latitude_variables(ds) - lon_variables = cfutil.get_longitude_variables(ds) - z_variables = cfutil.get_z_variables(ds) - - for coord_name in coord_vars: - coord_var = ds.variables[coord_name] - axis = getattr(coord_var, "axis", None) - standard_name = getattr(coord_var, "standard_name", None) - - # Unlimited dimensions must come first - if ds.dimensions[coord_name].isunlimited(): - coord_axis_map[coord_name] = "L" - # axis takes precedence over standard_name - elif axis in expected: - coord_axis_map[coord_name] = axis - elif standard_name == "time": - coord_axis_map[coord_name] = "T" - elif standard_name == "longitude": - coord_axis_map[coord_name] = "X" - elif standard_name == "latitude": - coord_axis_map[coord_name] = "Y" - elif standard_name in ["height", "depth", "altitude"]: - coord_axis_map[coord_name] = "Z" - elif cfutil.is_compression_coordinate(ds, coord_name): - coord_axis_map[coord_name] = "C" - elif coord_name in time_variables: - coord_axis_map[coord_name] = "T" - elif coord_name in z_variables: - coord_axis_map[coord_name] = "Z" - elif coord_name in lat_variables: - coord_axis_map[coord_name] = "Y" - elif coord_name in lon_variables: - coord_axis_map[coord_name] = "X" - else: - # mark the coordinate variable as unknown - coord_axis_map[coord_name] = "U" - - for dimension in self._get_instance_dimensions(ds): - if dimension not in coord_axis_map: - coord_axis_map[dimension] = "I" - - # Dimensions of auxiliary coordinate variables will be marked with A. - # This is useful to help determine if the dimensions are used like a - # mapping from grid coordinates to physical lat/lon - for coord_name in self._find_aux_coord_vars(ds): - coord_var = ds.variables[coord_name] - # Skip label auxiliary coordinates - if hasattr(coord_var.dtype, "char") and coord_var.dtype.char == "S": - continue - elif coord_var.dtype == str: - continue - for dimension in coord_var.dimensions: - if dimension not in coord_axis_map: - coord_axis_map[dimension] = "A" - - # If a dimension does not have a coordinate variable mark it as unknown - # 'U' - for dimension in ds.dimensions: - if dimension not in coord_axis_map: - coord_axis_map[dimension] = "U" - - return coord_axis_map - - def _get_coord_vars(self, ds): - coord_vars = [] - for name, var in ds.variables.items(): - if (name,) == var.dimensions: - coord_vars.append(name) - return coord_vars - - def _get_dimension_order(self, ds, name, coord_axis_map): - """ - Returns a list of strings corresponding to the named axis of the dimensions for a variable. - - Example:: - self._get_dimension_order(ds, 'temperature', coord_axis_map) - --> ['T', 'Y', 'X'] - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: Name of the variable - :param dict coord_axis_map: A dictionary mapping each coordinate variable and dimension to a named axis - - :rtype: list - :return: A list of strings corresponding to the named axis of the dimensions for a variable - """ - - retval = [] - variable = ds.variables[name] - for dim in variable.dimensions: - retval.append(coord_axis_map[dim]) - return retval - - def _get_instance_dimensions(self, ds): - """ - Returns a list of dimensions marked as instance dimensions - - :param netCDF4.Dataset ds: An open netCDF dataset - - :rtype: list - :returns: A list of variable dimensions - """ - ret_val = [] - for variable in ds.get_variables_by_attributes( - cf_role=lambda x: isinstance(x, str) - ): - if variable.ndim > 0: - ret_val.append(variable.dimensions[0]) - return ret_val - - def _get_pretty_dimension_order(self, ds, name): - """ - Returns a comma separated string of the dimensions for a specified - variable - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: A string with a valid NetCDF variable name for the - dataset - :rtype: str - :return: A comma separated string of the variable's dimensions - """ - dim_names = [] - for dim in ds.variables[name].dimensions: - dim_name = dim - if ds.dimensions[dim].isunlimited(): - dim_name += " (Unlimited)" - dim_names.append(dim_name) - return ", ".join(dim_names) - - def _get_pretty_dimension_order_with_type(self, ds, name, dim_types): - """ - Returns a comma separated string of the dimensions for a specified - variable of format "DIMENSIONS_NAME (DIMENSION_TYPE[, unlimited])" - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: A string with a valid NetCDF variable name for the - dataset - :param list dim_types: A list of strings returned by - _get_dimension_order for the same "name" - :rtype: str - :return: A comma separated string of the variable's dimensions - """ - dim_names = [] - for dim, dim_type in zip(ds.variables[name].dimensions, dim_types): - dim_name = "{} ({}".format(dim, dim_type) - if ds.dimensions[dim].isunlimited(): - dim_name += ", unlimited)" - else: - dim_name += ")" - dim_names.append(dim_name) - return ", ".join(dim_names) - - def _is_station_var(self, var): - """ - Returns True if the NetCDF variable is associated with a station, False - otherwise. - - :param netCDF4.Variable var: a variable in an existing NetCDF dataset - :rtype: bool - :return: Status of whether variable appears to be associated with a - station - """ - - if getattr(var, "standard_name", None) in ( - "platform_name", - "station_name", - "instrument_name", - ): - return True - return False - - def _split_standard_name(self, standard_name): - """ - Returns a tuple of the standard_name and standard_name modifier - - Nones are used to represent the absence of a modifier or standard_name - - :rtype: tuple - :return: 2-tuple of standard_name and modifier as strings - """ - - if isinstance(standard_name, str) and " " in standard_name: - return standard_name.split(" ", 1) - # if this isn't a string, then it doesn't make sense to split - # -- treat value as standard name with no modifier - else: - return standard_name, None - - def check_appendix_a(self, ds): - """ - Validates a CF dataset against the contents of its Appendix A table for - attribute types and locations. Returns a list of results with the - outcomes of the Appendix A validation results against the existing - attributes in the docstring. - - :param netCDF4.Variable var: a variable in an existing NetCDF dataset - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: A list of results corresponding to the results returned - """ - # if 'enable_appendix_a_checks' isn't specified in the checks, - # don't do anything on this check - results = [] - if "enable_appendix_a_checks" not in self.options: - return results - possible_global_atts = set(ds.ncattrs()).intersection(self.appendix_a.keys()) - attr_location_ident = { - "G": "global attributes", - "C": "coordinate data", - "D": "non-coordinate data", - } - - def att_loc_print_helper(att_letter): - """ - Returns a string corresponding to attr_location ident in - human-readable form. E.g. an input of 'G' will return - "global attributes (G)" - - :param str att_letter: An attribute letter corresponding to the - "Use" column in CF Appendix A - :rtype: str - :return: A string with a human-readable name followed by the input - letter specified - """ - - return "{} ({})".format( - attr_location_ident.get(att_letter, "other"), att_letter - ) - - def _att_loc_msg(att_loc): - """ - Helper method for formatting an error message when an attribute - appears in the improper location corresponding to the "Use" column - in CF Appendix A. - - :param set att_loc: A set with the possible valid locations of the - attribute corresponding to the "Use" column - in CF Appendix A - :rtype: str - :return: A human-readable string with the possible valid locations - of the attribute - """ - att_loc_len = len(att_loc) - # this is a fallback in case an empty att_loc is passed - # it generally should not occur - valid_loc = "no locations in the dataset" - loc_sort = sorted(att_loc) - if att_loc_len == 1: - valid_loc = att_loc_print_helper(loc_sort[0]) - elif att_loc_len == 2: - valid_loc = "{} and {}".format( - att_loc_print_helper(loc_sort[0]), att_loc_print_helper(loc_sort[1]) - ) - # shouldn't be reached under normal circumstances, as any attribute - # should be either G, C, or D but if another - # category is added, this will be useful. - else: - valid_loc = ", ".join(loc_sort[:-1]) + ", and {}".format( - att_loc_print_helper(loc_sort[-1]) - ) - return "This attribute may only appear in {}.".format(valid_loc) - - for global_att_name in possible_global_atts: - global_att = ds.getncattr(global_att_name) - att_dict = self.appendix_a[global_att_name] - att_loc = att_dict["attr_loc"] - valid_loc_warn = _att_loc_msg(att_loc) - if att_dict["cf_section"] is not None: - subsection_test = ".".join(att_dict["cf_section"].split(".")[:2]) - - section_loc = self.section_titles.get( - subsection_test, att_dict["cf_section"] - ) - else: - section_loc = None - test_ctx = TestCtx(BaseCheck.HIGH, section_loc) - - test_ctx.out_of += 1 - if "G" not in att_loc: - test_ctx.messages.append( - '[Appendix A] Attribute "{}" should not be present in global (G) ' - "attributes. {}".format(global_att_name, valid_loc_warn) - ) - else: - result = self._handle_dtype_check(global_att, global_att_name, att_dict) - if not result[0]: - test_ctx.messages.append(result[1]) - else: - test_ctx.score += 1 - results.append(test_ctx.to_result()) - - noncoord_vars = set(ds.variables) - set(self.coord_data_vars) - for var_set, coord_letter in ( - (self.coord_data_vars, "C"), - (noncoord_vars, "D"), - ): - for var_name in var_set: - var = ds.variables[var_name] - possible_attrs = set(var.ncattrs()).intersection(self.appendix_a.keys()) - for att_name in possible_attrs: - att_dict = self.appendix_a[att_name] - if att_dict["cf_section"] is not None: - subsection_test = ".".join( - att_dict["cf_section"].split(".")[:2] - ) - - section_loc = self.section_titles.get( - subsection_test, att_dict["cf_section"] - ) - else: - section_loc = None - test_ctx = TestCtx(BaseCheck.HIGH, section_loc, variable=var_name) - att_loc = att_dict["attr_loc"] - valid_loc_warn = _att_loc_msg(att_loc) - att = var.getncattr(att_name) - test_ctx.out_of += 1 - if coord_letter not in att_loc: - test_ctx.messages.append( - '[Appendix A] Attribute "{}" should not be present in {} ' - 'variable "{}". {}'.format( - att_name, - att_loc_print_helper(coord_letter), - var_name, - valid_loc_warn, - ) - ) - else: - result = self._handle_dtype_check(att, att_name, att_dict, var) - if not result[0]: - test_ctx.messages.append(result[1]) - else: - test_ctx.score += 1 - results.append(test_ctx.to_result()) - - return results - - def _check_attr_type(self, attr_name, attr_type, attribute, variable=None): - """ - Check if an attribute `attr` is of the type `attr_type`. Upon getting - a data type of 'D', the attr must have the same data type as the - variable it is assigned to. - - Attributes designated type 'S' must be of type `str`. 'N' require - numeric types, and 'D' requires the attribute type match the type - of the variable it is assigned to. - - :param str attr_name: name of attr being checked (to format message) - :param str attr_type: the correct type of the attribute - :param attribute: attribute to check - :param variable: if given, type should match attr - :rtype tuple - :return A two-tuple that contains pass/fail status as a boolean and - a message string (or None if unset) as the second element. - """ - - if attr_type == "S": - if not isinstance(attribute, str): - return [False, "{} must be a string".format(attr_name)] - else: - # if it's not a string, it should have a numpy dtype - underlying_dtype = getattr(attribute, "dtype", None) - - # TODO check for np.nan separately - if underlying_dtype is None: - return [False, "{} must be a numeric type".format(attr_name)] - - # both D and N should be some kind of numeric value - is_numeric = np.issubdtype(underlying_dtype, np.number) - if attr_type == "N": - if not is_numeric: - return [False, "{} must be a numeric type".format(attr_name)] - elif attr_type == "D": - # TODO: handle edge case where variable is unset here - temp_ctx = TestCtx() - self._parent_var_attr_type_check(attr_name, variable, temp_ctx) - var_dtype = getattr(variable, "dtype", None) - if temp_ctx.messages: - return ( - False, - "{} must be numeric and must be equivalent to {} dtype".format( - attr_name, var_dtype - ), - ) - else: - # If we reached here, we fell off with an unrecognized type - return ( - False, - "{} has unrecognized type '{}'".format(attr_name, attr_type), - ) - # pass if all other possible failure conditions have been evaluated - return (True, None) - - def _handle_dtype_check(self, attribute, attr_name, attr_dict, variable=None): - """ - Helper function for Appendix A checks. - - :param attribute: The value of the attribute being checked - :param str attr_name: The name of the attribute being processed - :param dict attr_dict: The dict entry with type and attribute location - information corresponding to this attribute - :param variable: if given, the variable whose type to check against - :rtype: tuple - :return: A two-tuple that contains pass/fail status as a boolean and - a message string (or None if unset) as the second element. - """ - attr_type = attr_dict["Type"] - if variable is None and "G" not in attr_dict["attr_loc"]: - raise ValueError( - "Non-global attributes must be associated with a " " variable" - ) - attr_str = ( - "Global attribute {}".format(attr_name) - if "G" in attr_dict["attr_loc"] and variable is None - else "Attribute {} in variable {}".format(attr_name, variable.name) - ) - - # check the type - return_value = self._check_attr_type(attr_name, attr_type, attribute, variable) - - # if the second element is a string, format it - if isinstance(return_value[1], str): - return_value[1] = return_value[1].format(attr_str) - - # convert to tuple for immutability and return - return tuple(return_value) - - -class CFNCCheck(BaseNCCheck, CFBaseCheck): - """Inherits from both BaseNCCheck and CFBaseCheck to support - checking netCDF datasets. Must inherit in this order, or certain - attributes from BaseNCCheck (like supported_ds) will not be passed to - CFNCCheck.""" - - pass - - -appendix_a_base = { - "Conventions": {"Type": "S", "attr_loc": {"G"}, "cf_section": None}, - "_FillValue": {"Type": "D", "attr_loc": {"D", "C"}, "cf_section": None}, - "add_offset": {"Type": "N", "attr_loc": {"D"}, "cf_section": "8.1"}, - "ancillary_variables": {"Type": "S", "attr_loc": {"D"}, "cf_section": "3.4"}, - "axis": {"Type": "S", "attr_loc": {"C"}, "cf_section": "4"}, - "bounds": {"Type": "S", "attr_loc": {"C"}, "cf_section": "7.1"}, - "calendar": {"Type": "S", "attr_loc": {"C"}, "cf_section": "4.4.1"}, - "cell_measures": {"Type": "S", "attr_loc": {"D"}, "cf_section": "7.2"}, - "cell_methods": {"Type": "S", "attr_loc": {"D"}, "cf_section": "7.3"}, - # cf_role type is "C" in document, which does not correspond - # to types used, replaced with "S" - "cf_role": {"Type": "S", "attr_loc": {"C"}, "cf_section": "9.5"}, - "climatology": {"Type": "S", "attr_loc": {"C"}, "cf_section": "7.4"}, - # comment was removed in this implementation - "compress": {"Type": "S", "attr_loc": {"C"}, "cf_section": "8.2"}, - "coordinates": {"Type": "S", "attr_loc": {"D"}, "cf_section": "5"}, - # featureType type is "C" in document, which does not - # correspond to types used, replaced with "S" - "featureType": {"Type": "S", "attr_loc": {"G"}, "cf_section": "9.4"}, - "flag_masks": {"Type": "D", "attr_loc": {"D"}, "cf_section": "3.5"}, - "flag_meanings": {"Type": "S", "attr_loc": {"D"}, "cf_section": "3.5"}, - "flag_values": {"Type": "D", "attr_loc": {"D"}, "cf_section": "3.5"}, - "formula_terms": {"Type": "S", "attr_loc": {"C"}, "cf_section": "4.3.2"}, - "grid_mapping": {"Type": "S", "attr_loc": {"D"}, "cf_section": "5.6"}, - "history": {"Type": "S", "attr_loc": {"G"}, "cf_section": None}, - #'instance_dimension': {'Type': 'N', 'attr_loc': {'D'}, 'cf_section': '9.3'}, - "institution": {"Type": "S", "attr_loc": {"G", "D"}, "cf_section": "2.6.2"}, - "leap_month": {"Type": "N", "attr_loc": {"C"}, "cf_section": "4.4.1"}, - "leap_year": {"Type": "N", "attr_loc": {"C"}, "cf_section": "4.4.1"}, - "long_name": {"Type": "S", "attr_loc": {"D", "C"}, "cf_section": "3.2"}, - "missing_value": {"Type": "D", "attr_loc": {"D", "C"}, "cf_section": "2.5.1"}, - "month_lengths": {"Type": "N", "attr_loc": {"C"}, "cf_section": "4.4.1"}, - "positive": {"Type": "S", "attr_loc": {"C"}, "cf_section": None}, - "references": {"Type": "S", "attr_loc": {"G", "D"}, "cf_section": "2.6.2"}, - #'sample_dimension': {'Type': 'N', 'attr_loc': {'D'}, 'cf_section': '9.3'}, - "scale_factor": {"Type": "N", "attr_loc": {"D"}, "cf_section": "8.1"}, - "source": {"Type": "S", "attr_loc": {"G", "D"}, "cf_section": "2.6.2"}, - "standard_error_multiplier": {"Type": "N", "attr_loc": {"D"}, "cf_section": None}, - "standard_name": {"Type": "S", "attr_loc": {"D", "C"}, "cf_section": "3.3"}, - "title": {"Type": "S", "attr_loc": {"G"}, "cf_section": None}, - "units": {"Type": "S", "attr_loc": {"D", "C"}, "cf_section": "3.1"}, - "valid_max": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": None}, - "valid_min": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": None}, - "valid_range": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": None}, -} - - -class CF1_6Check(CFNCCheck): - """CF-1.6-specific implementation of CFBaseCheck; supports checking - netCDF datasets. - These checks are translated documents: - http://cf-pcmdi.llnl.gov/documents/cf-conventions/1.6/cf-conventions.html - http://cf-pcmdi.llnl.gov/conformance/requirements-and-recommendations/1.6/""" - - register_checker = True - _cc_spec = "cf" - _cc_spec_version = "1.6" - _cc_description = "Climate and Forecast Conventions (CF)" - _cc_url = "http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html" - _cc_display_headers = {3: "Errors", 2: "Warnings", 1: "Info"} - appendix_a = appendix_a_base - - def __init__(self, options=None): # initialize with parent methods and data - super(CF1_6Check, self).__init__(options) - - self.cell_methods = cell_methods16 - self.grid_mapping_dict = grid_mapping_dict16 - self.grid_mapping_attr_types = grid_mapping_attr_types16 - - ############################################################################### - # Chapter 2: NetCDF Files and Components - ############################################################################### - - def check_data_types(self, ds): - """ - Checks the data type of all netCDF variables to ensure they are valid - data types under CF. - - CF §2.2 The netCDF data types char, byte, short, int, float or real, and - double are all acceptable - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - fails = [] - total = len(ds.variables) - - for k, v in ds.variables.items(): - if ( - v.dtype is not str - and v.dtype.kind != "S" - and all( - v.dtype.type != t - for t in ( - np.character, - np.dtype("|S1"), - np.dtype("b"), - np.dtype("i2"), - np.dtype("i4"), - np.float32, - np.double, - ) - ) - ): - fails.append( - "The variable {} failed because the datatype is {}".format( - k, v.datatype - ) - ) - return Result( - BaseCheck.HIGH, - (total - len(fails), total), - self.section_titles["2.2"], - msgs=fails, - ) - - def check_child_attr_data_types(self, ds): - """ - For any variables which contain any of the following attributes: - - valid_min/valid_max - - valid_range - - scale_factor - - add_offset - - _FillValue - the data type of the attribute must match the type of its parent variable as specified in the - NetCDF User Guide (NUG) https://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html, - referenced in the CF Conventions in Section 2.5.2 - (http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html#missing-data) - - :param netCDF4.Dataset ds: open netCDF dataset object - :rtype: compliance_checker.base.Result - """ - - ctx = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.5"]) - special_attrs = { - "actual_range", - "valid_min", - "valid_max", - "valid_range", - "_FillValue", - } - - for var_name, var in ds.variables.items(): - for att_name in special_attrs.intersection(var.ncattrs()): - self._parent_var_attr_type_check(att_name, var, ctx) - return ctx.to_result() - - # TODO: consider renaming to avoid confusion with non-underscore - # primary function version - def _check_add_offset_scale_factor_type(self, variable, attr_name): - """ - Reusable function for checking both add_offset and scale_factor. - """ - - msgs = [] - error_msg = ( - f"Variable {variable.name} and {attr_name} must be equivalent " - f"data types or {variable.name} must be of type byte, short, or int " - f"and {attr_name} must be float or double" - ) - - att = getattr(variable, attr_name, None) - if not (isinstance(att, (np.number, float))): # can't compare dtypes - val = False - - else: - val = ( - att.dtype == variable.dtype - ) or ( # will short-circuit or if first condition is true - isinstance(att.dtype, (np.float, np.double, float)) - and isinstance(variable.dtype, (np.byte, np.short, np.int, int)) - ) - if not val: - msgs.append(error_msg) - - return Result(BaseCheck.MEDIUM, val, self.section_titles["8.1"], - msgs) - - def check_add_offset_scale_factor_type(self, ds): - """ - If a variable has the attributes add_offset and scale_factor, - check that the variables and attributes are of the same type - OR that the variable is of type byte, short or int and the - attributes are of type float or double. - """ - - results = [] - add_offset_vars = ds.get_variables_by_attributes( - add_offset=lambda x: x is not None - ) - scale_factor_vars = ds.get_variables_by_attributes( - scale_factor=lambda x: x is not None - ) - - for _att_vars_tup in ( - ("add_offset", add_offset_vars), - ("scale_factor", scale_factor_vars), - ): - results.extend( - list( - map( - lambda var: self._check_add_offset_scale_factor_type( - var, _att_vars_tup[0] - ), - _att_vars_tup[1], - ) - ) - ) - - return results - - def check_naming_conventions(self, ds): - """ - Checks the variable names to ensure they are valid CF variable names under CF. - - CF §2.3 Variable, dimension and attribute names should begin with a letter - and be composed of letters, digits, and underscores. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - ret_val = [] - variable_naming = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.3"]) - dimension_naming = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.3"]) - attribute_naming = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.3"]) - - ignore_attributes = [ - "_FillValue", - "DODS", - "_ChunkSizes", - "_Coordinate", - "_Unsigned", - ] - - rname = regex.compile("^[A-Za-z][A-Za-z0-9_]*$") - - for name, variable in ds.variables.items(): - variable_naming.assert_true( - rname.match(name) is not None, - "variable {} should begin with a letter and be composed of " - "letters, digits, and underscores".format(name), - ) - - # Keep track of all the attributes, we'll need to check them - for attr in variable.ncattrs(): - if attr in ignore_attributes: - continue - # Special attributes made by THREDDS - if attr.startswith("DODS"): - continue - # Ignore model produced attributes - if attr.startswith("_Coordinate"): - continue - attribute_naming.assert_true( - rname.match(attr) is not None, - "attribute {}:{} should begin with a letter and be composed of " - "letters, digits, and underscores".format(name, attr), - ) - - ret_val.append(variable_naming.to_result()) - - for dimension in ds.dimensions: - dimension_naming.assert_true( - rname.match(dimension) is not None, - "dimension {} should begin with a latter and be composed of " - "letters, digits, and underscores".format(dimension), - ) - ret_val.append(dimension_naming.to_result()) - - for global_attr in ds.ncattrs(): - # Special attributes made by THREDDS - if global_attr.startswith("DODS"): - continue - if global_attr.startswith("EXTRA_DIMENSION"): - continue - attribute_naming.assert_true( - rname.match(global_attr) is not None, - "global attribute {} should begin with a letter and be composed of " - "letters, digits, and underscores".format(global_attr), - ) - ret_val.append(attribute_naming.to_result()) - - return ret_val - - def check_names_unique(self, ds): - """ - Checks the variable names for uniqueness regardless of case. - - CF §2.3 names should not be distinguished purely by case, i.e., if case - is disregarded, no two names should be the same. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - fails = [] - total = len(ds.variables) - names = defaultdict(int) - - for k in ds.variables: - names[k.lower()] += 1 - - fails = [ - "Variables are not case sensitive. Duplicate variables named: %s" % k - for k, v in names.items() - if v > 1 - ] - return Result( - BaseCheck.MEDIUM, - (total - len(fails), total), - self.section_titles["2.3"], - msgs=fails, - ) - - def check_dimension_names(self, ds): - """ - Checks variables contain no duplicate dimension names. - - CF §2.4 A variable may have any number of dimensions, including zero, - and the dimensions must all have different names. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - fails = [] - total = len(ds.variables) - - for k, v in ds.variables.items(): - dims = defaultdict(int) - for d in v.dimensions: - dims[d] += 1 - - for dimension, count in dims.items(): - if count > 1: - fails.append( - "%s has two or more dimensions named %s" % (k, dimension) - ) - - return Result( - BaseCheck.HIGH, - (total - len(fails), total), - self.section_titles["2.4"], - msgs=fails, - ) - - def check_dimension_order(self, ds): - """ - Checks each variable's dimension order to ensure that the order is - consistent and in order under CF §2.4 - - CF §2.4 If any or all of the dimensions of a variable have the - interpretations of "date or time" (T), "height or depth" (Z), - "latitude" (Y), or "longitude" (X) then we recommend, those dimensions - to appear in the relative order T, then Z, then Y, then X in the CDL - definition corresponding to the file. All other dimensions should, - whenever possible, be placed to the left of the spatiotemporal - dimensions. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - valid_dimension_order = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.4"]) - # Build a map from coordinate variable to axis - coord_axis_map = self._get_coord_axis_map(ds) - - # Check each variable's dimension order, excluding climatology and - # bounds variables - any_clim = cfutil.get_climatology_variable(ds) - any_bounds = cfutil.get_cell_boundary_variables(ds) - for name, variable in ds.variables.items(): - # Skip bounds/climatology variables, as they should implicitly - # have the same order except for the bounds specific dimension. - # This is tested later in the respective checks - if name in any_bounds or name == any_clim: - continue - - # Skip strings/labels - if hasattr(variable.dtype, "char") and variable.dtype.char == "S": - continue - elif variable.dtype == str: - continue - - if variable.dimensions: - dimension_order = self._get_dimension_order(ds, name, coord_axis_map) - valid_dimension_order.assert_true( - self._dims_in_order(dimension_order), - "{}'s spatio-temporal dimensions are not in the " - "recommended order T, Z, Y, X and/or further dimensions " - "are not located left of T, Z, Y, X. The dimensions (and " - "their guessed types) are {} (with U: other/unknown; L: " - "unlimited).".format( - name, - self._get_pretty_dimension_order_with_type( - ds, name, dimension_order - ), - ), - ) - return valid_dimension_order.to_result() - - def check_fill_value_outside_valid_range(self, ds): - """ - Checks each variable's _FillValue to ensure that it's in valid_range or - between valid_min and valid_max according to CF §2.5.1 - - CF §2.5.1 The _FillValue should be outside the range specified by - valid_range (if used) for a variable. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of Results - """ - valid_fill_range = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.5"]) - - for name, variable in ds.variables.items(): - # If the variable doesn't have a defined _FillValue don't check it. - - if not hasattr(variable, "_FillValue"): - continue - - fill_value = variable._FillValue - - attrs = variable.ncattrs() - - if "valid_range" in attrs: - if isinstance(variable.valid_range, str): - m = "§2.5.1 Fill Values should be outside the range specified by valid_range" # subsection message - valid_fill_range.assert_true( - False, - "{};\n\t{}:valid_range must be a numeric type not a string".format( - m, name - ), - ) - continue - rmin, rmax = variable.valid_range - spec_by = "valid_range" - - elif "valid_min" in attrs and "valid_max" in attrs: - if isinstance(variable.valid_min, str): - valid_fill_range.assert_true( - False, - "{}:valid_min must be a numeric type not a string".format(name), - ) - if isinstance(variable.valid_max, str): - valid_fill_range.assert_true( - False, - "{}:valid_max must be a numeric type not a string".format(name), - ) - if isinstance(variable.valid_min, str) or isinstance( - variable.valid_max, str - ): - continue - rmin = variable.valid_min - rmax = variable.valid_max - spec_by = "valid_min/valid_max" - else: - continue - - if np.isnan(fill_value): - valid = True - else: - valid = fill_value < rmin or fill_value > rmax - - valid_fill_range.assert_true( - valid, - "{}:_FillValue ({}) should be outside the range specified by {} ({}, {})" - "".format(name, fill_value, spec_by, rmin, rmax), - ) - - return valid_fill_range.to_result() - - def check_convention_globals(self, ds): - """ - Check the common global attributes are strings if they exist. - - CF §2.6.2 title/history global attributes, must be strings. Do not need - to exist. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of Results - """ - attrs = ["title", "history"] - - valid_globals = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.6"]) - - for attr in attrs: - dataset_attr = getattr(ds, attr, None) - is_string = isinstance(dataset_attr, str) - valid_globals.assert_true( - is_string and len(dataset_attr), - "§2.6.2 global attribute {} should exist and be a non-empty string" # subsection message - "".format(attr), - ) - return valid_globals.to_result() - - def check_convention_possibly_var_attrs(self, ds): - """ - Check variable and global attributes are strings for recommended attributes under CF §2.6.2 - - CF §2.6.2 institution, source, references, and comment, either global - or assigned to individual variables. When an attribute appears both - globally and as a variable attribute, the variable's version has - precedence. Must be strings. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of Results - """ - # The attrs are optional and only needs to be a string and non-empty if it - # exists. - attrs = ["institution", "source", "references", "comment"] - - valid_attributes = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.6"]) - - attr_bin = set() - # If the attribute is defined for any variable, check it and mark in - # the set that we've seen it at least once. - for name, variable in ds.variables.items(): - for attribute in variable.ncattrs(): - varattr = getattr(variable, attribute) - if attribute in attrs: - is_string = isinstance(varattr, str) - valid_attributes.assert_true( - is_string and len(varattr) > 0, - "§2.6.2 {}:{} should be a non-empty string" - "".format(name, attribute), - ) - attr_bin.add(attribute) - - # Check all the global attributes too and mark if we've seen them - for attribute in ds.ncattrs(): - dsattr = getattr(ds, attribute) - if attribute in attrs: - is_string = isinstance(dsattr, str) - valid_attributes.assert_true( - is_string and len(dsattr) > 0, - "§2.6.2 {} global attribute should be a non-empty string" - "".format(attribute), - ) - attr_bin.add(attribute) - return valid_attributes.to_result() - - ############################################################################### - # Chapter 3: Description of the Data - ############################################################################### - - def check_units(self, ds): - """ - Check the units attribute for all variables to ensure they are CF - compliant under CF §3.1 - - CF §3.1 The units attribute is required for all variables that represent dimensional quantities - (except for boundary variables defined in Section 7.1, "Cell Boundaries" and climatology variables - defined in Section 7.4, "Climatological Statistics"). - - Units are not required for dimensionless quantities. A variable with no units attribute is assumed - to be dimensionless. However, a units attribute specifying a dimensionless unit may optionally be - included. - - - units required - - type must be recognized by udunits - - if standard name specified, must be consistent with standard name table, must also be consistent with a - specified cell_methods attribute if present - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - ret_val = [] - - coordinate_variables = self._find_coord_vars(ds) - auxiliary_coordinates = self._find_aux_coord_vars(ds) - geophysical_variables = self._find_geophysical_vars(ds) - forecast_variables = cfutil.get_forecast_metadata_variables(ds) - - unit_required_variables = set( - coordinate_variables - + auxiliary_coordinates - + geophysical_variables - + forecast_variables - ) - - for name in unit_required_variables: - # For reduced horizontal grids, the compression index variable does - # not require units. - if cfutil.is_compression_coordinate(ds, name): - continue - - variable = ds.variables[name] - - # Skip instance coordinate variables - if getattr(variable, "cf_role", None) is not None: - continue - - # Skip labels - if ( - hasattr(variable.dtype, "char") and variable.dtype.char == "S" - ) or variable.dtype == str: - continue - - standard_name = getattr(variable, "standard_name", None) - standard_name, standard_name_modifier = self._split_standard_name( - standard_name - ) - - units = getattr(variable, "units", None) - - valid_units = self._check_valid_cf_units(ds, name) - ret_val.append(valid_units) - - units_attr_is_string = TestCtx(BaseCheck.MEDIUM, self.section_titles["3.1"]) - - # side effects, but better than teasing out the individual result - if units_attr_is_string.assert_true( - isinstance(units, str), - "units ({}) attribute of '{}' must be a string compatible with UDUNITS".format( - units, variable.name - ), - ): - valid_udunits = self._check_valid_udunits(ds, name) - ret_val.append(valid_udunits) - ret_val.append(units_attr_is_string.to_result()) - - if isinstance(standard_name, str): - valid_standard_units = self._check_valid_standard_units(ds, name) - ret_val.append(valid_standard_units) - - return ret_val - - def _check_valid_cf_units(self, ds, variable_name): - """ - Checks that the variable contains units attribute, the attribute is a - string and the value is not deprecated by CF - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str variable_name: Name of the variable to be checked - :rtype: - :return: List of results - """ - - # This list is straight from section 3 - deprecated = ["level", "layer", "sigma_level"] - variable = ds.variables[variable_name] - - units = getattr(variable, "units", None) - standard_name_full = getattr(variable, "standard_name", None) - standard_name, standard_name_modifier = self._split_standard_name( - standard_name_full - ) - std_name_units_dimensionless = cfutil.is_dimensionless_standard_name( - self._std_names._root, standard_name - ) - # Is this even in the database? also, if there is no standard_name, - # there's no way to know if it is dimensionless. - should_be_dimensionless = ( - variable.dtype is str - or (hasattr(variable.dtype, "char") and variable.dtype.char == "S") - or std_name_units_dimensionless - or standard_name is None - ) - - # 1) Units must exist - valid_units = TestCtx(BaseCheck.HIGH, self.section_titles["3.1"]) - valid_units.assert_true( - should_be_dimensionless or units is not None, - "units attribute is required for {} when variable is not a dimensionless quantity".format( - variable_name - ), - ) - - # Don't bother checking the rest - if units is None and not should_be_dimensionless: - return valid_units.to_result() - # 2) units attribute must be a string - valid_units.assert_true( - should_be_dimensionless or isinstance(units, str), - "units attribute for {} needs to be a string".format(variable_name), - ) - - # 3) units are not deprecated - valid_units.assert_true( - units not in deprecated, - 'units for {}, "{}" are deprecated by CF 1.6'.format(variable_name, units), - ) - - return valid_units.to_result() - - def _check_valid_udunits(self, ds, variable_name): - """ - Checks that the variable's units are contained in UDUnits - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str variable_name: Name of the variable to be checked - """ - variable = ds.variables[variable_name] - - units = getattr(variable, "units", None) - standard_name = getattr(variable, "standard_name", None) - standard_name, standard_name_modifier = self._split_standard_name(standard_name) - std_name_units_dimensionless = cfutil.is_dimensionless_standard_name( - self._std_names._root, standard_name - ) - - # If the variable is supposed to be dimensionless, it automatically passes - should_be_dimensionless = ( - variable.dtype is str - or (hasattr(variable.dtype, "char") and variable.dtype.char == "S") - or std_name_units_dimensionless - ) - - valid_udunits = TestCtx(BaseCheck.HIGH, self.section_titles["3.1"]) - are_udunits = units is not None and util.units_known(units) - valid_udunits.assert_true( - should_be_dimensionless or are_udunits, - 'units for {}, "{}" are not recognized by UDUNITS'.format( - variable_name, units - ), - ) - return valid_udunits.to_result() - - def _check_valid_standard_units(self, ds, variable_name): - """ - Checks that the variable's units are appropriate for the standard name - according to the CF standard name table and coordinate sections in CF - 1.6 - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str variable_name: Name of the variable to be checked - """ - variable = ds.variables[variable_name] - units = getattr(variable, "units", None) - standard_name = getattr(variable, "standard_name", None) - - valid_standard_units = TestCtx(BaseCheck.HIGH, self.section_titles["3.1"]) - - # If the variable is supposed to be dimensionless, it automatically passes - std_name_units_dimensionless = cfutil.is_dimensionless_standard_name( - self._std_names._root, standard_name - ) - - standard_name, standard_name_modifier = self._split_standard_name(standard_name) - - standard_entry = self._std_names.get(standard_name, None) - if standard_entry is not None: - canonical_units = standard_entry.canonical_units - else: - # Any unit comparisons with None returns False - canonical_units = None - - # Other standard_name modifiers have the same units as the - # unmodified standard name or are not checked for units. - - if standard_name_modifier == "number_of_observations": - canonical_units = "1" - - # This section represents the different cases where simple udunits - # comparison isn't comprehensive enough to determine if the units are - # appropriate under CF - - # UDUnits accepts "s" as a unit of time but it should be since - if standard_name == "time": - valid_standard_units.assert_true( - util.units_convertible(units, "seconds since 1970-01-01"), - "time must be in a valid units format since " - "not {}".format(units), - ) - - # UDunits can't tell the difference between east and north facing coordinates - elif standard_name == "latitude": - # degrees is allowed if using a transformed grid - allowed_units = cfutil.VALID_LAT_UNITS | {"degrees"} - valid_standard_units.assert_true( - units.lower() in allowed_units, - 'variables defining latitude ("{}") must use degrees_north ' - "or degrees if defining a transformed grid. Currently " - "{}".format(variable_name, units), - ) - # UDunits can't tell the difference between east and north facing coordinates - elif standard_name == "longitude": - # degrees is allowed if using a transformed grid - allowed_units = cfutil.VALID_LON_UNITS | {"degrees"} - valid_standard_units.assert_true( - units.lower() in allowed_units, - 'variables defining longitude ("{}") must use degrees_east ' - "or degrees if defining a transformed grid. Currently " - "{}".format(variable_name, units), - ) - # Standard Name table agrees the unit should be dimensionless - elif std_name_units_dimensionless: - valid_standard_units.assert_true(True, "") - - elif canonical_units is not None: - valid_standard_units.assert_true( - util.units_convertible(canonical_units, units), - "units for variable {} must be convertible to {} " - "currently they are {}".format(variable_name, canonical_units, units), - ) - - return valid_standard_units.to_result() - - def check_standard_name(self, ds): - """ - Check a variables's standard_name attribute to ensure that it meets CF - compliance. - - CF §3.3 A standard name is associated with a variable via the attribute - standard_name which takes a string value comprised of a standard name - optionally followed by one or more blanks and a standard name modifier - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - coord_vars = self._find_coord_vars(ds) - aux_coord_vars = self._find_aux_coord_vars(ds) - axis_vars = cfutil.get_axis_variables(ds) - flag_vars = cfutil.get_flag_variables(ds) - geophysical_vars = self._find_geophysical_vars(ds) - - variables_requiring_standard_names = ( - coord_vars + aux_coord_vars + axis_vars + flag_vars + geophysical_vars - ) - for name in set(variables_requiring_standard_names): - # Compression indices used in reduced horizontal grids or - # compression schemes do not require attributes other than compress - if cfutil.is_compression_coordinate(ds, name): - continue - - ncvar = ds.variables[name] - - # §9 doesn't explicitly allow instance variables as coordinates but - # it's loosely implied. Just in case, skip it. - if hasattr(ncvar, "cf_role"): - continue - - # Unfortunately, §6.1 allows for string types to be listed as - # coordinates. - if hasattr(ncvar.dtype, "char") and ncvar.dtype.char == "S": - continue - elif ncvar.dtype == str: - continue - - standard_name = getattr(ncvar, "standard_name", None) - standard_name, standard_name_modifier = self._split_standard_name( - standard_name - ) - long_name = getattr(ncvar, "long_name", None) - long_or_std_name = TestCtx(BaseCheck.HIGH, self.section_titles["3.3"]) - if long_name is not None: - long_name_present = True - long_or_std_name.assert_true( - isinstance(long_name, str), - "Attribute long_name for variable {} must be a string".format(name), - ) - else: - long_name_present = False - # §1.3 The long_name and standard_name attributes are used to - # describe the content of each variable. For backwards - # compatibility with COARDS neither is required, but use of at - # least one of them is strongly recommended. - - # If standard_name is not defined but long_name is, don't continue - # the check for this variable - if standard_name is not None: - standard_name_present = True - valid_std_name = TestCtx(BaseCheck.HIGH, self.section_titles["3.3"]) - valid_std_name.assert_true( - isinstance(standard_name, str), - "Attribute standard_name for variable {} must be a string".format( - name - ), - ) - if isinstance(standard_name, str): - valid_std_name.assert_true( - standard_name in self._std_names, - "standard_name {} is not defined in Standard Name Table v{}".format( - standard_name or "undefined", self._std_names._version - ), - ) - - ret_val.append(valid_std_name.to_result()) - - # 2) optional - if modifiers, should be in table - if standard_name_modifier is not None: - valid_modifier = TestCtx(BaseCheck.HIGH, self.section_titles["3.3"]) - allowed = [ - "detection_minimum", - "number_of_observations", - "standard_error", - "status_flag", - ] - valid_modifier.assert_true( - standard_name_modifier in allowed, - "standard_name modifier {} for variable {} is not a valid modifier " - "according to appendix C".format(standard_name_modifier, name), - ) - - ret_val.append(valid_modifier.to_result()) - else: - standard_name_present = False - - long_or_std_name.assert_true( - long_name_present or standard_name_present, - "Attribute long_name or/and standard_name is highly recommended for variable {}".format( - name - ), - ) - ret_val.append(long_or_std_name.to_result()) - return ret_val - - def check_ancillary_variables(self, ds): - """ - Checks the ancillary_variable attribute for all variables to ensure - they are CF compliant. - - CF §3.4 It is a string attribute whose value is a blank separated list - of variable names. The nature of the relationship between variables - associated via ancillary_variables must be determined by other - attributes. The variables listed by the ancillary_variables attribute - will often have the standard name of the variable which points to them - including a modifier (Appendix C, Standard Name Modifiers) to indicate - the relationship. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - for ncvar in ds.get_variables_by_attributes( - ancillary_variables=lambda x: x is not None - ): - name = ncvar.name - valid_ancillary = TestCtx(BaseCheck.HIGH, self.section_titles["3.4"]) - ancillary_variables = ncvar.ancillary_variables - - valid_ancillary.assert_true( - isinstance(ancillary_variables, str), - "ancillary_variables attribute defined by {} " - "should be string".format(name), - ) - - # Can't perform the second check if it's not a string - if not isinstance(ancillary_variables, str): - ret_val.append(valid_ancillary.to_result()) - continue - - for ancillary_variable in ancillary_variables.split(): - valid_ancillary.assert_true( - ancillary_variable in ds.variables, - "{} is not a variable in this dataset".format(ancillary_variable), - ) - - ret_val.append(valid_ancillary.to_result()) - - return ret_val - - def check_flags(self, ds): - """ - Check the flag_values, flag_masks and flag_meanings attributes for - variables to ensure they are CF compliant. - - CF §3.5 The attributes flag_values, flag_masks and flag_meanings are - intended to make variables that contain flag values self describing. - Status codes and Boolean (binary) condition flags may be expressed with - different combinations of flag_values and flag_masks attribute - definitions. - - The flag_values and flag_meanings attributes describe a status flag - consisting of mutually exclusive coded values. - - The flag_meanings attribute is a string whose value is a blank - separated list of descriptive words or phrases, one for each flag - value. Each word or phrase should consist of characters from the - alphanumeric set and the following five: '_', '-', '.', '+', '@'. - - The flag_masks and flag_meanings attributes describe a number of - independent Boolean conditions using bit field notation by setting - unique bits in each flag_masks value. - - The flag_masks, flag_values and flag_meanings attributes, used - together, describe a blend of independent Boolean conditions and - enumerated status codes. A flagged condition is identified by a bitwise - AND of the variable value and each flag_masks value; a result that - matches the flag_values value indicates a true condition. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - for name in cfutil.get_flag_variables(ds): - variable = ds.variables[name] - flag_values = getattr(variable, "flag_values", None) - flag_masks = getattr(variable, "flag_masks", None) - - valid_flags_var = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) - # Check that the variable defines mask or values - valid_flags_var.assert_true( - flag_values is not None or flag_masks is not None, - "{} does not define either flag_masks or flag_values".format(name), - ) - ret_val.append(valid_flags_var.to_result()) - - valid_meanings = self._check_flag_meanings(ds, name) - ret_val.append(valid_meanings) - - # check flag_values - if flag_values is not None: - valid_values = self._check_flag_values(ds, name) - ret_val.append(valid_values) - - # check flag_masks - if flag_masks is not None: - valid_masks = self._check_flag_masks(ds, name) - ret_val.append(valid_masks) - - if flag_values is not None and flag_masks is not None: - allv = list( - map(lambda a, b: a & b == a, list(zip(flag_values, flag_masks))) - ) - - allvr = Result(BaseCheck.MEDIUM, all(allv), self.section_titles["3.5"]) - if not allvr.value: - allvr.msgs = [ - "flag masks and flag values for '{}' combined don't equal flag value".format( - name - ) - ] - - ret_val.append(allvr) - - return ret_val - - def _check_flag_values(self, ds, name): - """ - Checks a variable's flag_values attribute for compliance under CF - - - flag_values exists as an array - - unique elements in flag_values - - flag_values si the same dtype as the variable - - flag_values is the same length as flag_meanings - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: Name of variable to check - :rtype: compliance_checker.base.Result - """ - variable = ds.variables[name] - - flag_values = getattr(variable, "flag_values", None) - flag_meanings = getattr(variable, "flag_meanings", None) - valid_values = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) - - # flag_values must be a list of values, not a string or anything else - valid_values.assert_true( - isinstance(flag_values, np.ndarray), - "{}'s flag_values must be an array of values not {}".format( - name, type(flag_values) - ), - ) - - # We can't perform any more checks - if not isinstance(flag_values, np.ndarray): - return valid_values.to_result() - - # the flag values must be independent, no repeating values - flag_set = set(flag_values) - valid_values.assert_true( - len(flag_set) == len(flag_values), - "{}'s flag_values must be independent and can not be repeated".format(name), - ) - - # the data type for flag_values should be the same as the variable - valid_values.assert_true( - variable.dtype.type == flag_values.dtype.type, - "flag_values ({}) must be the same data type as {} ({})" - "".format(flag_values.dtype.type, name, variable.dtype.type), - ) - - if isinstance(flag_meanings, str): - flag_meanings = flag_meanings.split() - valid_values.assert_true( - len(flag_meanings) == len(flag_values), - "{}'s flag_meanings and flag_values should have the same number ".format( - name - ) - + "of elements.", - ) - - return valid_values.to_result() - - def _check_flag_masks(self, ds, name): - """ - Check a variable's flag_masks attribute for compliance under CF - - - flag_masks exists as an array - - flag_masks is the same dtype as the variable - - variable's dtype can support bit-field - - flag_masks is the same length as flag_meanings - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: Variable name - :rtype: compliance_checker.base.Result - """ - variable = ds.variables[name] - - flag_masks = variable.flag_masks - flag_meanings = getattr(ds, "flag_meanings", None) - - valid_masks = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) - - valid_masks.assert_true( - isinstance(flag_masks, np.ndarray), - "{}'s flag_masks must be an array of values not {}".format( - name, type(flag_masks).__name__ - ), - ) - - if not isinstance(flag_masks, np.ndarray): - return valid_masks.to_result() - - valid_masks.assert_true( - variable.dtype.type == flag_masks.dtype.type, - "flag_masks ({}) mustbe the same data type as {} ({})" - "".format(flag_masks.dtype.type, name, variable.dtype.type), - ) - - type_ok = ( - np.issubdtype(variable.dtype, np.integer) - or np.issubdtype(variable.dtype, "S") - or np.issubdtype(variable.dtype, "b") - ) - - valid_masks.assert_true( - type_ok, - "{}'s data type must be capable of bit-field expression".format(name), - ) - - if isinstance(flag_meanings, str): - flag_meanings = flag_meanings.split() - valid_masks.assert_true( - len(flag_meanings) == len(flag_masks), - "{} flag_meanings and flag_masks should have the same number ".format( - name - ) - + "of elements.", - ) - - return valid_masks.to_result() - - def _check_flag_meanings(self, ds, name): - """ - Check a variable's flag_meanings attribute for compliance under CF - - - flag_meanings exists - - flag_meanings is a string - - flag_meanings elements are valid strings - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: Variable name - :rtype: compliance_checker.base.Result - """ - variable = ds.variables[name] - flag_meanings = getattr(variable, "flag_meanings", None) - valid_meanings = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) - - valid_meanings.assert_true( - flag_meanings is not None, - "{}'s flag_meanings attribute is required for flag variables".format(name), - ) - - valid_meanings.assert_true( - isinstance(flag_meanings, str), - "{}'s flag_meanings attribute must be a string".format(name), - ) - - # We can't perform any additional checks if it's not a string - if not isinstance(flag_meanings, str): - return valid_meanings.to_result() - - valid_meanings.assert_true( - len(flag_meanings) > 0, "{}'s flag_meanings can't be empty".format(name) - ) - - flag_regx = regex.compile(r"^[0-9A-Za-z_\-.+@]+$") - meanings = flag_meanings.split() - for meaning in meanings: - if flag_regx.match(meaning) is None: - valid_meanings.assert_true( - False, - "{}'s flag_meanings attribute defined an illegal flag meaning ".format( - name - ) - + "{}".format(meaning), - ) - return valid_meanings.to_result() - - ############################################################################### - # Chapter 4: Coordinate Types - ############################################################################### - - def check_coordinate_types(self, ds): - """ - Check the axis attribute of coordinate variables - - CF §4 The attribute axis may be attached to a coordinate variable and - given one of the values X, Y, Z or T which stand for a longitude, - latitude, vertical, or time axis respectively. Alternatively the - standard_name attribute may be used for direct identification. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - for variable in ds.get_variables_by_attributes(axis=lambda x: x is not None): - name = variable.name - # Coordinate compressions should not be checked as a valid - # coordinate, which they are not. They are a mechanism to project - # an array of indices onto a 2-d grid containing valid coordinates. - if cfutil.is_compression_coordinate(ds, name): - continue - - variable = ds.variables[name] - # Even though it's not allowed in CF 1.6, it is allowed in CF 1.7 - # and we see people do it, often. - if hasattr(variable, "cf_role"): - continue - - # §6.1 allows for labels to be referenced as auxiliary coordinate - # variables, which should not be checked like the rest of the - # coordinates. - if hasattr(variable.dtype, "char") and variable.dtype.char == "S": - continue - elif variable.dtype == str: - continue - - axis = getattr(variable, "axis", None) - - if axis is not None: - valid_axis = self._check_axis(ds, name) - ret_val.append(valid_axis) - - return ret_val - - def _check_axis(self, ds, name): - """ - Checks that the axis attribute is a string and an allowed value, namely - one of 'T', 'X', 'Y', or 'Z'. - - :param netCDF4.Dataset ds: An open netCDF dataset - :param str name: Name of the variable - :rtype: compliance_checker.base.Result - """ - allowed_axis = ["T", "X", "Y", "Z"] - variable = ds.variables[name] - axis = variable.axis - - valid_axis = TestCtx(BaseCheck.HIGH, self.section_titles["4"]) - axis_is_string = (isinstance(axis, str),) - valid_axis.assert_true( - axis_is_string and len(axis) > 0, - "{}'s axis attribute must be a non-empty string".format(name), - ) - - # If axis isn't a string we can't continue any checks - if not axis_is_string or len(axis) == 0: - return valid_axis.to_result() - - valid_axis.assert_true( - axis in allowed_axis, - "{}'s axis attribute must be T, X, Y, or Z, ".format(name) - + "currently {}".format(axis), - ) - - return valid_axis.to_result() - - def check_latitude(self, ds): - """ - Check variable(s) that define latitude and are defined correctly according to CF. - - CF §4.1 Variables representing latitude must always explicitly include - the units attribute; there is no default value. The recommended unit - of latitude is degrees_north. Also acceptable are degree_north, - degree_N, degrees_N, degreeN, and degreesN. - - Optionally, the latitude type may be indicated additionally by - providing the standard_name attribute with the value latitude, and/or - the axis attribute with the value Y. - - - Four checks per latitude variable - - (H) latitude has units attribute - - (M) latitude has an allowed units attribute - - (L) latitude uses degrees_north (if not in rotated pole) - - (M) latitude defines either standard_name or axis - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - allowed_lat_units = [ - "degrees_north", - "degree_north", - "degree_n", - "degrees_n", - "degreen", - "degreesn", - ] - - # Determine the grid mappings in this dataset - grid_mapping = [] - grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) - for name in grid_mapping_variables: - variable = ds.variables[name] - grid_mapping_name = getattr(variable, "grid_mapping_name", None) - if grid_mapping_name: - grid_mapping.append(grid_mapping_name) - - latitude_variables = cfutil.get_latitude_variables(ds) - for latitude in latitude_variables: - variable = ds.variables[latitude] - units = getattr(variable, "units", None) - units_is_string = isinstance(units, str) - standard_name = getattr(variable, "standard_name", None) - axis = getattr(variable, "axis", None) - - # Check that latitude defines units - valid_latitude = TestCtx(BaseCheck.HIGH, self.section_titles["4.1"]) - valid_latitude.assert_true( - units is not None, - "latitude variable '{}' must define units".format(latitude), - ) - ret_val.append(valid_latitude.to_result()) - - # Check that latitude uses allowed units - allowed_units = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.1"]) - if standard_name == "grid_latitude": - e_n_units = cfutil.VALID_LAT_UNITS | cfutil.VALID_LON_UNITS - # check that the units aren't in east and north degrees units, - # but are convertible to angular units - allowed_units.assert_true( - units not in e_n_units and Unit(units) == Unit("degree"), - "Grid latitude variable '{}' should use degree equivalent units without east or north components. " - "Current units are {}".format(latitude, units), - ) - else: - allowed_units.assert_true( - units_is_string and units.lower() in allowed_lat_units, - "latitude variable '{}' should define valid units for latitude" - "".format(latitude), - ) - ret_val.append(allowed_units.to_result()) - - # Check that latitude uses degrees_north - if standard_name == "latitude" and units != "degrees_north": - # This is only a recommendation and we won't penalize but we - # will include a recommended action. - msg = ( - "CF recommends latitude variable '{}' to use units degrees_north" - "".format(latitude) - ) - recommended_units = Result( - BaseCheck.LOW, (1, 1), self.section_titles["4.1"], [msg] - ) - ret_val.append(recommended_units) - - y_variables = ds.get_variables_by_attributes(axis="Y") - # Check that latitude defines either standard_name or axis - definition = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.1"]) - definition.assert_true( - standard_name == "latitude" or axis == "Y" or y_variables != [], - "latitude variable '{}' should define standard_name='latitude' or axis='Y'" - "".format(latitude), - ) - ret_val.append(definition.to_result()) - - return ret_val - - def check_longitude(self, ds): - """ - Check variable(s) that define longitude and are defined correctly according to CF. - - CF §4.2 Variables representing longitude must always explicitly include - the units attribute; there is no default value. The recommended unit - of longitude is degrees_east. Also acceptable are degree_east, - degree_E, degrees_E, degreeE, and degreesE. - - Optionally, the longitude type may be indicated additionally by - providing the standard_name attribute with the value longitude, and/or - the axis attribute with the value X. - - - Four checks per longitude variable - - (H) longitude has units attribute - - (M) longitude has an allowed units attribute - - (L) longitude uses degrees_east (if not in rotated pole) - - (M) longitude defines either standard_name or axis - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - # TODO we already have a check_latitude... I'm sure we can make DRYer - - ret_val = [] - allowed_lon_units = [ - "degrees_east", - "degree_east", - "degree_e", - "degrees_e", - "degreee", - "degreese", - ] - - # Determine the grid mappings in this dataset - grid_mapping = [] - grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) - for name in grid_mapping_variables: - variable = ds.variables[name] - grid_mapping_name = getattr(variable, "grid_mapping_name", None) - if grid_mapping_name: - grid_mapping.append(grid_mapping_name) - - longitude_variables = cfutil.get_longitude_variables(ds) - for longitude in longitude_variables: - variable = ds.variables[longitude] - units = getattr(variable, "units", None) - units_is_string = isinstance(units, str) - standard_name = getattr(variable, "standard_name", None) - axis = getattr(variable, "axis", None) - - # NOTE see docstring--should below be 4.1 or 4.2? - # Check that longitude defines units - valid_longitude = TestCtx(BaseCheck.HIGH, self.section_titles["4.2"]) - valid_longitude.assert_true( - units is not None, - "longitude variable '{}' must define units".format(longitude), - ) - ret_val.append(valid_longitude.to_result()) - - # Check that longitude uses allowed units - allowed_units = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.2"]) - if standard_name == "grid_longitude": - e_n_units = cfutil.VALID_LAT_UNITS | cfutil.VALID_LON_UNITS - # check that the units aren't in east and north degrees units, - # but are convertible to angular units - allowed_units.assert_true( - units not in e_n_units and Unit(units) == Unit("degree"), - "Grid longitude variable '{}' should use degree equivalent units without east or north components. " - "Current units are {}".format(longitude, units), - ) - else: - allowed_units.assert_true( - units_is_string and units.lower() in allowed_lon_units, - "longitude variable '{}' should define valid units for longitude" - "".format(longitude), - ) - ret_val.append(allowed_units.to_result()) - - # Check that longitude uses degrees_east - if standard_name == "longitude" and units != "degrees_east": - # This is only a recommendation and we won't penalize but we - # will include a recommended action. - msg = ( - "CF recommends longitude variable '{}' to use units degrees_east" - "".format(longitude) - ) - recommended_units = Result( - BaseCheck.LOW, (1, 1), self.section_titles["4.2"], [msg] - ) - ret_val.append(recommended_units) - - x_variables = ds.get_variables_by_attributes(axis="X") - # Check that longitude defines either standard_name or axis - definition = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.2"]) - definition.assert_true( - standard_name == "longitude" or axis == "X" or x_variables != [], - "longitude variable '{}' should define standard_name='longitude' or axis='X'" - "".format(longitude), - ) - ret_val.append(definition.to_result()) - - return ret_val - - def check_dimensional_vertical_coordinate( - self, ds, dimless_vertical_coordinates=dimless_vertical_coordinates_1_6 - ): - """ - Check units for variables defining vertical position are valid under - CF. - - CF §4.3.1 The units attribute for dimensional coordinates will be a string - formatted as per the udunits.dat file. - - The acceptable units for vertical (depth or height) coordinate variables - are: - - units of pressure as listed in the file udunits.dat. For vertical axes - the most commonly used of these include include bar, millibar, - decibar, atmosphere (atm), pascal (Pa), and hPa. - - units of length as listed in the file udunits.dat. For vertical axes - the most commonly used of these include meter (metre, m), and - kilometer (km). - - other units listed in the file udunits.dat that may under certain - circumstances reference vertical position such as units of density or - temperature. - - Plural forms are also acceptable. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - z_variables = cfutil.get_z_variables(ds) - # dimless_standard_names = [name for name, regx in dimless_vertical_coordinates] - for name in z_variables: - variable = ds.variables[name] - standard_name = getattr(variable, "standard_name", None) - units = getattr(variable, "units", None) - positive = getattr(variable, "positive", None) - # Skip the variable if it's dimensionless - if ( - hasattr(variable, "formula_terms") - or standard_name in dimless_vertical_coordinates - ): - continue - - valid_vertical_coord = TestCtx(BaseCheck.HIGH, self.section_titles["4.3"]) - valid_vertical_coord.assert_true( - isinstance(units, str) and units, - "§4.3.1 {}'s units must be defined for vertical coordinates, " - "there is no default".format(name), - ) - - if not util.units_convertible("bar", units): - valid_vertical_coord.assert_true( - positive in ("up", "down"), - "{}: vertical coordinates not defining pressure must include " - "a positive attribute that is either 'up' or 'down'".format(name), - ) - - # _check_valid_standard_units, part of the Chapter 3 checks, - # already verifies that this coordinate has valid units - - ret_val.append(valid_vertical_coord.to_result()) - - return ret_val - - def _check_dimensionless_vertical_coordinate_1_6( - self, ds, vname, deprecated_units, ret_val, dim_vert_coords_dict - ): - """ - Check that a dimensionless vertical coordinate variable is valid under - CF-1.6. - - :param netCDF4.Dataset ds: open netCDF4 dataset - :param str name: variable name - :param list ret_val: array to append Results to - :rtype None - """ - variable = ds.variables[vname] - standard_name = getattr(variable, "standard_name", None) - units = getattr(variable, "units", None) - formula_terms = getattr(variable, "formula_terms", None) - # Skip the variable if it's dimensional - if formula_terms is None and standard_name not in dim_vert_coords_dict: - return - - is_not_deprecated = TestCtx(BaseCheck.LOW, self.section_titles["4.3"]) - - is_not_deprecated.assert_true( - units not in deprecated_units, - "§4.3.2: units are deprecated by CF in variable {}: {}" - "".format(vname, units), - ) - - # check the vertical coordinates - ret_val.append(is_not_deprecated.to_result()) - ret_val.append(self._check_formula_terms(ds, vname, dim_vert_coords_dict)) - - def check_dimensionless_vertical_coordinates(self, ds): - """ - Check the validity of dimensionless coordinates under CF - - CF §4.3.2 The units attribute is not required for dimensionless - coordinates. - - The standard_name attribute associates a coordinate with its definition - from Appendix D, Dimensionless Vertical Coordinates. The definition - provides a mapping between the dimensionless coordinate values and - dimensional values that can positively and uniquely indicate the - location of the data. - - A new attribute, formula_terms, is used to associate terms in the - definitions with variables in a netCDF file. To maintain backwards - compatibility with COARDS the use of these attributes is not required, - but is strongly recommended. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - z_variables = cfutil.get_z_variables(ds) - deprecated_units = ["level", "layer", "sigma_level"] - - ret_val.extend( - self._check_dimensionless_vertical_coordinates( - ds, - deprecated_units, - self._check_dimensionless_vertical_coordinate_1_6, - dimless_vertical_coordinates_1_6, - ) - ) - - return ret_val - - def check_time_coordinate(self, ds): - """ - Check variables defining time are valid under CF - - CF §4.4 Variables representing time must always explicitly include the - units attribute; there is no default value. - - The units attribute takes a string value formatted as per the - recommendations in the Udunits package. - - The acceptable units for time are listed in the udunits.dat file. The - most commonly used of these strings (and their abbreviations) includes - day (d), hour (hr, h), minute (min) and second (sec, s). Plural forms - are also acceptable. The reference time string (appearing after the - identifier since) may include date alone; date and time; or date, time, - and time zone. The reference time is required. A reference time in year - 0 has a special meaning (see Section 7.4, "Climatological Statistics"). - - Recommend that the unit year be used with caution. It is not a calendar - year. For similar reasons the unit month should also be used with - caution. - - A time coordinate is identifiable from its units string alone. - Optionally, the time coordinate may be indicated additionally by - providing the standard_name attribute with an appropriate value, and/or - the axis attribute with the value T. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - ret_val = [] - for name in cfutil.get_time_variables(ds): - variable = ds.variables[name] - # Has units - has_units = hasattr(variable, "units") - if not has_units: - result = Result( - BaseCheck.HIGH, - False, - self.section_titles["4.4"], - ["%s does not have units" % name], - ) - ret_val.append(result) - continue - # Correct and identifiable units - result = Result(BaseCheck.HIGH, True, self.section_titles["4.4"]) - ret_val.append(result) - correct_units = util.units_temporal(variable.units) - reasoning = None - if not correct_units: - reasoning = ["%s does not have correct time units" % name] - result = Result( - BaseCheck.HIGH, correct_units, self.section_titles["4.4"], reasoning - ) - ret_val.append(result) - - return ret_val - - def check_calendar(self, ds): - """ - Check the calendar attribute for variables defining time and ensure it - is a valid calendar prescribed by CF. - - CF §4.4.1 In order to calculate a new date and time given a base date, base - time and a time increment one must know what calendar to use. - - The values currently defined for calendar are: - - gregorian or standard - - proleptic_gregorian - - noleap or 365_day - - all_leap or 366_day - - 360_day - - julian - - none - - The calendar attribute may be set to none in climate experiments that - simulate a fixed time of year. - The time of year is indicated by the date in the reference time of the - units attribute. - - If none of the calendars defined above applies, a non-standard calendar - can be defined. The lengths of each month are explicitly defined with - the month_lengths attribute of the time axis. - - If leap years are included, then two other attributes of the time axis - should also be defined: - - leap_year, leap_month - - The calendar attribute is not required when a non-standard calendar is - being used. It is sufficient to define the calendar using the - month_lengths attribute, along with leap_year, and leap_month as - appropriate. However, the calendar attribute is allowed to take - non-standard values and in that case defining the non-standard calendar - using the appropriate attributes is required. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - valid_calendars = [ - "gregorian", - "standard", - "proleptic_gregorian", - "noleap", - "365_day", - "all_leap", - "366_day", - "360_day", - "julian", - "none", - ] - - ret_val = [] - - # if has a calendar, check that it is within the valid values - # otherwise no calendar is valid - for time_var in ds.get_variables_by_attributes( - calendar=lambda c: c is not None - ): - reasoning = None - valid_calendar = time_var.calendar in valid_calendars - - if not valid_calendar: - reasoning = [ - "§4.4.1 Variable %s should have a valid calendar: '%s' is not a valid calendar" - % (time_var.name, time_var.calendar) - ] - - # passes if the calendar is valid, otherwise notify of invalid - # calendar - - result = Result( - BaseCheck.LOW, valid_calendar, self.section_titles["4.4"], reasoning - ) - ret_val.append(result) - - return ret_val - - ############################################################################### - # Chapter 5: Coordinate Systems - ############################################################################### - - def check_aux_coordinates(self, ds): - """ - Chapter 5 paragraph 3 - - The dimensions of an auxiliary coordinate variable must be a subset of - the dimensions of the variable with which the coordinate is associated, - with two exceptions. First, string-valued coordinates (Section 6.1, - "Labels") have a dimension for maximum string length. Second, in the - ragged array representations of data (Chapter 9, Discrete Sampling - Geometries), special methods are needed to connect the data and - coordinates. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - ret_val = [] - - # for contiguous ragged array/indexed ragged array representations, - # coordinates are not required to adhere to the same principles; - # these representaitions can be identified by two attributes: - - # required for contiguous - count_vars = ds.get_variables_by_attributes( - sample_dimension=lambda x: x is not None - ) - - # required for indexed - index_vars = ds.get_variables_by_attributes( - instance_dimension=lambda x: x is not None - ) - - # if these attributes exist, we don't need to test - # the coordinates - if count_vars or index_vars: - return ret_val - - geophysical_variables = self._find_geophysical_vars(ds) - for name in geophysical_variables: - variable = ds.variables[name] - coordinates = getattr(variable, "coordinates", None) - # We use a set so we can assert - dim_set = set(variable.dimensions) - # No auxiliary coordinates, no check - if not isinstance(coordinates, str) or coordinates == "": - continue - - valid_aux_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5"]) - - for aux_coord in coordinates.split(): - valid_aux_coords.assert_true( - aux_coord in ds.variables, - "{}'s auxiliary coordinate specified by the coordinates attribute, {}, " - "is not a variable in this dataset" - "".format(name, aux_coord), - ) - if aux_coord not in ds.variables: - continue - - # §6.1 Allows for "labels" to be referenced as coordinates - if ( - hasattr(ds.variables[aux_coord].dtype, "char") - and ds.variables[aux_coord].dtype.char == "S" - ): - continue - elif ds.variables[aux_coord].dtype == str: - continue - - aux_coord_dims = set(ds.variables[aux_coord].dimensions) - valid_aux_coords.assert_true( - aux_coord_dims.issubset(dim_set), - "dimensions for auxiliary coordinate variable {} ({}) " - "are not a subset of dimensions for variable {} ({})" - "".format( - aux_coord, ", ".join(aux_coord_dims), name, ", ".join(dim_set) - ), - ) - ret_val.append(valid_aux_coords.to_result()) - return ret_val - - def check_duplicate_axis(self, ds): - """ - Checks that no variable contains two coordinates defining the same - axis. - - Chapter 5 paragraph 6 - - If an axis attribute is attached to an auxiliary coordinate variable, - it can be used by applications in the same way the `axis` attribute - attached to a coordinate variable is used. However, it is not - permissible for a [geophysical variable] to have both a coordinate - variable and an auxiliary coordinate variable, or more than one of - either type of variable, having an `axis` attribute with any given - value e.g. there must be no more than one axis attribute for X for any - [geophysical variable]. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - :return: List of results - """ - - ret_val = [] - geophysical_variables = self._find_geophysical_vars(ds) - for name in geophysical_variables: - no_duplicates = TestCtx(BaseCheck.HIGH, self.section_titles["5"]) - axis_map = cfutil.get_axis_map(ds, name) - axes = [] - # For every coordinate associated with this variable, keep track of - # which coordinates define an axis and assert that there are no - # duplicate axis attributes defined in the set of associated - # coordinates. axis_map includes coordinates that don't actually have - # an axis attribute, so we need to ignore those here. - for axis, coords in axis_map.items(): - coords = [c for c in coords if hasattr(ds.variables[c], "axis")] - no_duplicates.assert_true( - len(coords) <= 1, - "'{}' has duplicate axis {} defined by [{}]".format( - name, axis, ", ".join(sorted(coords)) - ), - ) - - ret_val.append(no_duplicates.to_result()) - - return ret_val - - def check_multi_dimensional_coords(self, ds): - """ - Checks that no multidimensional coordinate shares a name with its - dimensions. - - Chapter 5 paragraph 4 - - We recommend that the name of a [multidimensional coordinate] should - not match the name of any of its dimensions. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - # This can only apply to auxiliary coordinate variables - for coord in self._find_aux_coord_vars(ds): - variable = ds.variables[coord] - if variable.ndim < 2: - continue - not_matching = TestCtx(BaseCheck.MEDIUM, self.section_titles["5"]) - - not_matching.assert_true( - coord not in variable.dimensions, - "{} shares the same name as one of its dimensions" "".format(coord), - ) - ret_val.append(not_matching.to_result()) - - return ret_val - - # NOTE ********** - # IS THIS EVEN NEEDED ANYMORE? - # *************** - def check_grid_coordinates(self, ds): - # def _check_grid_coordinates(self, ds): - """ - 5.6 When the coordinate variables for a horizontal grid are not - longitude and latitude, it is required that the true latitude and - longitude coordinates be supplied via the coordinates attribute. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - latitudes = cfutil.get_true_latitude_variables(ds) - longitudes = cfutil.get_true_longitude_variables(ds) - - check_featues = [ - "2d-regular-grid", - "2d-static-grid", - "3d-regular-grid", - "3d-static-grid", - "mapped-grid", - "reduced-grid", - ] - - # This one is tricky because there's a very subtle difference between - # latitude as defined in Chapter 4 and "true" latitude as defined in - # chapter 5. - - # For each geophysical variable that defines a grid, assert it is - # associated with a true latitude or longitude coordinate. - - for variable in self._find_geophysical_vars(ds): - # We use a set so we can do set-wise comparisons with coordinate - # dimensions - dimensions = set(ds.variables[variable].dimensions) - # If it's not a grid, skip it - if cfutil.guess_feature_type(ds, variable) not in check_featues: - continue - has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"]) - - # axis_map is a defaultdict(list) mapping the axis to a list of - # coordinate names. For example: - # {'X': ['lon'], 'Y':['lat'], 'Z':['lev']} - # The mapping comes from the dimensions of the variable and the - # contents of the `coordinates` attribute only. - axis_map = cfutil.get_axis_map(ds, variable) - - msg = ( - '{}\'s coordinate variable "{}" is not one of the variables identifying true ' - + "latitude/longitude and its dimensions are not a subset of {}'s dimensions" - ) - - alt = ( - "{} has no coordinate associated with a variable identified as true latitude/longitude; " - + "its coordinate variable should also share a subset of {}'s dimensions" - ) - - # Make sure we can find latitude and its dimensions are a subset - _lat = None - found_lat = False - for lat in axis_map["Y"]: - _lat = lat - is_subset_dims = set(ds.variables[lat].dimensions).issubset(dimensions) - - if is_subset_dims and lat in latitudes: - found_lat = True - break - if _lat: - has_coords.assert_true(found_lat, msg.format(variable, _lat, variable)) - else: - has_coords.assert_true(found_lat, alt.format(variable, variable)) - - # Make sure we can find longitude and its dimensions are a subset - _lon = None - found_lon = False - for lon in axis_map["X"]: - _lon = lon - is_subset_dims = set(ds.variables[lon].dimensions).issubset(dimensions) - - if is_subset_dims and lon in longitudes: - found_lon = True - break - if _lon: - has_coords.assert_true(found_lon, msg.format(variable, _lon, variable)) - else: - has_coords.assert_true(found_lon, alt.format(variable, variable)) - - ret_val.append(has_coords.to_result()) - return ret_val - - def check_reduced_horizontal_grid(self, ds): - """ - 5.3 A "reduced" longitude-latitude grid is one in which the points are - arranged along constant latitude lines with the number of points on a - latitude line decreasing toward the poles. - - Recommend that this type of gridded data be stored using the compression - scheme described in Section 8.2, "Compression by Gathering". The - compressed latitude and longitude auxiliary coordinate variables are - identified by the coordinates attribute. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - # Create a set of coordinate variables defining `compress` - lats = set(cfutil.get_latitude_variables(ds)) - lons = set(cfutil.get_longitude_variables(ds)) - - for name in self._find_geophysical_vars(ds): - coords = getattr(ds.variables[name], "coordinates", None) - axis_map = cfutil.get_axis_map(ds, name) - # If this variable has no coordinate that defines compression - if "C" not in axis_map: - continue - - valid_rgrid = TestCtx(BaseCheck.HIGH, self.section_titles["5.3"]) - # Make sure reduced grid features define coordinates - valid_rgrid.assert_true( - isinstance(coords, str) and coords, - "reduced grid feature {} must define coordinates attribute" - "".format(name), - ) - # We can't check anything else if there are no defined coordinates - if not isinstance(coords, str) and coords: - continue - - coord_set = set(coords.split()) - - # Make sure it's associated with valid lat and valid lon - valid_rgrid.assert_true( - len(coord_set.intersection(lons)) > 0, - "{} must be associated with a valid longitude coordinate".format(name), - ) - valid_rgrid.assert_true( - len(coord_set.intersection(lats)) > 0, - "{} must be associated with a valid latitude coordinate".format(name), - ) - valid_rgrid.assert_true( - len(axis_map["C"]) == 1, - "{} can not be associated with more than one compressed coordinates: " - "({})".format(name, ", ".join(axis_map["C"])), - ) - - for compressed_coord in axis_map["C"]: - coord = ds.variables[compressed_coord] - compress = getattr(coord, "compress", None) - valid_rgrid.assert_true( - isinstance(compress, str) and compress, - "compress attribute for compression coordinate {} must be a non-empty string" - "".format(compressed_coord), - ) - if not isinstance(compress, str): - continue - for dim in compress.split(): - valid_rgrid.assert_true( - dim in ds.dimensions, - "dimension {} referenced by {}:compress must exist" - "".format(dim, compressed_coord), - ) - ret_val.append(valid_rgrid.to_result()) - - return ret_val - - def _check_grid_mapping_attr_condition(self, attr, attr_name): - """ - Evaluate a condition (or series of conditions) for a particular - attribute. Implementation for CF-1.6. - - :param attr: attribute to teset condition for - :param str attr_name: name of the attribute - :rtype tuple - :return two-tuple of (bool, str) - """ - - if attr_name == "latitude_of_projection_origin": - return self._evaluate_latitude_of_projection_origin(attr) - - elif attr_name == "longitude_of_projection_origin": - return self._evaluate_longitude_of_projection_origin(attr) - - elif attr_name == "longitude_of_central_meridian": - return self._evaluate_longitude_of_central_meridian(attr) - - elif attr_name == "longitude_of_prime_meridian": - return self._evaluate_longitude_of_prime_meridian(attr) - - elif attr_name == "scale_factor_at_central_meridian": - return self._evaluate_scale_factor_at_central_meridian(attr) - - elif attr_name == "scale_factor_at_projection_origin": - return self._evaluate_scale_factor_at_projection_origin(attr) - - elif attr_name == "standard_parallel": - return self._evaluate_standard_parallel(attr) - - elif attr_name == "straight_vertical_longitude_from_pole": - return self._evaluate_straight_vertical_longitude_from_pole(attr) - - else: - raise NotImplementedError( - "Evaluation for {} not yet implemented".format(attr_name) - ) - - def _evaluate_latitude_of_projection_origin(self, val): - """ - Evaluate the condition for `latitude_of_projection_origin` attribute. - Return result. Value must be -90 <= x <= 90. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return ( - (val >= -90.0) and (val <= 90.0), - "latitude_of_projection_origin must satisfy (-90 <= x <= 90)", - ) - - def _evaluate_longitude_of_projection_origin(self, val): - """ - Evaluate the condition for `longitude_of_projection_origin` attribute. - Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return ( - (val >= -180.0) and (val <= 180.0), - "longitude_of_projection_origin must satisfy (-180 <= x <= 180)", - ) - - def _evaluate_longitude_of_central_meridian(self, val): - """ - Evaluate the condition for `longitude_of_central_meridian` attribute. - Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return ( - (val >= -180.0) and (val <= 180.0), - "longitude_of_central_meridian must satisfy (-180 <= x <= 180)", - ) - - def _evaluate_longitude_of_prime_meridian(self, val): - """ - Evaluate the condition for `longitude_of_prime_meridian` attribute. - Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return ( - (val >= -180.0) and (val <= 180.0), - "longitude_of_prime_meridian must satisfy (-180 <= x <= 180)", - ) - - def _evaluate_scale_factor_at_central_meridian(self, val): - """ - Evaluate the condition for `scale_factor_at_central_meridian` attribute. - Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return (val > 0.0, "scale_factor_at_central_meridian must be > 0.0") - - def _evaluate_scale_factor_at_projection_origin(self, val): - """ - Evaluate the condition for `scale_factor_at_projection_origin` attribute. - Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return (val > 0.0, "scale_factor_at_projection_origin must be > 0.0") - - def _evaluate_standard_parallel(self, val): - """ - Evaluate the condition for `standard_parallel` attribute. Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return ( - (val >= -90.0) and (val <= 90), - "standard_parallel must satisfy (-90 <= x <= 90)", - ) - - def _evaluate_straight_vertical_longitude_from_pole(self, val): - """ - Evaluate the condition for `straight_vertical_longitude_from_pole` - attribute. Return result. - - :param val: value to be tested - :rtype tuple - :return two-tuple (bool, msg) - """ - - return ( - (val >= -180.0) and (val <= 180), - "straight_vertical_longitude_from_pole must satisfy (-180 <= x <= 180)", - ) - - ############################################################################### - # Chapter 6: Labels and Alternative Coordinates - ############################################################################### - - def check_geographic_region(self, ds): - """ - 6.1.1 When data is representative of geographic regions which can be identified by names but which have complex - boundaries that cannot practically be specified using longitude and latitude boundary coordinates, a labeled - axis should be used to identify the regions. - - Recommend that the names be chosen from the list of standardized region names whenever possible. To indicate - that the label values are standardized the variable that contains the labels must be given the standard_name - attribute with the value region. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - region_list = ( - [ # TODO maybe move this (and other info like it) into a config file? - "africa", - "antarctica", - "arabian_sea", - "aral_sea", - "arctic_ocean", - "asia", - "atlantic_ocean", - "australia", - "baltic_sea", - "barents_opening", - "barents_sea", - "beaufort_sea", - "bellingshausen_sea", - "bering_sea", - "bering_strait", - "black_sea", - "canadian_archipelago", - "caribbean_sea", - "caspian_sea", - "central_america", - "chukchi_sea", - "contiguous_united_states", - "denmark_strait", - "drake_passage", - "east_china_sea", - "english_channel", - "eurasia", - "europe", - "faroe_scotland_channel", - "florida_bahamas_strait", - "fram_strait", - "global", - "global_land", - "global_ocean", - "great_lakes", - "greenland", - "gulf_of_alaska", - "gulf_of_mexico", - "hudson_bay", - "iceland_faroe_channel", - "indian_ocean", - "indonesian_throughflow", - "indo_pacific_ocean", - "irish_sea", - "lake_baykal", - "lake_chad", - "lake_malawi", - "lake_tanganyika", - "lake_victoria", - "mediterranean_sea", - "mozambique_channel", - "north_america", - "north_sea", - "norwegian_sea", - "pacific_equatorial_undercurrent", - "pacific_ocean", - "persian_gulf", - "red_sea", - "ross_sea", - "sea_of_japan", - "sea_of_okhotsk", - "south_america", - "south_china_sea", - "southern_ocean", - "taiwan_luzon_straits", - "weddell_sea", - "windward_passage", - "yellow_sea", - ] - ) - - for var in ds.get_variables_by_attributes(standard_name="region"): - valid_region = TestCtx(BaseCheck.MEDIUM, self.section_titles["6.1"]) - region = var[:] - if np.ma.isMA(region): - region = region.data - valid_region.assert_true( - "".join(region.astype(str)).lower() in region_list, - "6.1.1 '{}' specified by '{}' is not a valid region".format( - "".join(region.astype(str)), var.name - ), - ) - ret_val.append(valid_region.to_result()) - return ret_val - - ############################################################################### - # Chapter 7: Data Representative of Cells - ############################################################################### - - def check_cell_boundaries(self, ds): - """ - Checks the dimensions of cell boundary variables to ensure they are CF compliant. - - 7.1 To represent cells we add the attribute bounds to the appropriate coordinate variable(s). The value of bounds - is the name of the variable that contains the vertices of the cell boundaries. We refer to this type of variable as - a "boundary variable." A boundary variable will have one more dimension than its associated coordinate or auxiliary - coordinate variable. The additional dimension should be the most rapidly varying one, and its size is the maximum - number of cell vertices. - - Applications that process cell boundary data often times need to determine whether or not adjacent cells share an - edge. In order to facilitate this type of processing the following restrictions are placed on the data in boundary - variables: - - Bounds for 1-D coordinate variables - - For a coordinate variable such as lat(lat) with associated boundary variable latbnd(x,2), the interval endpoints - must be ordered consistently with the associated coordinate, e.g., for an increasing coordinate, lat(1) > lat(0) - implies latbnd(i,1) >= latbnd(i,0) for all i - - If adjacent intervals are contiguous, the shared endpoint must be represented identically in each instance where - it occurs in the boundary variable. For example, if the intervals that contain grid points lat(i) and lat(i+1) are - contiguous, then latbnd(i+1,0) = latbnd(i,1). - - Bounds for 2-D coordinate variables with 4-sided cells - - In the case where the horizontal grid is described by two-dimensional auxiliary coordinate variables in latitude - lat(n,m) and longitude lon(n,m), and the associated cells are four-sided, then the boundary variables are given - in the form latbnd(n,m,4) and lonbnd(n,m,4), where the trailing index runs over the four vertices of the cells. - - Bounds for multi-dimensional coordinate variables with p-sided cells - - In all other cases, the bounds should be dimensioned (...,n,p), where (...,n) are the dimensions of the auxiliary - coordinate variables, and p the number of vertices of the cells. The vertices must be traversed anticlockwise in the - lon-lat plane as viewed from above. The starting vertex is not specified. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - # Note that test does not check monotonicity - ret_val = [] - reasoning = [] - for variable_name, boundary_variable_name in cfutil.get_cell_boundary_map( - ds - ).items(): - variable = ds.variables[variable_name] - valid = True - reasoning = [] - if boundary_variable_name not in ds.variables: - valid = False - reasoning.append( - "Boundary variable {} referenced by {} not ".format( - boundary_variable_name, variable.name - ) - + "found in dataset variables" - ) - else: - boundary_variable = ds.variables[boundary_variable_name] - # The number of dimensions in the bounds variable should always be - # the number of dimensions in the referring variable + 1 - if boundary_variable.ndim < 2: - valid = False - reasoning.append( - "Boundary variable {} specified by {}".format( - boundary_variable.name, variable.name - ) - + " should have at least two dimensions to enclose the base " - + "case of a one dimensionsal variable" - ) - if boundary_variable.ndim != variable.ndim + 1: - valid = False - reasoning.append( - "The number of dimensions of the variable %s is %s, but the " - "number of dimensions of the boundary variable %s is %s. The boundary variable " - "should have %s dimensions" - % ( - variable.name, - variable.ndim, - boundary_variable.name, - boundary_variable.ndim, - variable.ndim + 1, - ) - ) - if variable.dimensions[:] != boundary_variable.dimensions[: variable.ndim]: - valid = False - reasoning.append( - "Boundary variable coordinates (for {}) are in improper order: {}. Bounds-specific dimensions should be last" - "".format(variable.name, boundary_variable.dimensions) - ) - - # ensure p vertices form a valid simplex given previous a...n - # previous auxiliary coordinates - if ( - ds.dimensions[boundary_variable.dimensions[-1]].size - < len(boundary_variable.dimensions[:-1]) + 1 - ): - valid = False - reasoning.append( - "Dimension {} of boundary variable (for {}) must have at least {} elements to form a simplex/closed cell with previous dimensions {}.".format( - boundary_variable.name, - variable.name, - len(variable.dimensions) + 1, - boundary_variable.dimensions[:-1], - ) - ) - result = Result( - BaseCheck.MEDIUM, valid, self.section_titles["7.1"], reasoning - ) - ret_val.append(result) - return ret_val - - def check_cell_measures(self, ds): - """ - 7.2 To indicate extra information about the spatial properties of a - variable's grid cells, a cell_measures attribute may be defined for a - variable. This is a string attribute comprising a list of - blank-separated pairs of words of the form "measure: name". "area" and - "volume" are the only defined measures. - - The "name" is the name of the variable containing the measure values, - which we refer to as a "measure variable". The dimensions of the - measure variable should be the same as or a subset of the dimensions of - the variable to which they are related, but their order is not - restricted. - - The variable must have a units attribute and may have other attributes - such as a standard_name. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - reasoning = [] - variables = ds.get_variables_by_attributes( - cell_measures=lambda c: c is not None - ) - for var in variables: - search_str = r"^(?:area|volume): (\w+)$" - search_res = regex.search(search_str, var.cell_measures) - if not search_res: - valid = False - reasoning.append( - "The cell_measures attribute for variable {} " - "is formatted incorrectly. It should take the" - " form of either 'area: cell_var' or " - "'volume: cell_var' where cell_var is the " - "variable describing the cell measures".format(var.name) - ) - else: - valid = True - cell_meas_var_name = search_res.groups()[0] - # TODO: cache previous results - if cell_meas_var_name not in ds.variables: - valid = False - reasoning.append( - "Cell measure variable {} referred to by " - "{} is not present in dataset variables".format( - cell_meas_var_name, var.name - ) - ) - else: - cell_meas_var = ds.variables[cell_meas_var_name] - if not hasattr(cell_meas_var, "units"): - valid = False - reasoning.append( - "Cell measure variable {} is required " - "to have units attribute defined.".format( - cell_meas_var_name - ) - ) - if not set(cell_meas_var.dimensions).issubset(var.dimensions): - valid = False - reasoning.append( - "Cell measure variable {} must have " - "dimensions which are a subset of " - "those defined in variable {}.".format( - cell_meas_var_name, var.name - ) - ) - - result = Result( - BaseCheck.MEDIUM, valid, (self.section_titles["7.2"]), reasoning - ) - ret_val.append(result) - - return ret_val - - def check_cell_methods(self, ds): - """ - 7.3 To describe the characteristic of a field that is represented by cell values, we define the cell_methods attribute - of the variable. This is a string attribute comprising a list of blank-separated words of the form "name: method". Each - "name: method" pair indicates that for an axis identified by name, the cell values representing the field have been - determined or derived by the specified method. - - name can be a dimension of the variable, a scalar coordinate variable, a valid standard name, or the word "area" - - values of method should be selected from the list in Appendix E, Cell Methods, which includes point, sum, mean, maximum, - minimum, mid_range, standard_deviation, variance, mode, and median. Case is not significant in the method name. Some - methods (e.g., variance) imply a change of units of the variable, as is indicated in Appendix E, Cell Methods. - - Because the default interpretation for an intensive quantity differs from that of an extensive quantity and because this - distinction may not be understood by some users of the data, it is recommended that every data variable include for each - of its dimensions and each of its scalar coordinate variables the cell_methods information of interest (unless this - information would not be meaningful). It is especially recommended that cell_methods be explicitly specified for each - spatio-temporal dimension and each spatio-temporal scalar coordinate variable. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - ret_val = [] - psep = regex.compile( - r"(?P\w+: )+(?P\w+) ?(?Pwhere (?P\w+) " - r"?(?Pover (?P\w+))?| ?)(?:\((?P[^)]*)\))?" - ) - - for var in ds.get_variables_by_attributes(cell_methods=lambda x: x is not None): - if not getattr(var, "cell_methods", ""): - continue - - method = getattr(var, "cell_methods", "") - - valid_attribute = TestCtx( - BaseCheck.HIGH, self.section_titles["7.3"] - ) # changed from 7.1 to 7.3 - valid_attribute.assert_true( - regex.match(psep, method) is not None, - '"{}" is not a valid format for cell_methods attribute of "{}"' - "".format(method, var.name), - ) - ret_val.append(valid_attribute.to_result()) - - valid_cell_names = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.3"]) - - # check that the name is valid - for match in regex.finditer(psep, method): - # it is possible to have "var1: var2: ... varn: ...", so handle - # that case - for var_raw_str in match.captures("vars"): - # strip off the ' :' at the end of each match - var_str = var_raw_str[:-2] - if ( - var_str in var.dimensions - or var_str == "area" - or var_str in getattr(var, "coordinates", "") - ): - - valid = True - else: - valid = False - - valid_cell_names.assert_true( - valid, - "{}'s cell_methods name component {} does not match a dimension, " - "area or auxiliary coordinate".format(var.name, var_str), - ) - - ret_val.append(valid_cell_names.to_result()) - - # Checks if the method value of the 'name: method' pair is acceptable - valid_cell_methods = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.3"]) - - for match in regex.finditer(psep, method): - # CF section 7.3 - "Case is not significant in the method name." - valid_cell_methods.assert_true( - match.group("method").lower() in self.cell_methods, - "{}:cell_methods contains an invalid method: {}" - "".format(var.name, match.group("method")), - ) - - ret_val.append(valid_cell_methods.to_result()) - - for match in regex.finditer(psep, method): - if match.group("paren_contents") is not None: - # split along spaces followed by words with a colon - # not sure what to do if a comment contains a colon! - ret_val.append( - self._check_cell_methods_paren_info( - match.group("paren_contents"), var - ).to_result() - ) - - return ret_val - - def _check_cell_methods_paren_info(self, paren_contents, var): - """ - Checks that the spacing and/or comment info contained inside the - parentheses in cell_methods is well-formed - """ - valid_info = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.3"]) - # if there are no colons, this is a simple comment - # TODO: are empty comments considered valid? - if ":" not in paren_contents: - valid_info.out_of += 1 - valid_info.score += 1 - return valid_info - # otherwise, split into k/v pairs - kv_pair_pat = r"(\S+:)\s+(.*(?=\s+\w+:)|[^:]+$)\s*" - # otherwise, we must split further with intervals coming - # first, followed by non-standard comments - # we need the count of the matches, and re.findall() only returns - # groups if they are present and we wish to see if the entire match - # object concatenated together is the same as the original string - pmatches = [m for m in regex.finditer(kv_pair_pat, paren_contents)] - for i, pmatch in enumerate(pmatches): - keyword, val = pmatch.groups() - if keyword == "interval:": - valid_info.out_of += 2 - interval_matches = regex.match( - r"^\s*(?P\S+)\s+(?P\S+)\s*$", val - ) - # attempt to get the number for the interval - if not interval_matches: - valid_info.messages.append( - '§7.3.3 {}:cell_methods contains an interval specification that does not parse: "{}". Should be in format "interval: "'.format( - var.name, val - ) - ) - else: - try: - float(interval_matches.group("interval_number")) - except ValueError: - valid_info.messages.append( - '§7.3.3 {}:cell_methods contains an interval value that does not parse as a numeric value: "{}".'.format( - var.name, interval_matches.group("interval_number") - ) - ) - else: - valid_info.score += 1 - - # then the units - try: - Unit(interval_matches.group("interval_units")) - except ValueError: - valid_info.messages.append( - '§7.3.3 {}:cell_methods interval units "{}" is not parsable by UDUNITS.'.format( - var.name, interval_matches.group("interval_units") - ) - ) - else: - valid_info.score += 1 - elif keyword == "comment:": - # comments can't really be invalid, except - # if they come first or aren't last, and - # maybe if they contain colons embedded in the - # comment string - valid_info.out_of += 1 - if len(pmatches) == 1: - valid_info.messages.append( - "§7.3.3 If there is no standardized information, the keyword comment: should be omitted for variable {}".format( - var.name - ) - ) - # otherwise check that the comment is the last - # item in the parentheses - elif i != len(pmatches) - 1: - valid_info.messages.append( - '§7.3.3 The non-standard "comment:" element must come after any standard elements in cell_methods for variable {}'.format( - var.name - ) - ) - # - else: - valid_info.score += 1 - else: - valid_info.out_of += 1 - valid_info.messages.append( - '§7.3.3 Invalid cell_methods keyword "{}" for variable {}. Must be one of [interval, comment]'.format( - keyword, var.name - ) - ) - - # Ensure concatenated reconstructed matches are the same as the - # original string. If they're not, there's likely a formatting error - valid_info.assert_true( - "".join(m.group(0) for m in pmatches) == paren_contents, - "§7.3.3 Parenthetical content inside {}:cell_methods is not well formed: {}".format( - var.name, paren_contents - ), - ) - - return valid_info - - def check_climatological_statistics(self, ds): - """ - 7.4 A climatological time coordinate variable does not have a bounds attribute. Instead, it has a climatology - attribute, which names a variable with dimensions (n,2), n being the dimension of the climatological time axis. - Using the units and calendar of the time coordinate variable, element (i,0) of the climatology variable specifies - the beginning of the first subinterval and element (i,1) the end of the last subinterval used to evaluate the - climatological statistics with index i in the time dimension. The time coordinates should be values that are - representative of the climatological time intervals, such that an application which does not recognise climatological - time will nonetheless be able to make a reasonable interpretation. - - A climatological axis may use different statistical methods to measure variation among years, within years, and within - days. The methods which can be specified are those listed in Appendix E, Cell Methods and each entry in the cell_methods - attribute may also contain non-standardised information in parentheses after the method. The value of the cell_method - attribute must be in one of the following forms: - - time: method1 within years time: method2 over years - - time: method1 within days time: method2 over days - - time: method1 within days time: method2 over days time: method3 over years - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - - reasoning = [] - ret_val = [] - total_climate_count = 0 - valid_climate_count = 0 - all_clim_coord_var_names = [] - - methods = [ - "point", # TODO change to appendix import once cf1.7 merged - "sum", - "mean", - "maximum", - "minimum", - "mid_range", - "standard_deviation", - "variance", - "mode", - "median", - ] - - # find any climatology axies variables; any variables which contain climatological stats will use - # these variables as coordinates - clim_time_coord_vars = ds.get_variables_by_attributes( - climatology=lambda s: s is not None - ) - - # first, to determine whether or not we have a valid climatological time - # coordinate variable, we need to make sure it has the attribute "climatology", - # but not the attribute "bounds" - for clim_coord_var in clim_time_coord_vars: - if hasattr(clim_coord_var, "bounds"): - reasoning.append( - "Variable {} has a climatology attribute and cannot also have a bounds attribute.".format( - clim_coord_var.name - ) - ) - result = Result( - BaseCheck.MEDIUM, False, (self.section_titles["7.4"]), reasoning - ) - ret_val.append(result) - return ret_val - - # make sure the climatology variable referenced actually exists - elif clim_coord_var.climatology not in ds.variables: - reasoning.append( - "Variable {} referenced in time's climatology attribute does not exist".format( - ds.variables["time"].climatology - ) - ) - result = Result( - BaseCheck.MEDIUM, False, (self.section_titles["7.4"]), reasoning - ) - ret_val.append(result) - return ret_val - - # check that coordinate bounds are in the proper order. - # make sure last elements are boundary variable specific dimensions - if ( - clim_coord_var.dimensions[:] - != ds.variables[clim_coord_var.climatology].dimensions[ - : clim_coord_var.ndim - ] - ): - reasoning.append( - "Climatology variable coordinates are in improper order: {}. Bounds-specific dimensions should be last".format( - ds.variables[clim_coord_var.climatology].dimensions - ) - ) - return ret_val - - elif ( - ds.dimensions[ - ds.variables[clim_coord_var.climatology].dimensions[-1] - ].size - != 2 - ): - reasoning.append( - "Climatology dimension {} should only contain two elements".format( - boundary_variable.dimensions - ) - ) - - # passed all these checks, so we can add this clim_coord_var to our total list - all_clim_coord_var_names.append(clim_coord_var.name) - - # for any variables which use a climatology time coordinate variable as a coordinate, - # if they have a cell_methods attribute, it must comply with the form: - # time: method1 within years time: method2 over years - # time: method1 within days time: method2 over days - # time: method1 within days time: method2 over days time: method3 over years - # optionally followed by parentheses for explaining additional - # info, e.g. - # "time: method1 within years time: method2 over years (sidereal years)" - - meth_regex = "(?:{})".format( - "|".join(methods) - ) # "or" comparison for the methods - re_string = ( - r"^time: {0} within (years|days)" # regex string to test - r" time: {0} over \1(?<=days)(?: time: {0} over years)?" - r"(?: \([^)]+\))?$".format(meth_regex) - ) - - # find any variables with a valid climatological cell_methods - for cell_method_var in ds.get_variables_by_attributes( - cell_methods=lambda s: s is not None - ): - if any( - [dim in all_clim_coord_var_names for dim in cell_method_var.dimensions] - ): - total_climate_count += 1 - if not regex.search(re_string, cell_method_var.cell_methods): - reasoning.append( - 'The "time: method within years/days over years/days" format is not correct in variable {}.'.format( - cell_method_var.name - ) - ) - else: - valid_climate_count += 1 - - result = Result( - BaseCheck.MEDIUM, - (valid_climate_count, total_climate_count), - (self.section_titles["7.4"]), - reasoning, - ) - ret_val.append(result) - - return ret_val - - ############################################################################### - # Chapter 8: Reduction of Dataset Size - ############################################################################### - - def check_packed_data(self, ds): - """ - 8.1 Simple packing may be achieved through the use of the optional NUG defined attributes scale_factor and - add_offset. After the data values of a variable have been read, they are to be multiplied by the scale_factor, - and have add_offset added to them. - - The units of a variable should be representative of the unpacked data. - - If the scale_factor and add_offset attributes are of the same data type as the associated variable, the unpacked - data is assumed to be of the same data type as the packed data. However, if the scale_factor and add_offset - attributes are of a different data type from the variable (containing the packed data) then the unpacked data - should match the type of these attributes, which must both be of type float or both be of type double. An additional - restriction in this case is that the variable containing the packed data must be of type byte, short or int. It is - not advised to unpack an int into a float as there is a potential precision loss. - - When data to be packed contains missing values the attributes that indicate missing values (_FillValue, valid_min, - valid_max, valid_range) must be of the same data type as the packed data. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - for name, var in ds.variables.items(): - - add_offset = getattr(var, "add_offset", None) - scale_factor = getattr(var, "scale_factor", None) - if not (add_offset or scale_factor): - continue - - valid = True - reasoning = [] - - # if only one of these attributes is defined, assume they - # are the same type (value doesn't matter here) - if not add_offset: - add_offset = scale_factor - if not scale_factor: - scale_factor = add_offset - - if type(add_offset) != type(scale_factor): - valid = False - reasoning.append( - "Attributes add_offset and scale_factor have different data type." - ) - elif type(scale_factor) != var.dtype.type: - # Check both attributes are type float or double - if not isinstance(scale_factor, (float, np.floating)): - valid = False - reasoning.append( - "Attributes add_offset and scale_factor are not of type float or double." - ) - else: - # Check variable type is byte, short or int - if var.dtype.type not in [ - np.int, - np.int8, - np.int16, - np.int32, - np.int64, - ]: - valid = False - reasoning.append("Variable is not of type byte, short, or int.") - - result = Result( - BaseCheck.MEDIUM, valid, self.section_titles["8.1"], reasoning - ) - ret_val.append(result) - reasoning = [] - - valid = True - # test further with _FillValue , valid_min , valid_max , valid_range - if hasattr(var, "_FillValue"): - if var._FillValue.dtype.type != var.dtype.type: - valid = False - reasoning.append( - "Type of %s:_FillValue attribute (%s) does not match variable type (%s)" - % (name, var._FillValue.dtype.name, var.dtype.name) - ) - if hasattr(var, "valid_min"): - if var.valid_min.dtype.type != var.dtype.type: - valid = False - reasoning.append( - "Type of %svalid_min attribute (%s) does not match variable type (%s)" - % (name, var.valid_min.dtype.name, var.dtype.name) - ) - if hasattr(var, "valid_max"): - if var.valid_max.dtype.type != var.dtype.type: - valid = False - reasoning.append( - "Type of %s:valid_max attribute (%s) does not match variable type (%s)" - % (name, var.valid_max.dtype.name, var.dtype.name) - ) - if hasattr(var, "valid_range"): - if var.valid_range.dtype.type != var.dtype.type: - valid = False - reasoning.append( - "Type of %s:valid_range attribute (%s) does not match variable type (%s)" - % (name, var.valid_range.dtype.name, var.dtype.name) - ) - - result = Result( - BaseCheck.MEDIUM, valid, self.section_titles["8.1"], reasoning - ) - ret_val.append(result) - - return ret_val - - def check_compression_gathering(self, ds): - """ - At the current time the netCDF interface does not provide for packing - data. However a simple packing may be achieved through the use of the - optional NUG defined attributes scale_factor and add_offset . After the - data values of a variable have been read, they are to be multiplied by - the scale_factor , and have add_offset added to them. If both - attributes are present, the data are scaled before the offset is added. - When scaled data are written, the application should first subtract the - offset and then divide by the scale factor. The units of a variable - should be representative of the unpacked data. - - This standard is more restrictive than the NUG with respect to the use - of the scale_factor and add_offset attributes; ambiguities and - precision problems related to data type conversions are resolved by - these restrictions. If the scale_factor and add_offset attributes are - of the same data type as the associated variable, the unpacked data is - assumed to be of the same data type as the packed data. However, if the - scale_factor and add_offset attributes are of a different data type - from the variable (containing the packed data) then the unpacked data - should match the type of these attributes, which must both be of type - float or both be of type double . An additional restriction in this - case is that the variable containing the packed data must be of type - byte , short or int . It is not advised to unpack an int into a float - as there is a potential precision loss. - - When data to be packed contains missing values the attributes that - indicate missing values ( _FillValue , valid_min , valid_max , - valid_range ) must be of the same data type as - the packed data. See Section 2.5.1, “Missing Data” for a discussion of - how applications should treat variables that have attributes indicating - both missing values and transformations defined by a scale and/or - offset. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - for compress_var in ds.get_variables_by_attributes( - compress=lambda s: s is not None - ): - valid = True - reasoning = [] - # puts the referenced variable being compressed into a set - compress_set = set(compress_var.compress.split(" ")) - if compress_var.ndim != 1: - valid = False - reasoning.append( - "Compression variable {} may only have one dimension".format( - compress_var.name - ) - ) - # ensure compression variable is a proper index, and thus is an - # signed or unsigned integer type of some sort - if (compress_var.dtype is str) or ( - compress_var.dtype.kind not in {"i", "u"} - ): - valid = False - reasoning.append( - "Compression variable {} must be an integer type to form a proper array index".format( - compress_var.name - ) - ) - # make sure all the variables referred to are contained by the - # variables. - if not compress_set.issubset(ds.dimensions): - not_in_dims = sorted(compress_set.difference(ds.dimensions)) - valid = False - reasoning.append( - "The following dimensions referenced by the compress attribute of variable {} do not exist: {}".format( - compress_var.name, not_in_dims - ) - ) - - result = Result( - BaseCheck.MEDIUM, valid, self.section_titles["8.2"], reasoning - ) - ret_val.append(result) - - return ret_val - - ############################################################################### - # Chapter 9: Discrete Sampling Geometries - ############################################################################### - - def check_feature_type(self, ds): - """ - Check the global attribute featureType for valid CF featureTypes - - 9.4 A global attribute, featureType, is required for all Discrete Geometry representations except the orthogonal - multidimensional array representation, for which it is highly recommended. - - The value assigned to the featureType attribute is case-insensitive. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - # Due to case insensitive requirement, we list the possible featuretypes - # in lower case and check using the .lower() method - feature_list = [ - "point", - "timeseries", - "trajectory", - "profile", - "timeseriesprofile", - "trajectoryprofile", - ] - - feature_type = getattr(ds, "featureType", None) - valid_feature_type = TestCtx( - BaseCheck.HIGH, "§9.1 Dataset contains a valid featureType" - ) - valid_feature_type.assert_true( - feature_type is None or feature_type.lower() in feature_list, - "{} is not a valid CF featureType. It must be one of {}" - "".format(feature_type, ", ".join(feature_list)), - ) - return valid_feature_type.to_result() - - def check_cf_role(self, ds): - """ - Check variables defining cf_role for legal cf_role values. - - §9.5 The only acceptable values of cf_role for Discrete Geometry CF - data sets are timeseries_id, profile_id, and trajectory_id - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: compliance_checker.base.Result - """ - valid_roles = ["timeseries_id", "profile_id", "trajectory_id"] - variable_count = 0 - for variable in ds.get_variables_by_attributes(cf_role=lambda x: x is not None): - variable_count += 1 - name = variable.name - valid_cf_role = TestCtx(BaseCheck.HIGH, self.section_titles["9.5"]) - cf_role = variable.cf_role - valid_cf_role.assert_true( - cf_role in valid_roles, - "{} is not a valid cf_role value. It must be one of {}" - "".format(cf_role, ", ".join(valid_roles)), - ) - if variable_count > 0: - m = ( - "§9.5 The only acceptable values of cf_role for Discrete Geometry CF" - + " data sets are timeseries_id, profile_id, and trajectory_id" - ) - valid_cf_role.assert_true(variable_count < 3, m) - return valid_cf_role.to_result() - - def check_variable_features(self, ds): - """ - Checks the variable feature types match the dataset featureType attribute. - If more than one unique feature type is found, report this as an error. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - feature_types_found = defaultdict(list) - ret_val = [] - feature_list = { - "point", - "timeseries", - "trajectory", - "profile", - "timeseriesprofile", - "trajectoryprofile", - } - # Don't bother checking if it's not a legal featureType - # if the featureType attribute doesn't exist - feature_type = getattr(ds, "featureType", "") - if feature_type is not None and feature_type.lower() not in feature_list: - return [] - - _feature = feature_type.lower() - - for name in self._find_geophysical_vars(ds): - variable_feature = cfutil.guess_feature_type(ds, name) - # If we can't figure it out, don't check it. - if variable_feature is None: - continue - feature_types_found[variable_feature].append(name) - matching_feature = TestCtx(BaseCheck.MEDIUM, self.section_titles["9.1"]) - matching_feature.assert_true( - variable_feature.lower() == _feature, - "{} is not a {}, it is detected as a {}" - "".format(name, _feature, variable_feature), - ) - ret_val.append(matching_feature.to_result()) - - # create explanation of all of the different featureTypes - # found in the dataset - feature_description = ", ".join( - [ - "{} ({})".format(ftr, ", ".join(vrs)) - for ftr, vrs in feature_types_found.items() - ] - ) - all_same_features = TestCtx(BaseCheck.HIGH, self.section_titles["9.1"]) - all_same_features.assert_true( - len(feature_types_found) < 2, - "Different feature types discovered in this dataset: {}" - "".format(feature_description), - ) - ret_val.append(all_same_features.to_result()) - - return ret_val - - def check_hints(self, ds): - """ - Checks for potentially mislabeled metadata and makes suggestions for how to correct - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - ret_val.extend(self._check_hint_bounds(ds)) - - return ret_val - - def _check_hint_bounds(self, ds): - """ - Checks for variables ending with _bounds, if they are not cell methods, - make the recommendation - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - boundary_variables = cfutil.get_cell_boundary_variables(ds) - for name in ds.variables: - if name.endswith("_bounds") and name not in boundary_variables: - msg = ( - "{} might be a cell boundary variable but there are no variables that define it " - "as a boundary using the `bounds` attribute.".format(name) - ) - result = Result(BaseCheck.LOW, True, self.section_titles["7.1"], [msg]) - ret_val.append(result) - - return ret_val - - -class CF1_7Check(CF1_6Check): - """Implementation for CF v1.7. Inherits from CF1_6Check as most of the - checks are the same.""" - - # things that are specific to 1.7 - _cc_spec_version = "1.7" - _cc_url = "http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html" - - appendix_a = appendix_a_base.copy() - appendix_a.update( - { - "actual_range": { - "Type": "N", - "attr_loc": {"D", "C"}, - "cf_section": "2.5.1", - }, - "comment": { - "Type": "S", - "attr_loc": {"G", "D", "C"}, - "cf_section": "2.6.2", - }, - "external_variables": { - "Type": "S", - "attr_loc": {"G"}, - "cf_section": "2.6.3", - }, - "actual_range": { - "Type": "N", - "attr_loc": {"D", "C"}, - "cf_section": "2.5.1", - }, - "scale_factor": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": "8.1"}, - } - ) - - def __init__(self, options=None): - super(CF1_7Check, self).__init__(options) - - self.cell_methods = cell_methods17 - self.grid_mapping_dict = grid_mapping_dict17 - self.grid_mapping_attr_types = grid_mapping_attr_types17 - - def check_actual_range(self, ds): - """Check the actual_range attribute of variables. As stated in - section 2.5.1 of version 1.7, this convention defines a two-element - vector attribute designed to describe the actual minimum and actual - maximum values of variables containing numeric data. Conditions: - - the fist value of the two-element vector must be equal to the - minimum of the data, and the second element equal to the maximum - - if the data is packed, the elements of actual_range should have - the same data type as the *unpacked* data - - if valid_range is specified, both elements of actual_range should - be within valid_range - - If a variable does not have an actual_range attribute, let it pass; - including this attribute is only suggested. However, if the user is - specifying the actual_range, the Result will be considered - high-priority.""" - - ret_val = [] - - for name, variable in ds.variables.items(): - msgs = [] - score = 0 - out_of = 0 - - if not hasattr(variable, "actual_range"): - continue # having this attr is only suggested, no Result needed - else: - - out_of += 1 - try: - if ( - len(variable.actual_range) != 2 - ): # TODO is the attr also a numpy array? if so, .size - msgs.append( - "actual_range of '{}' must be 2 elements".format(name) - ) - ret_val.append( - Result( # putting result into list - BaseCheck.HIGH, - (score, out_of), - self.section_titles["2.5"], - msgs, - ) - ) - continue # no need to keep checking if already completely wrong - else: - score += 1 - except TypeError: # in case it's just a single number - msgs.append("actual_range of '{}' must be 2 elements".format(name)) - ret_val.append( - Result( # putting result into list - BaseCheck.HIGH, - (score, out_of), - self.section_titles["2.5"], - msgs, - ) - ) - continue - - # check equality to existing min/max values - # NOTE this is a data check - # If every value is masked, a data check of actual_range isn't - # appropriate, so skip. - if not (hasattr(variable[:], "mask") and variable[:].mask.all()): - # if min/max values aren't close to actual_range bounds, - # fail. - out_of += 1 - if not np.isclose( - variable.actual_range[0], variable[:].min() - ) or not np.isclose(variable.actual_range[1], variable[:].max()): - msgs.append( - "actual_range elements of '{}' inconsistent with its min/max values".format( - name - ) - ) - else: - score += 1 - - # check that the actual range is within the valid range - if hasattr(variable, "valid_range"): # check within valid_range - out_of += 1 - if (variable.actual_range[0] < variable.valid_range[0]) or ( - variable.actual_range[1] > variable.valid_range[1] - ): - msgs.append( - '"{}"\'s actual_range must be within valid_range'.format( - name - ) - ) - else: - score += 1 - - # check the elements of the actual range have the appropriate - # relationship to the valid_min and valid_max - if hasattr(variable, "valid_min"): - out_of += 1 - if variable.actual_range[0] < variable.valid_min: - msgs.append( - '"{}"\'s actual_range first element must be >= valid_min ({})'.format( - name, variable.valid_min - ) - ) - else: - score += 1 - if hasattr(variable, "valid_max"): - out_of += 1 - if variable.actual_range[1] > variable.valid_max: - msgs.append( - '"{}"\'s actual_range second element must be <= valid_max ({})'.format( - name, variable.valid_max - ) - ) - else: - score += 1 - - ret_val.append( - Result( # putting result into list - BaseCheck.HIGH, (score, out_of), self.section_titles["2.5"], msgs - ) - ) - return ret_val - - def check_cell_boundaries(self, ds): - """ - Checks the dimensions of cell boundary variables to ensure they are CF compliant - per section 7.1. - - This method extends the CF1_6Check method; please see the original method for the - complete doc string. - - If any variable contains both a formula_terms attribute *and* a bounding variable, - that bounds variable must also have a formula_terms attribute. - - :param netCDF4.Dataset ds: An open netCDF dataset - :returns list: List of results - """ - - # Note that test does not check monotonicity - ret_val = [] - reasoning = [] - for variable_name, boundary_variable_name in cfutil.get_cell_boundary_map( - ds - ).items(): - variable = ds.variables[variable_name] - valid = True - reasoning = [] - if boundary_variable_name not in ds.variables: - valid = False - reasoning.append( - "Boundary variable {} referenced by {} not ".format( - boundary_variable_name, variable.name - ) - + "found in dataset variables" - ) - else: - boundary_variable = ds.variables[boundary_variable_name] - # The number of dimensions in the bounds variable should always be - # the number of dimensions in the referring variable + 1 - if boundary_variable.ndim < 2: - valid = False - reasoning.append( - "Boundary variable {} specified by {}".format( - boundary_variable.name, variable.name - ) - + " should have at least two dimensions to enclose the base " - + "case of a one dimensionsal variable" - ) - if boundary_variable.ndim != variable.ndim + 1: - valid = False - reasoning.append( - "The number of dimensions of the variable %s is %s, but the " - "number of dimensions of the boundary variable %s is %s. The boundary variable " - "should have %s dimensions" - % ( - variable.name, - variable.ndim, - boundary_variable.name, - boundary_variable.ndim, - variable.ndim + 1, - ) - ) - if variable.dimensions[:] != boundary_variable.dimensions[: variable.ndim]: - valid = False - reasoning.append( - "Boundary variable coordinates (for {}) are in improper order: {}. Bounds-specific dimensions should be last" - "".format(variable.name, boundary_variable.dimensions) - ) - - # ensure p vertices form a valid simplex given previous a...n - # previous auxiliary coordinates - if ( - ds.dimensions[boundary_variable.dimensions[-1]].size - < len(boundary_variable.dimensions[:-1]) + 1 - ): - valid = False - reasoning.append( - "Dimension {} of boundary variable (for {}) must have at least {} elements to form a simplex/closed cell with previous dimensions {}.".format( - boundary_variable.name, - variable.name, - len(variable.dimensions) + 1, - boundary_variable.dimensions[:-1], - ) - ) - - # check if formula_terms is present in the var; if so, - # the bounds variable must also have a formula_terms attr - if hasattr(variable, "formula_terms"): - if not hasattr(boundary_variable, "formula_terms"): - valid = False - reasoning.append( - "'{}' has 'formula_terms' attr, bounds variable '{}' must also have 'formula_terms'".format( - variable_name, boundary_variable_name - ) - ) - - result = Result( - BaseCheck.MEDIUM, valid, self.section_titles["7.1"], reasoning - ) - ret_val.append(result) - return ret_val - - def check_cell_measures(self, ds): - """ - A method to over-ride the CF1_6Check method. In CF 1.7, it is specified - that variable referenced by cell_measures must be in the dataset OR - referenced by the global attribute "external_variables", which represent - all the variables used in the dataset but not found in the dataset. - - 7.2 To indicate extra information about the spatial properties of a - variable's grid cells, a cell_measures attribute may be defined for a - variable. This is a string attribute comprising a list of - blank-separated pairs of words of the form "measure: name". "area" and - "volume" are the only defined measures. - - The "name" is the name of the variable containing the measure values, - which we refer to as a "measure variable". The dimensions of the - measure variable should be the same as or a subset of the dimensions of - the variable to which they are related, but their order is not - restricted. - - The variable must have a units attribute and may have other attributes - such as a standard_name. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - reasoning = [] - variables = ds.get_variables_by_attributes( - cell_measures=lambda c: c is not None - ) - for var in variables: - search_str = r"^(?:area|volume): (\w+)$" - search_res = regex.search(search_str, var.cell_measures) - if not search_res: - valid = False - reasoning.append( - "The cell_measures attribute for variable {} " - "is formatted incorrectly. It should take the" - " form of either 'area: cell_var' or " - "'volume: cell_var' where cell_var is the " - "variable describing the cell measures".format(var.name) - ) - else: - valid = True - cell_meas_var_name = search_res.groups()[0] - # TODO: cache previous results - - # if the dataset has external_variables, get it - try: - external_variables = ds.getncattr("external_variables") - except AttributeError: - external_variables = [] - if cell_meas_var_name not in ds.variables: - if cell_meas_var_name not in external_variables: - valid = False - reasoning.append( - "Cell measure variable {} referred to by {} is not present in dataset variables".format( - cell_meas_var_name, var.name - ) - ) - else: - valid = True - - # make Result - result = Result( - BaseCheck.MEDIUM, valid, (self.section_titles["7.2"]), reasoning - ) - ret_val.append(result) - continue # can't test anything on an external var - - else: - cell_meas_var = ds.variables[cell_meas_var_name] - if not hasattr(cell_meas_var, "units"): - valid = False - reasoning.append( - "Cell measure variable {} is required " - "to have units attribute defined.".format( - cell_meas_var_name - ) - ) - if not set(cell_meas_var.dimensions).issubset(var.dimensions): - valid = False - reasoning.append( - "Cell measure variable {} must have " - "dimensions which are a subset of " - "those defined in variable {}.".format( - cell_meas_var_name, var.name - ) - ) - - result = Result( - BaseCheck.MEDIUM, valid, (self.section_titles["7.2"]), reasoning - ) - ret_val.append(result) - - return ret_val - - def _check_grid_mapping_attr_condition(self, attr, attr_name): - """ - Evaluate a condition (or series of conditions) for a particular - attribute. Implementation for CF-1.7. - - :param attr: attribute to teset condition for - :param str attr_name: name of the attribute - :rtype tuple - :return two-tuple of (bool, str) - """ - - if attr_name == "geographic_crs_name": - return self._evaluate_geographic_crs_name(attr) - - elif attr_name == "geoid_name": - return self._evaluate_geoid_name(attr) - - elif attr_name == "geopotential_datum_name": - return self._evaluate_geopotential_datum_name(attr) - - elif attr_name == "horizontal_datum_name": - return self._evaluate_horizontal_datum_name(attr) - - elif attr_name == "prime_meridian_name": - return self._evaluate_prime_meridian_name(attr) - - elif attr_name == "projected_crs_name": - return self._evaluate_projected_crs_name(attr) - - elif attr_name == "reference_ellipsoid_name": - return self._evaluate_reference_ellipsoid_name(attr) - - elif attr_name == "towgs84": - return self._evaluate_towgs84(attr) - - else: # invoke method from 1.6, as these names are all still valid - return super(CF1_7Check, self)._check_grid_mapping_attr_condition( - attr, attr_name - ) - - def _check_gmattr_existence_condition_geoid_name_geoptl_datum_name(self, var): - """ - Check to see if both geoid_name and geopotential_datum_name exist as attributes - for `var`. They should not. - - :param netCDF4.Variable var - :rtype tuple - :return two-tuple (bool, str) - """ - - msg = "Both geoid_name and geopotential_datum_name cannot exist" - - if ("geoid_name" in var.ncattrs()) and ( - "geopotential_datum_name" in var.ncattrs() - ): - return (False, msg) - - else: - return (True, msg) - - def _check_gmattr_existence_condition_ell_pmerid_hdatum(self, var): - """ - If one of reference_ellipsoid_name, prime_meridian_name, or - horizontal_datum_name are defined as grid_mapping attributes, - they must all be defined. - - :param netCDF4.Variable var - :rtype tuple - :return two-tuple (bool, str) - """ - - msg = ( - "If any of reference_ellipsoid_name, prime_meridian_name, " - "or horizontal_datum_name are defined, all must be defined." - ) - - _ncattrs = set(var.ncattrs()) - - if any( - [ - x in _ncattrs - for x in [ - "reference_ellipsoid_name", - "prime_meridian_name", - "horizontal_datum_name", - ] - ] - ) and ( - not set( - [ - "reference_ellipsoid_name", - "prime_meridian_name", - "horizontal_datum_name", - ] - ).issubset(_ncattrs) - ): - return (False, msg) - - else: - return (True, msg) - - def _get_projdb_conn(self): - """ - Return a SQLite Connection to the PROJ database. - - Returns: - sqlite3.Connection - """ - - proj_db_path = os.path.join(pyproj.datadir.get_data_dir(), "proj.db") - return sqlite3.connect(proj_db_path) - - def _exec_query_str_with_params(self, qstr, argtuple): - """ - Execute a query string in a database connection with the given argument - tuple. Return a result set. - - :param str qstr: desired query to be executed - :param tuple argtuple: tuple of arguments to be supplied to query - :rtype set - """ - - conn = self._get_projdb_conn() - return conn.execute(qstr, argtuple) - - def _evaluate_geographic_crs_name(self, val): - """ - Evaluate the condition for the geographic_crs_name attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - query_str = ( - "SELECT 1 FROM geodetic_crs WHERE name = ? " - "UNION ALL " # need union in case contained in other tables - "SELECT 1 FROM alias_name WHERE alt_name = ? " - "AND table_name = 'geodetic_crs' LIMIT 1" - ) - - # try to find the value in the database - res_set = self._exec_query_str_with_params(query_str, (val, val)) - - # does it exist? if so, amt returned be > 1 - return ( - len(res_set.fetchall()) > 0, - "geographic_crs_name must correspond to a valid OGC WKT GEOGCS name", - ) - - def _evaluate_geoid_name(self, val): - """ - Evaluate the condition for the geod_name attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - query_str = ( - "SELECT 1 FROM vertical_datum WHERE name = ? " - "UNION ALL " - "SELECT 1 FROM alias_name WHERE alt_name = ? " - "AND table_name = 'vertical_datum' LIMIT 1" - ) - - # try to find the value in the database - res_set = self._exec_query_str_with_params(query_str, (val, val)) - - return ( - len(res_set.fetchall()) > 0, - "geoid_name must correspond to a valid OGC WKT VERT_DATUM name", - ) - - def _evaluate_geopotential_datum_name(self, val): - """ - Evaluate the condition for the geogpotential_datum_name attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - query_str = ( - "SELECT 1 FROM vertical_datum WHERE name = ? " - "UNION ALL " - "SELECT 1 FROM alias_name WHERE alt_name = ? " - "AND table_name = 'vertical_datum' LIMIT 1" - ) - - # try to find the value in the database - res_set = self._exec_query_str_with_params(query_str, (val, val)) - - return ( - len(res_set.fetchall()) > 0, - "geopotential_datum_name must correspond to a valid OGC WKT VERT_DATUM name", - ) - - def _evaluate_horizontal_datum_name(self, val): - """ - Evaluate the condition for the horizontal_datum_name attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - return ( - val in horizontal_datum_names17, - ( - "{} must be a valid Horizontal Datum Name; " - "see https://github.com/cf-convention/cf-conventions/wiki/Mapping-from-CF-Grid-Mapping-Attributes-to-CRS-WKT-Elements." - ), - ) - - def _evaluate_prime_meridian_name(self, val): - """ - Evaluate the condition for the prime_meridian_name. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - return ( - val in prime_meridian_names17, - ( - "{} must be a valid Prime Meridian name; " - "see https://github.com/cf-convention/cf-conventions/wiki/csv/prime_meridian.csv." - ), - ) - - def _evaluate_projected_crs_name(self, val): - """ - Evaluate the condition for the projected_crs attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - query_str = ( - "SELECT 1 FROM projected_crs WHERE name = ? " - "UNION ALL " - "SELECT 1 FROM alias_name WHERE alt_name = ? " - "AND table_name = 'projected_crs' LIMIT 1" - ) - - # try to find the value in the database - res_set = self._exec_query_str_with_params(query_str, (val, val)) - - return ( - len(res_set.fetchall()) > 0, - "projected_crs_name must correspond to a valid OGC WKT PROJCS name", - ) - - def _evaluate_reference_ellipsoid_name(self, val): - """ - Evaluate the condition for the reference_ellipsoid_name attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - return ( - val in ellipsoid_names17, - ( - "{} must be a valid Ellipsoid Name; " - "see https://github.com/cf-convention/cf-conventions/wiki/csv/ellipsoid.csv." - ), - ) - - def _evaluate_towgs84(self, val): - """ - Evaluate the condition for the towgs84 attribute. - - :param val: value to be tested - :rtype tuple - :return two-tuple of (bool, str) - """ - - msg = ( - "towgs84 must be an array of length 3, 6, or 7 of double-precision" - " and correspond to anm OGC WKT TOWGS84 node" - ) - - # if not numpy type, return false - if not getattr(val, "dtype", None): - return (False, msg) - - # must be double-precision array - elif val.dtype != np.float64: - return (False, msg) - - # must be of length 3, 6, or 7 - elif not val.shape: # single value - return (False, msg) - - elif not (val.size in (3, 6, 7)): - return (False, msg) - - else: - return (True, msg) - - def check_grid_mapping(self, ds): - __doc__ = super(CF1_7Check, self).check_grid_mapping.__doc__ - prev_return = super(CF1_7Check, self).check_grid_mapping(ds) - ret_val = [] - grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) - for var_name in sorted(grid_mapping_variables): - var = ds.variables[var_name] - test_ctx = self.get_test_ctx( - BaseCheck.HIGH, self.section_titles["5.6"], var.name - ) - - # TODO: check cases where crs_wkt provides part of a necessary - # grid_mapping attribute, or where a grid_mapping attribute - # overrides what has been provided in crs_wkt. - # attempt to parse crs_wkt if it is present - if "crs_wkt" in var.ncattrs(): - crs_wkt = var.crs_wkt - if not isinstance(crs_wkt, str): - test_ctx.messages.append("crs_wkt attribute must be a string") - test_ctx.out_of += 1 - else: - try: - pyproj.CRS.from_wkt(crs_wkt) - except pyproj.exceptions.CRSError as crs_error: - test_ctx.messages.append( - "Cannot parse crs_wkt attribute to CRS using Proj4. Proj4 error: {}".format( - str(crs_error) - ) - ) - else: - test_ctx.score += 1 - test_ctx.out_of += 1 - - # existence_conditions - exist_cond_1 = ( - self._check_gmattr_existence_condition_geoid_name_geoptl_datum_name(var) - ) - test_ctx.assert_true(exist_cond_1[0], exist_cond_1[1]) - exist_cond_2 = self._check_gmattr_existence_condition_ell_pmerid_hdatum(var) - test_ctx.assert_true(exist_cond_2[0], exist_cond_2[1]) - - # handle vertical datum related grid_mapping attributes - vert_datum_attrs = {} - possible_vert_datum_attrs = {"geoid_name", "geopotential_datum_name"} - vert_datum_attrs = possible_vert_datum_attrs.intersection(var.ncattrs()) - len_vdatum_name_attrs = len(vert_datum_attrs) - # check that geoid_name and geopotential_datum_name are not both - # present in the grid_mapping variable - if len_vdatum_name_attrs == 2: - test_ctx.out_of += 1 - test_ctx.messages.append( - "Cannot have both 'geoid_name' and " - "'geopotential_datum_name' attributes in " - "grid mapping variable '{}'".format(var.name) - ) - elif len_vdatum_name_attrs == 1: - # should be one or zero attrs - proj_db_path = os.path.join(pyproj.datadir.get_data_dir(), "proj.db") - try: - with sqlite3.connect(proj_db_path) as conn: - v_datum_attr = next(iter(vert_datum_attrs)) - v_datum_value = getattr(var, v_datum_attr) - v_datum_str_valid = self._process_v_datum_str( - v_datum_value, conn - ) - - invalid_msg = ( - "Vertical datum value '{}' for " - "attribute '{}' in grid mapping " - "variable '{}' is not valid".format( - v_datum_value, v_datum_attr, var.name - ) - ) - test_ctx.assert_true(v_datum_str_valid, invalid_msg) - except sqlite3.Error as e: - # if we hit an error, skip the check - warn( - "Error occurred while trying to query " - "Proj4 SQLite database at {}: {}".format(proj_db_path, str(e)) - ) - prev_return[var.name] = test_ctx.to_result() - - return prev_return - - def _process_v_datum_str(self, v_datum_str, conn): - vdatum_query = """SELECT 1 FROM alias_name WHERE - table_name = 'vertical_datum' AND - alt_name = ? - UNION ALL - SELECT 1 FROM vertical_datum WHERE - name = ? - LIMIT 1""" - res_set = conn.execute(vdatum_query, (v_datum_str, v_datum_str)) - return len(res_set.fetchall()) > 0 - - def _check_dimensionless_vertical_coordinate_1_7( - self, ds, vname, deprecated_units, ret_val, dim_vert_coords_dict - ): - """ - Check that a dimensionless vertical coordinate variable is valid under - CF-1.7. - - :param netCDF4.Dataset ds: open netCDF4 dataset - :param str name: variable name - :param list ret_val: array to append Results to - :rtype None - """ - variable = ds.variables[vname] - standard_name = getattr(variable, "standard_name", None) - units = getattr(variable, "units", None) - formula_terms = getattr(variable, "formula_terms", None) - # Skip the variable if it's dimensional - if formula_terms is None and standard_name not in dim_vert_coords_dict: - return - - # assert that the computed_standard_name is maps to the standard_name correctly - correct_computed_std_name_ctx = TestCtx( - BaseCheck.MEDIUM, self.section_titles["4.3"] - ) - _comp_std_name = dim_vert_coords_dict[standard_name][1] - correct_computed_std_name_ctx.assert_true( - getattr(variable, "computed_standard_name", None) in _comp_std_name, - "§4.3.3 The standard_name of `{}` must map to the correct computed_standard_name, `{}`".format( - vname, sorted(_comp_std_name) - ), - ) - ret_val.append(correct_computed_std_name_ctx.to_result()) - - def check_dimensionless_vertical_coordinates(self, ds): - """ - Check the validity of dimensionless coordinates under CF - - CF §4.3.2 The units attribute is not required for dimensionless - coordinates. - - The standard_name attribute associates a coordinate with its definition - from Appendix D, Dimensionless Vertical Coordinates. The definition - provides a mapping between the dimensionless coordinate values and - dimensional values that can positively and uniquely indicate the - location of the data. - - A new attribute, formula_terms, is used to associate terms in the - definitions with variables in a netCDF file. To maintain backwards - compatibility with COARDS the use of these attributes is not required, - but is strongly recommended. - - :param netCDF4.Dataset ds: An open netCDF dataset - :rtype: list - :return: List of results - """ - ret_val = [] - - z_variables = cfutil.get_z_variables(ds) - deprecated_units = ["level", "layer", "sigma_level"] - - # compose this function to use the results from the CF-1.6 check - # and then extend it using a CF-1.7 addition - ret_val.extend( - self._check_dimensionless_vertical_coordinates( - ds, - deprecated_units, - self._check_dimensionless_vertical_coordinate_1_6, - dimless_vertical_coordinates_1_7, - ) - ) - - ret_val.extend( - self._check_dimensionless_vertical_coordinates( - ds, - deprecated_units, - self._check_dimensionless_vertical_coordinate_1_7, - dimless_vertical_coordinates_1_7, - ) - ) - - return ret_val - - -class CFNCCheck(BaseNCCheck, CFBaseCheck): - @classmethod - def beliefs(cls): # @TODO - return {} +# Version specific checkers organized in other modules +from compliance_checker.cf.cf_1_6 import CF1_6Check +from compliance_checker.cf.cf_1_7 import CF1_7Check +from compliance_checker.cf.cf_1_8 import CF1_8Check diff --git a/compliance_checker/cf/cf_1_6.py b/compliance_checker/cf/cf_1_6.py new file mode 100644 index 00000000..bfe93679 --- /dev/null +++ b/compliance_checker/cf/cf_1_6.py @@ -0,0 +1,3238 @@ +import logging +import os +import sys + +from collections import OrderedDict, defaultdict +from functools import wraps +from warnings import warn + +import numpy as np +import regex + +from cf_units import Unit + +from compliance_checker import cfutil +from compliance_checker.base import BaseCheck, BaseNCCheck, Result, TestCtx +from compliance_checker.cf import util +from compliance_checker.cf.appendix_d import (dimless_vertical_coordinates_1_6, + no_missing_terms) +from compliance_checker.cf.appendix_e import cell_methods16 +from compliance_checker.cf.appendix_f import (grid_mapping_attr_types16, + grid_mapping_dict16) + +from compliance_checker.cf.cf_base import CFNCCheck, appendix_a_base +logger = logging.getLogger(__name__) + +class CF1_6Check(CFNCCheck): + """CF-1.6-specific implementation of CFBaseCheck; supports checking + netCDF datasets. + These checks are translated documents: + http://cf-pcmdi.llnl.gov/documents/cf-conventions/1.6/cf-conventions.html + http://cf-pcmdi.llnl.gov/conformance/requirements-and-recommendations/1.6/""" + + register_checker = True + _cc_spec = "cf" + _cc_spec_version = "1.6" + _cc_description = "Climate and Forecast Conventions (CF)" + _cc_url = "http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html" + _cc_display_headers = {3: "Errors", 2: "Warnings", 1: "Info"} + appendix_a = appendix_a_base + + def __init__(self, options=None): # initialize with parent methods and data + super(CF1_6Check, self).__init__(options) + + self.cell_methods = cell_methods16 + self.grid_mapping_dict = grid_mapping_dict16 + self.grid_mapping_attr_types = grid_mapping_attr_types16 + + ############################################################################### + # Chapter 2: NetCDF Files and Components + ############################################################################### + + def check_data_types(self, ds): + """ + Checks the data type of all netCDF variables to ensure they are valid + data types under CF. + + CF §2.2 The netCDF data types char, byte, short, int, float or real, and + double are all acceptable + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + fails = [] + total = len(ds.variables) + + for k, v in ds.variables.items(): + if ( + v.dtype is not str + and v.dtype.kind != "S" + and all( + v.dtype.type != t + for t in ( + np.character, + np.dtype("|S1"), + np.dtype("b"), + np.dtype("i2"), + np.dtype("i4"), + np.float32, + np.double, + ) + ) + ): + fails.append( + "The variable {} failed because the datatype is {}".format( + k, v.datatype + ) + ) + return Result( + BaseCheck.HIGH, + (total - len(fails), total), + self.section_titles["2.2"], + msgs=fails, + ) + + def check_child_attr_data_types(self, ds): + """ + For any variables which contain any of the following attributes: + - valid_min/valid_max + - valid_range + - scale_factor + - add_offset + - _FillValue + the data type of the attribute must match the type of its parent variable as specified in the + NetCDF User Guide (NUG) https://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html, + referenced in the CF Conventions in Section 2.5.2 + (http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html#missing-data) + + :param netCDF4.Dataset ds: open netCDF dataset object + :rtype: compliance_checker.base.Result + """ + + ctx = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.5"]) + special_attrs = { + "actual_range", + "valid_min", + "valid_max", + "valid_range", + "_FillValue", + } + + for var_name, var in ds.variables.items(): + for att_name in special_attrs.intersection(var.ncattrs()): + self._parent_var_attr_type_check(att_name, var, ctx) + return ctx.to_result() + + # TODO: consider renaming to avoid confusion with non-underscore + # primary function version + def _check_add_offset_scale_factor_type(self, variable, attr_name): + """ + Reusable function for checking both add_offset and scale_factor. + """ + + msgs = [] + error_msg = ( + f"Variable {variable.name} and {attr_name} must be equivalent " + f"data types or {variable.name} must be of type byte, short, or int " + f"and {attr_name} must be float or double" + ) + + att = getattr(variable, attr_name, None) + if not (isinstance(att, (np.number, float))): # can't compare dtypes + val = False + + else: + val = ( + att.dtype == variable.dtype + ) or ( # will short-circuit or if first condition is true + isinstance(att.dtype, (np.float, np.double, float)) + and isinstance(variable.dtype, (np.byte, np.short, np.int, int)) + ) + if not val: + msgs.append(error_msg) + + return Result(BaseCheck.MEDIUM, val, self.section_titles["8.1"], + msgs) + + def check_add_offset_scale_factor_type(self, ds): + """ + If a variable has the attributes add_offset and scale_factor, + check that the variables and attributes are of the same type + OR that the variable is of type byte, short or int and the + attributes are of type float or double. + """ + + results = [] + add_offset_vars = ds.get_variables_by_attributes( + add_offset=lambda x: x is not None + ) + scale_factor_vars = ds.get_variables_by_attributes( + scale_factor=lambda x: x is not None + ) + + for _att_vars_tup in ( + ("add_offset", add_offset_vars), + ("scale_factor", scale_factor_vars), + ): + results.extend( + list( + map( + lambda var: self._check_add_offset_scale_factor_type( + var, _att_vars_tup[0] + ), + _att_vars_tup[1], + ) + ) + ) + + return results + + def check_naming_conventions(self, ds): + """ + Checks the variable names to ensure they are valid CF variable names under CF. + + CF §2.3 Variable, dimension and attribute names should begin with a letter + and be composed of letters, digits, and underscores. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + ret_val = [] + variable_naming = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.3"]) + dimension_naming = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.3"]) + attribute_naming = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.3"]) + + ignore_attributes = [ + "_FillValue", + "DODS", + "_ChunkSizes", + "_Coordinate", + "_Unsigned", + ] + + rname = regex.compile("^[A-Za-z][A-Za-z0-9_]*$") + + for name, variable in ds.variables.items(): + variable_naming.assert_true( + rname.match(name) is not None, + "variable {} should begin with a letter and be composed of " + "letters, digits, and underscores".format(name), + ) + + # Keep track of all the attributes, we'll need to check them + for attr in variable.ncattrs(): + if attr in ignore_attributes: + continue + # Special attributes made by THREDDS + if attr.startswith("DODS"): + continue + # Ignore model produced attributes + if attr.startswith("_Coordinate"): + continue + attribute_naming.assert_true( + rname.match(attr) is not None, + "attribute {}:{} should begin with a letter and be composed of " + "letters, digits, and underscores".format(name, attr), + ) + + ret_val.append(variable_naming.to_result()) + + for dimension in ds.dimensions: + dimension_naming.assert_true( + rname.match(dimension) is not None, + "dimension {} should begin with a latter and be composed of " + "letters, digits, and underscores".format(dimension), + ) + ret_val.append(dimension_naming.to_result()) + + for global_attr in ds.ncattrs(): + # Special attributes made by THREDDS + if global_attr.startswith("DODS"): + continue + if global_attr.startswith("EXTRA_DIMENSION"): + continue + attribute_naming.assert_true( + rname.match(global_attr) is not None, + "global attribute {} should begin with a letter and be composed of " + "letters, digits, and underscores".format(global_attr), + ) + ret_val.append(attribute_naming.to_result()) + + return ret_val + + def check_names_unique(self, ds): + """ + Checks the variable names for uniqueness regardless of case. + + CF §2.3 names should not be distinguished purely by case, i.e., if case + is disregarded, no two names should be the same. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + fails = [] + total = len(ds.variables) + names = defaultdict(int) + + for k in ds.variables: + names[k.lower()] += 1 + + fails = [ + "Variables are not case sensitive. Duplicate variables named: %s" % k + for k, v in names.items() + if v > 1 + ] + return Result( + BaseCheck.MEDIUM, + (total - len(fails), total), + self.section_titles["2.3"], + msgs=fails, + ) + + def check_dimension_names(self, ds): + """ + Checks variables contain no duplicate dimension names. + + CF §2.4 A variable may have any number of dimensions, including zero, + and the dimensions must all have different names. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + fails = [] + total = len(ds.variables) + + for k, v in ds.variables.items(): + dims = defaultdict(int) + for d in v.dimensions: + dims[d] += 1 + + for dimension, count in dims.items(): + if count > 1: + fails.append( + "%s has two or more dimensions named %s" % (k, dimension) + ) + + return Result( + BaseCheck.HIGH, + (total - len(fails), total), + self.section_titles["2.4"], + msgs=fails, + ) + + def check_dimension_order(self, ds): + """ + Checks each variable's dimension order to ensure that the order is + consistent and in order under CF §2.4 + + CF §2.4 If any or all of the dimensions of a variable have the + interpretations of "date or time" (T), "height or depth" (Z), + "latitude" (Y), or "longitude" (X) then we recommend, those dimensions + to appear in the relative order T, then Z, then Y, then X in the CDL + definition corresponding to the file. All other dimensions should, + whenever possible, be placed to the left of the spatiotemporal + dimensions. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + valid_dimension_order = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.4"]) + # Build a map from coordinate variable to axis + coord_axis_map = self._get_coord_axis_map(ds) + + # Check each variable's dimension order, excluding climatology and + # bounds variables + any_clim = cfutil.get_climatology_variable(ds) + any_bounds = cfutil.get_cell_boundary_variables(ds) + for name, variable in ds.variables.items(): + # Skip bounds/climatology variables, as they should implicitly + # have the same order except for the bounds specific dimension. + # This is tested later in the respective checks + if name in any_bounds or name == any_clim: + continue + + # Skip strings/labels + if hasattr(variable.dtype, "char") and variable.dtype.char == "S": + continue + elif variable.dtype == str: + continue + + if variable.dimensions: + dimension_order = self._get_dimension_order(ds, name, coord_axis_map) + valid_dimension_order.assert_true( + self._dims_in_order(dimension_order), + "{}'s spatio-temporal dimensions are not in the " + "recommended order T, Z, Y, X and/or further dimensions " + "are not located left of T, Z, Y, X. The dimensions (and " + "their guessed types) are {} (with U: other/unknown; L: " + "unlimited).".format( + name, + self._get_pretty_dimension_order_with_type( + ds, name, dimension_order + ), + ), + ) + return valid_dimension_order.to_result() + + def check_fill_value_outside_valid_range(self, ds): + """ + Checks each variable's _FillValue to ensure that it's in valid_range or + between valid_min and valid_max according to CF §2.5.1 + + CF §2.5.1 The _FillValue should be outside the range specified by + valid_range (if used) for a variable. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of Results + """ + valid_fill_range = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.5"]) + + for name, variable in ds.variables.items(): + # If the variable doesn't have a defined _FillValue don't check it. + + if not hasattr(variable, "_FillValue"): + continue + + fill_value = variable._FillValue + + attrs = variable.ncattrs() + + if "valid_range" in attrs: + if isinstance(variable.valid_range, str): + m = "§2.5.1 Fill Values should be outside the range specified by valid_range" # subsection message + valid_fill_range.assert_true( + False, + "{};\n\t{}:valid_range must be a numeric type not a string".format( + m, name + ), + ) + continue + rmin, rmax = variable.valid_range + spec_by = "valid_range" + + elif "valid_min" in attrs and "valid_max" in attrs: + if isinstance(variable.valid_min, str): + valid_fill_range.assert_true( + False, + "{}:valid_min must be a numeric type not a string".format(name), + ) + if isinstance(variable.valid_max, str): + valid_fill_range.assert_true( + False, + "{}:valid_max must be a numeric type not a string".format(name), + ) + if isinstance(variable.valid_min, str) or isinstance( + variable.valid_max, str + ): + continue + rmin = variable.valid_min + rmax = variable.valid_max + spec_by = "valid_min/valid_max" + else: + continue + + if np.isnan(fill_value): + valid = True + else: + valid = fill_value < rmin or fill_value > rmax + + valid_fill_range.assert_true( + valid, + "{}:_FillValue ({}) should be outside the range specified by {} ({}, {})" + "".format(name, fill_value, spec_by, rmin, rmax), + ) + + return valid_fill_range.to_result() + + def check_convention_globals(self, ds): + """ + Check the common global attributes are strings if they exist. + + CF §2.6.2 title/history global attributes, must be strings. Do not need + to exist. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of Results + """ + attrs = ["title", "history"] + + valid_globals = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.6"]) + + for attr in attrs: + dataset_attr = getattr(ds, attr, None) + is_string = isinstance(dataset_attr, str) + valid_globals.assert_true( + is_string and len(dataset_attr), + "§2.6.2 global attribute {} should exist and be a non-empty string" # subsection message + "".format(attr), + ) + return valid_globals.to_result() + + def check_convention_possibly_var_attrs(self, ds): + """ + Check variable and global attributes are strings for recommended attributes under CF §2.6.2 + + CF §2.6.2 institution, source, references, and comment, either global + or assigned to individual variables. When an attribute appears both + globally and as a variable attribute, the variable's version has + precedence. Must be strings. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of Results + """ + # The attrs are optional and only needs to be a string and non-empty if it + # exists. + attrs = ["institution", "source", "references", "comment"] + + valid_attributes = TestCtx(BaseCheck.MEDIUM, self.section_titles["2.6"]) + + attr_bin = set() + # If the attribute is defined for any variable, check it and mark in + # the set that we've seen it at least once. + for name, variable in ds.variables.items(): + for attribute in variable.ncattrs(): + varattr = getattr(variable, attribute) + if attribute in attrs: + is_string = isinstance(varattr, str) + valid_attributes.assert_true( + is_string and len(varattr) > 0, + "§2.6.2 {}:{} should be a non-empty string" + "".format(name, attribute), + ) + attr_bin.add(attribute) + + # Check all the global attributes too and mark if we've seen them + for attribute in ds.ncattrs(): + dsattr = getattr(ds, attribute) + if attribute in attrs: + is_string = isinstance(dsattr, str) + valid_attributes.assert_true( + is_string and len(dsattr) > 0, + "§2.6.2 {} global attribute should be a non-empty string" + "".format(attribute), + ) + attr_bin.add(attribute) + return valid_attributes.to_result() + + ############################################################################### + # Chapter 3: Description of the Data + ############################################################################### + + def check_units(self, ds): + """ + Check the units attribute for all variables to ensure they are CF + compliant under CF §3.1 + + CF §3.1 The units attribute is required for all variables that represent dimensional quantities + (except for boundary variables defined in Section 7.1, "Cell Boundaries" and climatology variables + defined in Section 7.4, "Climatological Statistics"). + + Units are not required for dimensionless quantities. A variable with no units attribute is assumed + to be dimensionless. However, a units attribute specifying a dimensionless unit may optionally be + included. + + - units required + - type must be recognized by udunits + - if standard name specified, must be consistent with standard name table, must also be consistent with a + specified cell_methods attribute if present + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + ret_val = [] + + coordinate_variables = self._find_coord_vars(ds) + auxiliary_coordinates = self._find_aux_coord_vars(ds) + geophysical_variables = self._find_geophysical_vars(ds) + forecast_variables = cfutil.get_forecast_metadata_variables(ds) + + unit_required_variables = set( + coordinate_variables + + auxiliary_coordinates + + geophysical_variables + + forecast_variables + ) + + for name in unit_required_variables: + # For reduced horizontal grids, the compression index variable does + # not require units. + if cfutil.is_compression_coordinate(ds, name): + continue + + variable = ds.variables[name] + + # Skip instance coordinate variables + if getattr(variable, "cf_role", None) is not None: + continue + + # Skip labels + if ( + hasattr(variable.dtype, "char") and variable.dtype.char == "S" + ) or variable.dtype == str: + continue + + standard_name = getattr(variable, "standard_name", None) + standard_name, standard_name_modifier = self._split_standard_name( + standard_name + ) + + units = getattr(variable, "units", None) + + valid_units = self._check_valid_cf_units(ds, name) + ret_val.append(valid_units) + + units_attr_is_string = TestCtx(BaseCheck.MEDIUM, self.section_titles["3.1"]) + + # side effects, but better than teasing out the individual result + if units_attr_is_string.assert_true( + isinstance(units, str), + "units ({}) attribute of '{}' must be a string compatible with UDUNITS".format( + units, variable.name + ), + ): + valid_udunits = self._check_valid_udunits(ds, name) + ret_val.append(valid_udunits) + ret_val.append(units_attr_is_string.to_result()) + + if isinstance(standard_name, str): + valid_standard_units = self._check_valid_standard_units(ds, name) + ret_val.append(valid_standard_units) + + return ret_val + + def _check_valid_cf_units(self, ds, variable_name): + """ + Checks that the variable contains units attribute, the attribute is a + string and the value is not deprecated by CF + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str variable_name: Name of the variable to be checked + :rtype: + :return: List of results + """ + + # This list is straight from section 3 + deprecated = ["level", "layer", "sigma_level"] + variable = ds.variables[variable_name] + + units = getattr(variable, "units", None) + standard_name_full = getattr(variable, "standard_name", None) + standard_name, standard_name_modifier = self._split_standard_name( + standard_name_full + ) + std_name_units_dimensionless = cfutil.is_dimensionless_standard_name( + self._std_names._root, standard_name + ) + # Is this even in the database? also, if there is no standard_name, + # there's no way to know if it is dimensionless. + should_be_dimensionless = ( + variable.dtype is str + or (hasattr(variable.dtype, "char") and variable.dtype.char == "S") + or std_name_units_dimensionless + or standard_name is None + ) + + # 1) Units must exist + valid_units = TestCtx(BaseCheck.HIGH, self.section_titles["3.1"]) + valid_units.assert_true( + should_be_dimensionless or units is not None, + "units attribute is required for {} when variable is not a dimensionless quantity".format( + variable_name + ), + ) + + # Don't bother checking the rest + if units is None and not should_be_dimensionless: + return valid_units.to_result() + # 2) units attribute must be a string + valid_units.assert_true( + should_be_dimensionless or isinstance(units, str), + "units attribute for {} needs to be a string".format(variable_name), + ) + + # 3) units are not deprecated + valid_units.assert_true( + units not in deprecated, + 'units for {}, "{}" are deprecated by CF 1.6'.format(variable_name, units), + ) + + return valid_units.to_result() + + def _check_valid_udunits(self, ds, variable_name): + """ + Checks that the variable's units are contained in UDUnits + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str variable_name: Name of the variable to be checked + """ + variable = ds.variables[variable_name] + + units = getattr(variable, "units", None) + standard_name = getattr(variable, "standard_name", None) + standard_name, standard_name_modifier = self._split_standard_name(standard_name) + std_name_units_dimensionless = cfutil.is_dimensionless_standard_name( + self._std_names._root, standard_name + ) + + # If the variable is supposed to be dimensionless, it automatically passes + should_be_dimensionless = ( + variable.dtype is str + or (hasattr(variable.dtype, "char") and variable.dtype.char == "S") + or std_name_units_dimensionless + ) + + valid_udunits = TestCtx(BaseCheck.HIGH, self.section_titles["3.1"]) + are_udunits = units is not None and util.units_known(units) + valid_udunits.assert_true( + should_be_dimensionless or are_udunits, + 'units for {}, "{}" are not recognized by UDUNITS'.format( + variable_name, units + ), + ) + return valid_udunits.to_result() + + def _check_valid_standard_units(self, ds, variable_name): + """ + Checks that the variable's units are appropriate for the standard name + according to the CF standard name table and coordinate sections in CF + 1.6 + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str variable_name: Name of the variable to be checked + """ + variable = ds.variables[variable_name] + units = getattr(variable, "units", None) + standard_name = getattr(variable, "standard_name", None) + + valid_standard_units = TestCtx(BaseCheck.HIGH, self.section_titles["3.1"]) + + # If the variable is supposed to be dimensionless, it automatically passes + std_name_units_dimensionless = cfutil.is_dimensionless_standard_name( + self._std_names._root, standard_name + ) + + standard_name, standard_name_modifier = self._split_standard_name(standard_name) + + standard_entry = self._std_names.get(standard_name, None) + if standard_entry is not None: + canonical_units = standard_entry.canonical_units + else: + # Any unit comparisons with None returns False + canonical_units = None + + # Other standard_name modifiers have the same units as the + # unmodified standard name or are not checked for units. + + if standard_name_modifier == "number_of_observations": + canonical_units = "1" + + # This section represents the different cases where simple udunits + # comparison isn't comprehensive enough to determine if the units are + # appropriate under CF + + # UDUnits accepts "s" as a unit of time but it should be since + if standard_name == "time": + valid_standard_units.assert_true( + util.units_convertible(units, "seconds since 1970-01-01"), + "time must be in a valid units format since " + "not {}".format(units), + ) + + # UDunits can't tell the difference between east and north facing coordinates + elif standard_name == "latitude": + # degrees is allowed if using a transformed grid + allowed_units = cfutil.VALID_LAT_UNITS | {"degrees"} + valid_standard_units.assert_true( + units.lower() in allowed_units, + 'variables defining latitude ("{}") must use degrees_north ' + "or degrees if defining a transformed grid. Currently " + "{}".format(variable_name, units), + ) + # UDunits can't tell the difference between east and north facing coordinates + elif standard_name == "longitude": + # degrees is allowed if using a transformed grid + allowed_units = cfutil.VALID_LON_UNITS | {"degrees"} + valid_standard_units.assert_true( + units.lower() in allowed_units, + 'variables defining longitude ("{}") must use degrees_east ' + "or degrees if defining a transformed grid. Currently " + "{}".format(variable_name, units), + ) + # Standard Name table agrees the unit should be dimensionless + elif std_name_units_dimensionless: + valid_standard_units.assert_true(True, "") + + elif canonical_units is not None: + valid_standard_units.assert_true( + util.units_convertible(canonical_units, units), + "units for variable {} must be convertible to {} " + "currently they are {}".format(variable_name, canonical_units, units), + ) + + return valid_standard_units.to_result() + + def check_standard_name(self, ds): + """ + Check a variables's standard_name attribute to ensure that it meets CF + compliance. + + CF §3.3 A standard name is associated with a variable via the attribute + standard_name which takes a string value comprised of a standard name + optionally followed by one or more blanks and a standard name modifier + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + coord_vars = self._find_coord_vars(ds) + aux_coord_vars = self._find_aux_coord_vars(ds) + axis_vars = cfutil.get_axis_variables(ds) + flag_vars = cfutil.get_flag_variables(ds) + geophysical_vars = self._find_geophysical_vars(ds) + + variables_requiring_standard_names = ( + coord_vars + aux_coord_vars + axis_vars + flag_vars + geophysical_vars + ) + for name in set(variables_requiring_standard_names): + # Compression indices used in reduced horizontal grids or + # compression schemes do not require attributes other than compress + if cfutil.is_compression_coordinate(ds, name): + continue + + ncvar = ds.variables[name] + + # §9 doesn't explicitly allow instance variables as coordinates but + # it's loosely implied. Just in case, skip it. + if hasattr(ncvar, "cf_role"): + continue + + # Unfortunately, §6.1 allows for string types to be listed as + # coordinates. + if hasattr(ncvar.dtype, "char") and ncvar.dtype.char == "S": + continue + elif ncvar.dtype == str: + continue + + standard_name = getattr(ncvar, "standard_name", None) + standard_name, standard_name_modifier = self._split_standard_name( + standard_name + ) + long_name = getattr(ncvar, "long_name", None) + long_or_std_name = TestCtx(BaseCheck.HIGH, self.section_titles["3.3"]) + if long_name is not None: + long_name_present = True + long_or_std_name.assert_true( + isinstance(long_name, str), + "Attribute long_name for variable {} must be a string".format(name), + ) + else: + long_name_present = False + # §1.3 The long_name and standard_name attributes are used to + # describe the content of each variable. For backwards + # compatibility with COARDS neither is required, but use of at + # least one of them is strongly recommended. + + # If standard_name is not defined but long_name is, don't continue + # the check for this variable + if standard_name is not None: + standard_name_present = True + valid_std_name = TestCtx(BaseCheck.HIGH, self.section_titles["3.3"]) + valid_std_name.assert_true( + isinstance(standard_name, str), + "Attribute standard_name for variable {} must be a string".format( + name + ), + ) + if isinstance(standard_name, str): + valid_std_name.assert_true( + standard_name in self._std_names, + "standard_name {} is not defined in Standard Name Table v{}".format( + standard_name or "undefined", self._std_names._version + ), + ) + + ret_val.append(valid_std_name.to_result()) + + # 2) optional - if modifiers, should be in table + if standard_name_modifier is not None: + valid_modifier = TestCtx(BaseCheck.HIGH, self.section_titles["3.3"]) + allowed = [ + "detection_minimum", + "number_of_observations", + "standard_error", + "status_flag", + ] + valid_modifier.assert_true( + standard_name_modifier in allowed, + "standard_name modifier {} for variable {} is not a valid modifier " + "according to appendix C".format(standard_name_modifier, name), + ) + + ret_val.append(valid_modifier.to_result()) + else: + standard_name_present = False + + long_or_std_name.assert_true( + long_name_present or standard_name_present, + "Attribute long_name or/and standard_name is highly recommended for variable {}".format( + name + ), + ) + ret_val.append(long_or_std_name.to_result()) + return ret_val + + def check_ancillary_variables(self, ds): + """ + Checks the ancillary_variable attribute for all variables to ensure + they are CF compliant. + + CF §3.4 It is a string attribute whose value is a blank separated list + of variable names. The nature of the relationship between variables + associated via ancillary_variables must be determined by other + attributes. The variables listed by the ancillary_variables attribute + will often have the standard name of the variable which points to them + including a modifier (Appendix C, Standard Name Modifiers) to indicate + the relationship. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + for ncvar in ds.get_variables_by_attributes( + ancillary_variables=lambda x: x is not None + ): + name = ncvar.name + valid_ancillary = TestCtx(BaseCheck.HIGH, self.section_titles["3.4"]) + ancillary_variables = ncvar.ancillary_variables + + valid_ancillary.assert_true( + isinstance(ancillary_variables, str), + "ancillary_variables attribute defined by {} " + "should be string".format(name), + ) + + # Can't perform the second check if it's not a string + if not isinstance(ancillary_variables, str): + ret_val.append(valid_ancillary.to_result()) + continue + + for ancillary_variable in ancillary_variables.split(): + valid_ancillary.assert_true( + ancillary_variable in ds.variables, + "{} is not a variable in this dataset".format(ancillary_variable), + ) + + ret_val.append(valid_ancillary.to_result()) + + return ret_val + + def check_flags(self, ds): + """ + Check the flag_values, flag_masks and flag_meanings attributes for + variables to ensure they are CF compliant. + + CF §3.5 The attributes flag_values, flag_masks and flag_meanings are + intended to make variables that contain flag values self describing. + Status codes and Boolean (binary) condition flags may be expressed with + different combinations of flag_values and flag_masks attribute + definitions. + + The flag_values and flag_meanings attributes describe a status flag + consisting of mutually exclusive coded values. + + The flag_meanings attribute is a string whose value is a blank + separated list of descriptive words or phrases, one for each flag + value. Each word or phrase should consist of characters from the + alphanumeric set and the following five: '_', '-', '.', '+', '@'. + + The flag_masks and flag_meanings attributes describe a number of + independent Boolean conditions using bit field notation by setting + unique bits in each flag_masks value. + + The flag_masks, flag_values and flag_meanings attributes, used + together, describe a blend of independent Boolean conditions and + enumerated status codes. A flagged condition is identified by a bitwise + AND of the variable value and each flag_masks value; a result that + matches the flag_values value indicates a true condition. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + for name in cfutil.get_flag_variables(ds): + variable = ds.variables[name] + flag_values = getattr(variable, "flag_values", None) + flag_masks = getattr(variable, "flag_masks", None) + + valid_flags_var = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) + # Check that the variable defines mask or values + valid_flags_var.assert_true( + flag_values is not None or flag_masks is not None, + "{} does not define either flag_masks or flag_values".format(name), + ) + ret_val.append(valid_flags_var.to_result()) + + valid_meanings = self._check_flag_meanings(ds, name) + ret_val.append(valid_meanings) + + # check flag_values + if flag_values is not None: + valid_values = self._check_flag_values(ds, name) + ret_val.append(valid_values) + + # check flag_masks + if flag_masks is not None: + valid_masks = self._check_flag_masks(ds, name) + ret_val.append(valid_masks) + + if flag_values is not None and flag_masks is not None: + allv = list( + map(lambda a, b: a & b == a, list(zip(flag_values, flag_masks))) + ) + + allvr = Result(BaseCheck.MEDIUM, all(allv), self.section_titles["3.5"]) + if not allvr.value: + allvr.msgs = [ + "flag masks and flag values for '{}' combined don't equal flag value".format( + name + ) + ] + + ret_val.append(allvr) + + return ret_val + + def _check_flag_values(self, ds, name): + """ + Checks a variable's flag_values attribute for compliance under CF + + - flag_values exists as an array + - unique elements in flag_values + - flag_values si the same dtype as the variable + - flag_values is the same length as flag_meanings + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: Name of variable to check + :rtype: compliance_checker.base.Result + """ + variable = ds.variables[name] + + flag_values = getattr(variable, "flag_values", None) + flag_meanings = getattr(variable, "flag_meanings", None) + valid_values = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) + + # flag_values must be a list of values, not a string or anything else + valid_values.assert_true( + isinstance(flag_values, np.ndarray), + "{}'s flag_values must be an array of values not {}".format( + name, type(flag_values) + ), + ) + + # We can't perform any more checks + if not isinstance(flag_values, np.ndarray): + return valid_values.to_result() + + # the flag values must be independent, no repeating values + flag_set = set(flag_values) + valid_values.assert_true( + len(flag_set) == len(flag_values), + "{}'s flag_values must be independent and can not be repeated".format(name), + ) + + # the data type for flag_values should be the same as the variable + valid_values.assert_true( + variable.dtype.type == flag_values.dtype.type, + "flag_values ({}) must be the same data type as {} ({})" + "".format(flag_values.dtype.type, name, variable.dtype.type), + ) + + if isinstance(flag_meanings, str): + flag_meanings = flag_meanings.split() + valid_values.assert_true( + len(flag_meanings) == len(flag_values), + "{}'s flag_meanings and flag_values should have the same number ".format( + name + ) + + "of elements.", + ) + + return valid_values.to_result() + + def _check_flag_masks(self, ds, name): + """ + Check a variable's flag_masks attribute for compliance under CF + + - flag_masks exists as an array + - flag_masks is the same dtype as the variable + - variable's dtype can support bit-field + - flag_masks is the same length as flag_meanings + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: Variable name + :rtype: compliance_checker.base.Result + """ + variable = ds.variables[name] + + flag_masks = variable.flag_masks + flag_meanings = getattr(ds, "flag_meanings", None) + + valid_masks = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) + + valid_masks.assert_true( + isinstance(flag_masks, np.ndarray), + "{}'s flag_masks must be an array of values not {}".format( + name, type(flag_masks).__name__ + ), + ) + + if not isinstance(flag_masks, np.ndarray): + return valid_masks.to_result() + + valid_masks.assert_true( + variable.dtype.type == flag_masks.dtype.type, + "flag_masks ({}) mustbe the same data type as {} ({})" + "".format(flag_masks.dtype.type, name, variable.dtype.type), + ) + + type_ok = ( + np.issubdtype(variable.dtype, np.integer) + or np.issubdtype(variable.dtype, "S") + or np.issubdtype(variable.dtype, "b") + ) + + valid_masks.assert_true( + type_ok, + "{}'s data type must be capable of bit-field expression".format(name), + ) + + if isinstance(flag_meanings, str): + flag_meanings = flag_meanings.split() + valid_masks.assert_true( + len(flag_meanings) == len(flag_masks), + "{} flag_meanings and flag_masks should have the same number ".format( + name + ) + + "of elements.", + ) + + return valid_masks.to_result() + + def _check_flag_meanings(self, ds, name): + """ + Check a variable's flag_meanings attribute for compliance under CF + + - flag_meanings exists + - flag_meanings is a string + - flag_meanings elements are valid strings + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: Variable name + :rtype: compliance_checker.base.Result + """ + variable = ds.variables[name] + flag_meanings = getattr(variable, "flag_meanings", None) + valid_meanings = TestCtx(BaseCheck.HIGH, self.section_titles["3.5"]) + + valid_meanings.assert_true( + flag_meanings is not None, + "{}'s flag_meanings attribute is required for flag variables".format(name), + ) + + valid_meanings.assert_true( + isinstance(flag_meanings, str), + "{}'s flag_meanings attribute must be a string".format(name), + ) + + # We can't perform any additional checks if it's not a string + if not isinstance(flag_meanings, str): + return valid_meanings.to_result() + + valid_meanings.assert_true( + len(flag_meanings) > 0, "{}'s flag_meanings can't be empty".format(name) + ) + + flag_regx = regex.compile(r"^[0-9A-Za-z_\-.+@]+$") + meanings = flag_meanings.split() + for meaning in meanings: + if flag_regx.match(meaning) is None: + valid_meanings.assert_true( + False, + "{}'s flag_meanings attribute defined an illegal flag meaning ".format( + name + ) + + "{}".format(meaning), + ) + return valid_meanings.to_result() + + ############################################################################### + # Chapter 4: Coordinate Types + ############################################################################### + + def check_coordinate_types(self, ds): + """ + Check the axis attribute of coordinate variables + + CF §4 The attribute axis may be attached to a coordinate variable and + given one of the values X, Y, Z or T which stand for a longitude, + latitude, vertical, or time axis respectively. Alternatively the + standard_name attribute may be used for direct identification. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + for variable in ds.get_variables_by_attributes(axis=lambda x: x is not None): + name = variable.name + # Coordinate compressions should not be checked as a valid + # coordinate, which they are not. They are a mechanism to project + # an array of indices onto a 2-d grid containing valid coordinates. + if cfutil.is_compression_coordinate(ds, name): + continue + + variable = ds.variables[name] + # Even though it's not allowed in CF 1.6, it is allowed in CF 1.7 + # and we see people do it, often. + if hasattr(variable, "cf_role"): + continue + + # §6.1 allows for labels to be referenced as auxiliary coordinate + # variables, which should not be checked like the rest of the + # coordinates. + if hasattr(variable.dtype, "char") and variable.dtype.char == "S": + continue + elif variable.dtype == str: + continue + + axis = getattr(variable, "axis", None) + + if axis is not None: + valid_axis = self._check_axis(ds, name) + ret_val.append(valid_axis) + + return ret_val + + def _check_axis(self, ds, name): + """ + Checks that the axis attribute is a string and an allowed value, namely + one of 'T', 'X', 'Y', or 'Z'. + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: Name of the variable + :rtype: compliance_checker.base.Result + """ + allowed_axis = ["T", "X", "Y", "Z"] + variable = ds.variables[name] + axis = variable.axis + + valid_axis = TestCtx(BaseCheck.HIGH, self.section_titles["4"]) + axis_is_string = (isinstance(axis, str),) + valid_axis.assert_true( + axis_is_string and len(axis) > 0, + "{}'s axis attribute must be a non-empty string".format(name), + ) + + # If axis isn't a string we can't continue any checks + if not axis_is_string or len(axis) == 0: + return valid_axis.to_result() + + valid_axis.assert_true( + axis in allowed_axis, + "{}'s axis attribute must be T, X, Y, or Z, ".format(name) + + "currently {}".format(axis), + ) + + return valid_axis.to_result() + + def check_latitude(self, ds): + """ + Check variable(s) that define latitude and are defined correctly according to CF. + + CF §4.1 Variables representing latitude must always explicitly include + the units attribute; there is no default value. The recommended unit + of latitude is degrees_north. Also acceptable are degree_north, + degree_N, degrees_N, degreeN, and degreesN. + + Optionally, the latitude type may be indicated additionally by + providing the standard_name attribute with the value latitude, and/or + the axis attribute with the value Y. + + - Four checks per latitude variable + - (H) latitude has units attribute + - (M) latitude has an allowed units attribute + - (L) latitude uses degrees_north (if not in rotated pole) + - (M) latitude defines either standard_name or axis + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + allowed_lat_units = [ + "degrees_north", + "degree_north", + "degree_n", + "degrees_n", + "degreen", + "degreesn", + ] + + # Determine the grid mappings in this dataset + grid_mapping = [] + grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) + for name in grid_mapping_variables: + variable = ds.variables[name] + grid_mapping_name = getattr(variable, "grid_mapping_name", None) + if grid_mapping_name: + grid_mapping.append(grid_mapping_name) + + latitude_variables = cfutil.get_latitude_variables(ds) + for latitude in latitude_variables: + variable = ds.variables[latitude] + units = getattr(variable, "units", None) + units_is_string = isinstance(units, str) + standard_name = getattr(variable, "standard_name", None) + axis = getattr(variable, "axis", None) + + # Check that latitude defines units + valid_latitude = TestCtx(BaseCheck.HIGH, self.section_titles["4.1"]) + valid_latitude.assert_true( + units is not None, + "latitude variable '{}' must define units".format(latitude), + ) + ret_val.append(valid_latitude.to_result()) + + # Check that latitude uses allowed units + allowed_units = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.1"]) + if standard_name == "grid_latitude": + e_n_units = cfutil.VALID_LAT_UNITS | cfutil.VALID_LON_UNITS + # check that the units aren't in east and north degrees units, + # but are convertible to angular units + allowed_units.assert_true( + units not in e_n_units and Unit(units) == Unit("degree"), + "Grid latitude variable '{}' should use degree equivalent units without east or north components. " + "Current units are {}".format(latitude, units), + ) + else: + allowed_units.assert_true( + units_is_string and units.lower() in allowed_lat_units, + "latitude variable '{}' should define valid units for latitude" + "".format(latitude), + ) + ret_val.append(allowed_units.to_result()) + + # Check that latitude uses degrees_north + if standard_name == "latitude" and units != "degrees_north": + # This is only a recommendation and we won't penalize but we + # will include a recommended action. + msg = ( + "CF recommends latitude variable '{}' to use units degrees_north" + "".format(latitude) + ) + recommended_units = Result( + BaseCheck.LOW, (1, 1), self.section_titles["4.1"], [msg] + ) + ret_val.append(recommended_units) + + y_variables = ds.get_variables_by_attributes(axis="Y") + # Check that latitude defines either standard_name or axis + definition = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.1"]) + definition.assert_true( + standard_name == "latitude" or axis == "Y" or y_variables != [], + "latitude variable '{}' should define standard_name='latitude' or axis='Y'" + "".format(latitude), + ) + ret_val.append(definition.to_result()) + + return ret_val + + def check_longitude(self, ds): + """ + Check variable(s) that define longitude and are defined correctly according to CF. + + CF §4.2 Variables representing longitude must always explicitly include + the units attribute; there is no default value. The recommended unit + of longitude is degrees_east. Also acceptable are degree_east, + degree_E, degrees_E, degreeE, and degreesE. + + Optionally, the longitude type may be indicated additionally by + providing the standard_name attribute with the value longitude, and/or + the axis attribute with the value X. + + - Four checks per longitude variable + - (H) longitude has units attribute + - (M) longitude has an allowed units attribute + - (L) longitude uses degrees_east (if not in rotated pole) + - (M) longitude defines either standard_name or axis + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + # TODO we already have a check_latitude... I'm sure we can make DRYer + + ret_val = [] + allowed_lon_units = [ + "degrees_east", + "degree_east", + "degree_e", + "degrees_e", + "degreee", + "degreese", + ] + + # Determine the grid mappings in this dataset + grid_mapping = [] + grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) + for name in grid_mapping_variables: + variable = ds.variables[name] + grid_mapping_name = getattr(variable, "grid_mapping_name", None) + if grid_mapping_name: + grid_mapping.append(grid_mapping_name) + + longitude_variables = cfutil.get_longitude_variables(ds) + for longitude in longitude_variables: + variable = ds.variables[longitude] + units = getattr(variable, "units", None) + units_is_string = isinstance(units, str) + standard_name = getattr(variable, "standard_name", None) + axis = getattr(variable, "axis", None) + + # NOTE see docstring--should below be 4.1 or 4.2? + # Check that longitude defines units + valid_longitude = TestCtx(BaseCheck.HIGH, self.section_titles["4.2"]) + valid_longitude.assert_true( + units is not None, + "longitude variable '{}' must define units".format(longitude), + ) + ret_val.append(valid_longitude.to_result()) + + # Check that longitude uses allowed units + allowed_units = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.2"]) + if standard_name == "grid_longitude": + e_n_units = cfutil.VALID_LAT_UNITS | cfutil.VALID_LON_UNITS + # check that the units aren't in east and north degrees units, + # but are convertible to angular units + allowed_units.assert_true( + units not in e_n_units and Unit(units) == Unit("degree"), + "Grid longitude variable '{}' should use degree equivalent units without east or north components. " + "Current units are {}".format(longitude, units), + ) + else: + allowed_units.assert_true( + units_is_string and units.lower() in allowed_lon_units, + "longitude variable '{}' should define valid units for longitude" + "".format(longitude), + ) + ret_val.append(allowed_units.to_result()) + + # Check that longitude uses degrees_east + if standard_name == "longitude" and units != "degrees_east": + # This is only a recommendation and we won't penalize but we + # will include a recommended action. + msg = ( + "CF recommends longitude variable '{}' to use units degrees_east" + "".format(longitude) + ) + recommended_units = Result( + BaseCheck.LOW, (1, 1), self.section_titles["4.2"], [msg] + ) + ret_val.append(recommended_units) + + x_variables = ds.get_variables_by_attributes(axis="X") + # Check that longitude defines either standard_name or axis + definition = TestCtx(BaseCheck.MEDIUM, self.section_titles["4.2"]) + definition.assert_true( + standard_name == "longitude" or axis == "X" or x_variables != [], + "longitude variable '{}' should define standard_name='longitude' or axis='X'" + "".format(longitude), + ) + ret_val.append(definition.to_result()) + + return ret_val + + def check_dimensional_vertical_coordinate( + self, ds, dimless_vertical_coordinates=dimless_vertical_coordinates_1_6 + ): + """ + Check units for variables defining vertical position are valid under + CF. + + CF §4.3.1 The units attribute for dimensional coordinates will be a string + formatted as per the udunits.dat file. + + The acceptable units for vertical (depth or height) coordinate variables + are: + - units of pressure as listed in the file udunits.dat. For vertical axes + the most commonly used of these include include bar, millibar, + decibar, atmosphere (atm), pascal (Pa), and hPa. + - units of length as listed in the file udunits.dat. For vertical axes + the most commonly used of these include meter (metre, m), and + kilometer (km). + - other units listed in the file udunits.dat that may under certain + circumstances reference vertical position such as units of density or + temperature. + + Plural forms are also acceptable. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + z_variables = cfutil.get_z_variables(ds) + # dimless_standard_names = [name for name, regx in dimless_vertical_coordinates] + for name in z_variables: + variable = ds.variables[name] + standard_name = getattr(variable, "standard_name", None) + units = getattr(variable, "units", None) + positive = getattr(variable, "positive", None) + # Skip the variable if it's dimensionless + if ( + hasattr(variable, "formula_terms") + or standard_name in dimless_vertical_coordinates + ): + continue + + valid_vertical_coord = TestCtx(BaseCheck.HIGH, self.section_titles["4.3"]) + valid_vertical_coord.assert_true( + isinstance(units, str) and units, + "§4.3.1 {}'s units must be defined for vertical coordinates, " + "there is no default".format(name), + ) + + if not util.units_convertible("bar", units): + valid_vertical_coord.assert_true( + positive in ("up", "down"), + "{}: vertical coordinates not defining pressure must include " + "a positive attribute that is either 'up' or 'down'".format(name), + ) + + # _check_valid_standard_units, part of the Chapter 3 checks, + # already verifies that this coordinate has valid units + + ret_val.append(valid_vertical_coord.to_result()) + + return ret_val + + def _check_dimensionless_vertical_coordinate_1_6( + self, ds, vname, deprecated_units, ret_val, dim_vert_coords_dict + ): + """ + Check that a dimensionless vertical coordinate variable is valid under + CF-1.6. + + :param netCDF4.Dataset ds: open netCDF4 dataset + :param str name: variable name + :param list ret_val: array to append Results to + :rtype None + """ + variable = ds.variables[vname] + standard_name = getattr(variable, "standard_name", None) + units = getattr(variable, "units", None) + formula_terms = getattr(variable, "formula_terms", None) + # Skip the variable if it's dimensional + if formula_terms is None and standard_name not in dim_vert_coords_dict: + return + + is_not_deprecated = TestCtx(BaseCheck.LOW, self.section_titles["4.3"]) + + is_not_deprecated.assert_true( + units not in deprecated_units, + "§4.3.2: units are deprecated by CF in variable {}: {}" + "".format(vname, units), + ) + + # check the vertical coordinates + ret_val.append(is_not_deprecated.to_result()) + ret_val.append(self._check_formula_terms(ds, vname, dim_vert_coords_dict)) + + def check_dimensionless_vertical_coordinates(self, ds): + """ + Check the validity of dimensionless coordinates under CF + + CF §4.3.2 The units attribute is not required for dimensionless + coordinates. + + The standard_name attribute associates a coordinate with its definition + from Appendix D, Dimensionless Vertical Coordinates. The definition + provides a mapping between the dimensionless coordinate values and + dimensional values that can positively and uniquely indicate the + location of the data. + + A new attribute, formula_terms, is used to associate terms in the + definitions with variables in a netCDF file. To maintain backwards + compatibility with COARDS the use of these attributes is not required, + but is strongly recommended. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + deprecated_units = ["level", "layer", "sigma_level"] + + ret_val.extend( + self._check_dimensionless_vertical_coordinates( + ds, + deprecated_units, + self._check_dimensionless_vertical_coordinate_1_6, + dimless_vertical_coordinates_1_6, + ) + ) + + return ret_val + + def check_time_coordinate(self, ds): + """ + Check variables defining time are valid under CF + + CF §4.4 Variables representing time must always explicitly include the + units attribute; there is no default value. + + The units attribute takes a string value formatted as per the + recommendations in the Udunits package. + + The acceptable units for time are listed in the udunits.dat file. The + most commonly used of these strings (and their abbreviations) includes + day (d), hour (hr, h), minute (min) and second (sec, s). Plural forms + are also acceptable. The reference time string (appearing after the + identifier since) may include date alone; date and time; or date, time, + and time zone. The reference time is required. A reference time in year + 0 has a special meaning (see Section 7.4, "Climatological Statistics"). + + Recommend that the unit year be used with caution. It is not a calendar + year. For similar reasons the unit month should also be used with + caution. + + A time coordinate is identifiable from its units string alone. + Optionally, the time coordinate may be indicated additionally by + providing the standard_name attribute with an appropriate value, and/or + the axis attribute with the value T. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + ret_val = [] + for name in cfutil.get_time_variables(ds): + variable = ds.variables[name] + # Has units + has_units = hasattr(variable, "units") + if not has_units: + result = Result( + BaseCheck.HIGH, + False, + self.section_titles["4.4"], + ["%s does not have units" % name], + ) + ret_val.append(result) + continue + # Correct and identifiable units + result = Result(BaseCheck.HIGH, True, self.section_titles["4.4"]) + ret_val.append(result) + correct_units = util.units_temporal(variable.units) + reasoning = None + if not correct_units: + reasoning = ["%s does not have correct time units" % name] + result = Result( + BaseCheck.HIGH, correct_units, self.section_titles["4.4"], reasoning + ) + ret_val.append(result) + + return ret_val + + def check_calendar(self, ds): + """ + Check the calendar attribute for variables defining time and ensure it + is a valid calendar prescribed by CF. + + CF §4.4.1 In order to calculate a new date and time given a base date, base + time and a time increment one must know what calendar to use. + + The values currently defined for calendar are: + - gregorian or standard + - proleptic_gregorian + - noleap or 365_day + - all_leap or 366_day + - 360_day + - julian + - none + + The calendar attribute may be set to none in climate experiments that + simulate a fixed time of year. + The time of year is indicated by the date in the reference time of the + units attribute. + + If none of the calendars defined above applies, a non-standard calendar + can be defined. The lengths of each month are explicitly defined with + the month_lengths attribute of the time axis. + + If leap years are included, then two other attributes of the time axis + should also be defined: + + leap_year, leap_month + + The calendar attribute is not required when a non-standard calendar is + being used. It is sufficient to define the calendar using the + month_lengths attribute, along with leap_year, and leap_month as + appropriate. However, the calendar attribute is allowed to take + non-standard values and in that case defining the non-standard calendar + using the appropriate attributes is required. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + valid_calendars = [ + "gregorian", + "standard", + "proleptic_gregorian", + "noleap", + "365_day", + "all_leap", + "366_day", + "360_day", + "julian", + "none", + ] + + ret_val = [] + + # if has a calendar, check that it is within the valid values + # otherwise no calendar is valid + for time_var in ds.get_variables_by_attributes( + calendar=lambda c: c is not None + ): + reasoning = None + valid_calendar = time_var.calendar in valid_calendars + + if not valid_calendar: + reasoning = [ + "§4.4.1 Variable %s should have a valid calendar: '%s' is not a valid calendar" + % (time_var.name, time_var.calendar) + ] + + # passes if the calendar is valid, otherwise notify of invalid + # calendar + + result = Result( + BaseCheck.LOW, valid_calendar, self.section_titles["4.4"], reasoning + ) + ret_val.append(result) + + return ret_val + + ############################################################################### + # Chapter 5: Coordinate Systems + ############################################################################### + + def check_aux_coordinates(self, ds): + """ + Chapter 5 paragraph 3 + + The dimensions of an auxiliary coordinate variable must be a subset of + the dimensions of the variable with which the coordinate is associated, + with two exceptions. First, string-valued coordinates (Section 6.1, + "Labels") have a dimension for maximum string length. Second, in the + ragged array representations of data (Chapter 9, Discrete Sampling + Geometries), special methods are needed to connect the data and + coordinates. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + ret_val = [] + + # for contiguous ragged array/indexed ragged array representations, + # coordinates are not required to adhere to the same principles; + # these representaitions can be identified by two attributes: + + # required for contiguous + count_vars = ds.get_variables_by_attributes( + sample_dimension=lambda x: x is not None + ) + + # required for indexed + index_vars = ds.get_variables_by_attributes( + instance_dimension=lambda x: x is not None + ) + + # if these attributes exist, we don't need to test + # the coordinates + if count_vars or index_vars: + return ret_val + + geophysical_variables = self._find_geophysical_vars(ds) + for name in geophysical_variables: + variable = ds.variables[name] + coordinates = getattr(variable, "coordinates", None) + # We use a set so we can assert + dim_set = set(variable.dimensions) + # No auxiliary coordinates, no check + if not isinstance(coordinates, str) or coordinates == "": + continue + + valid_aux_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5"]) + + for aux_coord in coordinates.split(): + valid_aux_coords.assert_true( + aux_coord in ds.variables, + "{}'s auxiliary coordinate specified by the coordinates attribute, {}, " + "is not a variable in this dataset" + "".format(name, aux_coord), + ) + if aux_coord not in ds.variables: + continue + + # §6.1 Allows for "labels" to be referenced as coordinates + if ( + hasattr(ds.variables[aux_coord].dtype, "char") + and ds.variables[aux_coord].dtype.char == "S" + ): + continue + elif ds.variables[aux_coord].dtype == str: + continue + + aux_coord_dims = set(ds.variables[aux_coord].dimensions) + valid_aux_coords.assert_true( + aux_coord_dims.issubset(dim_set), + "dimensions for auxiliary coordinate variable {} ({}) " + "are not a subset of dimensions for variable {} ({})" + "".format( + aux_coord, ", ".join(aux_coord_dims), name, ", ".join(dim_set) + ), + ) + ret_val.append(valid_aux_coords.to_result()) + return ret_val + + def check_duplicate_axis(self, ds): + """ + Checks that no variable contains two coordinates defining the same + axis. + + Chapter 5 paragraph 6 + + If an axis attribute is attached to an auxiliary coordinate variable, + it can be used by applications in the same way the `axis` attribute + attached to a coordinate variable is used. However, it is not + permissible for a [geophysical variable] to have both a coordinate + variable and an auxiliary coordinate variable, or more than one of + either type of variable, having an `axis` attribute with any given + value e.g. there must be no more than one axis attribute for X for any + [geophysical variable]. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + :return: List of results + """ + + ret_val = [] + geophysical_variables = self._find_geophysical_vars(ds) + for name in geophysical_variables: + no_duplicates = TestCtx(BaseCheck.HIGH, self.section_titles["5"]) + axis_map = cfutil.get_axis_map(ds, name) + # For every coordinate associated with this variable, keep track of + # which coordinates define an axis and assert that there are no + # duplicate axis attributes defined in the set of associated + # coordinates. axis_map includes coordinates that don't actually have + # an axis attribute, so we need to ignore those here. + for axis, coords in axis_map.items(): + coords = [c for c in coords if hasattr(ds.variables[c], "axis")] + no_duplicates.assert_true( + len(coords) <= 1, + "'{}' has duplicate axis {} defined by [{}]".format( + name, axis, ", ".join(sorted(coords)) + ), + ) + + ret_val.append(no_duplicates.to_result()) + + return ret_val + + def check_multi_dimensional_coords(self, ds): + """ + Checks that no multidimensional coordinate shares a name with its + dimensions. + + Chapter 5 paragraph 4 + + We recommend that the name of a [multidimensional coordinate] should + not match the name of any of its dimensions. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + # This can only apply to auxiliary coordinate variables + for coord in self._find_aux_coord_vars(ds): + variable = ds.variables[coord] + if variable.ndim < 2: + continue + not_matching = TestCtx(BaseCheck.MEDIUM, self.section_titles["5"]) + + not_matching.assert_true( + coord not in variable.dimensions, + "{} shares the same name as one of its dimensions" "".format(coord), + ) + ret_val.append(not_matching.to_result()) + + return ret_val + + # NOTE ********** + # IS THIS EVEN NEEDED ANYMORE? + # *************** + def check_grid_coordinates(self, ds): + # def _check_grid_coordinates(self, ds): + """ + 5.6 When the coordinate variables for a horizontal grid are not + longitude and latitude, it is required that the true latitude and + longitude coordinates be supplied via the coordinates attribute. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + latitudes = cfutil.get_true_latitude_variables(ds) + longitudes = cfutil.get_true_longitude_variables(ds) + + check_featues = [ + "2d-regular-grid", + "2d-static-grid", + "3d-regular-grid", + "3d-static-grid", + "mapped-grid", + "reduced-grid", + ] + + # This one is tricky because there's a very subtle difference between + # latitude as defined in Chapter 4 and "true" latitude as defined in + # chapter 5. + + # For each geophysical variable that defines a grid, assert it is + # associated with a true latitude or longitude coordinate. + + for variable in self._find_geophysical_vars(ds): + # We use a set so we can do set-wise comparisons with coordinate + # dimensions + dimensions = set(ds.variables[variable].dimensions) + # If it's not a grid, skip it + if cfutil.guess_feature_type(ds, variable) not in check_featues: + continue + has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"]) + + # axis_map is a defaultdict(list) mapping the axis to a list of + # coordinate names. For example: + # {'X': ['lon'], 'Y':['lat'], 'Z':['lev']} + # The mapping comes from the dimensions of the variable and the + # contents of the `coordinates` attribute only. + axis_map = cfutil.get_axis_map(ds, variable) + + msg = ( + '{}\'s coordinate variable "{}" is not one of the variables identifying true ' + + "latitude/longitude and its dimensions are not a subset of {}'s dimensions" + ) + + alt = ( + "{} has no coordinate associated with a variable identified as true latitude/longitude; " + + "its coordinate variable should also share a subset of {}'s dimensions" + ) + + # Make sure we can find latitude and its dimensions are a subset + _lat = None + found_lat = False + for lat in axis_map["Y"]: + _lat = lat + is_subset_dims = set(ds.variables[lat].dimensions).issubset(dimensions) + + if is_subset_dims and lat in latitudes: + found_lat = True + break + if _lat: + has_coords.assert_true(found_lat, msg.format(variable, _lat, variable)) + else: + has_coords.assert_true(found_lat, alt.format(variable, variable)) + + # Make sure we can find longitude and its dimensions are a subset + _lon = None + found_lon = False + for lon in axis_map["X"]: + _lon = lon + is_subset_dims = set(ds.variables[lon].dimensions).issubset(dimensions) + + if is_subset_dims and lon in longitudes: + found_lon = True + break + if _lon: + has_coords.assert_true(found_lon, msg.format(variable, _lon, variable)) + else: + has_coords.assert_true(found_lon, alt.format(variable, variable)) + + ret_val.append(has_coords.to_result()) + return ret_val + + def check_reduced_horizontal_grid(self, ds): + """ + 5.3 A "reduced" longitude-latitude grid is one in which the points are + arranged along constant latitude lines with the number of points on a + latitude line decreasing toward the poles. + + Recommend that this type of gridded data be stored using the compression + scheme described in Section 8.2, "Compression by Gathering". The + compressed latitude and longitude auxiliary coordinate variables are + identified by the coordinates attribute. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + # Create a set of coordinate variables defining `compress` + lats = set(cfutil.get_latitude_variables(ds)) + lons = set(cfutil.get_longitude_variables(ds)) + + for name in self._find_geophysical_vars(ds): + coords = getattr(ds.variables[name], "coordinates", None) + axis_map = cfutil.get_axis_map(ds, name) + # If this variable has no coordinate that defines compression + if "C" not in axis_map: + continue + + valid_rgrid = TestCtx(BaseCheck.HIGH, self.section_titles["5.3"]) + # Make sure reduced grid features define coordinates + valid_rgrid.assert_true( + isinstance(coords, str) and coords, + "reduced grid feature {} must define coordinates attribute" + "".format(name), + ) + # We can't check anything else if there are no defined coordinates + if not isinstance(coords, str) and coords: + continue + + coord_set = set(coords.split()) + + # Make sure it's associated with valid lat and valid lon + valid_rgrid.assert_true( + len(coord_set.intersection(lons)) > 0, + "{} must be associated with a valid longitude coordinate".format(name), + ) + valid_rgrid.assert_true( + len(coord_set.intersection(lats)) > 0, + "{} must be associated with a valid latitude coordinate".format(name), + ) + valid_rgrid.assert_true( + len(axis_map["C"]) == 1, + "{} can not be associated with more than one compressed coordinates: " + "({})".format(name, ", ".join(axis_map["C"])), + ) + + for compressed_coord in axis_map["C"]: + coord = ds.variables[compressed_coord] + compress = getattr(coord, "compress", None) + valid_rgrid.assert_true( + isinstance(compress, str) and compress, + "compress attribute for compression coordinate {} must be a non-empty string" + "".format(compressed_coord), + ) + if not isinstance(compress, str): + continue + for dim in compress.split(): + valid_rgrid.assert_true( + dim in ds.dimensions, + "dimension {} referenced by {}:compress must exist" + "".format(dim, compressed_coord), + ) + ret_val.append(valid_rgrid.to_result()) + + return ret_val + + def _check_grid_mapping_attr_condition(self, attr, attr_name): + """ + Evaluate a condition (or series of conditions) for a particular + attribute. Implementation for CF-1.6. + + :param attr: attribute to teset condition for + :param str attr_name: name of the attribute + :rtype tuple + :return two-tuple of (bool, str) + """ + + if attr_name == "latitude_of_projection_origin": + return self._evaluate_latitude_of_projection_origin(attr) + + elif attr_name == "longitude_of_projection_origin": + return self._evaluate_longitude_of_projection_origin(attr) + + elif attr_name == "longitude_of_central_meridian": + return self._evaluate_longitude_of_central_meridian(attr) + + elif attr_name == "longitude_of_prime_meridian": + return self._evaluate_longitude_of_prime_meridian(attr) + + elif attr_name == "scale_factor_at_central_meridian": + return self._evaluate_scale_factor_at_central_meridian(attr) + + elif attr_name == "scale_factor_at_projection_origin": + return self._evaluate_scale_factor_at_projection_origin(attr) + + elif attr_name == "standard_parallel": + return self._evaluate_standard_parallel(attr) + + elif attr_name == "straight_vertical_longitude_from_pole": + return self._evaluate_straight_vertical_longitude_from_pole(attr) + + else: + raise NotImplementedError( + "Evaluation for {} not yet implemented".format(attr_name) + ) + + def _evaluate_latitude_of_projection_origin(self, val): + """ + Evaluate the condition for `latitude_of_projection_origin` attribute. + Return result. Value must be -90 <= x <= 90. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return ( + (val >= -90.0) and (val <= 90.0), + "latitude_of_projection_origin must satisfy (-90 <= x <= 90)", + ) + + def _evaluate_longitude_of_projection_origin(self, val): + """ + Evaluate the condition for `longitude_of_projection_origin` attribute. + Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return ( + (val >= -180.0) and (val <= 180.0), + "longitude_of_projection_origin must satisfy (-180 <= x <= 180)", + ) + + def _evaluate_longitude_of_central_meridian(self, val): + """ + Evaluate the condition for `longitude_of_central_meridian` attribute. + Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return ( + (val >= -180.0) and (val <= 180.0), + "longitude_of_central_meridian must satisfy (-180 <= x <= 180)", + ) + + def _evaluate_longitude_of_prime_meridian(self, val): + """ + Evaluate the condition for `longitude_of_prime_meridian` attribute. + Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return ( + (val >= -180.0) and (val <= 180.0), + "longitude_of_prime_meridian must satisfy (-180 <= x <= 180)", + ) + + def _evaluate_scale_factor_at_central_meridian(self, val): + """ + Evaluate the condition for `scale_factor_at_central_meridian` attribute. + Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return (val > 0.0, "scale_factor_at_central_meridian must be > 0.0") + + def _evaluate_scale_factor_at_projection_origin(self, val): + """ + Evaluate the condition for `scale_factor_at_projection_origin` attribute. + Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return (val > 0.0, "scale_factor_at_projection_origin must be > 0.0") + + def _evaluate_standard_parallel(self, val): + """ + Evaluate the condition for `standard_parallel` attribute. Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return ( + (val >= -90.0) and (val <= 90), + "standard_parallel must satisfy (-90 <= x <= 90)", + ) + + def _evaluate_straight_vertical_longitude_from_pole(self, val): + """ + Evaluate the condition for `straight_vertical_longitude_from_pole` + attribute. Return result. + + :param val: value to be tested + :rtype tuple + :return two-tuple (bool, msg) + """ + + return ( + (val >= -180.0) and (val <= 180), + "straight_vertical_longitude_from_pole must satisfy (-180 <= x <= 180)", + ) + + ############################################################################### + # Chapter 6: Labels and Alternative Coordinates + ############################################################################### + + def check_geographic_region(self, ds): + """ + 6.1.1 When data is representative of geographic regions which can be identified by names but which have complex + boundaries that cannot practically be specified using longitude and latitude boundary coordinates, a labeled + axis should be used to identify the regions. + + Recommend that the names be chosen from the list of standardized region names whenever possible. To indicate + that the label values are standardized the variable that contains the labels must be given the standard_name + attribute with the value region. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + region_list = ( + [ # TODO maybe move this (and other info like it) into a config file? + "africa", + "antarctica", + "arabian_sea", + "aral_sea", + "arctic_ocean", + "asia", + "atlantic_ocean", + "australia", + "baltic_sea", + "barents_opening", + "barents_sea", + "beaufort_sea", + "bellingshausen_sea", + "bering_sea", + "bering_strait", + "black_sea", + "canadian_archipelago", + "caribbean_sea", + "caspian_sea", + "central_america", + "chukchi_sea", + "contiguous_united_states", + "denmark_strait", + "drake_passage", + "east_china_sea", + "english_channel", + "eurasia", + "europe", + "faroe_scotland_channel", + "florida_bahamas_strait", + "fram_strait", + "global", + "global_land", + "global_ocean", + "great_lakes", + "greenland", + "gulf_of_alaska", + "gulf_of_mexico", + "hudson_bay", + "iceland_faroe_channel", + "indian_ocean", + "indonesian_throughflow", + "indo_pacific_ocean", + "irish_sea", + "lake_baykal", + "lake_chad", + "lake_malawi", + "lake_tanganyika", + "lake_victoria", + "mediterranean_sea", + "mozambique_channel", + "north_america", + "north_sea", + "norwegian_sea", + "pacific_equatorial_undercurrent", + "pacific_ocean", + "persian_gulf", + "red_sea", + "ross_sea", + "sea_of_japan", + "sea_of_okhotsk", + "south_america", + "south_china_sea", + "southern_ocean", + "taiwan_luzon_straits", + "weddell_sea", + "windward_passage", + "yellow_sea", + ] + ) + + for var in ds.get_variables_by_attributes(standard_name="region"): + valid_region = TestCtx(BaseCheck.MEDIUM, self.section_titles["6.1"]) + region = var[:] + if np.ma.isMA(region): + region = region.data + valid_region.assert_true( + "".join(region.astype(str)).lower() in region_list, + "6.1.1 '{}' specified by '{}' is not a valid region".format( + "".join(region.astype(str)), var.name + ), + ) + ret_val.append(valid_region.to_result()) + return ret_val + + ############################################################################### + # Chapter 7: Data Representative of Cells + ############################################################################### + + def check_cell_boundaries(self, ds): + """ + Checks the dimensions of cell boundary variables to ensure they are CF compliant. + + 7.1 To represent cells we add the attribute bounds to the appropriate coordinate variable(s). The value of bounds + is the name of the variable that contains the vertices of the cell boundaries. We refer to this type of variable as + a "boundary variable." A boundary variable will have one more dimension than its associated coordinate or auxiliary + coordinate variable. The additional dimension should be the most rapidly varying one, and its size is the maximum + number of cell vertices. + + Applications that process cell boundary data often times need to determine whether or not adjacent cells share an + edge. In order to facilitate this type of processing the following restrictions are placed on the data in boundary + variables: + + Bounds for 1-D coordinate variables + + For a coordinate variable such as lat(lat) with associated boundary variable latbnd(x,2), the interval endpoints + must be ordered consistently with the associated coordinate, e.g., for an increasing coordinate, lat(1) > lat(0) + implies latbnd(i,1) >= latbnd(i,0) for all i + + If adjacent intervals are contiguous, the shared endpoint must be represented identically in each instance where + it occurs in the boundary variable. For example, if the intervals that contain grid points lat(i) and lat(i+1) are + contiguous, then latbnd(i+1,0) = latbnd(i,1). + + Bounds for 2-D coordinate variables with 4-sided cells + + In the case where the horizontal grid is described by two-dimensional auxiliary coordinate variables in latitude + lat(n,m) and longitude lon(n,m), and the associated cells are four-sided, then the boundary variables are given + in the form latbnd(n,m,4) and lonbnd(n,m,4), where the trailing index runs over the four vertices of the cells. + + Bounds for multi-dimensional coordinate variables with p-sided cells + + In all other cases, the bounds should be dimensioned (...,n,p), where (...,n) are the dimensions of the auxiliary + coordinate variables, and p the number of vertices of the cells. The vertices must be traversed anticlockwise in the + lon-lat plane as viewed from above. The starting vertex is not specified. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + # Note that test does not check monotonicity + ret_val = [] + reasoning = [] + for variable_name, boundary_variable_name in cfutil.get_cell_boundary_map( + ds + ).items(): + variable = ds.variables[variable_name] + valid = True + reasoning = [] + if boundary_variable_name not in ds.variables: + valid = False + reasoning.append( + "Boundary variable {} referenced by {} not ".format( + boundary_variable_name, variable.name + ) + + "found in dataset variables" + ) + else: + boundary_variable = ds.variables[boundary_variable_name] + # The number of dimensions in the bounds variable should always be + # the number of dimensions in the referring variable + 1 + if boundary_variable.ndim < 2: + valid = False + reasoning.append( + "Boundary variable {} specified by {}".format( + boundary_variable.name, variable.name + ) + + " should have at least two dimensions to enclose the base " + + "case of a one dimensionsal variable" + ) + if boundary_variable.ndim != variable.ndim + 1: + valid = False + reasoning.append( + "The number of dimensions of the variable %s is %s, but the " + "number of dimensions of the boundary variable %s is %s. The boundary variable " + "should have %s dimensions" + % ( + variable.name, + variable.ndim, + boundary_variable.name, + boundary_variable.ndim, + variable.ndim + 1, + ) + ) + if variable.dimensions[:] != boundary_variable.dimensions[: variable.ndim]: + valid = False + reasoning.append( + "Boundary variable coordinates (for {}) are in improper order: {}. Bounds-specific dimensions should be last" + "".format(variable.name, boundary_variable.dimensions) + ) + + # ensure p vertices form a valid simplex given previous a...n + # previous auxiliary coordinates + if ( + ds.dimensions[boundary_variable.dimensions[-1]].size + < len(boundary_variable.dimensions[:-1]) + 1 + ): + valid = False + reasoning.append( + "Dimension {} of boundary variable (for {}) must have at least {} elements to form a simplex/closed cell with previous dimensions {}.".format( + boundary_variable.name, + variable.name, + len(variable.dimensions) + 1, + boundary_variable.dimensions[:-1], + ) + ) + result = Result( + BaseCheck.MEDIUM, valid, self.section_titles["7.1"], reasoning + ) + ret_val.append(result) + return ret_val + + def check_cell_measures(self, ds): + """ + 7.2 To indicate extra information about the spatial properties of a + variable's grid cells, a cell_measures attribute may be defined for a + variable. This is a string attribute comprising a list of + blank-separated pairs of words of the form "measure: name". "area" and + "volume" are the only defined measures. + + The "name" is the name of the variable containing the measure values, + which we refer to as a "measure variable". The dimensions of the + measure variable should be the same as or a subset of the dimensions of + the variable to which they are related, but their order is not + restricted. + + The variable must have a units attribute and may have other attributes + such as a standard_name. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + reasoning = [] + variables = ds.get_variables_by_attributes( + cell_measures=lambda c: c is not None + ) + for var in variables: + search_str = r"^(?:area|volume): (\w+)$" + search_res = regex.search(search_str, var.cell_measures) + if not search_res: + valid = False + reasoning.append( + "The cell_measures attribute for variable {} " + "is formatted incorrectly. It should take the" + " form of either 'area: cell_var' or " + "'volume: cell_var' where cell_var is the " + "variable describing the cell measures".format(var.name) + ) + else: + valid = True + cell_meas_var_name = search_res.groups()[0] + # TODO: cache previous results + if cell_meas_var_name not in ds.variables: + valid = False + reasoning.append( + "Cell measure variable {} referred to by " + "{} is not present in dataset variables".format( + cell_meas_var_name, var.name + ) + ) + else: + cell_meas_var = ds.variables[cell_meas_var_name] + if not hasattr(cell_meas_var, "units"): + valid = False + reasoning.append( + "Cell measure variable {} is required " + "to have units attribute defined.".format( + cell_meas_var_name + ) + ) + if not set(cell_meas_var.dimensions).issubset(var.dimensions): + valid = False + reasoning.append( + "Cell measure variable {} must have " + "dimensions which are a subset of " + "those defined in variable {}.".format( + cell_meas_var_name, var.name + ) + ) + + result = Result( + BaseCheck.MEDIUM, valid, (self.section_titles["7.2"]), reasoning + ) + ret_val.append(result) + + return ret_val + + def check_cell_methods(self, ds): + """ + 7.3 To describe the characteristic of a field that is represented by cell values, we define the cell_methods attribute + of the variable. This is a string attribute comprising a list of blank-separated words of the form "name: method". Each + "name: method" pair indicates that for an axis identified by name, the cell values representing the field have been + determined or derived by the specified method. + + name can be a dimension of the variable, a scalar coordinate variable, a valid standard name, or the word "area" + + values of method should be selected from the list in Appendix E, Cell Methods, which includes point, sum, mean, maximum, + minimum, mid_range, standard_deviation, variance, mode, and median. Case is not significant in the method name. Some + methods (e.g., variance) imply a change of units of the variable, as is indicated in Appendix E, Cell Methods. + + Because the default interpretation for an intensive quantity differs from that of an extensive quantity and because this + distinction may not be understood by some users of the data, it is recommended that every data variable include for each + of its dimensions and each of its scalar coordinate variables the cell_methods information of interest (unless this + information would not be meaningful). It is especially recommended that cell_methods be explicitly specified for each + spatio-temporal dimension and each spatio-temporal scalar coordinate variable. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + ret_val = [] + psep = regex.compile( + r"(?P\w+: )+(?P\w+) ?(?Pwhere (?P\w+) " + r"?(?Pover (?P\w+))?| ?)(?:\((?P[^)]*)\))?" + ) + + for var in ds.get_variables_by_attributes(cell_methods=lambda x: x is not None): + if not getattr(var, "cell_methods", ""): + continue + + method = getattr(var, "cell_methods", "") + + valid_attribute = TestCtx( + BaseCheck.HIGH, self.section_titles["7.3"] + ) # changed from 7.1 to 7.3 + valid_attribute.assert_true( + regex.match(psep, method) is not None, + '"{}" is not a valid format for cell_methods attribute of "{}"' + "".format(method, var.name), + ) + ret_val.append(valid_attribute.to_result()) + + valid_cell_names = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.3"]) + + # check that the name is valid + for match in regex.finditer(psep, method): + # it is possible to have "var1: var2: ... varn: ...", so handle + # that case + for var_raw_str in match.captures("vars"): + # strip off the ' :' at the end of each match + var_str = var_raw_str[:-2] + if ( + var_str in var.dimensions + or var_str == "area" + or var_str in getattr(var, "coordinates", "") + ): + + valid = True + else: + valid = False + + valid_cell_names.assert_true( + valid, + "{}'s cell_methods name component {} does not match a dimension, " + "area or auxiliary coordinate".format(var.name, var_str), + ) + + ret_val.append(valid_cell_names.to_result()) + + # Checks if the method value of the 'name: method' pair is acceptable + valid_cell_methods = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.3"]) + + for match in regex.finditer(psep, method): + # CF section 7.3 - "Case is not significant in the method name." + valid_cell_methods.assert_true( + match.group("method").lower() in self.cell_methods, + "{}:cell_methods contains an invalid method: {}" + "".format(var.name, match.group("method")), + ) + + ret_val.append(valid_cell_methods.to_result()) + + for match in regex.finditer(psep, method): + if match.group("paren_contents") is not None: + # split along spaces followed by words with a colon + # not sure what to do if a comment contains a colon! + ret_val.append( + self._check_cell_methods_paren_info( + match.group("paren_contents"), var + ).to_result() + ) + + return ret_val + + def _check_cell_methods_paren_info(self, paren_contents, var): + """ + Checks that the spacing and/or comment info contained inside the + parentheses in cell_methods is well-formed + """ + valid_info = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.3"]) + # if there are no colons, this is a simple comment + # TODO: are empty comments considered valid? + if ":" not in paren_contents: + valid_info.out_of += 1 + valid_info.score += 1 + return valid_info + # otherwise, split into k/v pairs + kv_pair_pat = r"(\S+:)\s+(.*(?=\s+\w+:)|[^:]+$)\s*" + # otherwise, we must split further with intervals coming + # first, followed by non-standard comments + # we need the count of the matches, and re.findall() only returns + # groups if they are present and we wish to see if the entire match + # object concatenated together is the same as the original string + pmatches = [m for m in regex.finditer(kv_pair_pat, paren_contents)] + for i, pmatch in enumerate(pmatches): + keyword, val = pmatch.groups() + if keyword == "interval:": + valid_info.out_of += 2 + interval_matches = regex.match( + r"^\s*(?P\S+)\s+(?P\S+)\s*$", val + ) + # attempt to get the number for the interval + if not interval_matches: + valid_info.messages.append( + '§7.3.3 {}:cell_methods contains an interval specification that does not parse: "{}". Should be in format "interval: "'.format( + var.name, val + ) + ) + else: + try: + float(interval_matches.group("interval_number")) + except ValueError: + valid_info.messages.append( + '§7.3.3 {}:cell_methods contains an interval value that does not parse as a numeric value: "{}".'.format( + var.name, interval_matches.group("interval_number") + ) + ) + else: + valid_info.score += 1 + + # then the units + try: + Unit(interval_matches.group("interval_units")) + except ValueError: + valid_info.messages.append( + '§7.3.3 {}:cell_methods interval units "{}" is not parsable by UDUNITS.'.format( + var.name, interval_matches.group("interval_units") + ) + ) + else: + valid_info.score += 1 + elif keyword == "comment:": + # comments can't really be invalid, except + # if they come first or aren't last, and + # maybe if they contain colons embedded in the + # comment string + valid_info.out_of += 1 + if len(pmatches) == 1: + valid_info.messages.append( + "§7.3.3 If there is no standardized information, the keyword comment: should be omitted for variable {}".format( + var.name + ) + ) + # otherwise check that the comment is the last + # item in the parentheses + elif i != len(pmatches) - 1: + valid_info.messages.append( + '§7.3.3 The non-standard "comment:" element must come after any standard elements in cell_methods for variable {}'.format( + var.name + ) + ) + # + else: + valid_info.score += 1 + else: + valid_info.out_of += 1 + valid_info.messages.append( + '§7.3.3 Invalid cell_methods keyword "{}" for variable {}. Must be one of [interval, comment]'.format( + keyword, var.name + ) + ) + + # Ensure concatenated reconstructed matches are the same as the + # original string. If they're not, there's likely a formatting error + valid_info.assert_true( + "".join(m.group(0) for m in pmatches) == paren_contents, + "§7.3.3 Parenthetical content inside {}:cell_methods is not well formed: {}".format( + var.name, paren_contents + ), + ) + + return valid_info + + def check_climatological_statistics(self, ds): + """ + 7.4 A climatological time coordinate variable does not have a bounds attribute. Instead, it has a climatology + attribute, which names a variable with dimensions (n,2), n being the dimension of the climatological time axis. + Using the units and calendar of the time coordinate variable, element (i,0) of the climatology variable specifies + the beginning of the first subinterval and element (i,1) the end of the last subinterval used to evaluate the + climatological statistics with index i in the time dimension. The time coordinates should be values that are + representative of the climatological time intervals, such that an application which does not recognise climatological + time will nonetheless be able to make a reasonable interpretation. + + A climatological axis may use different statistical methods to measure variation among years, within years, and within + days. The methods which can be specified are those listed in Appendix E, Cell Methods and each entry in the cell_methods + attribute may also contain non-standardised information in parentheses after the method. The value of the cell_method + attribute must be in one of the following forms: + - time: method1 within years time: method2 over years + - time: method1 within days time: method2 over days + - time: method1 within days time: method2 over days time: method3 over years + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + reasoning = [] + ret_val = [] + total_climate_count = 0 + valid_climate_count = 0 + all_clim_coord_var_names = [] + + methods = [ + "point", # TODO change to appendix import once cf1.7 merged + "sum", + "mean", + "maximum", + "minimum", + "mid_range", + "standard_deviation", + "variance", + "mode", + "median", + ] + + # find any climatology axies variables; any variables which contain climatological stats will use + # these variables as coordinates + clim_time_coord_vars = ds.get_variables_by_attributes( + climatology=lambda s: s is not None + ) + + # first, to determine whether or not we have a valid climatological time + # coordinate variable, we need to make sure it has the attribute "climatology", + # but not the attribute "bounds" + for clim_coord_var in clim_time_coord_vars: + if hasattr(clim_coord_var, "bounds"): + reasoning.append( + "Variable {} has a climatology attribute and cannot also have a bounds attribute.".format( + clim_coord_var.name + ) + ) + result = Result( + BaseCheck.MEDIUM, False, (self.section_titles["7.4"]), reasoning + ) + ret_val.append(result) + + # make sure the climatology variable referenced actually exists + elif clim_coord_var.climatology not in ds.variables: + reasoning.append( + "Variable {} referenced in time's climatology attribute does not exist".format( + ds.variables["time"].climatology + ) + ) + result = Result( + BaseCheck.MEDIUM, False, (self.section_titles["7.4"]), reasoning + ) + ret_val.append(result) + + # check that coordinate bounds are in the proper order. + # make sure last elements are boundary variable specific dimensions + if ( + clim_coord_var.dimensions[:] + != ds.variables[clim_coord_var.climatology].dimensions[ + : clim_coord_var.ndim + ] + ): + total_climate_count += 1 + reasoning.append( + "Climatology variable coordinates are in improper order: {}. Bounds-specific dimensions should be last".format( + ds.variables[clim_coord_var.climatology].dimensions + ) + ) + result = Result( + BaseCheck.MEDIUM, + (valid_climate_count, total_climate_count), + (self.section_titles["7.4"]), + reasoning, + ) + ret_val.append(result) + + elif (ds.dimensions[ + ds.variables[clim_coord_var.climatology].dimensions[-1] + ].size != 2): + reasoning.append( + "Climatology dimension \"{}\" should only contain two elements".format( + ds.variables[clim_coord_var.climatology].name + ) + ) + total_climate_count += 1 + result = Result( + BaseCheck.MEDIUM, + (valid_climate_count, total_climate_count), + (self.section_titles["7.4"]), + reasoning, + ) + ret_val.append(result) + + # passed all these checks, so we can add this clim_coord_var to our total list + all_clim_coord_var_names.append(clim_coord_var.name) + + # for any variables which use a climatology time coordinate variable as a coordinate, + # if they have a cell_methods attribute, it must comply with the form: + # time: method1 within years time: method2 over years + # time: method1 within days time: method2 over days + # time: method1 within days time: method2 over days time: method3 over years + # optionally followed by parentheses for explaining additional + # info, e.g. + # "time: method1 within years time: method2 over years (sidereal years)" + + meth_regex = "(?:{})".format( + "|".join(methods) + ) # "or" comparison for the methods + re_string = ( + r"^time: {0} within (years|days)" # regex string to test + r" time: {0} over \1(?<=days)(?: time: {0} over years)?" + r"(?: \([^)]+\))?$".format(meth_regex) + ) + + # find any variables with a valid climatological cell_methods + for cell_method_var in ds.get_variables_by_attributes( + cell_methods=lambda s: s is not None + ): + if any( + [dim in all_clim_coord_var_names for dim in cell_method_var.dimensions] + ): + total_climate_count += 1 + if not regex.search(re_string, cell_method_var.cell_methods): + reasoning.append( + 'The "time: method within years/days over years/days" format is not correct in variable {}.'.format( + cell_method_var.name + ) + ) + else: + valid_climate_count += 1 + + result = Result( + BaseCheck.MEDIUM, + (valid_climate_count, total_climate_count), + (self.section_titles["7.4"]), + reasoning, + ) + ret_val.append(result) + + return ret_val + + ############################################################################### + # Chapter 8: Reduction of Dataset Size + ############################################################################### + + def check_packed_data(self, ds): + """ + 8.1 Simple packing may be achieved through the use of the optional NUG defined attributes scale_factor and + add_offset. After the data values of a variable have been read, they are to be multiplied by the scale_factor, + and have add_offset added to them. + + The units of a variable should be representative of the unpacked data. + + If the scale_factor and add_offset attributes are of the same data type as the associated variable, the unpacked + data is assumed to be of the same data type as the packed data. However, if the scale_factor and add_offset + attributes are of a different data type from the variable (containing the packed data) then the unpacked data + should match the type of these attributes, which must both be of type float or both be of type double. An additional + restriction in this case is that the variable containing the packed data must be of type byte, short or int. It is + not advised to unpack an int into a float as there is a potential precision loss. + + When data to be packed contains missing values the attributes that indicate missing values (_FillValue, valid_min, + valid_max, valid_range) must be of the same data type as the packed data. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + for name, var in ds.variables.items(): + + add_offset = getattr(var, "add_offset", None) + scale_factor = getattr(var, "scale_factor", None) + if not (add_offset or scale_factor): + continue + + valid = True + reasoning = [] + + # if only one of these attributes is defined, assume they + # are the same type (value doesn't matter here) + if not add_offset: + add_offset = scale_factor + if not scale_factor: + scale_factor = add_offset + + if type(add_offset) != type(scale_factor): + valid = False + reasoning.append( + "Attributes add_offset and scale_factor have different data type." + ) + elif type(scale_factor) != var.dtype.type: + # Check both attributes are type float or double + if not isinstance(scale_factor, (float, np.floating)): + valid = False + reasoning.append( + "Attributes add_offset and scale_factor are not of type float or double." + ) + else: + # Check variable type is byte, short or int + if var.dtype.type not in [ + np.int, + np.int8, + np.int16, + np.int32, + np.int64, + ]: + valid = False + reasoning.append("Variable is not of type byte, short, or int.") + + result = Result( + BaseCheck.MEDIUM, valid, self.section_titles["8.1"], reasoning + ) + ret_val.append(result) + reasoning = [] + + valid = True + # test further with _FillValue , valid_min , valid_max , valid_range + if hasattr(var, "_FillValue"): + if var._FillValue.dtype.type != var.dtype.type: + valid = False + reasoning.append( + "Type of %s:_FillValue attribute (%s) does not match variable type (%s)" + % (name, var._FillValue.dtype.name, var.dtype.name) + ) + if hasattr(var, "valid_min"): + if var.valid_min.dtype.type != var.dtype.type: + valid = False + reasoning.append( + "Type of %svalid_min attribute (%s) does not match variable type (%s)" + % (name, var.valid_min.dtype.name, var.dtype.name) + ) + if hasattr(var, "valid_max"): + if var.valid_max.dtype.type != var.dtype.type: + valid = False + reasoning.append( + "Type of %s:valid_max attribute (%s) does not match variable type (%s)" + % (name, var.valid_max.dtype.name, var.dtype.name) + ) + if hasattr(var, "valid_range"): + if var.valid_range.dtype.type != var.dtype.type: + valid = False + reasoning.append( + "Type of %s:valid_range attribute (%s) does not match variable type (%s)" + % (name, var.valid_range.dtype.name, var.dtype.name) + ) + + result = Result( + BaseCheck.MEDIUM, valid, self.section_titles["8.1"], reasoning + ) + ret_val.append(result) + + return ret_val + + def check_compression_gathering(self, ds): + """ + At the current time the netCDF interface does not provide for packing + data. However a simple packing may be achieved through the use of the + optional NUG defined attributes scale_factor and add_offset . After the + data values of a variable have been read, they are to be multiplied by + the scale_factor , and have add_offset added to them. If both + attributes are present, the data are scaled before the offset is added. + When scaled data are written, the application should first subtract the + offset and then divide by the scale factor. The units of a variable + should be representative of the unpacked data. + + This standard is more restrictive than the NUG with respect to the use + of the scale_factor and add_offset attributes; ambiguities and + precision problems related to data type conversions are resolved by + these restrictions. If the scale_factor and add_offset attributes are + of the same data type as the associated variable, the unpacked data is + assumed to be of the same data type as the packed data. However, if the + scale_factor and add_offset attributes are of a different data type + from the variable (containing the packed data) then the unpacked data + should match the type of these attributes, which must both be of type + float or both be of type double . An additional restriction in this + case is that the variable containing the packed data must be of type + byte , short or int . It is not advised to unpack an int into a float + as there is a potential precision loss. + + When data to be packed contains missing values the attributes that + indicate missing values ( _FillValue , valid_min , valid_max , + valid_range ) must be of the same data type as + the packed data. See Section 2.5.1, “Missing Data” for a discussion of + how applications should treat variables that have attributes indicating + both missing values and transformations defined by a scale and/or + offset. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + for compress_var in ds.get_variables_by_attributes( + compress=lambda s: s is not None + ): + valid = True + reasoning = [] + # puts the referenced variable being compressed into a set + compress_set = set(compress_var.compress.split(" ")) + if compress_var.ndim != 1: + valid = False + reasoning.append( + "Compression variable {} may only have one dimension".format( + compress_var.name + ) + ) + # ensure compression variable is a proper index, and thus is an + # signed or unsigned integer type of some sort + if (compress_var.dtype is str) or ( + compress_var.dtype.kind not in {"i", "u"} + ): + valid = False + reasoning.append( + "Compression variable {} must be an integer type to form a proper array index".format( + compress_var.name + ) + ) + # make sure all the variables referred to are contained by the + # variables. + if not compress_set.issubset(ds.dimensions): + not_in_dims = sorted(compress_set.difference(ds.dimensions)) + valid = False + reasoning.append( + "The following dimensions referenced by the compress attribute of variable {} do not exist: {}".format( + compress_var.name, not_in_dims + ) + ) + + result = Result( + BaseCheck.MEDIUM, valid, self.section_titles["8.2"], reasoning + ) + ret_val.append(result) + + return ret_val + + ############################################################################### + # Chapter 9: Discrete Sampling Geometries + ############################################################################### + + def check_feature_type(self, ds): + """ + Check the global attribute featureType for valid CF featureTypes + + 9.4 A global attribute, featureType, is required for all Discrete Geometry representations except the orthogonal + multidimensional array representation, for which it is highly recommended. + + The value assigned to the featureType attribute is case-insensitive. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + # Due to case insensitive requirement, we list the possible featuretypes + # in lower case and check using the .lower() method + feature_list = [ + "point", + "timeseries", + "trajectory", + "profile", + "timeseriesprofile", + "trajectoryprofile", + ] + + feature_type = getattr(ds, "featureType", None) + valid_feature_type = TestCtx( + BaseCheck.HIGH, "§9.1 Dataset contains a valid featureType" + ) + valid_feature_type.assert_true( + feature_type is None or feature_type.lower() in feature_list, + "{} is not a valid CF featureType. It must be one of {}" + "".format(feature_type, ", ".join(feature_list)), + ) + return valid_feature_type.to_result() + + def check_cf_role(self, ds): + """ + Check variables defining cf_role for legal cf_role values. + + §9.5 The only acceptable values of cf_role for Discrete Geometry CF + data sets are timeseries_id, profile_id, and trajectory_id + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + valid_roles = ["timeseries_id", "profile_id", "trajectory_id"] + variable_count = 0 + valid_cf_role = TestCtx(BaseCheck.HIGH, self.section_titles["9.5"]) + for variable in ds.get_variables_by_attributes(cf_role=lambda x: x is not None): + variable_count += 1 + cf_role = variable.cf_role + valid_cf_role.assert_true( + cf_role in valid_roles, + "{} is not a valid cf_role value. It must be one of {}" + "".format(cf_role, ", ".join(valid_roles)), + ) + if variable_count > 0: + m = ( + "§9.5 The only acceptable values of cf_role for Discrete Geometry CF" + + " data sets are timeseries_id, profile_id, and trajectory_id" + ) + valid_cf_role.assert_true(variable_count < 3, m) + return valid_cf_role.to_result() + + def check_variable_features(self, ds): + """ + Checks the variable feature types match the dataset featureType attribute. + If more than one unique feature type is found, report this as an error. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + feature_types_found = defaultdict(list) + ret_val = [] + feature_list = { + "point", + "timeseries", + "trajectory", + "profile", + "timeseriesprofile", + "trajectoryprofile", + } + # Don't bother checking if it's not a legal featureType + # if the featureType attribute doesn't exist + feature_type = getattr(ds, "featureType", "") + if feature_type is not None and feature_type.lower() not in feature_list: + return [] + + _feature = feature_type.lower() + + for name in self._find_geophysical_vars(ds): + variable_feature = cfutil.guess_feature_type(ds, name) + # If we can't figure it out, don't check it. + if variable_feature is None: + continue + feature_types_found[variable_feature].append(name) + matching_feature = TestCtx(BaseCheck.MEDIUM, self.section_titles["9.1"]) + matching_feature.assert_true( + variable_feature.lower() == _feature, + "{} is not a {}, it is detected as a {}" + "".format(name, _feature, variable_feature), + ) + ret_val.append(matching_feature.to_result()) + + # create explanation of all of the different featureTypes + # found in the dataset + feature_description = ", ".join( + [ + "{} ({})".format(ftr, ", ".join(vrs)) + for ftr, vrs in feature_types_found.items() + ] + ) + all_same_features = TestCtx(BaseCheck.HIGH, self.section_titles["9.1"]) + all_same_features.assert_true( + len(feature_types_found) < 2, + "Different feature types discovered in this dataset: {}" + "".format(feature_description), + ) + ret_val.append(all_same_features.to_result()) + + return ret_val + + def check_hints(self, ds): + """ + Checks for potentially mislabeled metadata and makes suggestions for how to correct + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + ret_val.extend(self._check_hint_bounds(ds)) + + return ret_val + + def _check_hint_bounds(self, ds): + """ + Checks for variables ending with _bounds, if they are not cell methods, + make the recommendation + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + boundary_variables = cfutil.get_cell_boundary_variables(ds) + for name in ds.variables: + if name.endswith("_bounds") and name not in boundary_variables: + msg = ( + "{} might be a cell boundary variable but there are no variables that define it " + "as a boundary using the `bounds` attribute.".format(name) + ) + result = Result(BaseCheck.LOW, True, self.section_titles["7.1"], [msg]) + ret_val.append(result) + + return ret_val + diff --git a/compliance_checker/cf/cf_1_7.py b/compliance_checker/cf/cf_1_7.py new file mode 100644 index 00000000..486ff6d0 --- /dev/null +++ b/compliance_checker/cf/cf_1_7.py @@ -0,0 +1,875 @@ +import logging +import os +import sqlite3 +import sys + +from collections import OrderedDict, defaultdict +from functools import wraps +from warnings import warn + +import numpy as np +import pyproj +import regex + +from cf_units import Unit + +from compliance_checker import cfutil +from compliance_checker.base import BaseCheck, BaseNCCheck, Result, TestCtx +from compliance_checker.cf import util +from compliance_checker.cf.appendix_d import ( + dimless_vertical_coordinates_1_7, + no_missing_terms, +) +from compliance_checker.cf.appendix_e import cell_methods17 +from compliance_checker.cf.appendix_f import ( + ellipsoid_names17, + grid_mapping_attr_types17, + grid_mapping_dict17, + horizontal_datum_names17, + prime_meridian_names17, +) +from compliance_checker.cf.cf_base import appendix_a_base +from compliance_checker.cf.cf_1_6 import CF1_6Check + +logger = logging.getLogger(__name__) + +class CF1_7Check(CF1_6Check): + """Implementation for CF v1.7. Inherits from CF1_6Check as most of the + checks are the same.""" + + # things that are specific to 1.7 + _cc_spec_version = "1.7" + _cc_url = "http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html" + + appendix_a = appendix_a_base.copy() + appendix_a.update( + { + "actual_range": { + "Type": "N", + "attr_loc": {"D", "C"}, + "cf_section": "2.5.1", + }, + "comment": { + "Type": "S", + "attr_loc": {"G", "D", "C"}, + "cf_section": "2.6.2", + }, + "external_variables": { + "Type": "S", + "attr_loc": {"G"}, + "cf_section": "2.6.3", + }, + "actual_range": { + "Type": "N", + "attr_loc": {"D", "C"}, + "cf_section": "2.5.1", + }, + "scale_factor": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": "8.1"}, + } + ) + + def __init__(self, options=None): + super(CF1_7Check, self).__init__(options) + + self.cell_methods = cell_methods17 + self.grid_mapping_dict = grid_mapping_dict17 + self.grid_mapping_attr_types = grid_mapping_attr_types17 + + def check_actual_range(self, ds): + """Check the actual_range attribute of variables. As stated in + section 2.5.1 of version 1.7, this convention defines a two-element + vector attribute designed to describe the actual minimum and actual + maximum values of variables containing numeric data. Conditions: + - the fist value of the two-element vector must be equal to the + minimum of the data, and the second element equal to the maximum + - if the data is packed, the elements of actual_range should have + the same data type as the *unpacked* data + - if valid_range is specified, both elements of actual_range should + be within valid_range + + If a variable does not have an actual_range attribute, let it pass; + including this attribute is only suggested. However, if the user is + specifying the actual_range, the Result will be considered + high-priority.""" + + ret_val = [] + + for name, variable in ds.variables.items(): + msgs = [] + score = 0 + out_of = 0 + + if not hasattr(variable, "actual_range"): + continue # having this attr is only suggested, no Result needed + else: + + out_of += 1 + try: + if ( + len(variable.actual_range) != 2 + ): # TODO is the attr also a numpy array? if so, .size + msgs.append( + "actual_range of '{}' must be 2 elements".format(name) + ) + ret_val.append( + Result( # putting result into list + BaseCheck.HIGH, + (score, out_of), + self.section_titles["2.5"], + msgs, + ) + ) + continue # no need to keep checking if already completely wrong + else: + score += 1 + except TypeError: # in case it's just a single number + msgs.append("actual_range of '{}' must be 2 elements".format(name)) + ret_val.append( + Result( # putting result into list + BaseCheck.HIGH, + (score, out_of), + self.section_titles["2.5"], + msgs, + ) + ) + continue + + # check equality to existing min/max values + # NOTE this is a data check + # If every value is masked, a data check of actual_range isn't + # appropriate, so skip. + if not (hasattr(variable[:], "mask") and variable[:].mask.all()): + # if min/max values aren't close to actual_range bounds, + # fail. + out_of += 1 + if not np.isclose( + variable.actual_range[0], variable[:].min() + ) or not np.isclose(variable.actual_range[1], variable[:].max()): + msgs.append( + "actual_range elements of '{}' inconsistent with its min/max values".format( + name + ) + ) + else: + score += 1 + + # check that the actual range is within the valid range + if hasattr(variable, "valid_range"): # check within valid_range + out_of += 1 + if (variable.actual_range[0] < variable.valid_range[0]) or ( + variable.actual_range[1] > variable.valid_range[1] + ): + msgs.append( + '"{}"\'s actual_range must be within valid_range'.format( + name + ) + ) + else: + score += 1 + + # check the elements of the actual range have the appropriate + # relationship to the valid_min and valid_max + if hasattr(variable, "valid_min"): + out_of += 1 + if variable.actual_range[0] < variable.valid_min: + msgs.append( + '"{}"\'s actual_range first element must be >= valid_min ({})'.format( + name, variable.valid_min + ) + ) + else: + score += 1 + if hasattr(variable, "valid_max"): + out_of += 1 + if variable.actual_range[1] > variable.valid_max: + msgs.append( + '"{}"\'s actual_range second element must be <= valid_max ({})'.format( + name, variable.valid_max + ) + ) + else: + score += 1 + + ret_val.append( + Result( # putting result into list + BaseCheck.HIGH, (score, out_of), self.section_titles["2.5"], msgs + ) + ) + return ret_val + + def check_cell_boundaries(self, ds): + """ + Checks the dimensions of cell boundary variables to ensure they are CF compliant + per section 7.1. + + This method extends the CF1_6Check method; please see the original method for the + complete doc string. + + If any variable contains both a formula_terms attribute *and* a bounding variable, + that bounds variable must also have a formula_terms attribute. + + :param netCDF4.Dataset ds: An open netCDF dataset + :returns list: List of results + """ + + # Note that test does not check monotonicity + ret_val = [] + reasoning = [] + for variable_name, boundary_variable_name in cfutil.get_cell_boundary_map( + ds + ).items(): + variable = ds.variables[variable_name] + valid = True + reasoning = [] + if boundary_variable_name not in ds.variables: + valid = False + reasoning.append( + "Boundary variable {} referenced by {} not ".format( + boundary_variable_name, variable.name + ) + + "found in dataset variables" + ) + else: + boundary_variable = ds.variables[boundary_variable_name] + # The number of dimensions in the bounds variable should always be + # the number of dimensions in the referring variable + 1 + if boundary_variable.ndim < 2: + valid = False + reasoning.append( + "Boundary variable {} specified by {}".format( + boundary_variable.name, variable.name + ) + + " should have at least two dimensions to enclose the base " + + "case of a one dimensionsal variable" + ) + if boundary_variable.ndim != variable.ndim + 1: + valid = False + reasoning.append( + "The number of dimensions of the variable %s is %s, but the " + "number of dimensions of the boundary variable %s is %s. The boundary variable " + "should have %s dimensions" + % ( + variable.name, + variable.ndim, + boundary_variable.name, + boundary_variable.ndim, + variable.ndim + 1, + ) + ) + if variable.dimensions[:] != boundary_variable.dimensions[: variable.ndim]: + valid = False + reasoning.append( + "Boundary variable coordinates (for {}) are in improper order: {}. Bounds-specific dimensions should be last" + "".format(variable.name, boundary_variable.dimensions) + ) + + # ensure p vertices form a valid simplex given previous a...n + # previous auxiliary coordinates + if ( + ds.dimensions[boundary_variable.dimensions[-1]].size + < len(boundary_variable.dimensions[:-1]) + 1 + ): + valid = False + reasoning.append( + "Dimension {} of boundary variable (for {}) must have at least {} elements to form a simplex/closed cell with previous dimensions {}.".format( + boundary_variable.name, + variable.name, + len(variable.dimensions) + 1, + boundary_variable.dimensions[:-1], + ) + ) + + # check if formula_terms is present in the var; if so, + # the bounds variable must also have a formula_terms attr + if hasattr(variable, "formula_terms"): + if not hasattr(boundary_variable, "formula_terms"): + valid = False + reasoning.append( + "'{}' has 'formula_terms' attr, bounds variable '{}' must also have 'formula_terms'".format( + variable_name, boundary_variable_name + ) + ) + + result = Result( + BaseCheck.MEDIUM, valid, self.section_titles["7.1"], reasoning + ) + ret_val.append(result) + return ret_val + + def check_cell_measures(self, ds): + """ + A method to over-ride the CF1_6Check method. In CF 1.7, it is specified + that variable referenced by cell_measures must be in the dataset OR + referenced by the global attribute "external_variables", which represent + all the variables used in the dataset but not found in the dataset. + + 7.2 To indicate extra information about the spatial properties of a + variable's grid cells, a cell_measures attribute may be defined for a + variable. This is a string attribute comprising a list of + blank-separated pairs of words of the form "measure: name". "area" and + "volume" are the only defined measures. + + The "name" is the name of the variable containing the measure values, + which we refer to as a "measure variable". The dimensions of the + measure variable should be the same as or a subset of the dimensions of + the variable to which they are related, but their order is not + restricted. + + The variable must have a units attribute and may have other attributes + such as a standard_name. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + reasoning = [] + variables = ds.get_variables_by_attributes( + cell_measures=lambda c: c is not None + ) + for var in variables: + search_str = r"^(?:area|volume): (\w+)$" + search_res = regex.search(search_str, var.cell_measures) + if not search_res: + valid = False + reasoning.append( + "The cell_measures attribute for variable {} " + "is formatted incorrectly. It should take the" + " form of either 'area: cell_var' or " + "'volume: cell_var' where cell_var is the " + "variable describing the cell measures".format(var.name) + ) + else: + valid = True + cell_meas_var_name = search_res.groups()[0] + # TODO: cache previous results + + # if the dataset has external_variables, get it + try: + external_variables = ds.getncattr("external_variables") + except AttributeError: + external_variables = [] + if cell_meas_var_name not in ds.variables: + if cell_meas_var_name not in external_variables: + valid = False + reasoning.append( + "Cell measure variable {} referred to by {} is not present in dataset variables".format( + cell_meas_var_name, var.name + ) + ) + else: + valid = True + + # make Result + result = Result( + BaseCheck.MEDIUM, valid, (self.section_titles["7.2"]), reasoning + ) + ret_val.append(result) + continue # can't test anything on an external var + + else: + cell_meas_var = ds.variables[cell_meas_var_name] + if not hasattr(cell_meas_var, "units"): + valid = False + reasoning.append( + "Cell measure variable {} is required " + "to have units attribute defined.".format( + cell_meas_var_name + ) + ) + if not set(cell_meas_var.dimensions).issubset(var.dimensions): + valid = False + reasoning.append( + "Cell measure variable {} must have " + "dimensions which are a subset of " + "those defined in variable {}.".format( + cell_meas_var_name, var.name + ) + ) + + result = Result( + BaseCheck.MEDIUM, valid, (self.section_titles["7.2"]), reasoning + ) + ret_val.append(result) + + return ret_val + + def _check_grid_mapping_attr_condition(self, attr, attr_name): + """ + Evaluate a condition (or series of conditions) for a particular + attribute. Implementation for CF-1.7. + + :param attr: attribute to teset condition for + :param str attr_name: name of the attribute + :rtype tuple + :return two-tuple of (bool, str) + """ + + if attr_name == "geographic_crs_name": + return self._evaluate_geographic_crs_name(attr) + + elif attr_name == "geoid_name": + return self._evaluate_geoid_name(attr) + + elif attr_name == "geopotential_datum_name": + return self._evaluate_geopotential_datum_name(attr) + + elif attr_name == "horizontal_datum_name": + return self._evaluate_horizontal_datum_name(attr) + + elif attr_name == "prime_meridian_name": + return self._evaluate_prime_meridian_name(attr) + + elif attr_name == "projected_crs_name": + return self._evaluate_projected_crs_name(attr) + + elif attr_name == "reference_ellipsoid_name": + return self._evaluate_reference_ellipsoid_name(attr) + + elif attr_name == "towgs84": + return self._evaluate_towgs84(attr) + + else: # invoke method from 1.6, as these names are all still valid + return super(CF1_7Check, self)._check_grid_mapping_attr_condition( + attr, attr_name + ) + + def _check_gmattr_existence_condition_geoid_name_geoptl_datum_name(self, var): + """ + Check to see if both geoid_name and geopotential_datum_name exist as attributes + for `var`. They should not. + + :param netCDF4.Variable var + :rtype tuple + :return two-tuple (bool, str) + """ + + msg = "Both geoid_name and geopotential_datum_name cannot exist" + + if ("geoid_name" in var.ncattrs()) and ( + "geopotential_datum_name" in var.ncattrs() + ): + return (False, msg) + + else: + return (True, msg) + + def _check_gmattr_existence_condition_ell_pmerid_hdatum(self, var): + """ + If one of reference_ellipsoid_name, prime_meridian_name, or + horizontal_datum_name are defined as grid_mapping attributes, + they must all be defined. + + :param netCDF4.Variable var + :rtype tuple + :return two-tuple (bool, str) + """ + + msg = ( + "If any of reference_ellipsoid_name, prime_meridian_name, " + "or horizontal_datum_name are defined, all must be defined." + ) + + _ncattrs = set(var.ncattrs()) + + if any( + [ + x in _ncattrs + for x in [ + "reference_ellipsoid_name", + "prime_meridian_name", + "horizontal_datum_name", + ] + ] + ) and ( + not set( + [ + "reference_ellipsoid_name", + "prime_meridian_name", + "horizontal_datum_name", + ] + ).issubset(_ncattrs) + ): + return (False, msg) + + else: + return (True, msg) + + def _get_projdb_conn(self): + """ + Return a SQLite Connection to the PROJ database. + + Returns: + sqlite3.Connection + """ + + proj_db_path = os.path.join(pyproj.datadir.get_data_dir(), "proj.db") + return sqlite3.connect(proj_db_path) + + def _exec_query_str_with_params(self, qstr, argtuple): + """ + Execute a query string in a database connection with the given argument + tuple. Return a result set. + + :param str qstr: desired query to be executed + :param tuple argtuple: tuple of arguments to be supplied to query + :rtype set + """ + + conn = self._get_projdb_conn() + return conn.execute(qstr, argtuple) + + def _evaluate_geographic_crs_name(self, val): + """ + Evaluate the condition for the geographic_crs_name attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + query_str = ( + "SELECT 1 FROM geodetic_crs WHERE name = ? " + "UNION ALL " # need union in case contained in other tables + "SELECT 1 FROM alias_name WHERE alt_name = ? " + "AND table_name = 'geodetic_crs' LIMIT 1" + ) + + # try to find the value in the database + res_set = self._exec_query_str_with_params(query_str, (val, val)) + + # does it exist? if so, amt returned be > 1 + return ( + len(res_set.fetchall()) > 0, + "geographic_crs_name must correspond to a valid OGC WKT GEOGCS name", + ) + + def _evaluate_geoid_name(self, val): + """ + Evaluate the condition for the geod_name attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + query_str = ( + "SELECT 1 FROM vertical_datum WHERE name = ? " + "UNION ALL " + "SELECT 1 FROM alias_name WHERE alt_name = ? " + "AND table_name = 'vertical_datum' LIMIT 1" + ) + + # try to find the value in the database + res_set = self._exec_query_str_with_params(query_str, (val, val)) + + return ( + len(res_set.fetchall()) > 0, + "geoid_name must correspond to a valid OGC WKT VERT_DATUM name", + ) + + def _evaluate_geopotential_datum_name(self, val): + """ + Evaluate the condition for the geogpotential_datum_name attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + query_str = ( + "SELECT 1 FROM vertical_datum WHERE name = ? " + "UNION ALL " + "SELECT 1 FROM alias_name WHERE alt_name = ? " + "AND table_name = 'vertical_datum' LIMIT 1" + ) + + # try to find the value in the database + res_set = self._exec_query_str_with_params(query_str, (val, val)) + + return ( + len(res_set.fetchall()) > 0, + "geopotential_datum_name must correspond to a valid OGC WKT VERT_DATUM name", + ) + + def _evaluate_horizontal_datum_name(self, val): + """ + Evaluate the condition for the horizontal_datum_name attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + return ( + val in horizontal_datum_names17, + ( + "{} must be a valid Horizontal Datum Name; " + "see https://github.com/cf-convention/cf-conventions/wiki/Mapping-from-CF-Grid-Mapping-Attributes-to-CRS-WKT-Elements." + ), + ) + + def _evaluate_prime_meridian_name(self, val): + """ + Evaluate the condition for the prime_meridian_name. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + return ( + val in prime_meridian_names17, + ( + "{} must be a valid Prime Meridian name; " + "see https://github.com/cf-convention/cf-conventions/wiki/csv/prime_meridian.csv." + ), + ) + + def _evaluate_projected_crs_name(self, val): + """ + Evaluate the condition for the projected_crs attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + query_str = ( + "SELECT 1 FROM projected_crs WHERE name = ? " + "UNION ALL " + "SELECT 1 FROM alias_name WHERE alt_name = ? " + "AND table_name = 'projected_crs' LIMIT 1" + ) + + # try to find the value in the database + res_set = self._exec_query_str_with_params(query_str, (val, val)) + + return ( + len(res_set.fetchall()) > 0, + "projected_crs_name must correspond to a valid OGC WKT PROJCS name", + ) + + def _evaluate_reference_ellipsoid_name(self, val): + """ + Evaluate the condition for the reference_ellipsoid_name attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + return ( + val in ellipsoid_names17, + ( + "{} must be a valid Ellipsoid Name; " + "see https://github.com/cf-convention/cf-conventions/wiki/csv/ellipsoid.csv." + ), + ) + + def _evaluate_towgs84(self, val): + """ + Evaluate the condition for the towgs84 attribute. + + :param val: value to be tested + :rtype tuple + :return two-tuple of (bool, str) + """ + + msg = ( + "towgs84 must be an array of length 3, 6, or 7 of double-precision" + " and correspond to anm OGC WKT TOWGS84 node" + ) + + # if not numpy type, return false + if not getattr(val, "dtype", None): + return (False, msg) + + # must be double-precision array + elif val.dtype != np.float64: + return (False, msg) + + # must be of length 3, 6, or 7 + elif not val.shape: # single value + return (False, msg) + + elif not (val.size in (3, 6, 7)): + return (False, msg) + + else: + return (True, msg) + + def check_grid_mapping(self, ds): + __doc__ = super(CF1_7Check, self).check_grid_mapping.__doc__ + prev_return = super(CF1_7Check, self).check_grid_mapping(ds) + grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) + for var_name in sorted(grid_mapping_variables): + var = ds.variables[var_name] + test_ctx = self.get_test_ctx( + BaseCheck.HIGH, self.section_titles["5.6"], var.name + ) + + # TODO: check cases where crs_wkt provides part of a necessary + # grid_mapping attribute, or where a grid_mapping attribute + # overrides what has been provided in crs_wkt. + # attempt to parse crs_wkt if it is present + if "crs_wkt" in var.ncattrs(): + crs_wkt = var.crs_wkt + if not isinstance(crs_wkt, str): + test_ctx.messages.append("crs_wkt attribute must be a string") + test_ctx.out_of += 1 + else: + try: + pyproj.CRS.from_wkt(crs_wkt) + except pyproj.exceptions.CRSError as crs_error: + test_ctx.messages.append( + "Cannot parse crs_wkt attribute to CRS using Proj4. Proj4 error: {}".format( + str(crs_error) + ) + ) + else: + test_ctx.score += 1 + test_ctx.out_of += 1 + + # existence_conditions + exist_cond_1 = ( + self._check_gmattr_existence_condition_geoid_name_geoptl_datum_name(var) + ) + test_ctx.assert_true(exist_cond_1[0], exist_cond_1[1]) + exist_cond_2 = self._check_gmattr_existence_condition_ell_pmerid_hdatum(var) + test_ctx.assert_true(exist_cond_2[0], exist_cond_2[1]) + + # handle vertical datum related grid_mapping attributes + vert_datum_attrs = {} + possible_vert_datum_attrs = {"geoid_name", "geopotential_datum_name"} + vert_datum_attrs = possible_vert_datum_attrs.intersection(var.ncattrs()) + len_vdatum_name_attrs = len(vert_datum_attrs) + # check that geoid_name and geopotential_datum_name are not both + # present in the grid_mapping variable + if len_vdatum_name_attrs == 2: + test_ctx.out_of += 1 + test_ctx.messages.append( + "Cannot have both 'geoid_name' and " + "'geopotential_datum_name' attributes in " + "grid mapping variable '{}'".format(var.name) + ) + elif len_vdatum_name_attrs == 1: + # should be one or zero attrs + proj_db_path = os.path.join(pyproj.datadir.get_data_dir(), + "proj.db") + try: + with sqlite3.connect(proj_db_path) as conn: + v_datum_attr = next(iter(vert_datum_attrs)) + v_datum_value = getattr(var, v_datum_attr) + v_datum_str_valid = self._process_v_datum_str( + v_datum_value, conn + ) + + invalid_msg = ( + "Vertical datum value '{}' for " + "attribute '{}' in grid mapping " + "variable '{}' is not valid".format( + v_datum_value, v_datum_attr, var.name + ) + ) + test_ctx.assert_true(v_datum_str_valid, invalid_msg) + except sqlite3.Error as e: + # if we hit an error, skip the check + warn( + "Error occurred while trying to query " + "Proj4 SQLite database at {}: {}".format(proj_db_path, str(e)) + ) + prev_return[var.name] = test_ctx.to_result() + + return prev_return + + def _process_v_datum_str(self, v_datum_str, conn): + vdatum_query = """SELECT 1 FROM alias_name WHERE + table_name = 'vertical_datum' AND + alt_name = ? + UNION ALL + SELECT 1 FROM vertical_datum WHERE + name = ? + LIMIT 1""" + res_set = conn.execute(vdatum_query, (v_datum_str, v_datum_str)) + return len(res_set.fetchall()) > 0 + + def _check_dimensionless_vertical_coordinate_1_7( + self, ds, vname, deprecated_units, ret_val, dim_vert_coords_dict + ): + """ + Check that a dimensionless vertical coordinate variable is valid under + CF-1.7. + + :param netCDF4.Dataset ds: open netCDF4 dataset + :param str name: variable name + :param list ret_val: array to append Results to + :rtype None + """ + variable = ds.variables[vname] + standard_name = getattr(variable, "standard_name", None) + formula_terms = getattr(variable, "formula_terms", None) + # Skip the variable if it's dimensional + if formula_terms is None and standard_name not in dim_vert_coords_dict: + return + + # assert that the computed_standard_name is maps to the standard_name correctly + correct_computed_std_name_ctx = TestCtx( + BaseCheck.MEDIUM, self.section_titles["4.3"] + ) + _comp_std_name = dim_vert_coords_dict[standard_name][1] + correct_computed_std_name_ctx.assert_true( + getattr(variable, "computed_standard_name", None) in _comp_std_name, + "§4.3.3 The standard_name of `{}` must map to the correct computed_standard_name, `{}`".format( + vname, sorted(_comp_std_name) + ), + ) + ret_val.append(correct_computed_std_name_ctx.to_result()) + + def check_dimensionless_vertical_coordinates(self, ds): + """ + Check the validity of dimensionless coordinates under CF + + CF §4.3.2 The units attribute is not required for dimensionless + coordinates. + + The standard_name attribute associates a coordinate with its definition + from Appendix D, Dimensionless Vertical Coordinates. The definition + provides a mapping between the dimensionless coordinate values and + dimensional values that can positively and uniquely indicate the + location of the data. + + A new attribute, formula_terms, is used to associate terms in the + definitions with variables in a netCDF file. To maintain backwards + compatibility with COARDS the use of these attributes is not required, + but is strongly recommended. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + ret_val = [] + + deprecated_units = ["level", "layer", "sigma_level"] + + # compose this function to use the results from the CF-1.6 check + # and then extend it using a CF-1.7 addition + ret_val.extend( + self._check_dimensionless_vertical_coordinates( + ds, + deprecated_units, + self._check_dimensionless_vertical_coordinate_1_6, + dimless_vertical_coordinates_1_7, + ) + ) + + ret_val.extend( + self._check_dimensionless_vertical_coordinates( + ds, + deprecated_units, + self._check_dimensionless_vertical_coordinate_1_7, + dimless_vertical_coordinates_1_7, + ) + ) + + return ret_val diff --git a/compliance_checker/cf/cf_1_8.py b/compliance_checker/cf/cf_1_8.py new file mode 100644 index 00000000..9ba36373 --- /dev/null +++ b/compliance_checker/cf/cf_1_8.py @@ -0,0 +1,893 @@ +from compliance_checker.base import BaseCheck, TestCtx, Result +from compliance_checker import MemoizedDataset +from compliance_checker.cf.cf_1_7 import CF1_7Check +from netCDF4 import Dataset +from compliance_checker.base import BaseCheck, BaseNCCheck, Result, TestCtx +import requests +from lxml import etree +from shapely.geometry import Polygon +import numpy as np +import re +from compliance_checker.cf.util import reference_attr_variables, string_from_var_type +import itertools +import warnings +from shapely.geometry import (MultiPoint, LineString, MultiLineString, Polygon, + MultiPolygon) +from compliance_checker.cf.util import reference_attr_variables + +""" +What's new in CF-1.8 +-------------------- +2.7. Groups + 2.7.1. Scope + 2.7.2. Application of attributes + +6.1.2. Taxon Names and Identifiers + +7.5. Geometries +""" + + +class CF1_8Check(CF1_7Check): + """Implementation for CF v1.8. Inherits from CF1_7Check.""" + + # things that are specific to 1.8 + _cc_spec_version = "1.8" + _cc_url = "http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html" + + ROOT_GROUP_ONLY_ATTRS = ["Conventions", "external_variables"] + NON_ROOT_GROUP_OPT = ["title", "history"] + + def __init__(self, options=None): + super(CF1_8Check, self).__init__(options) + self.section_titles.update({"2.7": + "§2.7 Groups", + "6.1.2": + "§6.1.2 Taxon Names and Identifiers", + "7.5": "§7.5 Geometries"}) + + def check_groups(self, ds: MemoizedDataset): + """ + 2.7.2. Application of attributes + + The following attributes are optional for non-root groups. They are allowed in order to + provide additional provenance and description of the subsidiary data. They do not override + attributes from parent groups. + + - title + - history + + If these attributes are present, they may be applied additively to the parent attributes of + the same name. If a file containing groups is modified, the user or application need only + update these attributes in the root group, rather than traversing all groups and updating + all attributes that are found with the same name. In the case of conflicts, the root group + attribute takes precedence over per-group instances of these attributes. + + The following attributes MAY ONLY be used in the root group and SHALL NOT be duplicated or + overridden in child groups: + + - Conventions + - external_variables + + Furthermore, per-variable attributes MUST be attached to the variables to which they refer. + They MAY NOT be attached to a group, even if all variables within that group use the same + attribute and value. + + If attributes are present within groups without being attached to a variable, these + attributes apply to the group where they are defined, and to that group’s descendants, but + not to ancestor or sibling groups. If a group attribute is defined in a parent group, and + one of the child group redefines the same attribute, the definition within the child group + applies for the child and all of its descendants. + """ + + results = [] + + ctx_hi = TestCtx(BaseCheck.HIGH, self.section_titles["2.7"]) + ctx_lo = TestCtx(BaseCheck.LOW, self.section_titles["2.7"]) + + # Make sure `Conventions` & `external_variables` attributes are only present in the + # root group. + for gname in ds.groups: + ginstance = ds.createGroup( + gname + ) # returns existing Group; doesn't create a new one + + for attr in ginstance.ncattrs(): + if attr in CF1_8Check.ROOT_GROUP_ONLY_ATTRS: + + ctx_hi.messages.append( + f'§2.7.2 Attribute "{ attr }" MAY ONLY be used in the root group ' + "and SHALL NOT be duplicated or overridden in child groups." + ) + + results.append(ctx_hi.to_result()) + + elif attr in CF1_8Check.NON_ROOT_GROUP_OPT: + + ctx_lo.messages.append( + f"§2.7.2 Note: attribute '{ attr }' found on non-root group '{ gname }'. " + "This is optional for non-root groups. It is allowed in order to provide additional " + "provenance and description of the subsidiary data. It does not override " + "attributes from parent groups." + ) + results.append(ctx_lo.to_result()) + + return results + + def check_geometry(self, ds: Dataset): + """Runs any necessary checks for geometry well-formedness + :param netCDF4.Dataset ds: An open netCDF dataset + :returns list: List of error messages + """ + vars_with_geometry = ds.get_variables_by_attributes( + geometry=lambda g: g is not None) + results = [] + unique_geometry_var_names = {var.geometry for var in vars_with_geometry} + if unique_geometry_var_names: + geom_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.5"]) + geom_valid.out_of += 1 + for geometry_var_name in unique_geometry_var_names: + if geometry_var_name not in ds.variables: + geom_valid.messages.append("Cannot find geometry variable " + f"named {geometry_var_name}") + results.append(geom_valid.to_result()) + continue + else: + geometry_var = ds.variables[geometry_var_name] + + geometry_type = getattr(geometry_var, "geometry_type") + valid_geometry_types = {"point", "line", "polygon"} + try: + node_coord_var_names = geometry_var.node_coordinates + except AttributeError as e: + geom_valid.messsages.append('Could not find required attribute ' + '"node_coordinates" in geometry ' + f'variable "{geometry_var_name}"') + results.append(geom_valid.to_result()) + if not isinstance(node_coord_var_names, str): + geom_valid.messages.append( + 'Attribute "node_coordinates" in geometry ' + f'variable "{geometry_var_name}" must be ' + 'a string') + results.append(geom_valid.to_result()) + continue + split_coord_names = node_coord_var_names.strip().split(" ") + node_coord_vars, not_found_node_vars = [], [] + for coord_var_name in split_coord_names: + try: + node_coord_vars.append(ds.variables[coord_var_name]) + except KeyError: + not_found_node_vars.append(coord_var_name) + # If any variables weren't found, we can't continue + if not_found_node_vars: + geom_valid.messages.append( + "The following referenced node coordinate" + "variables for geometry variable" + f'"{geometry_var_name}" were not found: ' + f'{not_found_node_vars}') + results.append(geom_valid.to_result()) + continue + return error_msgs + + node_count = reference_attr_variables(ds, + getattr(geometry_var, "node_count", None)) + # multipart lines and polygons only + part_node_count = reference_attr_variables(ds, + getattr(geometry_var, "part_node_count", None)) + # polygons with interior geometry only + interior_ring = reference_attr_variables(ds, + getattr(geometry_var, "interior_ring", None)) + + if geometry_type == "point": + geometry = PointGeometry(node_coord_vars, node_count) + elif geometry_type == "line": + geometry = LineGeometry(node_coord_vars, node_count, + part_node_count) + elif geometry_type == "polygon": + geometry = PolygonGeometry(node_coord_vars, node_count, + part_node_count, + interior_ring) + else: + geom_valid.messages.append( + f'For geometry variable "{geometry_var_name}' + 'the attribute "geometry_type" must exist' + 'and have one of the following values:' + '"point", "line", "polygon"') + results.append(geom_valid.to_result()) + continue + # check geometry + messages = geometry.check_geometry() + if not messages: + geom_valid.score += 1 + else: + geom_valid.messages.extend(messages) + results.append(geom_valid.to_result()) + return results + + def check_taxa(self, ds: Dataset): + """ + 6.1.2. Taxon Names and Identifiers + + A taxon is a named level within a biological classification, such as a class, genus and + species. QUANTITIES DEPENDENT ON TAXA HAVE GENERIC STANDARD NAMES CONTAINING THE PHRASE + "organisms_in_taxon", AND THE TAXA ARE IDENTIFIED BY AUXILIARY COORDINATE VARIABLES. + + The taxon auxiliary coordinate variables are string-valued. The plain-language name of the + taxon MUST be contained in a variable with standard_name of 'biological_taxon_name'. A Life + Science Identifier (LSID) may be contained in a variable with standard_name of + biological_taxon_lsid. This is a URN with the syntax + "urn:lsid:::[:]". This includes the reference + classification in the element and these are restricted by the LSID governance. + It is strongly recommended in CF that the authority chosen is World Register of Marine + Species (WoRMS) for oceanographic data and Integrated Taxonomic Information System (ITIS) + for freshwater and terrestrial data. WoRMS LSIDs are built from the WoRMS AphiaID taxon + identifier such as "urn:lsid:marinespecies.org:taxname:104464" for AphiaID 104464. This may + be converted to a URL by adding prefixes such as http://www.lsid.info/. ITIS LSIDs are + built from the ITIS Taxonomic Serial Number (TSN), such as + "urn:lsid:itis.gov:itis_tsn:180543". + + The biological_taxon_name auxiliary coordinate variable included for human readability is + MANDATORY. The biological_taxon_lsid auxliary coordinate variable included for software + agent readability is optional, but strongly recommended. If both are present then each + biological_taxon_name coordinate must exactly match the name resolved from the + biological_taxon_lsid coordinate. If LSIDs are available for some taxa in a dataset then + the biological_taxon_lsid auxiliary coordinate variable should be included and missing data + given for those taxa that do not have an identifier. + """ + ret_val = [] + # taxa identification variables + taxa_name_variables = ds.get_variables_by_attributes( + standard_name="biological_taxon_name") + taxa_lsid_variables = ds.get_variables_by_attributes( + standard_name="biological_taxon_identifier") + + def match_taxa_standard_names(standard_name_string): + """ + Match variables which are standard_names related to taxa, but + are not the taxon identifiers or LSIDs themselves. + """ + return ( + standard_name_string is not None + and "taxon" in standard_name_string + and + # exclude the identifiers we just looked at + standard_name_string + not in {"biological_taxon_lsid", "biological_taxon_name"} + and standard_name_string in self._std_names + ) + + taxa_quantifier_variables = ds.get_variables_by_attributes( + standard_name=match_taxa_standard_names + ) + # If there are no matches, there either are no taxa variables + # or the standard names are not appropriate, which will be picked up + # by the standard_name check + if not taxa_quantifier_variables: + return + + for taxon_quantifier_variable in taxa_quantifier_variables: + valid_taxa = TestCtx(BaseCheck.HIGH, self.section_titles["6.1.2"]) + if not isinstance( + getattr(taxon_quantifier_variable, "coordinates", None), str + ): + valid_taxa.add_failure( + f'{taxon_quantifier_variable.name} must have a string valued "coordinates" attribute' + ) + continue + + coordinate_var_names = taxon_quantifier_variable.coordinates.split(" ") + invalid_coord_vars = set(coordinate_var_names) - ds.variables.keys() + if invalid_coord_vars: + valid_taxa.add_failure( + 'The following values for "coordinates" attributes were not found in the dataset\'s variables ' + f"{invalid_coord_vars}" + ) + + if len(coordinate_var_names) > 2: + valid_taxa.add_failure( + "coordinates attribute for taxon data must either reference one or two variable names" + ) + continue + + coordinate_vars = [ + ds.variables[var_name] for var_name in coordinate_var_names + ] + + coord_var_standard_names = { + var: getattr(var, "standard_name", None) for var in coordinate_vars + } + # if we have no authority, we can't check validity of the name -- assume it's OK + standard_name_set = set(coord_var_standard_names.values()) + if set(coord_var_standard_names.keys()) == {"biological_taxon_name"}: + # TODO: Check for at least binomial nomenclature? + continue + # check against WoRMS or ITIS if applicable + elif standard_name_set == { + "biological_taxon_name", + "biological_taxon_lsid", + }: + inverted_dict = {v: k for k, v in coord_var_standard_names.items()} + taxon_lsid_var = inverted_dict["biological_taxon_lsid"] + taxon_name_var = inverted_dict["biological_taxon_name"] + lsid_messages = self.handle_lsid(taxon_lsid_var, taxon_name_var) + valid_taxa.out_of += 1 + if lsid_messages: + valid_taxa.messages.extend(lsid_messages) + else: + valid_taxa.score += 1 + else: + valid_taxa.add_failure( + f"coordinates attribute for variable {taxon_quantifier_variable} must consist of " + 'variables containing standard names of either just "biological_taxon_name", or "biological_taxon_name" and "biological_taxon_identifier"' + ) + ret_val.append(valid_taxa.to_result()) + + return ret_val + + def handle_lsid(self, taxon_lsid_variable, taxon_name_variable): + """ + Checks if LSID is well formed and present in the LSID database, + and then attempts to delegate to WoRMS or ITIS, the LSID is applicable. + If the LSID does not check the above authorities, it is not + currently checked for correctness. + """ + messages = [] + match_str = ( + r"(?:http://(?:www\.)?lsid.info/)?urn:lsid:" + r"(?P[^:]+):(?P[^:]+):" + r"(?P\w+)(?::(?P\w+))?" + ) + for taxon_lsid, taxon_name in zip( + taxon_lsid_variable[:], taxon_name_variable[:] + ): + # TODO: handle case where LSID is not present. This can happen + # if the species is not present in the database desired. + taxon_name_str = string_from_var_type(taxon_name) + lsid_str = string_from_var_type(taxon_lsid) + # if nodata/empty string for LSID, skip validity check + if lsid_str == "": + continue + taxon_match = re.fullmatch(match_str, lsid_str) + if not taxon_match: + messages.append( + "Taxon id must match one of the following forms:\n" + "- urn:lsid:::\n" + "- urn:lsid::::\n" + "- www.lsid.info/urn:lsid.info::/\n" + "- www.lsid.info/urn:lsid.info::/:\n" + "- lsid.info/urn:lsid.info::/\n" + "- lsid.info/urn:lsid.info::/:\n" + "- http://lsid.info/urn:lsid.info::/\n" + "- http://lsid.info/urn:lsid.info::/:\n" + "- http://www.lsid.info/urn:lsid.info::/\n" + "- http://www.lsid.info/urn:lsid.info::/:" + ) + continue + if lsid_str.startswith("urn"): + lsid_url = f"http://www.lsid.info/{lsid_str}" + else: + lsid_url = lsid_str + + try: + response = requests.get(lsid_url, timeout=10) + response.raise_for_status() + except requests.exceptions.RequestException as e: + # 400 error code indicates something is malformed on client's + # end + if response.status_code == 400: + tree = etree.HTML(response.text) + problem_text = tree.find("./body/p").text + messages.append( + "http://lsid.info returned an error message " + f"for submitted LSID string '{lsid_str}': " + f"{problem_text}" + ) + else: + messages.append( + "Error occurred attempting to check LSID " + f"'{lsid_str}': {str(e)}" + ) + continue + + # WoRMS -- marine bio data + if ( + taxon_match["authority"] == "marinespecies.org" + and taxon_match["namespace"] == "taxname" + ): + try: + response = requests.get( + f"http://www.marinespecies.org/rest/AphiaRecordByAphiaID/{taxon_match['object_id']}", + timeout=15, + ) + response.raise_for_status() + except requests.exceptions.RequestException as e: + messages.append( + "Aphia ID {taxon_match['object_id'] returned " + "other error: {str(e)}" + ) + # record not found in database + if response.status_code == 204: + messages.append( + "Aphia ID {taxon_match['object_id'] " + "not found in WoRMS database" + ) + # good case, parse JSON + elif response.status_code == 200: + valid_name = response.json()["valid_name"] + if valid_name != taxon_name_str: + messages.append( + "Supplied taxon name and WoRMS valid name do not match. " + f"Supplied taxon name is '{taxon_name_str}', WoRMS valid name " + f"is '{valid_name}.'" + ) + # Misc non-error code. Should not reach here. + else: + messages.append( + f"Aphia ID {taxon_match['object_id']}" + "returned an unhandled HTTP status " + f"code {response.status_code}" + ) + continue + + # ITIS -- freshwater bio data + elif ( + taxon_match["authority"] == "itis.gov" + and taxon_match["namespace"] == "itis_tsn" + ): + itis_url = f"https://www.itis.gov/ITISWebService/jsonservice/getFullRecordFromTSN?tsn={taxon_match['object_id']}" + try: + itis_response = requests.get(itis_url, timeout=15) + itis_response.raise_for_status() + except requests.exceptions.RequestException as e: + if itis_response.status_code == 404: + messages.append( + "itis.gov TSN " f"{taxon_match['object_id']} not found." + ) + continue + else: + messages.append( + "itis.gov identifier returned other " f"error: {str(e)}" + ) + continue + json_contents = itis_response.json() + combined_name = json_contents["scientificName"]["combinedName"] + + if taxon_name_str != combined_name: + messages.append( + "Supplied taxon name and ITIS scientific name do not match. " + f"Supplied taxon name is '{taxon_name_str}', ITIS scientific name " + f"for TSN {taxon_match['object_id']} is '{combined_name}.'" + ) + + else: + warnings.warn( + "Compliance checker only supports checking valid " + "LSID URNs of the form " + "'urn:lsid:marinespecies.org:taxname:' or " + "'urn:lsid:itis.gov:itis_tsn:'. Assuming " + "pass condition" + ) + + return messages + + +class GeometryStorage(object): + """Abstract base class for geometries""" + + def __init__(self, coord_vars, node_count): + self.coord_vars = coord_vars + self.node_count = node_count + self.errors = [] + # geometry is later parsed after sanity checks are run + self.geometry = None + + def _split_mulitpart_geometry(self): + arr_extents_filt = self.part_node_count[self.part_node_count > 0] + splits = np.split(np.vstack(self.coord_vars).T, + arr_extents_filt.cumsum()[:-1]) + return splits + +class PointGeometry(GeometryStorage): + """Class for validating Point/MultiPoint geometries""" + + def check_geometry(self): + super().check_geometry() + # non-multipoint should have exactly one feature + if self.node_count is None: + expected_node_count = 1 + else: + expected_node_count = self.node_count + + if all(len(cv.dimensions) != 0 for cv in self.coord_vars): + same_dim_group = itertools.groupby(self.coord_vars, + lambda x: x.dimensions) + same_dim = (next(same_dim_group, True) and + not next(same_dim_group, False)) + if not same_dim: + self.errors.append("For a point geometry, coordinate " + "variables must be the same length as " + "node_count defined, or must be " + "length 1 if node_count is not set") + return self.errors + +class LineGeometry(GeometryStorage): + """Class for validating Line/MultiLine geometries""" + def __init__(self, coord_vars, node_count, part_node_count): + super().__init__(coord_vars, node_count) + self.part_node_count = part_node_count + if not np.issubdtype(self.node_count.dtype, np.integer): + raise TypeError("For line geometries, node_count must be an integer") + + def check_geometry(self): + geom_errors = [] + same_dim_group = itertools.groupby(self.coord_vars, + lambda x: x.dimensions) + same_dim = (next(same_dim_group, True) and + not next(same_dim_group, False)) + if not same_dim: + raise IndexError("Coordinate variables must be the same length. " + "If node_count is specified, this value must " + "also sum to the length of the coordinate " + "variables.") + # if a multipart + if self.node_count is not None: + same_length = len(self.coord_vars[0]) == self.node_count[:].sum() + if not same_length: + geom_errors.append("Coordinate variables must be the same " + "length. If node_count is specified, this " + "value must also sum to the length of the " + "coordinate variables.") + if self.part_node_count is not None: + if not np.issubdtype(self.part_node_count.dtype, np.integer): + geom_errors.append("when part_node_count is specified, it must " + "be an array of integers") + same_node_count = len(self.coord_vars[0]) == self.node_count[:].sum() + if not same_node_count: + geom_errors.append("The sum of part_node_count must be equal " + "to the value of node_count") + return geom_errors + + +class PolygonGeometry(LineGeometry): + """Class for validating Line/MultiLine geometries""" + # TODO/clarify: Should polygons be simple, i.e. non-self intersecting? + # Presumably + def __init__(self, coord_vars, node_count, part_node_count, + interior_ring): + super().__init__(coord_vars, node_count, part_node_count) + self.part_node_count = part_node_count + self.interior_ring = interior_ring + + def check_polygon_orientation(self, transposed_coords, interior=False): + """ + Checks that the polygon orientation is counter-clockwise if an + exterior ring, otherwise clockwise if an interior ring. Orientation + is indicated by the `interior` boolean variable with False for an + exterior ring and True for an interior ring (hole), defaulting to False. + This function operates piecewise on individual interior/exterior + polygons as well as multipart polygons + :param np.array transposed_coords: A 2-by-n array of x and y coordinates + :param bool interior: A boolean defaulting to False which has False + indicating a counter-clockwise or exterior polygon, and True + indicating a clockwise or interior polygon. + :rtype bool: + :returns: True if the polygon follows the proper orientation, + False if it fails the orientation test. + """ + + try: + polygon = Polygon(transposed_coords.tolist()) + except ValueError: + raise ValueError("Polygon contains too few points to perform orientation test") + + ccw = polygon.exterior.is_ccw + return not ccw if interior else ccw + + def check_geometry(self): + messages = super().check_geometry() + # If any errors occurred within the preliminary checks, they preclude + # running checks against the geometry here. + if messages: + return messages + if self.part_node_count is not None: + extents = np.concatenate([np.array([0]), + self.part_node_count[:].cumsum()]) + if self.interior_ring is not None: + ring_orientation = self.interior_ring[:].astype(bool) + else: + ring_orientation = np.zeros(len(self.part_count), dtype=bool) + current_node_count = self.node_count[:].copy() + node_indexer_len = len(self.part_node_count) + else: + extents = np.concatenate([np.array([0]), + self.node_count[:].cumsum()]) + node_indexer_len = len(self.node_count) + ring_orientation = np.zeros(node_indexer_len, dtype=bool) + # TODO: is it necessary to check whether part_node_count "consumes" + # node_count in the polygon, i.e. first (3, 3, 3) will consume + # a node part of 9, follow by next 3 will consume a node part of + # 3 after consuming + for i in range(node_indexer_len): + extent_slice = slice(extents[i], extents[i+1]) + poly_sliced = np.vstack([cv[extent_slice] for cv in + self.coord_vars]).T + pass_orientation = (self.check_polygon_orientation( + poly_sliced, + ring_orientation[i])) + if not pass_orientation: + orient_fix = (("exterior", "counterclockwise") + if not ring_orientation[i] else + ("interior", "clockwise")) + message = (f"An {orient_fix[0]} polygon referred to by " + f"coordinates ({poly_sliced}) must have coordinates " + f"in {orient_fix[1]} order") + messages.append(message) + return messages + + def check_geometry(self, ds: Dataset): + """Runs any necessary checks for geometry well-formedness + :param netCDF4.Dataset ds: An open netCDF dataset + :returns list: List of error messages + + """ + vars_with_geometry = ds.get_variables_by_attributes( + geometry=lambda g: g is not None + ) + results = [] + unique_geometry_var_names = {var.geometry for var in vars_with_geometry} + if unique_geometry_var_names: + geom_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["7.5"]) + geom_valid.out_of += 1 + for geometry_var_name in unique_geometry_var_names: + if geometry_var_name not in ds.variables: + geom_valid.messages.append( + "Cannot find geometry variable " f"named {geometry_var_name}" + ) + results.append(geom_valid.to_result()) + continue + else: + geometry_var = ds.variables[geometry_var_name] + + geometry_type = getattr(geometry_var, "geometry_type") + try: + node_coord_var_names = geometry_var.node_coordinates + except AttributeError: + geom_valid.messages.append( + "Could not find required attribute " + '"node_coordinates" in geometry ' + f'variable "{geometry_var_name}"' + ) + results.append(geom_valid.to_result()) + continue + if not isinstance(node_coord_var_names, str): + geom_valid.messages.append( + 'Attribute "node_coordinates" in geometry ' + f'variable "{geometry_var_name}" must be ' + "a string" + ) + results.append(geom_valid.to_result()) + continue + split_coord_names = node_coord_var_names.strip().split(" ") + node_coord_vars, not_found_node_vars = [], [] + for coord_var_name in split_coord_names: + try: + node_coord_vars.append(ds.variables[coord_var_name]) + except KeyError: + not_found_node_vars.append(coord_var_name) + # If any variables weren't found, we can't continue + if not_found_node_vars: + geom_valid.messages.append( + "The following referenced node coordinate" + "variables for geometry variable" + f'"{geometry_var_name}" were not found: ' + f"{not_found_node_vars}" + ) + results.append(geom_valid.to_result()) + continue + + node_count = reference_attr_variables( + ds, getattr(geometry_var, "node_count", None) + ) + # multipart lines and polygons only + part_node_count = reference_attr_variables( + ds, getattr(geometry_var, "part_node_count", None) + ) + # polygons with interior geometry only + interior_ring = reference_attr_variables( + ds, getattr(geometry_var, "interior_ring", None) + ) + + if geometry_type == "point": + geometry = PointGeometry(node_coord_vars, node_count) + elif geometry_type == "line": + geometry = LineGeometry(node_coord_vars, node_count, part_node_count) + elif geometry_type == "polygon": + geometry = PolygonGeometry( + node_coord_vars, node_count, part_node_count, interior_ring + ) + else: + geom_valid.messages.append( + f'For geometry variable "{geometry_var_name}' + 'the attribute "geometry_type" must exist' + "and have one of the following values:" + '"point", "line", "polygon"' + ) + results.append(geom_valid.to_result()) + continue + # check geometry + geometry.check_geometry() + if not geometry.errors: # geom_valid.messages: + geom_valid.score += 1 + results.append(geom_valid.to_result()) + return results + + +class GeometryStorage(object): + """Abstract base class for geometries""" + + def __init__(self, coord_vars, node_count): + self.coord_vars = coord_vars + self.node_count = node_count + self.errors = [] + # geometry is later parsed after sanity checks are run + self.geometry = None + + def check_geometry(self): + invalid_vars = [] + for coord_var in self.coord_vars: + if not np.issubdtype(coord_var, np.float): + invalid_vars.append(coord_var.name) + # can't continue if the geometry variables are not the correct size + if invalid_vars: + self.errors.append( + "The following geometry variables " + f"have non-numeric contents: {invalid_vars}" + ) + + def _split_mulitpart_geometry(self): + arr_extents_filt = self.part_node_count[self.part_node_count > 0] + splits = np.split(np.vstack(self.coord_vars).T, arr_extents_filt.cumsum()[:-1]) + return splits + + +class PointGeometry(GeometryStorage): + """Class for validating Point/MultiPoint geometries""" + + def check_geometry(self): + super().check_geometry() + + if all(len(cv.dimensions) != 0 for cv in self.coord_vars): + same_dim_group = itertools.groupby(self.coord_vars, lambda x: x.dimensions) + same_dim = next(same_dim_group, True) and not next(same_dim_group, False) + if not same_dim: + self.errors.append( + "For a point geometry, coordinate " + "variables must be the same length as " + "node_count defined, or must be " + "length 1 if node_count is not set" + ) + return self.errors + + +class LineGeometry(GeometryStorage): + """Class for validating Line/MultiLine geometries""" + + def __init__(self, coord_vars, node_count, part_node_count): + super().__init__(coord_vars, node_count) + self.part_node_count = part_node_count + if not np.issubdtype(self.node_count.dtype, np.integer): + raise TypeError("For line geometries, node_count must be an integer") + + def check_geometry(self): + geom_errors = [] + same_dim_group = itertools.groupby(self.coord_vars, lambda x: x.dimensions) + same_dim = next(same_dim_group, True) and not next(same_dim_group, False) + if not same_dim: + raise IndexError( + "Coordinate variables must be the same length. " + "If node_count is specified, this value must " + "also sum to the length of the coordinate " + "variables." + ) + # if a multipart + if self.node_count is not None: + same_length = len(self.coord_vars[0]) == self.node_count[:].sum() + if not same_length: + geom_errors.append( + "Coordinate variables must be the same " + "length. If node_count is specified, this " + "value must also sum to the length of the " + "coordinate variables." + ) + if self.part_node_count is not None: + if not np.issubdtype(self.part_node_count.dtype, np.integer): + geom_errors.append( + "when part_node_count is specified, it must " + "be an array of integers" + ) + same_node_count = len(self.coord_vars[0]) == self.node_count[:].sum() + if not same_node_count: + geom_errors.append( + "The sum of part_node_count must be equal " + "to the value of node_count" + ) + return geom_errors + + +class PolygonGeometry(LineGeometry): + """Class for validating Line/MultiLine geometries""" + + # TODO/clarify: Should polygons be simple, i.e. non-self intersecting? + # Presumably + def __init__(self, coord_vars, node_count, part_node_count, interior_ring): + super().__init__(coord_vars, node_count, part_node_count) + self.part_node_count = part_node_count + self.interior_ring = interior_ring + + def check_polygon_orientation(self, transposed_coords, interior=False): + """ + Checks that the polygon orientation is counter-clockwise if an + exterior ring, otherwise clockwise if an interior ring. Orientation + is indicated by the `interior` boolean variable with False for an + exterior ring and True for an interior ring (hole), defaulting to False. + This function operates piecewise on individual interior/exterior + polygons as well as multipart polygons + :param np.array transposed_coords: A 2-by-n array of x and y coordinates + :param bool interior: A boolean defaulting to False which has False + indicating a counter-clockwise or exterior polygon, and True + indicating a clockwise or interior polygon. + :rtype bool: + :returns: True if the polygon follows the proper orientation, + False if it fails the orientation test. + """ + + try: + polygon = Polygon(transposed_coords.tolist()) + except ValueError: + raise ValueError( + "Polygon contains too few points to perform orientation test" + ) + + ccw = polygon.exterior.is_ccw + return not ccw if interior else ccw + + def check_geometry(self): + messages = super().check_geometry() + # If any errors occurred within the preliminary checks, they preclude + # running checks against the geometry here. + if messages: + return messages + if self.part_node_count is not None: + extents = np.concatenate([np.array([0]), self.part_node_count[:].cumsum()]) + if self.interior_ring is not None: + ring_orientation = self.interior_ring[:].astype(bool) + else: + ring_orientation = np.zeros(len(self.part_count), dtype=bool) + node_indexer_len = len(self.part_node_count) + else: + extents = np.concatenate([np.array([0]), self.node_count[:].cumsum()]) + node_indexer_len = len(self.node_count) + ring_orientation = np.zeros(node_indexer_len, dtype=bool) + # TODO: is it necessary to check whether part_node_count "consumes" + # node_count in the polygon, i.e. first (3, 3, 3) will consume + # a node part of 9, follow by next 3 will consume a node part of + # 3 after consuming + for i in range(node_indexer_len): + extent_slice = slice(extents[i], extents[i + 1]) + poly_sliced = np.vstack([cv[extent_slice] for cv in self.coord_vars]).T + pass_orientation = self.check_polygon_orientation( + poly_sliced, ring_orientation[i] + ) + if not pass_orientation: + orient_fix = ( + ("exterior", "counterclockwise") + if not ring_orientation[i] + else ("interior", "clockwise") + ) + message = ( + f"An {orient_fix[0]} polygon referred to by " + f"coordinates ({poly_sliced}) must have coordinates " + f"in {orient_fix[1]} order" + ) + messages.append(message) + return messages diff --git a/compliance_checker/cf/cf_base.py b/compliance_checker/cf/cf_base.py new file mode 100644 index 00000000..ab6ebf20 --- /dev/null +++ b/compliance_checker/cf/cf_base.py @@ -0,0 +1,1281 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import logging +import os +import sys + +from collections import OrderedDict, defaultdict +from warnings import warn + +import numpy as np +import regex + +from cf_units import Unit + +from compliance_checker import cfutil +from compliance_checker.base import BaseCheck, BaseNCCheck, Result, TestCtx +from compliance_checker.cf import util +from compliance_checker.cf.appendix_d import ( + dimless_vertical_coordinates_1_6, + dimless_vertical_coordinates_1_7, + no_missing_terms, +) +from compliance_checker.cf.appendix_e import cell_methods16, cell_methods17 +from compliance_checker.cf.appendix_f import ( + ellipsoid_names17, + grid_mapping_attr_types16, + grid_mapping_attr_types17, + grid_mapping_dict16, + grid_mapping_dict17, + horizontal_datum_names17, + prime_meridian_names17, +) + +logger = logging.getLogger(__name__) + +class CFBaseCheck(BaseCheck): + """ + CF Convention Checker Base + """ + + def __init__(self, options=None): + # The compliance checker can be run on multiple datasets in a single + # instantiation, so caching values has be done by the unique identifier + # for each dataset loaded. + + # Each default dict is a key, value mapping from the dataset object to + # a list of variables + super(CFBaseCheck, self).__init__(options) + self._coord_vars = defaultdict(list) + self._ancillary_vars = defaultdict(list) + self._clim_vars = defaultdict(list) + self._metadata_vars = defaultdict(list) + self._boundary_vars = defaultdict(list) + self._geophysical_vars = defaultdict(list) + self._aux_coords = defaultdict(list) + + self._std_names = util.StandardNameTable() + + self.section_titles = { # dict of section headers shared by grouped checks + "2.2": "§2.2 Data Types", + "2.3": "§2.3 Naming Conventions", + "2.4": "§2.4 Dimensions", + "2.5": "§2.5 Variables", + "2.6": "§2.6 Attributes", + "3.1": "§3.1 Units", + "3.2": "§3.2 Long Name", + "3.3": "§3.3 Standard Name", + "3.4": "§3.4 Ancillary Data", + "3.5": "§3.5 Flags", + "4": "§4 Coordinate Types", + "4.1": "§4.1 Latitude Coordinate", + "4.2": "§4.2 Longitude Coordinate", + "4.3": "§4.3 Vertical Coordinate", + "4.4": "§4.4 Time Coordinate", + "4.5": "§4.5 Discrete Axis", + "5": "§5 Coordinate Systems", + "5.1": "§5.1 Independent Latitude, Longitude, Vertical, and Time Axes", + "5.2": "§5.2 2-D Latitude, Longitude, Coordinate Variables", + "5.3": "§5.3 Reduced Horizontal Grid", + "5.4": "§5.4 Timeseries of Station Data", + "5.5": "§5.5 Trajectories", + "5.6": "§5.6 Horizontal Coordinate Reference Systems, Grid Mappings, Projections", + "5.7": "§5.7 Scalar Coordinate Variables", + "6.1": "§6.1 Labels", + "6.2": "§6.2 Alternative Coordinates", + "7.1": "§7.1 Cell Boundaries", + "7.2": "§7.2 Cell Measures", + "7.3": "§7.3 Cell Methods", + "7.4": "§7.4 Climatological Statistics", + "8.1": "§8.1 Packed Data", + "8.2": "§8.2 Compression by Gathering", + "9.1": "§9.1 Features and feature types", + "9.2": "§9.2 Collections, instances, and elements", + "9.3": "§9.3 Representations of Collections of features in data variables", + "9.4": "§9.4 The featureType attribute", + "9.5": "§9.5 Coordinates and metadata", + "9.6": "§9.6 Missing Data", + } + + ################################################################################ + # Helper Methods - var classifications, etc + ################################################################################ + + def setup(self, ds): + """ + Initialize various special variable types within the class. + Mutates a number of instance variables. + + :param netCDF4.Dataset ds: An open netCDF dataset + """ + self.coord_vars = self._find_coord_vars(ds) + self._find_aux_coord_vars(ds) + self._find_ancillary_vars(ds) + self._find_clim_vars(ds) + self._find_boundary_vars(ds) + self._find_metadata_vars(ds) + self._find_cf_standard_name_table(ds) + self._find_geophysical_vars(ds) + coord_containing_vars = ds.get_variables_by_attributes( + coordinates=lambda val: isinstance(val, str) + ) + + # coordinate data variables + + # Excerpt from "§1.3 Overview" on coordinate data + # There are two methods used to identify variables that contain + # coordinate data. The first is to use the NUG-defined "coordinate + # variables." The use of coordinate variables is required for all + # dimensions that correspond to one dimensional space or time + # coordinates . In cases where coordinate variables are not applicable, + # the variables containing coordinate data are identified by the + # coordinates attribute. + + # first read in variables referred to in coordinates which exist + # in the dataset + self.coord_data_vars = set() + for var in coord_containing_vars: + for coord_var_name in var.coordinates.strip().split(" "): + if coord_var_name in ds.variables: + self.coord_data_vars.add(coord_var_name) + # then add in the NUG coordinate variables -- single dimension with + # dimension name the same as coordinates + self.coord_data_vars.update(self.coord_vars) + + def check_grid_mapping(self, ds): + """ + 5.6 When the coordinate variables for a horizontal grid are not + longitude and latitude, it is required that the true latitude and + longitude coordinates be supplied via the coordinates attribute. If in + addition it is desired to describe the mapping between the given + coordinate variables and the true latitude and longitude coordinates, + the attribute grid_mapping may be used to supply this description. + + This attribute is attached to data variables so that variables with + different mappings may be present in a single file. The attribute takes + a string value which is the name of another variable in the file that + provides the description of the mapping via a collection of attached + attributes. This variable is called a grid mapping variable and is of + arbitrary type since it contains no data. Its purpose is to act as a + container for the attributes that define the mapping. + + The one attribute that all grid mapping variables must have is + grid_mapping_name which takes a string value that contains the mapping's + name. The other attributes that define a specific mapping depend on the + value of grid_mapping_name. The valid values of grid_mapping_name along + with the attributes that provide specific map parameter values are + described in Appendix F, Grid Mappings. + + When the coordinate variables for a horizontal grid are longitude and + latitude, a grid mapping variable with grid_mapping_name of + latitude_longitude may be used to specify the ellipsoid and prime + meridian. + + + In order to make use of a grid mapping to directly calculate latitude + and longitude values it is necessary to associate the coordinate + variables with the independent variables of the mapping. This is done by + assigning a standard_name to the coordinate variable. The appropriate + values of the standard_name depend on the grid mapping and are given in + Appendix F, Grid Mappings. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: List of results + """ + + ret_val = OrderedDict() + grid_mapping_variables = cfutil.get_grid_mapping_variables(ds) + + # Check the grid_mapping attribute to be a non-empty string and that its reference exists + for variable in ds.get_variables_by_attributes( + grid_mapping=lambda x: x is not None + ): + grid_mapping = getattr(variable, "grid_mapping", None) + defines_grid_mapping = self.get_test_ctx( + BaseCheck.HIGH, self.section_titles["5.6"], variable.name + ) + defines_grid_mapping.assert_true( + (isinstance(grid_mapping, str) and grid_mapping), + "{}'s grid_mapping attribute must be a " + + "space-separated non-empty string".format(variable.name), + ) + if isinstance(grid_mapping, str): + # TODO (badams): refactor functionality to split functionality + # into requisite classes + if ":" in grid_mapping and self._cc_spec_version >= "1.7": + colon_count = grid_mapping.count(":") + re_all = regex.findall( + r"(\w+):\s*((?:\w+\s+)*(?:\w+)(?![\w:]))", grid_mapping + ) + if colon_count != len(re_all): + defines_grid_mapping.out_of += 1 + defines_grid_mapping.messages.append( + "Could not consume entire grid_mapping expression, please check for well-formedness" + ) + else: + for grid_var_name, coord_var_str in re_all: + defines_grid_mapping.assert_true( + grid_var_name in ds.variables, + "grid mapping variable {} must exist in this dataset".format( + grid_var_name + ), + ) + for ref_var in coord_var_str.split(): + defines_grid_mapping.assert_true( + ref_var in ds.variables, + "Coordinate-related variable {} referenced by grid_mapping variable {} must exist in this dataset".format( + ref_var, grid_var_name + ), + ) + + else: + for grid_var_name in grid_mapping.split(): + defines_grid_mapping.assert_true( + grid_var_name in ds.variables, + "grid mapping variable {} must exist in this dataset".format( + grid_var_name + ), + ) + ret_val[variable.name] = defines_grid_mapping.to_result() + + # Check the grid mapping variables themselves + for grid_var_name in grid_mapping_variables: + valid_grid_mapping = self.get_test_ctx( + BaseCheck.HIGH, self.section_titles["5.6"], grid_var_name + ) + grid_var = ds.variables[grid_var_name] + + grid_mapping_name = getattr(grid_var, "grid_mapping_name", None) + + # Grid mapping name must be in appendix F + valid_grid_mapping.assert_true( + grid_mapping_name in self.grid_mapping_dict, + "{} is not a valid grid_mapping_name.".format(grid_mapping_name) + + " See Appendix F for valid grid mappings", + ) + + # The self.grid_mapping_dict has a values of: + # - required attributes + # - optional attributes (can't check) + # - required standard_names defined + # - at least one of these attributes must be defined + + # We can't do any of the other grid mapping checks if it's not a valid grid mapping name + if grid_mapping_name not in self.grid_mapping_dict: + ret_val[grid_mapping_name] = valid_grid_mapping.to_result() + continue + + grid_mapping = self.grid_mapping_dict[grid_mapping_name] + required_attrs = grid_mapping[0] + # Make sure all the required attributes are defined + for req in required_attrs: + valid_grid_mapping.assert_true( + hasattr(grid_var, req), + "{} is a required attribute for grid mapping {}".format( + req, grid_mapping_name + ), + ) + + # Make sure that exactly one of the exclusive attributes exist + if len(grid_mapping) == 4: + at_least_attr = grid_mapping[3] + number_found = 0 + for attr in at_least_attr: + if hasattr(grid_var, attr): + number_found += 1 + valid_grid_mapping.assert_true( + number_found == 1, + "grid mapping {}".format(grid_mapping_name) + + "must define exactly one of these attributes: " + + "{}".format(" or ".join(at_least_attr)), + ) + + # Make sure that exactly one variable is defined for each of the required standard_names + expected_std_names = grid_mapping[2] + for expected_std_name in expected_std_names: + found_vars = ds.get_variables_by_attributes( + standard_name=expected_std_name + ) + valid_grid_mapping.assert_true( + len(found_vars) == 1, + "grid mapping {} requires exactly ".format(grid_mapping_name) + + "one variable with standard_name " + + "{} to be defined".format(expected_std_name), + ) + + ret_val[grid_var_name] = valid_grid_mapping.to_result() + + return ret_val + + def check_conventions_version(self, ds): + """ + CF §2.6.1 the NUG defined global attribute Conventions to the string + value "CF-"; check the Conventions attribute contains + the appropriate string. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + + valid = False + reasoning = [] + correct_version_string = "{}-{}".format( + self._cc_spec, self._cc_spec_version + ).upper() + if hasattr(ds, "Conventions"): + conventions = regex.split(r",|\s+", getattr(ds, "Conventions", "")) + for convention in conventions: + if convention == correct_version_string: + valid = True + break + else: + reasoning = [ + "§2.6.1 Conventions global attribute does not contain " + '"{}"'.format(correct_version_string) + ] + else: + valid = False + reasoning = ["§2.6.1 Conventions field is not present"] + return Result( + BaseCheck.MEDIUM, valid, self.section_titles["2.6"], msgs=reasoning + ) + + def _check_dimensionless_vertical_coordinates( + self, + ds, + deprecated_units, + version_specific_check, + version_specific_dimless_vertical_coord_dict, + ): + """ + Check the validity of dimensionless coordinates under CF + + :param netCDF4.Dataset ds: An open netCDF dataset + :param list deprecated_units: list of string names of deprecated units + :param function version_specific_check: version-specific implementation to check dimensionless vertical coord + :param dict version_specific_dimless_coord_dict: version-specific dict of dimensionless vertical coords and computed standard names + :return: List of results + """ + ret_val = [] + + z_variables = cfutil.get_z_variables(ds) + + # call version-specific implementation + for name in z_variables: + version_specific_check( + ds, + name, + deprecated_units, + ret_val, + version_specific_dimless_vertical_coord_dict, + ) + + return ret_val + + def _check_formula_terms(self, ds, coord, dimless_coords_dict): + """ + Checks a dimensionless vertical coordinate contains valid formula_terms + + - formula_terms is a non-empty string + - formula_terms matches regdimless_coords_dictx + - every variable defined in formula_terms exists + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: compliance_checker.base.Result + """ + variable = ds.variables[coord] + standard_name = getattr(variable, "standard_name", None) + formula_terms = getattr(variable, "formula_terms", None) + valid_formula_terms = TestCtx(BaseCheck.HIGH, self.section_titles["4.3"]) + + valid_formula_terms.assert_true( + isinstance(formula_terms, str) and formula_terms, + "§4.3.2: {}'s formula_terms is a required attribute and must be a non-empty string" + "".format(coord), + ) + # We can't check any more + if not formula_terms: + return valid_formula_terms.to_result() + + # check that the formula_terms are well formed and are present + # The pattern for formula terms is always component: variable_name + # the regex grouping always has component names in even positions and + # the corresponding variable name in odd positions. + matches = regex.findall( + r"([A-Za-z][A-Za-z0-9_]*: )([A-Za-z][A-Za-z0-9_]*)", variable.formula_terms + ) + terms = set(m[0][:-2] for m in matches) + # get the variables named in the formula terms and check if any + # are not present in the dataset + missing_vars = sorted(set(m[1] for m in matches) - set(ds.variables)) + missing_fmt = "The following variable(s) referenced in {}:formula_terms are not present in the dataset: {}" + valid_formula_terms.assert_true( + len(missing_vars) == 0, missing_fmt.format(coord, ", ".join(missing_vars)) + ) + # try to reconstruct formula_terms by adding space in between the regex + # matches. If it doesn't exactly match the original, the formatting + # of the attribute is incorrect + reconstructed_formula = " ".join(m[0] + m[1] for m in matches) + valid_formula_terms.assert_true( + reconstructed_formula == formula_terms, + "Attribute formula_terms is not well-formed", + ) + + valid_formula_terms.assert_true( + standard_name in dimless_coords_dict, + "unknown standard_name '{}' for dimensionless vertical coordinate {}" + "".format(standard_name, coord), + ) + if standard_name not in dimless_coords_dict: + return valid_formula_terms.to_result() + + valid_formula_terms.assert_true( + no_missing_terms(standard_name, terms, dimless_coords_dict), + "{}'s formula_terms are invalid for {}, please see appendix D of CF 1.6" + "".format(coord, standard_name), + ) + + return valid_formula_terms.to_result() + + def _check_grid_mapping_attr_condition(self, attr, attr_name, ret_val): + """ + Evaluate a condition (or series of conditions) for a particular + attribute. Designed to be overloaded in subclass implementations. + + :param attr: attribute to teset condition for + :param str attr_name: name of the attribute + :param list ret_val: list of results to append to + :rtype None + :return None + """ + raise NotImplementedError + + def _dims_in_order(self, dimension_order): + """ + :param list dimension_order: A list of axes + :rtype: bool + :return: Returns True if the dimensions are in order U*, T, Z, Y, X, + False otherwise + """ + regx = regex.compile(r"^[^TZYX]*T?Z?Y?X?$") + dimension_string = "".join(dimension_order) + return regx.match(dimension_string) is not None + + def _parent_var_attr_type_check(self, attr_name, var, ctx): + """ + Checks that an attribute has an equivalent value to a parent variable. + Takes an attribute name, variable, and test context on which to operate. + :param str attr_name: The name of the attribute to be checked + :param netCDF4.Variable var: The variable against which to be checked + :param compliance_checker.base.TestCtx ctx: The associated test context to modify + :rtype None + :return None + """ + attr_val = var.getncattr(attr_name) + + if isinstance(attr_val, (str, bytes)): + type_match = (var.dtype is str) or (var.dtype.kind == "S") + val_type = type(attr_val) + else: + val_type = attr_val.dtype.type + type_match = val_type == var.dtype.type + + ctx.assert_true( + type_match, + "Attribute '{}' (type: {}) and parent variable '{}' (type: {}) " + "must have equivalent datatypes".format( + attr_name, val_type, var.name, var.dtype.type + ), + ) + + def _find_aux_coord_vars(self, ds, refresh=False): + """ + Returns a list of auxiliary coordinate variables + + An auxiliary coordinate variable is any netCDF variable that contains + coordinate data, but is not a coordinate variable (in the sense of the term + defined by CF). + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: List of variable names (str) that are defined to be auxiliary + coordinate variables. + """ + if self._aux_coords.get(ds, None) and refresh is False: + return self._aux_coords[ds] + + self._aux_coords[ds] = cfutil.get_auxiliary_coordinate_variables(ds) + return self._aux_coords[ds] + + def _find_boundary_vars(self, ds, refresh=False): + """ + Returns dictionary of boundary variables mapping the variable instance + to the name of the variable acting as a boundary variable. + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: A list containing strings with boundary variable names. + """ + if self._boundary_vars.get(ds, None) and refresh is False: + return self._boundary_vars[ds] + + self._boundary_vars[ds] = cfutil.get_cell_boundary_variables(ds) + + return self._boundary_vars[ds] + + def _find_ancillary_vars(self, ds, refresh=False): + """ + Returns a list of variable names that are defined as ancillary + variables in the dataset ds. + + An ancillary variable generally is a metadata container and referenced + from other variables via a string reference in an attribute. + + - via ancillary_variables (3.4) + - "grid mapping var" (5.6) + - TODO: more? + + The result is cached by the passed in dataset object inside of this + checker. Pass refresh=True to redo the cached value. + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: List of variable names (str) that are defined as ancillary + variables in the dataset ds. + """ + + # Used the cached version if it exists and is not empty + if self._ancillary_vars.get(ds, None) and refresh is False: + return self._ancillary_vars[ds] + + # Invalidate the cache at all costs + self._ancillary_vars[ds] = [] + + for name, var in ds.variables.items(): + if hasattr(var, "ancillary_variables"): + for anc_name in var.ancillary_variables.split(" "): + if anc_name in ds.variables: + self._ancillary_vars[ds].append(anc_name) + + if hasattr(var, "grid_mapping"): + gm_name = var.grid_mapping + if gm_name in ds.variables: + self._ancillary_vars[ds].append(gm_name) + + return self._ancillary_vars[ds] + + def _find_clim_vars(self, ds, refresh=False): + """ + Returns a list of variables that are likely to be climatology variables based on CF §7.4 + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: A list containing strings with geophysical variable + names. + """ + + if self._clim_vars.get(ds, None) and refresh is False: + return self._clim_vars[ds] + + climatology_variable = cfutil.get_climatology_variable(ds) + if climatology_variable: + self._clim_vars[ds].append(climatology_variable) + + return self._clim_vars[ds] + + def _find_cf_standard_name_table(self, ds): + """ + Parse out the `standard_name_vocabulary` attribute and download that + version of the cf standard name table. If the standard name table has + already been downloaded, use the cached version. Modifies `_std_names` + attribute to store standard names. Returns True if the file exists and + False if it fails to download. + + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: bool + """ + # Get the standard name vocab + standard_name_vocabulary = getattr(ds, "standard_name_vocabulary", "") + + # Try to parse this attribute to get version + version = None + try: + if "cf standard name table" in standard_name_vocabulary.lower(): + version = [ + s.strip("(").strip(")").strip("v").strip(",") + for s in standard_name_vocabulary.split() + ] + # This assumes that table version number won't start with 0. + version = [ + s + for s in version + if s.isdigit() and len(s) <= 2 and not s.startswith("0") + ] + if len(version) > 1: + return False + else: + try: + version = version[0] + except IndexError: + warn( + "Cannot extract CF standard name version number " + "from standard_name_vocabulary string" + ) + return False + else: + # Can't parse the attribute, use the packaged version + return False + # usually raised from .lower() with an incompatible (non-string) + # data type + except AttributeError: + warn( + "Cannot convert standard name table to lowercase. This can " + "occur if a non-string standard_name_vocabulary global " + "attribute is supplied" + ) + return False + + if version.startswith("v"): # i.e 'v34' -> '34' drop the v + version = version[1:] + + # If the packaged version is what we're after, then we're good + if version == self._std_names._version: + print( + "Using packaged standard name table v{0}".format(version), + file=sys.stderr, + ) + return False + + # Try to download the version specified + try: + data_directory = util.create_cached_data_dir() + location = os.path.join( + data_directory, "cf-standard-name-table-test-{0}.xml".format(version) + ) + # Did we already download this before? + if not os.path.isfile(location): + util.download_cf_standard_name_table(version, location) + print( + "Using downloaded standard name table v{0}".format(version), + file=sys.stderr, + ) + else: + print( + "Using cached standard name table v{0} from {1}".format( + version, location + ), + file=sys.stderr, + ) + + self._std_names = util.StandardNameTable(location) + return True + except Exception as e: + # There was an error downloading the CF table. That's ok, we'll just use the packaged version + warn( + "Problem fetching standard name table:\n{0}\n" + "Using packaged v{1}".format(e, self._std_names._version) + ) + return False + + def _find_coord_vars(self, ds, refresh=False): + """ + Returns a list of variable names that identify as coordinate variables. + + The result is cached by the passed in dataset object inside of this + checker. Pass refresh=True to redo the cached value. + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: A list of variables names (str) that are defined as coordinate + variables in the dataset ds. + """ + if ds in self._coord_vars and refresh is False: + return self._coord_vars[ds] + + self._coord_vars[ds] = cfutil.get_coordinate_variables(ds) + + return self._coord_vars[ds] + + def _find_geophysical_vars(self, ds, refresh=False): + """ + Returns a list of geophysical variables. Modifies + `self._geophysical_vars` + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: A list containing strings with geophysical variable + names. + """ + if self._geophysical_vars.get(ds, None) and refresh is False: + return self._geophysical_vars[ds] + + self._geophysical_vars[ds] = cfutil.get_geophysical_variables(ds) + + return self._geophysical_vars[ds] + + def _find_metadata_vars(self, ds, refresh=False): + """ + Returns a list of netCDF variable instances for those that are likely metadata variables + + :param netCDF4.Dataset ds: An open netCDF dataset + :param bool refresh: if refresh is set to True, the cache is + invalidated. + :rtype: list + :return: List of variable names (str) that are likely metadata + variable candidates. + + """ + if self._metadata_vars.get(ds, None) and refresh is False: + return self._metadata_vars[ds] + + self._metadata_vars[ds] = [] + for name, var in ds.variables.items(): + + if name in self._find_ancillary_vars(ds) or name in self._find_coord_vars( + ds + ): + continue + + if name in ( + "platform_name", + "station_name", + "instrument_name", + "station_id", + "platform_id", + "surface_altitude", + ): + self._metadata_vars[ds].append(name) + + elif getattr(var, "cf_role", "") != "": + self._metadata_vars[ds].append(name) + + elif ( + getattr(var, "standard_name", None) is None and len(var.dimensions) == 0 + ): + self._metadata_vars[ds].append(name) + + return self._metadata_vars[ds] + + def _get_coord_axis_map(self, ds): + """ + Returns a dictionary mapping each coordinate to a letter identifier + describing the _kind_ of coordinate. + + :param netCDF4.Dataset ds: An open netCDF dataset + + :rtype: dict + :return: A dictionary with variable names mapped to axis abbreviations, + i.e. {'longitude': 'X', ... 'pressure': 'Z'} + """ + expected = ["T", "Z", "Y", "X"] + coord_vars = self._find_coord_vars(ds) + coord_axis_map = {} + + # L - Unlimited Coordinates + # T - Time coordinates + # Z - Depth/Altitude Coordinate + # Y - Y-Coordinate (latitude) + # X - X-Coordinate (longitude) + # A - Auxiliary Coordinate + # I - Instance Coordinate + + time_variables = cfutil.get_time_variables(ds) + lat_variables = cfutil.get_latitude_variables(ds) + lon_variables = cfutil.get_longitude_variables(ds) + z_variables = cfutil.get_z_variables(ds) + + for coord_name in coord_vars: + coord_var = ds.variables[coord_name] + axis = getattr(coord_var, "axis", None) + standard_name = getattr(coord_var, "standard_name", None) + + # Unlimited dimensions must come first + if ds.dimensions[coord_name].isunlimited(): + coord_axis_map[coord_name] = "L" + # axis takes precedence over standard_name + elif axis in expected: + coord_axis_map[coord_name] = axis + elif standard_name == "time": + coord_axis_map[coord_name] = "T" + elif standard_name == "longitude": + coord_axis_map[coord_name] = "X" + elif standard_name == "latitude": + coord_axis_map[coord_name] = "Y" + elif standard_name in ["height", "depth", "altitude"]: + coord_axis_map[coord_name] = "Z" + elif cfutil.is_compression_coordinate(ds, coord_name): + coord_axis_map[coord_name] = "C" + elif coord_name in time_variables: + coord_axis_map[coord_name] = "T" + elif coord_name in z_variables: + coord_axis_map[coord_name] = "Z" + elif coord_name in lat_variables: + coord_axis_map[coord_name] = "Y" + elif coord_name in lon_variables: + coord_axis_map[coord_name] = "X" + else: + # mark the coordinate variable as unknown + coord_axis_map[coord_name] = "U" + + for dimension in self._get_instance_dimensions(ds): + if dimension not in coord_axis_map: + coord_axis_map[dimension] = "I" + + # Dimensions of auxiliary coordinate variables will be marked with A. + # This is useful to help determine if the dimensions are used like a + # mapping from grid coordinates to physical lat/lon + for coord_name in self._find_aux_coord_vars(ds): + coord_var = ds.variables[coord_name] + # Skip label auxiliary coordinates + if hasattr(coord_var.dtype, "char") and coord_var.dtype.char == "S": + continue + elif coord_var.dtype == str: + continue + for dimension in coord_var.dimensions: + if dimension not in coord_axis_map: + coord_axis_map[dimension] = "A" + + # If a dimension does not have a coordinate variable mark it as unknown + # 'U' + for dimension in ds.dimensions: + if dimension not in coord_axis_map: + coord_axis_map[dimension] = "U" + + return coord_axis_map + + def _get_coord_vars(self, ds): + coord_vars = [] + for name, var in ds.variables.items(): + if (name,) == var.dimensions: + coord_vars.append(name) + return coord_vars + + def _get_dimension_order(self, ds, name, coord_axis_map): + """ + Returns a list of strings corresponding to the named axis of the dimensions for a variable. + + Example:: + self._get_dimension_order(ds, 'temperature', coord_axis_map) + --> ['T', 'Y', 'X'] + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: Name of the variable + :param dict coord_axis_map: A dictionary mapping each coordinate variable and dimension to a named axis + + :rtype: list + :return: A list of strings corresponding to the named axis of the dimensions for a variable + """ + + retval = [] + variable = ds.variables[name] + for dim in variable.dimensions: + retval.append(coord_axis_map[dim]) + return retval + + def _get_instance_dimensions(self, ds): + """ + Returns a list of dimensions marked as instance dimensions + + :param netCDF4.Dataset ds: An open netCDF dataset + + :rtype: list + :returns: A list of variable dimensions + """ + ret_val = [] + for variable in ds.get_variables_by_attributes( + cf_role=lambda x: isinstance(x, str) + ): + if variable.ndim > 0: + ret_val.append(variable.dimensions[0]) + return ret_val + + def _get_pretty_dimension_order(self, ds, name): + """ + Returns a comma separated string of the dimensions for a specified + variable + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: A string with a valid NetCDF variable name for the + dataset + :rtype: str + :return: A comma separated string of the variable's dimensions + """ + dim_names = [] + for dim in ds.variables[name].dimensions: + dim_name = dim + if ds.dimensions[dim].isunlimited(): + dim_name += " (Unlimited)" + dim_names.append(dim_name) + return ", ".join(dim_names) + + def _get_pretty_dimension_order_with_type(self, ds, name, dim_types): + """ + Returns a comma separated string of the dimensions for a specified + variable of format "DIMENSIONS_NAME (DIMENSION_TYPE[, unlimited])" + + :param netCDF4.Dataset ds: An open netCDF dataset + :param str name: A string with a valid NetCDF variable name for the + dataset + :param list dim_types: A list of strings returned by + _get_dimension_order for the same "name" + :rtype: str + :return: A comma separated string of the variable's dimensions + """ + dim_names = [] + for dim, dim_type in zip(ds.variables[name].dimensions, dim_types): + dim_name = "{} ({}".format(dim, dim_type) + if ds.dimensions[dim].isunlimited(): + dim_name += ", unlimited)" + else: + dim_name += ")" + dim_names.append(dim_name) + return ", ".join(dim_names) + + def _is_station_var(self, var): + """ + Returns True if the NetCDF variable is associated with a station, False + otherwise. + + :param netCDF4.Variable var: a variable in an existing NetCDF dataset + :rtype: bool + :return: Status of whether variable appears to be associated with a + station + """ + + if getattr(var, "standard_name", None) in ( + "platform_name", + "station_name", + "instrument_name", + ): + return True + return False + + def _split_standard_name(self, standard_name): + """ + Returns a tuple of the standard_name and standard_name modifier + + Nones are used to represent the absence of a modifier or standard_name + + :rtype: tuple + :return: 2-tuple of standard_name and modifier as strings + """ + + if isinstance(standard_name, str) and " " in standard_name: + return standard_name.split(" ", 1) + # if this isn't a string, then it doesn't make sense to split + # -- treat value as standard name with no modifier + else: + return standard_name, None + + def check_appendix_a(self, ds): + """ + Validates a CF dataset against the contents of its Appendix A table for + attribute types and locations. Returns a list of results with the + outcomes of the Appendix A validation results against the existing + attributes in the docstring. + + :param netCDF4.Variable var: a variable in an existing NetCDF dataset + :param netCDF4.Dataset ds: An open netCDF dataset + :rtype: list + :return: A list of results corresponding to the results returned + """ + # if 'enable_appendix_a_checks' isn't specified in the checks, + # don't do anything on this check + results = [] + if "enable_appendix_a_checks" not in self.options: + return results + possible_global_atts = set(ds.ncattrs()).intersection(self.appendix_a.keys()) + attr_location_ident = { + "G": "global attributes", + "C": "coordinate data", + "D": "non-coordinate data", + } + + def att_loc_print_helper(att_letter): + """ + Returns a string corresponding to attr_location ident in + human-readable form. E.g. an input of 'G' will return + "global attributes (G)" + + :param str att_letter: An attribute letter corresponding to the + "Use" column in CF Appendix A + :rtype: str + :return: A string with a human-readable name followed by the input + letter specified + """ + + return "{} ({})".format( + attr_location_ident.get(att_letter, "other"), att_letter + ) + + def _att_loc_msg(att_loc): + """ + Helper method for formatting an error message when an attribute + appears in the improper location corresponding to the "Use" column + in CF Appendix A. + + :param set att_loc: A set with the possible valid locations of the + attribute corresponding to the "Use" column + in CF Appendix A + :rtype: str + :return: A human-readable string with the possible valid locations + of the attribute + """ + att_loc_len = len(att_loc) + # this is a fallback in case an empty att_loc is passed + # it generally should not occur + valid_loc = "no locations in the dataset" + loc_sort = sorted(att_loc) + if att_loc_len == 1: + valid_loc = att_loc_print_helper(loc_sort[0]) + elif att_loc_len == 2: + valid_loc = "{} and {}".format( + att_loc_print_helper(loc_sort[0]), att_loc_print_helper(loc_sort[1]) + ) + # shouldn't be reached under normal circumstances, as any attribute + # should be either G, C, or D but if another + # category is added, this will be useful. + else: + valid_loc = ", ".join(loc_sort[:-1]) + ", and {}".format( + att_loc_print_helper(loc_sort[-1]) + ) + return "This attribute may only appear in {}.".format(valid_loc) + + for global_att_name in possible_global_atts: + global_att = ds.getncattr(global_att_name) + att_dict = self.appendix_a[global_att_name] + att_loc = att_dict["attr_loc"] + valid_loc_warn = _att_loc_msg(att_loc) + if att_dict["cf_section"] is not None: + subsection_test = ".".join(att_dict["cf_section"].split(".")[:2]) + + section_loc = self.section_titles.get( + subsection_test, att_dict["cf_section"] + ) + else: + section_loc = None + test_ctx = TestCtx(BaseCheck.HIGH, section_loc) + + test_ctx.out_of += 1 + if "G" not in att_loc: + test_ctx.messages.append( + '[Appendix A] Attribute "{}" should not be present in global (G) ' + "attributes. {}".format(global_att_name, valid_loc_warn) + ) + else: + result = self._handle_dtype_check(global_att, global_att_name, att_dict) + if not result[0]: + test_ctx.messages.append(result[1]) + else: + test_ctx.score += 1 + results.append(test_ctx.to_result()) + + noncoord_vars = set(ds.variables) - set(self.coord_data_vars) + for var_set, coord_letter in ( + (self.coord_data_vars, "C"), + (noncoord_vars, "D"), + ): + for var_name in var_set: + var = ds.variables[var_name] + possible_attrs = set(var.ncattrs()).intersection(self.appendix_a.keys()) + for att_name in possible_attrs: + att_dict = self.appendix_a[att_name] + if att_dict["cf_section"] is not None: + subsection_test = ".".join( + att_dict["cf_section"].split(".")[:2] + ) + + section_loc = self.section_titles.get( + subsection_test, att_dict["cf_section"] + ) + else: + section_loc = None + test_ctx = TestCtx(BaseCheck.HIGH, section_loc, variable=var_name) + att_loc = att_dict["attr_loc"] + valid_loc_warn = _att_loc_msg(att_loc) + att = var.getncattr(att_name) + test_ctx.out_of += 1 + if coord_letter not in att_loc: + test_ctx.messages.append( + '[Appendix A] Attribute "{}" should not be present in {} ' + 'variable "{}". {}'.format( + att_name, + att_loc_print_helper(coord_letter), + var_name, + valid_loc_warn, + ) + ) + else: + result = self._handle_dtype_check(att, att_name, att_dict, var) + if not result[0]: + test_ctx.messages.append(result[1]) + else: + test_ctx.score += 1 + results.append(test_ctx.to_result()) + + return results + + def _check_attr_type(self, attr_name, attr_type, attribute, variable=None): + """ + Check if an attribute `attr` is of the type `attr_type`. Upon getting + a data type of 'D', the attr must have the same data type as the + variable it is assigned to. + + Attributes designated type 'S' must be of type `str`. 'N' require + numeric types, and 'D' requires the attribute type match the type + of the variable it is assigned to. + + :param str attr_name: name of attr being checked (to format message) + :param str attr_type: the correct type of the attribute + :param attribute: attribute to check + :param variable: if given, type should match attr + :rtype tuple + :return A two-tuple that contains pass/fail status as a boolean and + a message string (or None if unset) as the second element. + """ + + if attr_type == "S": + if not isinstance(attribute, str): + return [False, "{} must be a string".format(attr_name)] + else: + # if it's not a string, it should have a numpy dtype + underlying_dtype = getattr(attribute, "dtype", None) + + # TODO check for np.nan separately + if underlying_dtype is None: + return [False, "{} must be a numeric type".format(attr_name)] + + # both D and N should be some kind of numeric value + is_numeric = np.issubdtype(underlying_dtype, np.number) + if attr_type == "N": + if not is_numeric: + return [False, "{} must be a numeric type".format(attr_name)] + elif attr_type == "D": + # TODO: handle edge case where variable is unset here + temp_ctx = TestCtx() + self._parent_var_attr_type_check(attr_name, variable, temp_ctx) + var_dtype = getattr(variable, "dtype", None) + if temp_ctx.messages: + return ( + False, + "{} must be numeric and must be equivalent to {} dtype".format( + attr_name, var_dtype + ), + ) + else: + # If we reached here, we fell off with an unrecognized type + return ( + False, + "{} has unrecognized type '{}'".format(attr_name, attr_type), + ) + # pass if all other possible failure conditions have been evaluated + return (True, None) + + def _handle_dtype_check(self, attribute, attr_name, attr_dict, variable=None): + """ + Helper function for Appendix A checks. + + :param attribute: The value of the attribute being checked + :param str attr_name: The name of the attribute being processed + :param dict attr_dict: The dict entry with type and attribute location + information corresponding to this attribute + :param variable: if given, the variable whose type to check against + :rtype: tuple + :return: A two-tuple that contains pass/fail status as a boolean and + a message string (or None if unset) as the second element. + """ + attr_type = attr_dict["Type"] + if variable is None and "G" not in attr_dict["attr_loc"]: + raise ValueError( + "Non-global attributes must be associated with a " " variable" + ) + attr_str = ( + "Global attribute {}".format(attr_name) + if "G" in attr_dict["attr_loc"] and variable is None + else "Attribute {} in variable {}".format(attr_name, variable.name) + ) + + # check the type + return_value = self._check_attr_type(attr_name, attr_type, attribute, variable) + + # if the second element is a string, format it + if isinstance(return_value[1], str): + return_value[1] = return_value[1].format(attr_str) + + # convert to tuple for immutability and return + return tuple(return_value) + + +class CFNCCheck(BaseNCCheck, CFBaseCheck): + """Inherits from both BaseNCCheck and CFBaseCheck to support + checking netCDF datasets. Must inherit in this order, or certain + attributes from BaseNCCheck (like supported_ds) will not be passed to + CFNCCheck.""" + + pass + + +appendix_a_base = { + "Conventions": {"Type": "S", "attr_loc": {"G"}, "cf_section": None}, + "_FillValue": {"Type": "D", "attr_loc": {"D", "C"}, "cf_section": None}, + "add_offset": {"Type": "N", "attr_loc": {"D"}, "cf_section": "8.1"}, + "ancillary_variables": {"Type": "S", "attr_loc": {"D"}, "cf_section": "3.4"}, + "axis": {"Type": "S", "attr_loc": {"C"}, "cf_section": "4"}, + "bounds": {"Type": "S", "attr_loc": {"C"}, "cf_section": "7.1"}, + "calendar": {"Type": "S", "attr_loc": {"C"}, "cf_section": "4.4.1"}, + "cell_measures": {"Type": "S", "attr_loc": {"D"}, "cf_section": "7.2"}, + "cell_methods": {"Type": "S", "attr_loc": {"D"}, "cf_section": "7.3"}, + # cf_role type is "C" in document, which does not correspond + # to types used, replaced with "S" + "cf_role": {"Type": "S", "attr_loc": {"C"}, "cf_section": "9.5"}, + "climatology": {"Type": "S", "attr_loc": {"C"}, "cf_section": "7.4"}, + # comment was removed in this implementation + "compress": {"Type": "S", "attr_loc": {"C"}, "cf_section": "8.2"}, + "coordinates": {"Type": "S", "attr_loc": {"D"}, "cf_section": "5"}, + # featureType type is "C" in document, which does not + # correspond to types used, replaced with "S" + "featureType": {"Type": "S", "attr_loc": {"G"}, "cf_section": "9.4"}, + "flag_masks": {"Type": "D", "attr_loc": {"D"}, "cf_section": "3.5"}, + "flag_meanings": {"Type": "S", "attr_loc": {"D"}, "cf_section": "3.5"}, + "flag_values": {"Type": "D", "attr_loc": {"D"}, "cf_section": "3.5"}, + "formula_terms": {"Type": "S", "attr_loc": {"C"}, "cf_section": "4.3.2"}, + "grid_mapping": {"Type": "S", "attr_loc": {"D"}, "cf_section": "5.6"}, + "history": {"Type": "S", "attr_loc": {"G"}, "cf_section": None}, + #'instance_dimension': {'Type': 'N', 'attr_loc': {'D'}, 'cf_section': '9.3'}, + "institution": {"Type": "S", "attr_loc": {"G", "D"}, "cf_section": "2.6.2"}, + "leap_month": {"Type": "N", "attr_loc": {"C"}, "cf_section": "4.4.1"}, + "leap_year": {"Type": "N", "attr_loc": {"C"}, "cf_section": "4.4.1"}, + "long_name": {"Type": "S", "attr_loc": {"D", "C"}, "cf_section": "3.2"}, + "missing_value": {"Type": "D", "attr_loc": {"D", "C"}, "cf_section": "2.5.1"}, + "month_lengths": {"Type": "N", "attr_loc": {"C"}, "cf_section": "4.4.1"}, + "positive": {"Type": "S", "attr_loc": {"C"}, "cf_section": None}, + "references": {"Type": "S", "attr_loc": {"G", "D"}, "cf_section": "2.6.2"}, + #'sample_dimension': {'Type': 'N', 'attr_loc': {'D'}, 'cf_section': '9.3'}, + "scale_factor": {"Type": "N", "attr_loc": {"D"}, "cf_section": "8.1"}, + "source": {"Type": "S", "attr_loc": {"G", "D"}, "cf_section": "2.6.2"}, + "standard_error_multiplier": {"Type": "N", "attr_loc": {"D"}, "cf_section": None}, + "standard_name": {"Type": "S", "attr_loc": {"D", "C"}, "cf_section": "3.3"}, + "title": {"Type": "S", "attr_loc": {"G"}, "cf_section": None}, + "units": {"Type": "S", "attr_loc": {"D", "C"}, "cf_section": "3.1"}, + "valid_max": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": None}, + "valid_min": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": None}, + "valid_range": {"Type": "N", "attr_loc": {"D", "C"}, "cf_section": None}, +} + +class CFNCCheck(BaseNCCheck, CFBaseCheck): + @classmethod + def beliefs(cls): # @TODO + return {} diff --git a/compliance_checker/cf/util.py b/compliance_checker/cf/util.py index e10c8981..4e97a380 100644 --- a/compliance_checker/cf/util.py +++ b/compliance_checker/cf/util.py @@ -13,7 +13,7 @@ from cf_units import Unit from lxml import etree -from netCDF4 import Dimension, Variable +from netCDF4 import Dataset, Dimension, Variable from pkg_resources import resource_filename @@ -235,6 +235,18 @@ def get_safe(dict_instance, keypath, default=None): except Exception: return default +class VariableReferenceError(Exception): + """A variable to assign bad variable references to""" + + def __init__(self, name: str, dataset: Dataset = None): + self.name = name + self.dataset_path = dataset.filepath() if dataset is not None else None + + def __str__(self): + return ( + f"Cannot find variable named {self.name} in dataset " f"{self}.dataset_path" + ) + class NCGraph(object): def __init__( @@ -562,3 +574,37 @@ def is_vertical_coordinate(var_name, var): if not is_pressure: satisfied |= getattr(var, "positive", "").lower() in ("up", "down") return satisfied + + +def string_from_var_type(variable): + if isinstance(variable, str): + return variable[:] + elif variable.dtype.kind == "S": + strip_char = variable.fill_value or b"\x00" + return variable.tobytes().rstrip(strip_char).decode("utf-8") + else: + raise TypeError( + f"Variable '{variable.name} has non-string/character' " + f"dtype {variable.dtype}" + ) + + +def reference_attr_variables( + dataset: Dataset, attributes_string: str, split_by: str = None +): + """ + Attempts to reference variables in the string, optionally splitting by + a string + """ + if attributes_string is None: + return None + elif split_by is None: + return dataset.variables.get( + attributes_string, VariableReferenceError(attributes_string) + ) + else: + string_proc = attributes_string.split(split_by) + return [ + dataset.variables.get(var_name, VariableReferenceError(var_name)) + for var_name in string_proc + ] diff --git a/compliance_checker/cfutil.py b/compliance_checker/cfutil.py index bec13654..414e4733 100644 --- a/compliance_checker/cfutil.py +++ b/compliance_checker/cfutil.py @@ -81,7 +81,7 @@ def attr_membership(attr_val, value_set, attr_type=str, modifier_fn=lambda x: x) else: new_attr_val = attr_type(attr_val) # catch casting errors - except (ValueError, UnicodeEncodeError) as e: + except (ValueError, UnicodeEncodeError): warnings.warn("Could not cast to type {}".format(attr_type)) return False else: diff --git a/compliance_checker/ioos.py b/compliance_checker/ioos.py index ea91fc45..6a215228 100644 --- a/compliance_checker/ioos.py +++ b/compliance_checker/ioos.py @@ -925,38 +925,43 @@ def check_cf_role_variables(self, ds): https://github.com/ioos/compliance-checker/issues/828 """ - fType = getattr(ds, "featureType", None) - if (not fType) or (not isinstance(fType, str)): # can't do anything, pass + feature_type_attr = getattr(ds, "featureType", None) + # can't do anything, pass + if not feature_type_attr or not isinstance(feature_type_attr, str): return Result( BaseCheck.MEDIUM, False, "CF DSG: Invalid featureType", [ ( - f"Invalid featureType '{fType}'; please see the " - "IOOS 1.2 Profile and CF-1.7 Conformance documents for valid featureType" + f"Invalid featureType '{feature_type_attr}'; please see the " + "IOOS 1.2 Profile and CF-1.7 Conformance documents for valid featureType" ) ], ) - featType = fType.lower() - if featType == "timeseries": + feature_type = feature_type_attr.lower() + + if feature_type == "timeseries": return self._check_feattype_timeseries_cf_role(ds) - elif featType == "timeseriesprofile": + elif feature_type == "timeseriesprofile": return self._check_feattype_timeseriesprof_cf_role(ds) - elif featType == "trajectory": + elif feature_type == "trajectory": return self._check_feattype_trajectory_cf_role(ds) - elif featType == "trajectoryprofile": - return self._check_feattype_trajprof_cf_role(ds) + elif feature_type == "trajectoryprofile": + return self._check_feattype_trajectoryprof_cf_role(ds) - elif featType == "profile": + elif feature_type == "profile": return self._check_feattype_profile_cf_role(ds) - elif featType == "point": - return good_result # can't do anything + elif feature_type == "point": + return Result( + BaseCheck.MEDIUM, + True, + "CF DSG: featureType=trajectoryProfile") else: return Result( @@ -965,8 +970,9 @@ def check_cf_role_variables(self, ds): "CF DSG: Unknown featureType", [ ( - f"Invalid featureType '{featType}'; please see the " - "IOOS 1.2 Profile and CF-1.7 Conformance documents for valid featureType" + f"Invalid featureType '{feature_type_attr}'; " + "please see the IOOS 1.2 Profile and CF-1.7 " + "Conformance documents for valid featureType" ) ], ) @@ -1106,7 +1112,7 @@ def _check_feattype_trajectoryprof_cf_role(self, ds): trj_prof_msg = ( "Dimension length of non-platform variable with cf_role={cf_role} " "(the '{dim_type}' dimension) is {dim_len}. " - "The IOOS profile restricts trjectory and trajectoryProfile " + "The IOOS profile restricts trajectory and trajectoryProfile " "datasets to a single platform (ie. trajectory) per dataset " "(the profile dimension is permitted to be >= 1)." ) diff --git a/compliance_checker/suite.py b/compliance_checker/suite.py index 066236a1..61c548e0 100644 --- a/compliance_checker/suite.py +++ b/compliance_checker/suite.py @@ -28,7 +28,7 @@ from compliance_checker import MemoizedDataset, __version__, tempnc from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value -from compliance_checker.cf.cf import CFBaseCheck +from compliance_checker.cf.cf_base import CFBaseCheck from compliance_checker.protocols import cdl, erddap, netcdf, opendap @@ -677,13 +677,16 @@ def reasoning_routine(self, groups, check, priority_flag=3, _top_level=True): print out the appropriate header string """ - sort_fn = lambda x: x.weight - groups_sorted = sorted(groups, key=sort_fn, reverse=True) + def weight_sort(result): + return result.weight + + groups_sorted = sorted(groups, key=weight_sort, reverse=True) # create dict of the groups -> {level: [reasons]} result = { key: [v for v in valuesiter if v.value[0] != v.value[1]] - for key, valuesiter in itertools.groupby(groups_sorted, key=sort_fn) + for key, valuesiter in itertools.groupby(groups_sorted, + key=weight_sort) } priorities = self.checkers[check]._cc_display_headers @@ -728,7 +731,8 @@ def process_table(res, check): print("{:^{width}}".format(level_name, width=width)) print("-" * width) - data_issues = [process_table(res, check) for res in result[level]] + data_issues = [process_table(res, check) for res in + result[level]] has_printed = False for issue, reasons in data_issues: @@ -738,7 +742,8 @@ def process_table(res, check): print("") # join alphabetized reasons together reason_str = "\n".join( - "* {}".format(r) for r in sorted(reasons, key=lambda x: x[0]) + "* {}".format(r) for r in sorted(reasons, + key=lambda x: x[0]) ) proc_str = "{}\n{}".format(issue, reason_str) print(proc_str) @@ -827,7 +832,7 @@ def check_remote_netcdf(self, ds_str): return MemoizedDataset( urlparse(response.url).path, memory=response.content ) - except OSError as e: + except OSError: # handle case when netCDF C libs weren't compiled with # in-memory support by using tempfile with tempnc(response.content) as _nc: diff --git a/compliance_checker/tests/conftest.py b/compliance_checker/tests/conftest.py index d51d3898..348e76d6 100644 --- a/compliance_checker/tests/conftest.py +++ b/compliance_checker/tests/conftest.py @@ -21,7 +21,7 @@ def glob_down(pth, suffix, lvls): def generate_dataset(cdl_path, nc_path): - subprocess.call(["ncgen", "-o", str(nc_path), str(cdl_path)]) + subprocess.call(["ncgen", "-4", "-o", str(nc_path), str(cdl_path)]) def static_files(cdl_stem): diff --git a/compliance_checker/tests/data/examples/bio_taxa.cdl b/compliance_checker/tests/data/examples/bio_taxa.cdl new file mode 100644 index 00000000..298fc967 --- /dev/null +++ b/compliance_checker/tests/data/examples/bio_taxa.cdl @@ -0,0 +1,22 @@ +netcdf { + dimensions: + time = 5 ; + string80 = 80 ; + taxon = 2 ; + variables: + float time(time); + time:standard_name = "time" ; + time:units = "days since 2019-01-01" ; + float abundance(time,taxon) ; + abundance:standard_name = "number_concentration_of_organisms_in_taxon_in_sea_water" ; + abundance:coordinates = "taxon_lsid taxon_name" ; + char taxon_name(taxon,string80) ; + taxon_name:standard_name = "biological_taxon_name" ; + char taxon_lsid(taxon,string80) ; + taxon_lsid:standard_name = "biological_taxon_lsid" ; + data: + time = 1., 2., 3., 4., 5. ; + abundance = 1., 2., 3., 4., 5., 6., 7., 8., 9., 10. ; + taxon_name = "Calanus finmarchicus", "Calanus helgolandicus" ; + taxon_lsid = "urn:lsid:marinespecies.org:taxname:104464", "urn:lsid:marinespecies.org:taxname:104466" ; +} \ No newline at end of file diff --git a/compliance_checker/tests/data/line_geometry.cdl b/compliance_checker/tests/data/line_geometry.cdl new file mode 100644 index 00000000..f10bad88 --- /dev/null +++ b/compliance_checker/tests/data/line_geometry.cdl @@ -0,0 +1,52 @@ +netcdf geometry { +dimensions: + instance = 2 ; + node = 5 ; + time = 4 ; +variables: + int time(time) ; + time:units = "days since 2000-01-01" ; + double lat(instance) ; + lat:units = "degrees_north" ; + lat:standard_name = "latitude" ; + lat:nodes = "y" ; + double lon(instance) ; + lon:units = "degrees_east" ; + lon:standard_name = "longitude" ; + lon:nodes = "x" ; + int datum ; + datum:grid_mapping_name = "latitude_longitude" ; + datum:longitude_of_prime_meridian = 0.0 ; + datum:semi_major_axis = 6378137.0 ; + datum:inverse_flattening = 298.257223563 ; + int geometry_container ; + geometry_container:geometry_type = "line" ; + geometry_container:node_count = "node_count" ; + geometry_container:node_coordinates = "x y" ; + int node_count(instance) ; + double x(node) ; + x:units = "degrees_east" ; + x:standard_name = "longitude" ; + x:axis = "X" ; + double y(node) ; + y:units = "degrees_north" ; + y:standard_name = "latitude" ; + y:axis = "Y" ; + double someData(instance, time) ; + someData:coordinates = "time lat lon" ; + someData:grid_mapping = "datum" ; + someData:geometry = "geometry_container" ; +// global attributes: + :Conventions = "CF-1.8" ; + :featureType = "timeSeries" ; +data: + time = 1, 2, 3, 4 ; + lat = 30, 50 ; + lon = 10, 60 ; + someData = + 1, 2, 3, 4, + 1, 2, 3, 4 ; + node_count = 3, 2 ; + x = 30, 10, 40, 50, 50 ; + y = 10, 30, 40, 60, 50 ; +} diff --git a/compliance_checker/tests/data/polygon_geometry.cdl b/compliance_checker/tests/data/polygon_geometry.cdl new file mode 100644 index 00000000..72ac76df --- /dev/null +++ b/compliance_checker/tests/data/polygon_geometry.cdl @@ -0,0 +1,61 @@ +netcdf polygon_geometry { +dimensions: + node = 12 ; + instance = 2 ; + part = 4 ; + time = 4 ; +variables: + int time(time) ; + time:units = "days since 2000-01-01" ; + double x(node) ; + x:units = "degrees_east" ; + x:standard_name = "longitude" ; + x:axis = "X" ; + double y(node) ; + y:units = "degrees_north" ; + y:standard_name = "latitude" ; + y:axis = "Y" ; + double lat(instance) ; + lat:units = "degrees_north" ; + lat:standard_name = "latitude" ; + lat:nodes = "y" ; + double lon(instance) ; + lon:units = "degrees_east" ; + lon:standard_name = "longitude" ; + lon:nodes = "x" ; + float geometry_container ; + geometry_container:geometry_type = "polygon" ; + geometry_container:node_count = "node_count" ; + geometry_container:node_coordinates = "x y" ; + geometry_container:grid_mapping = "datum" ; + geometry_container:coordinates = "lat lon" ; + geometry_container:part_node_count = "part_node_count" ; + geometry_container:interior_ring = "interior_ring" ; + int node_count(instance) ; + int part_node_count(part) ; + int interior_ring(part) ; + float datum ; + datum:grid_mapping_name = "latitude_longitude" ; + datum:semi_major_axis = 6378137. ; + datum:inverse_flattening = 298.257223563 ; + datum:longitude_of_prime_meridian = 0. ; + double someData(instance, time) ; + someData:coordinates = "time lat lon" ; + someData:grid_mapping = "datum" ; + someData:geometry = "geometry_container" ; +// global attributes: + :Conventions = "CF-1.8" ; + :featureType = "timeSeries" ; +data: + time = 1, 2, 3, 4 ; + x = 20, 10, 0, 5, 10, 15, 20, 10, 0, 50, 40, 30 ; + y = 0, 15, 0, 5, 10, 5, 20, 35, 20, 0, 15, 0 ; + lat = 25, 7 ; + lon = 10, 40 ; + node_count = 9, 3 ; + part_node_count = 3, 3, 3, 3 ; + interior_ring = 0, 1, 0, 0 ; + someData = + 1, 2, 3, 4, + 1, 2, 3, 4 ; +} diff --git a/compliance_checker/tests/data/taxonomy_example.cdl b/compliance_checker/tests/data/taxonomy_example.cdl new file mode 100644 index 00000000..83e6670e --- /dev/null +++ b/compliance_checker/tests/data/taxonomy_example.cdl @@ -0,0 +1,28 @@ +netcdf taxonomy_example { +dimensions: + time = 100 ; + string80 = 80 ; + taxon = 2 ; +variables: + float time(time); + time:standard_name = "time" ; + time:units = "days since 2019-01-01" ; + float abundance(time,taxon) ; + /// below appears to not be a valid standard name + //abundance:standard_name = "number_concentration_of_organisms_in_taxon_in_sea_water" ; + abundance:standard_name = "number_concentration_of_biological_taxon_in_sea_water" ; + // units were also not specified in example CDL in CF Conventions document, + // include them here + abundance:standard_name = "number_concentration_of_biological_taxon_in_sea_water" ; + abundance:units = "m-3" ; + abundance:coordinates = "taxon_lsid taxon_name" ; + char taxon_name(taxon,string80) ; + taxon_name:standard_name = "biological_taxon_name" ; + char taxon_lsid(taxon,string80) ; + taxon_lsid:standard_name = "biological_taxon_lsid" ; +data: + //time = // 100 values ; + //abundance = // 200 values ; + taxon_name = "Calanus finmarchicus", "Calanus helgolandicus" ; + taxon_lsid = "urn:lsid:marinespecies.org:taxname:104464", "urn:lsid:marinespecies.org:taxname:104466" ; +} diff --git a/compliance_checker/tests/helpers.py b/compliance_checker/tests/helpers.py index 43e6d113..19ecd567 100644 --- a/compliance_checker/tests/helpers.py +++ b/compliance_checker/tests/helpers.py @@ -55,9 +55,18 @@ def __init__(self, copy_var=None): self.dtype = copy_var.dtype self.dimensions = copy_var.dimensions self.ndim = copy_var.ndim + self._arr = copy_var[:] for att in copy_var.ncattrs(): setattr(self, att, getattr(copy_var, att)) + def __getitem__(self, idx): + return self._arr[idx] + + + def __setitem__(self, idx, val): + self._arr[idx] = val + + def ncattrs(self): return [ att diff --git a/compliance_checker/tests/resources.py b/compliance_checker/tests/resources.py index 640f1fe1..dd3df549 100644 --- a/compliance_checker/tests/resources.py +++ b/compliance_checker/tests/resources.py @@ -16,10 +16,11 @@ def get_filename(path): def generate_dataset(cdl_path, nc_path): - subprocess.call(["ncgen", "-o", nc_path, cdl_path]) + subprocess.call(["ncgen", "-4", "-o", nc_path, cdl_path]) STATIC_FILES = { + # TODO: add defaulltdict implementation for default files, etc "bad": get_filename("tests/data/non-comp/bad.cdl"), "badname": get_filename("tests/data/non-comp/badname.netcdf"), "bad-instance": get_filename("tests/data/bad-instance.cdl"), @@ -79,6 +80,7 @@ def generate_dataset(cdl_path, nc_path): "ioos_gold_1_1": get_filename("tests/data/ioos_1_1.cdl"), "kibesillah": get_filename("tests/data/examples/kibesillah.cdl"), "l01-met": get_filename("tests/data/examples/l01-met.cdl"), + "line_geometry": get_filename("tests/data/line_geometry.cdl"), "mapping": get_filename("tests/data/mapping.cdl"), "multi-dim-coordinates": get_filename("tests/data/multi-dim-coordinates.cdl"), "multi-timeseries-orthogonal": get_filename( @@ -95,6 +97,7 @@ def generate_dataset(cdl_path, nc_path): "ocos": get_filename("tests/data/examples/ocos.cdl"), "ooi_glider": get_filename("tests/data/examples/ooi_glider.cdl"), "point": get_filename("tests/data/point.cdl"), + "polygon_geometry": get_filename("tests/data/polygon_geometry.cdl"), "profile-orthogonal": get_filename("tests/data/profile-orthogonal.cdl"), "profile-incomplete": get_filename("tests/data/profile-incomplete.cdl"), "pr_inundation": get_filename("tests/data/examples/pr_inundation.cdl"), @@ -111,6 +114,7 @@ def generate_dataset(cdl_path, nc_path): "string": get_filename("tests/data/string_type_variable.cdl"), "swan": get_filename("tests/data/examples/swan.cdl"), "sp041": get_filename("tests/data/examples/sp041.cdl"), + "taxonomy_example": get_filename("tests/data/taxonomy_example.cdl"), "timeseries": get_filename("tests/data/timeseries.cdl"), "timeseries-profile-single-station": get_filename( "tests/data/timeseries-profile-single-station.cdl" diff --git a/compliance_checker/tests/test_cf.py b/compliance_checker/tests/test_cf.py index 5938d9fc..90786f80 100644 --- a/compliance_checker/tests/test_cf.py +++ b/compliance_checker/tests/test_cf.py @@ -11,12 +11,13 @@ import numpy as np import pytest -from netCDF4 import Dataset +from netCDF4 import Dataset, stringtoarr from compliance_checker import cfutil -from compliance_checker.cf import ( +from compliance_checker.cf.cf import ( CF1_6Check, CF1_7Check, + CF1_8Check, dimless_vertical_coordinates_1_6, dimless_vertical_coordinates_1_7, ) @@ -32,11 +33,11 @@ ) from compliance_checker.suite import CheckSuite from compliance_checker.tests import BaseTestCase -from compliance_checker.tests.helpers import ( - MockRaggedArrayRepr, - MockTimeSeries, - MockVariable, -) +from compliance_checker.tests.helpers import (MockRaggedArrayRepr, + MockTimeSeries, MockVariable) +import requests_mock +import json +import re from compliance_checker.tests.resources import STATIC_FILES @@ -109,17 +110,6 @@ def test_coord_data_vars(self): # present in coord_data_vars self.assertEqual(self.cf.coord_data_vars, {"time", "sigma"}) - def load_dataset(self, nc_dataset): - """ - Return a loaded NC Dataset for the given path - """ - if not isinstance(nc_dataset, str): - raise ValueError("nc_dataset should be a string") - - nc_dataset = Dataset(nc_dataset, "r") - self.addCleanup(nc_dataset.close) - return nc_dataset - # -------------------------------------------------------------------------------- # Compliance Tests # -------------------------------------------------------------------------------- @@ -564,6 +554,17 @@ def test_climatology_cell_methods(self): score, out_of, messages = get_results(results) self.assertEqual(score, out_of) + bad_dim_ds = MockTimeSeries() + bad_dim_ds.createDimension("clim_bounds", 3) + + temp = bad_dim_ds.createVariable("temperature", "f8", ("time",)) + bad_dim_ds.createVariable("clim_bounds", "f8", ("time")) + temp.climatology = "clim_bounds" + results = self.cf.check_climatological_statistics(bad_dim_ds) + assert results[0].value[0] < results[0].value[1] + assert (results[0].msgs[0] == 'Climatology dimension "clim_bounds" ' + "should only contain two elements") + def test_check_ancillary_variables(self): """ Test to ensure that ancillary variables are properly checked @@ -2255,6 +2256,279 @@ def test_check_add_offset_scale_factor_type(self): self.assertFalse(r[0].msgs) +class TestCF1_8(BaseTestCase): + def setUp(self): + self.cf = CF1_8Check() + + def test_point_geometry_simple(self): + dataset = MockTimeSeries() + fake_data = dataset.createVariable("someData", "f8", ("time",)) + fake_data.geometry = "geometry" + x = dataset.createVariable("x", "f8", ()) + y = dataset.createVariable("y", "f8", ()) + geom_var = dataset.createVariable("geometry", "i4", ()) + geom_var.geometry_type = "point" + geom_var.node_coordinates = "x y" + x[:] = 1 + y[:] = 1 + self.cf.check_geometry(dataset) + + def test_point_geometry_multiple(self): + dataset = MockTimeSeries() + dataset.createDimension("point_count", 3) + fake_data = dataset.createVariable("someData", "f8", ("time",)) + fake_data.geometry = "geometry" + x = dataset.createVariable("x", "f8", ("point_count",)) + y = dataset.createVariable("y", "f8", ("point_count",)) + geom_var = dataset.createVariable("geometry", "i4", ()) + geom_var.geometry_type = "point" + geom_var.node_coordinates = "x y" + x[:] = np.array([10, 20, 30]) + y[:] = np.array([30, 35, 21]) + results = self.cf.check_geometry(dataset) + assert results[0].value[0] == results[0].value[1] + dataset.createDimension("point_count_2", 2) + # can't recreate y, even with del issued first + y2 = dataset.createVariable("y2", "f8", ("point_count_2",)) + geom_var.node_coordinates = "x y2" + y2[:] = np.array([30, 35]) + results = self.cf.check_geometry(dataset) + assert results[0].value[0] < results[0].value[1] + + def test_line_geometry(self): + dataset = self.load_dataset(STATIC_FILES["line_geometry"]) + self.cf.check_geometry(dataset) + + def test_polygon_geometry(self): + dataset = self.load_dataset(STATIC_FILES["polygon_geometry"]) + self.cf.check_geometry(dataset) + dataset.variables["interior_ring"] = MockVariable( + dataset.variables["interior_ring"] + ) + # Flip sign indicator for interior rings. Should cause failure + flip_ring_bits = (dataset.variables["interior_ring"][:] == 0).astype(int) + dataset.variables["interior_ring"][:] = flip_ring_bits + results = self.cf.check_geometry(dataset) + # There should be messages regarding improper polygon order + assert results[0].value[0] < results[0].value[1] + assert results[0].msgs + + def test_bad_lsid(self): + """ + Tests malformed and nonexistent LSIDs + """ + dataset = MockTimeSeries() + # TODO: handle scalar dimension + dataset.createDimension("taxon", 1) + abundance = dataset.createVariable("abundance", "f8", ("time",)) + abundance.standard_name = ( + "number_concentration_of_biological_taxon_in_sea_water" + ) + abundance.units = "m-3" + abundance.coordinates = "taxon_name taxon_lsid" + taxon_name = dataset.createVariable("taxon_name", str, ("taxon",)) + taxon_name.standard_name = "biological_taxon_name" + taxon_lsid = dataset.createVariable("taxon_lsid", str, ("taxon",)) + taxon_lsid.standard_name = "biological_taxon_lsid" + taxon_name[0] = "Esox lucius" + taxon_lsid[0] = "urn:lsid:itis.gov:itis_tsn:99999999999" + with requests_mock.Mocker() as m: + # bad ID + taxon_lsid[0] = "99999999999" + m.get( + "http://www.lsid.info/urn:lsid:marinespecies.org:taxname:99999999999", + status_code=400, + text="400 Bad Request</head><body><h1>Bad Request</h1><p>Unknown LSID</p></body></html>", + ) + results = self.cf.check_taxa(dataset) + assert len(results) == 1 + messages = results[0].msgs + assert results[0].value[0] < results[0].value[1] + assert len(messages) == 1 + taxon_lsid[ + 0 + ] = "http://www.lsid.info/urn:lsid:marinespecies.org:taxname:99999999999" + results = self.cf.check_taxa(dataset) + assert messages[0].startswith( + "Taxon id must match one of the following forms:" + ) + assert results[0].value[0] < results[0].value[1] + + def test_taxonomy_data_worms_valid(self): + """ + Tests taxonomy data with a mocked pyworms call + """ + with requests_mock.Mocker() as m: + # assume LSID lookups for WoRMS return valid HTTP status code + m.get( + re.compile( + r"^http://www.lsid.info/urn:lsid:marinespecies.org:taxname:\d+$" + ) + ) + response_1 = json.dumps( + { + "AphiaID": 104464, + "url": "http://www.marinespecies.org/aphia.php?p=taxdetails&id=104464", + "scientificname": "Calanus finmarchicus", + "authority": "(Gunnerus, 1770)", + "status": "accepted", + "unacceptreason": None, + "taxonRankID": 220, + "rank": "Species", + "valid_AphiaID": 104464, + "valid_name": "Calanus finmarchicus", + "valid_authority": "(Gunnerus, 1770)", + "parentNameUsageID": 104152, + "kingdom": "Animalia", + "phylum": "Arthropoda", + "class": "Hexanauplia", + "order": "Calanoida", + "family": "Calanidae", + "genus": "Calanus", + "citation": "Walter, T.C.; Boxshall, G. (2021). World of Copepods Database. Calanus finmarchicus (Gunnerus, 1770). Accessed through: World Register of Marine Species at: http://www.marinespecies.org/aphia.php?p=taxdetails&id=104464 on 2021-11-11", + "lsid": "urn:lsid:marinespecies.org:taxname:104464", + "isMarine": 1, + "isBrackish": 0, + "isFreshwater": 0, + "isTerrestrial": 0, + "isExtinct": None, + "match_type": "exact", + "modified": "2020-10-06T15:25:25.040Z", + } + ) + m.get( + "http://www.marinespecies.org/rest/AphiaRecordByAphiaID/104464", + text=response_1, + ) + response_2 = json.dumps( + { + "AphiaID": 104466, + "url": "http://www.marinespecies.org/aphia.php?p=taxdetails&id=104466", + "scientificname": "Calanus helgolandicus", + "authority": "(Claus, 1863)", + "status": "accepted", + "unacceptreason": None, + "taxonRankID": 220, + "rank": "Species", + "valid_AphiaID": 104466, + "valid_name": "Calanus helgolandicus", + "valid_authority": "(Claus, 1863)", + "parentNameUsageID": 104152, + "kingdom": "Animalia", + "phylum": "Arthropoda", + "class": "Hexanauplia", + "order": "Calanoida", + "family": "Calanidae", + "genus": "Calanus", + "citation": "Walter, T.C.; Boxshall, G. (2021). World of Copepods Database. Calanus helgolandicus (Claus, 1863). Accessed through: World Register of Marine Species at: http://www.marinespecies.org/aphia.php?p=taxdetails&id=104466 on 2021-11-11", + "lsid": "urn:lsid:marinespecies.org:taxname:104466", + "isMarine": 1, + "isBrackish": 0, + "isFreshwater": 0, + "isTerrestrial": 0, + "isExtinct": None, + "match_type": "exact", + "modified": "2004-12-21T15:54:05Z", + } + ) + m.get( + "http://www.marinespecies.org/rest/AphiaRecordByAphiaID/104466", + text=response_2, + ) + dataset = self.load_dataset(STATIC_FILES["taxonomy_example"]) + + results = self.cf.check_taxa(dataset) + assert len(results) == 1 + assert results[0].value[0] == results[0].value[1] + + def test_taxonomy_data_itis_valid(self): + """ + Tests taxonomy data with a mocked ITIS call + """ + dataset = MockTimeSeries() + # TODO: handle scalar dimension + dataset.createDimension("taxon", 1) + abundance = dataset.createVariable("abundance", "f8", ("time",)) + abundance.standard_name = ( + "number_concentration_of_biological_taxon_in_sea_water" + ) + abundance.units = "m-3" + abundance.coordinates = "taxon_name taxon_lsid" + taxon_name = dataset.createVariable("taxon_name", str, ("taxon",)) + taxon_name.standard_name = "biological_taxon_name" + taxon_lsid = dataset.createVariable("taxon_lsid", str, ("taxon",)) + taxon_lsid.standard_name = "biological_taxon_lsid" + taxon_name[0] = "Esox lucius" + taxon_lsid[0] = "urn:lsid:itis.gov:itis_tsn:162139" + + with requests_mock.Mocker() as m: + m.get(re.compile(r"^http://www.lsid.info/urn:lsid:itis.gov:itis_tsn:\d+$")) + response = r"""{"acceptedNameList":{"acceptedNames":[null],"class":"gov.usgs.itis.itis_service.data.SvcAcceptedNameList","tsn":"162139"},"class":"gov.usgs.itis.itis_service.data.SvcFullRecord","commentList":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonCommentList","comments":[null],"tsn":"162139"},"commonNameList":{"class":"gov.usgs.itis.itis_service.data.SvcCommonNameList","commonNames":[{"class":"gov.usgs.itis.itis_service.data.SvcCommonName","commonName":"northern pike","language":"English","tsn":"162139"},{"class":"gov.usgs.itis.itis_service.data.SvcCommonName","commonName":"grand brochet","language":"French","tsn":"162139"}],"tsn":"162139"},"completenessRating":{"class":"gov.usgs.itis.itis_service.data.SvcGlobalSpeciesCompleteness","completeness":"","rankId":220,"tsn":"162139"},"coreMetadata":{"class":"gov.usgs.itis.itis_service.data.SvcCoreMetadata","credRating":"TWG standards met","rankId":220,"taxonCoverage":"","taxonCurrency":"","taxonUsageRating":"valid","tsn":"162139","unacceptReason":""},"credibilityRating":{"class":"gov.usgs.itis.itis_service.data.SvcCredibilityData","credRating":"TWG standards met","tsn":"162139"},"currencyRating":{"class":"gov.usgs.itis.itis_service.data.SvcCurrencyData","rankId":220,"taxonCurrency":"","tsn":"162139"},"dateData":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonDateData","initialTimeStamp":"1996-06-13 14:51:08.0","tsn":"162139","updateDate":"2004-01-22"},"expertList":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonExpertList","experts":[{"class":"gov.usgs.itis.itis_service.data.SvcTaxonExpert","comment":"Research Curator of Fishes, North Carolina State Museum of Natural Sciences, Research Laboratory, 4301 Reedy Creek Rd., Raleigh, NC, 27607, USA","expert":"Wayne C. Starnes","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"}],"updateDate":"2004-02-23"}],"tsn":"162139"},"geographicDivisionList":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonGeoDivisionList","geoDivisions":[{"class":"gov.usgs.itis.itis_service.data.SvcTaxonGeoDivision","geographicValue":"North America","updateDate":"1998-09-14"}],"tsn":"162139"},"hierarchyUp":{"author":null,"class":"gov.usgs.itis.itis_service.data.SvcHierarchyRecord","parentName":"Esox","parentTsn":"162138","rankName":"Species","taxonName":"Esox lucius","tsn":"162139"},"jurisdictionalOriginList":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonJurisdictionalOriginList","jurisdictionalOrigins":[{"class":"gov.usgs.itis.itis_service.data.SvcTaxonJurisdictionalOrigin","jurisdictionValue":"Alaska","origin":"Native","updateDate":"2004-01-22"},{"class":"gov.usgs.itis.itis_service.data.SvcTaxonJurisdictionalOrigin","jurisdictionValue":"Canada","origin":"Native","updateDate":"2004-01-22"},{"class":"gov.usgs.itis.itis_service.data.SvcTaxonJurisdictionalOrigin","jurisdictionValue":"Continental US","origin":"Native & Introduced","updateDate":"2004-01-22"}],"tsn":"162139"},"kingdom":{"class":"gov.usgs.itis.itis_service.data.SvcKingdomInfo","kingdomId":"5","kingdomName":"Animalia ","tsn":"162139"},"otherSourceList":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonOtherSourceList","otherSources":[{"acquisitionDate":"2003-03-17","class":"gov.usgs.itis.itis_service.data.SvcTaxonOtherSource","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"}],"source":"Catalog of Fishes, 17-Mar-2003","sourceComment":"http://www.calacademy.org/research/ichthyology/catalog/","sourceType":"website","updateDate":"2004-02-11","version":"13-Mar-03"},{"acquisitionDate":"1996-07-29","class":"gov.usgs.itis.itis_service.data.SvcTaxonOtherSource","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"}],"source":"NODC Taxonomic Code","sourceComment":"","sourceType":"database","updateDate":"2010-01-14","version":"8.0"},{"acquisitionDate":"2003-05-06","class":"gov.usgs.itis.itis_service.data.SvcTaxonOtherSource","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"northern pike","refLanguage":"English","referredTsn":"162139"},{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"},{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"grand brochet","refLanguage":"French","referredTsn":"162139"}],"source":"<a href=\"http://www.menv.gouv.qc.ca/biodiversite/centre.htm\">CDP","sourceComment":"","sourceType":"database","updateDate":"2003-05-08","version":"1999"}],"tsn":"162139"},"parentTSN":{"class":"gov.usgs.itis.itis_service.data.SvcParentTsn","parentTsn":"162138","tsn":"162139"},"publicationList":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonPublicationList","publications":[{"actualPubDate":"2004-07-01","class":"gov.usgs.itis.itis_service.data.SvcTaxonPublication","isbn":"1-888569-61-1","issn":"0097-0638","listedPubDate":"2004-01-01","pages":"ix + 386","pubComment":"Full author list: Nelson, Joseph S., Edwin J. Crossman, H�ctor Espinosa-P�rez, Lloyd T. Findley, Carter R. Gilbert, Robert N. Lea, and James D. Williams","pubName":"American Fisheries Society Special Publication, no. 29","pubPlace":"Bethesda, Maryland, USA","publisher":"American Fisheries Society","referenceAuthor":"Nelson, Joseph S., Edwin J. Crossman, H. Espinosa-P�rez, L. T. Findley, C. R. Gilbert, et al., eds.","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"grand brochet","refLanguage":"French","referredTsn":"162139"},{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"},{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"northern pike","refLanguage":"English","referredTsn":"162139"}],"title":"Common and scientific names of fishes from the United States, Canada, and Mexico, Sixth Edition","updateDate":"2021-10-27"},{"actualPubDate":"2003-12-31","class":"gov.usgs.itis.itis_service.data.SvcTaxonPublication","isbn":"","issn":"","listedPubDate":"2003-12-31","pages":"","pubComment":"As-yet (2003) unpublished manuscript from 1998","pubName":"Checklist of Vertebrates of the United States, the U.S. Territories, and Canada","pubPlace":"","publisher":"","referenceAuthor":"Banks, R. C., R. W. McDiarmid, A. L. Gardner, and W. C. Starnes","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"}],"title":"","updateDate":"2021-08-26"},{"actualPubDate":"1980-01-01","class":"gov.usgs.itis.itis_service.data.SvcTaxonPublication","isbn":"","issn":"0097-0638","listedPubDate":"1980-01-01","pages":"174","pubComment":"","pubName":"American Fisheries Society Special Publication, no. 12","pubPlace":"Bethesda, Maryland, USA","publisher":"American Fisheries Society","referenceAuthor":"Robins, Richard C., Reeve M. Bailey, Carl E. Bond, James R. Brooker, Ernest A. Lachner, et al.","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"},{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"northern pike","refLanguage":"English","referredTsn":"162139"}],"title":"A List of Common and Scientific Names of Fishes from the United States and Canada, Fourth Edition","updateDate":"2021-10-27"},{"actualPubDate":"1991-01-01","class":"gov.usgs.itis.itis_service.data.SvcTaxonPublication","isbn":"0-913235-70-9","issn":"0097-0638","listedPubDate":"1991-01-01","pages":"183","pubComment":"","pubName":"American Fisheries Society Special Publication, no. 20","pubPlace":"Bethesda, Maryland, USA","publisher":"American Fisheries Society","referenceAuthor":"Robins, Richard C., Reeve M. Bailey, Carl E. Bond, James R. Brooker, Ernest A. Lachner, et al.","referenceFor":[{"class":"gov.usgs.itis.itis_service.data.SvcReferenceForElement","name":"Esox lucius","refLanguage":null,"referredTsn":"162139"}],"title":"Common and Scientific Names of Fishes from the United States and Canada, Fifth Edition","updateDate":"2021-10-27"}],"tsn":"162139"},"scientificName":{"author":"Linnaeus, 1758","class":"gov.usgs.itis.itis_service.data.SvcScientificName","combinedName":"Esox lucius","kingdom":null,"tsn":"162139","unitInd1":null,"unitInd2":null,"unitInd3":null,"unitInd4":null,"unitName1":"Esox ","unitName2":"lucius","unitName3":null,"unitName4":null},"synonymList":{"class":"gov.usgs.itis.itis_service.data.SvcSynonymNameList","synonyms":[null],"tsn":"162139"},"taxRank":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonRankInfo","kingdomId":"5","kingdomName":"Animalia ","rankId":"220","rankName":"Species ","tsn":"162139"},"taxonAuthor":{"authorship":"Linnaeus, 1758","class":"gov.usgs.itis.itis_service.data.SvcTaxonAuthorship","tsn":"162139","updateDate":"2004-04-09"},"tsn":"162139","unacceptReason":{"class":"gov.usgs.itis.itis_service.data.SvcUnacceptData","tsn":"162139","unacceptReason":null},"usage":{"class":"gov.usgs.itis.itis_service.data.SvcTaxonUsageData","taxonUsageRating":"valid","tsn":"162139"}}""" + m.get( + "https://www.itis.gov/ITISWebService/jsonservice/getFullRecordFromTSN?tsn=162139", + text=response, + ) + + results = self.cf.check_taxa(dataset) + + assert len(results) == 1 + assert results[0].value[0] == results[0].value[1] + + # try non-matching name + taxon_name[0] = "Morone saxitilis" + results = self.cf.check_taxa(dataset) + result = results[0] + assert result.msgs == [ + "Supplied taxon name and ITIS scientific name do not match. " + "Supplied taxon name is 'Morone saxitilis', ITIS scientific name " + "for TSN 162139 is 'Esox lucius.'" + ] + + def test_taxonomy_skip_lsid(self): + """ + Tests that nodata/unset LSID values are skipped for validation + """ + dataset = MockTimeSeries() + # TODO: handle scalar dimension + dataset.createDimension("taxon", 1) + abundance = dataset.createVariable("abundance", "f8", ("time",)) + abundance.standard_name = ( + "number_concentration_of_biological_taxon_in_sea_water" + ) + abundance.units = "m-3" + abundance.coordinates = "taxon_name taxon_lsid" + taxon_name = dataset.createVariable("taxon_name", str, ("taxon",)) + taxon_name.standard_name = "biological_taxon_name" + taxon_lsid = dataset.createVariable("taxon_lsid", str, ("taxon",)) + taxon_lsid.standard_name = "biological_taxon_lsid" + # This would fail if checked against an LSID or even for binomial + # nomenclature, obviously. + taxon_name[0] = "No check" + results = self.cf.check_taxa(dataset) + assert len(results[0].msgs) == 0 + assert results[0].value[0] == results[0].value[1] + + dataset = MockTimeSeries() + # TODO: handle scalar dimension? + dataset.createDimension("string80", 80) + dataset.createDimension("taxon", 1) + abundance = dataset.createVariable("abundance", "f8", ("time",)) + abundance.standard_name = ( + "number_concentration_of_biological_taxon_in_sea_water" + ) + abundance.units = "m-3" + abundance.coordinates = "taxon_name taxon_lsid" + taxon_name = dataset.createVariable("taxon_name", "S1", ("taxon", "string80")) + taxon_name.standard_name = "biological_taxon_name" + taxon_lsid = dataset.createVariable("taxon_lsid", "S1", ("taxon", "string80")) + taxon_lsid.standard_name = "biological_taxon_lsid" + fake_str = "No check" + taxon_name[0] = stringtoarr(fake_str, 80) + results = self.cf.check_taxa(dataset) + assert len(results[0].msgs) == 0 + assert results[0].value[0] == results[0].value[1] + + class TestCFUtil(BaseTestCase): """ Class to test the cfutil module. diff --git a/compliance_checker/tests/test_cf_integration.py b/compliance_checker/tests/test_cf_integration.py index 2a07f28f..eddc94f0 100644 --- a/compliance_checker/tests/test_cf_integration.py +++ b/compliance_checker/tests/test_cf_integration.py @@ -34,7 +34,7 @@ ), ( "usgs_dem_saipan", - ['§2.6.1 Conventions global attribute does not contain "CF-1.7"'], + ['§2.6.1 Conventions global attribute does not contain "CF-1.8"'], ), ( "l01-met", @@ -97,7 +97,7 @@ "swan", [ "global attribute _CoordSysBuilder should begin with a letter and be composed of letters, digits, and underscores", - '§2.6.1 Conventions global attribute does not contain "CF-1.7"', + '§2.6.1 Conventions global attribute does not contain "CF-1.8"', "units for variable time_offset must be convertible to s currently they are hours since 2013-02-18T00:00:00Z", "units for variable time_run must be convertible to s currently they are hours since 2013-02-18 00:00:00.000 UTC", "lon's axis attribute must be T, X, Y, or Z, currently x", @@ -197,7 +197,7 @@ "vbar's spatio-temporal dimensions are not in the recommended order T, Z, Y, X and/or further dimensions are not located left of T, Z, Y, X. The dimensions (and their guessed types) are ocean_time (T), eta_v (A), xi_v (A) (with U: other/unknown; L: unlimited).", "w's spatio-temporal dimensions are not in the recommended order T, Z, Y, X and/or further dimensions are not located left of T, Z, Y, X. The dimensions (and their guessed types) are ocean_time (T), s_w (Z), eta_rho (A), xi_rho (A) (with U: other/unknown; L: unlimited).", "zeta's spatio-temporal dimensions are not in the recommended order T, Z, Y, X and/or further dimensions are not located left of T, Z, Y, X. The dimensions (and their guessed types) are ocean_time (T), eta_rho (A), xi_rho (A) (with U: other/unknown; L: unlimited).", - '§2.6.1 Conventions global attribute does not contain "CF-1.7"', + '§2.6.1 Conventions global attribute does not contain "CF-1.8"', "units (None) attribute of 's_w' must be a string compatible with UDUNITS", "units (None) attribute of 's_rho' must be a string compatible with UDUNITS", "units (None) attribute of 'Cs_w' must be a string compatible with UDUNITS", @@ -313,7 +313,7 @@ def test_fvcom(self, cs, loaded_dataset): " not in messages" ) assert ( - '§2.6.1 Conventions global attribute does not contain "CF-1.7"' + '§2.6.1 Conventions global attribute does not contain "CF-1.8"' ) in messages @pytest.mark.parametrize( diff --git a/compliance_checker/tests/test_ioos_profile.py b/compliance_checker/tests/test_ioos_profile.py index adaafea4..fd234880 100644 --- a/compliance_checker/tests/test_ioos_profile.py +++ b/compliance_checker/tests/test_ioos_profile.py @@ -12,7 +12,8 @@ NamingAuthorityValidator, ) from compliance_checker.tests import BaseTestCase -from compliance_checker.tests.helpers import MockTimeSeries, MockVariable +from compliance_checker.tests.helpers import (MockTimeSeries, MockVariable, + MockNetCDF) from compliance_checker.tests.resources import STATIC_FILES from compliance_checker.tests.test_cf import get_results @@ -761,7 +762,7 @@ def test_check_single_platform(self): # global platform, one platform variable, pass temp = ds.createVariable("temp", "d", ("time")) temp.setncattr("platform", "platform_var") - plat = ds.createVariable("platform_var", np.byte) + ds.createVariable("platform_var", np.byte) result = self.ioos.check_single_platform(ds) self.assertTrue(result.value) self.assertEqual(result.msgs, []) @@ -769,7 +770,7 @@ def test_check_single_platform(self): # two platform variables, fail temp2 = ds.createVariable("temp2", "d", ("time")) temp2.setncattr("platform", "platform_var2") - plat = ds.createVariable("platform_var2", np.byte) + ds.createVariable("platform_var2", np.byte) result = self.ioos.check_single_platform(ds) self.assertFalse(result.value) @@ -777,7 +778,7 @@ def test_check_single_platform(self): ds = MockTimeSeries() # time, lat, lon, depth temp = ds.createVariable("temp", "d", ("time")) temp.setncattr("platform", "platform_var") - plat = ds.createVariable("platform_var", np.byte) + ds.createVariable("platform_var", np.byte) result = self.ioos.check_single_platform(ds) self.assertFalse(result.value) @@ -1002,7 +1003,7 @@ def test_check_feattype_timeseries_cf_role(self): ftype = "timeseries" ds = MockTimeSeries() # time, lat, lon, depth ds.setncattr("featureType", ftype) - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) # no platform variables or geophys vars with cf_role=timeseries_id, fail result = self.ioos._check_feattype_timeseries_cf_role(ds) @@ -1045,7 +1046,7 @@ def test_check_feattype_timeseries_cf_role(self): ds = MockTimeSeries() # time, lat, lon, depth ds.createDimension("station_dim", 21) ds.setncattr("featureType", "timeseries") - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) plat = ds.createVariable("station", "|S1", dimensions=("station_dim")) plat.setncattr("cf_role", "timeseries_id") result = self.ioos._check_feattype_timeseries_cf_role(ds) @@ -1056,7 +1057,7 @@ def test_check_feattype_timeseriesprof_cf_role(self): ds = MockTimeSeries() # time, lat, lon, depth ds.setncattr("featureType", ftype) ds.createDimension("station_dim", 1) - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) # no platform variables or geophys vars with cf_role=timeseries_id, fail result = self.ioos._check_feattype_timeseriesprof_cf_role(ds) @@ -1088,7 +1089,7 @@ def test_check_feattype_timeseriesprof_cf_role(self): ds.setncattr("featureType", ftype) ds.createDimension("station_dim", 1) ds.createDimension("profile_dim", 1) - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) plat = ds.createVariable("station", "|S1", ("station_dim")) plat.setncattr("cf_role", "timeseries_id") pf = ds.createVariable("profile", "|S1", ("profile_dim")) @@ -1101,7 +1102,7 @@ def test_check_feattype_timeseriesprof_cf_role(self): ds.setncattr("featureType", ftype) ds.createDimension("station_dim", 1) ds.createDimension("profile_dim", 21) - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) plat = ds.createVariable("station", "|S1", ("station_dim")) plat.setncattr("cf_role", "timeseries_id") pf = ds.createVariable("profile", "|S1", ("profile_dim")) @@ -1172,7 +1173,7 @@ def test_check_feattype_trajectoryprof_cf_role(self): ds.setncattr("featureType", "trajectoryprofile") ds.createDimension("trajectory_dim", 1) ds.createDimension("profile_dim", 1) - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) trj = ds.createVariable("trajectory", "|S1", ("trajectory_dim")) trj.setncattr("cf_role", "trajectory_id") pf = ds.createVariable("profile", "|S1", ("profile_dim")) @@ -1185,7 +1186,7 @@ def test_check_feattype_trajectoryprof_cf_role(self): ds.setncattr("featureType", "trajectoryprofile") ds.createDimension("trajectory_dim", 1) ds.createDimension("profile_dim", 21) - temp = ds.createVariable("temp", "d", ("time")) + ds.createVariable("temp", "d", ("time")) trj = ds.createVariable("trajectory", "|S1", ("trajectory_dim")) trj.setncattr("cf_role", "trajectory_id") pf = ds.createVariable("profile", "|S1", ("profile_dim")) @@ -1222,6 +1223,16 @@ def test_check_feattype_profile_cf_role(self): result = self.ioos._check_feattype_profile_cf_role(ds) self.assertTrue(result.value) + def test_feattype_point(self): + ds = MockNetCDF() + ds.createDimension("obs", 1) + ds.createVariable("lon", "f8", ("obs",)) + ds.createVariable("lat", "f8", ("obs",)) + ds.createVariable("temp", "f8", ("obs",)) + ds.setncattr("featureType", "point") + result = self.ioos.check_cf_role_variables(ds) + assert result.value + def test_check_instrument_make_model_calib_date(self): """ Per the IOOS-1.2 spec, instrument variables should have diff --git a/compliance_checker/tests/test_suite.py b/compliance_checker/tests/test_suite.py index 33d93915..617f885f 100644 --- a/compliance_checker/tests/test_suite.py +++ b/compliance_checker/tests/test_suite.py @@ -72,8 +72,10 @@ def test_unicode_formatting(self): score_list, points, out_of = self.cs.standard_output( ds.filepath(), limit, checker, groups ) - # This asserts that print is able to generate all of the unicode output - self.cs.standard_output_generation(groups, limit, points, out_of, checker) + # This asserts that print is able to generate all of the unicode + # output + self.cs.standard_output_generation(groups, limit, points, out_of, + checker) def test_generate_dataset_netCDF4(self): """ diff --git a/compliance_checker/tests/test_util.py b/compliance_checker/tests/test_util.py index 6d4ea645..ee860a9b 100644 --- a/compliance_checker/tests/test_util.py +++ b/compliance_checker/tests/test_util.py @@ -51,4 +51,4 @@ def test_datetime_is_iso(self): self.assertFalse(util.datetime_is_iso(bad_datetime)[0]) bad_date = "09192017" - self.assertFalse(util.datetime_is_iso(bad_datetime)[0]) + self.assertFalse(util.datetime_is_iso(bad_date)[0]) diff --git a/compliance_checker/util.py b/compliance_checker/util.py index a9c02fd9..71279650 100644 --- a/compliance_checker/util.py +++ b/compliance_checker/util.py @@ -11,9 +11,9 @@ def datetime_is_iso(date_str): """Attempts to parse a date formatted in ISO 8601 format""" try: if len(date_str) > 10: - dt = isodate.parse_datetime(date_str) + isodate.parse_datetime(date_str) else: - dt = isodate.parse_date(date_str) + isodate.parse_date(date_str) return True, [] except: # Any error qualifies as not ISO format return False, ["Datetime provided is not in a valid ISO 8601 format"] diff --git a/requirements.txt b/requirements.txt index 8dedc89e..5a591098 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cftime>=1.1.0 isodate>=0.5.4 Jinja2>=2.7.3 lxml>=3.2.1 -netCDF4>=1.4.0 +netCDF4>=1.5.7 OWSLib>=0.8.3 pendulum>=1.2.4 pygeoif>=0.6 @@ -12,3 +12,4 @@ regex>=2017.07.28 requests>=2.2.1 setuptools>=15.0 validators>=0.14.2 +shapely>=1.7.1 diff --git a/setup.py b/setup.py index 2faa361d..c3ebbced 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,11 @@ def pip_requirements(fname="requirements.txt"): if not line or line.startswith("#"): continue reqs.append(line) + return reqs + setup( name="compliance-checker", description="Checks Datasets and SOS endpoints for standards compliance", @@ -56,12 +58,13 @@ def pip_requirements(fname="requirements.txt"): "compliance_checker.suites": [ "cf-1.6 = compliance_checker.cf.cf:CF1_6Check", "cf-1.7 = compliance_checker.cf.cf:CF1_7Check", + "cf-1.8 = compliance_checker.cf.cf:CF1_8Check", "acdd-1.1 = compliance_checker.acdd:ACDD1_1Check", "acdd-1.3 = compliance_checker.acdd:ACDD1_3Check", "ioos_sos = compliance_checker.ioos:IOOSBaseSOSCheck", "ioos-0.1 = compliance_checker.ioos:IOOS0_1Check", "ioos-1.1 = compliance_checker.ioos:IOOS1_1Check", - "ioos-1.2 = compliance_checker.ioos:IOOS1_2Check", + "ioos-1.2 = compliance_checker.ioos:IOOS1_2Check" ], }, package_data={ diff --git a/test_requirements.txt b/test_requirements.txt index 060fd20c..656a55f1 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -4,3 +4,6 @@ httpretty mypy pre-commit pytest>=2.9.0 +requests-mock>=1.7.0 +codecov +pytest-cov