diff --git a/.gitignore b/.gitignore index dd1caf8..a93c6cf 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,10 @@ notes playpen __pycache__ venv +data +deprecated +legal +unittest +build +backup_of_data +backup_of_unittest diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6af40e1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,20 @@ +# Changelog + +## [1.0] - 2021-03-13 +### Added +- Initial release of Valditor Toolkot + +## [1.1] - 2021-04-05 +### Fixed +- Found missing line numbers +- Found missing reports +- Allow data type to be a subclass of data type in property constraint +- Handle conflicting namespaces in ontology and data +- Reject pickle file whose Version does not match the Toolkit Version +- Fixed regex to match @type statement in json-ld files + +### Changed +- Toolkit document + +### Added +- Changelog file diff --git a/README.md b/README.md index 2aef661..3790018 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # UCO-Utility-Pre-0.7.0-Validator -## Description - (Alpha Release) +## Description - (Beta Release Version 1.1) The UCO/CASE Validation Toolkit provides the capability to validate JSON-LD data files against a turtle-file based ontology such as the Unified Cyber Ontology (UCO) and Cyber-Investigation Analysis Standard Expression (CASE). diff --git a/UCO-CASE Validation Toolkit.pdf b/UCO-CASE Validation Toolkit.pdf deleted file mode 100644 index 59ec290..0000000 Binary files a/UCO-CASE Validation Toolkit.pdf and /dev/null differ diff --git a/UCO-CASE_Validation_Toolkit.pdf b/UCO-CASE_Validation_Toolkit.pdf new file mode 100644 index 0000000..2cd0ed3 Binary files /dev/null and b/UCO-CASE_Validation_Toolkit.pdf differ diff --git a/requirements.txt b/requirements.txt index 542991f..2e3de5c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ ontospy rdflib +lxml diff --git a/src/casedata.py b/src/casedata.py index da343bb..75235ff 100644 --- a/src/casedata.py +++ b/src/casedata.py @@ -15,11 +15,12 @@ import tempfile from ontospy import Ontospy import serializer -import precondition +from precondition import precondition, postcondition +from context import Context -VERSION = '1.0' # Appears in the metadata when serialized +VERSION = '1.1' # Appears in the metadata when serialized -def get_casedata(path, output_filepath=None, verbose=True, **kwargs): +def get_casedata(path, output_filepath=None, verbose=False, **kwargs): ''' If path is a serialized casedata file, deserialize it and return it. If path is file containing valid json-ld, ingest it. @@ -47,8 +48,14 @@ def get_casedata(path, output_filepath=None, verbose=True, **kwargs): # If path is a serialized casedata file, deserialize it and return casedata try: - identifier, _metadata, casedata.__dict__ = serializer.deserialize(path) + identifier, metadata, casedata.__dict__ = serializer.deserialize(path) if identifier == serializer.CASEDATA: + if metadata['version'] != VERSION: + print('{} was serialized with a different version of the toolkit. Use this command to reserialize:'.format(path)) + print() + print(' serialize {}'.format(metadata['path'])) + print() + raise Exception('{} was serialized with a different version of the toolkit.'.format(path)) return casedata except serializer.DeserializeError: pass @@ -78,6 +85,7 @@ def __init__(self): self.jsonld_filepath = None # Path to json-ld file self.graph = None # rdflib.Graph of json-ld data self.line_numbers = {} # {node:line_number} where node is a URIRef or a BNode + self.bindings = [] # [(prefix, uri)] from ontospy.namespaces def serialize(self, output_filepath, comment): @@ -113,9 +121,10 @@ def _read_jsonld_file(jsonld_filepath, output_filepath, verbose, **kwargs): Return: dictionary { - 'jsonld_filepath':filepath, # Full path to json-ld file - 'graph':rdflib_graph_obj, # Json-ld data decomponsed to an rdflib.Graph of triples - 'line_numbers':line_numbers_dict # {node:line_number}, where node is a URIRef or a BNode + 'jsonld_filepath':filepath, # Full path to json-ld file + 'graph':rdflib_graph_obj, # Json-ld data decomponsed to an rdflib.Graph of triples + 'line_numbers':line_numbers_dict # {node:line_number}, where node is a URIRef or a BNode + 'bindings':(qualiifer:uri_string) # List of binding tuples, e.g. ('core', 'http://unifiedcyberontology.org/core') } ''' # Read the jsonld file (could raise exception) @@ -123,7 +132,7 @@ def _read_jsonld_file(jsonld_filepath, output_filepath, verbose, **kwargs): text = infile.read() # Precondition it - preconditioned_text = precondition.precondition(text) + preconditioned_text = precondition(text) # If specified, save text in output_filepath if output_filepath: @@ -142,11 +151,15 @@ def _read_jsonld_file(jsonld_filepath, output_filepath, verbose, **kwargs): temp_filepath = os.path.join(tempdirname, 'preconditioned.json') with open(temp_filepath, 'w') as outfile: outfile.write(preconditioned_text) - graph = Ontospy( + ontospy = Ontospy( uri_or_path=temp_filepath, rdf_format='jsonld', verbose=verbose, - **kwargs).rdflib_graph + **kwargs) + + graph = ontospy.rdflib_graph + context = Context().populate(ontospy.namespaces) + # If ontospy cannot read the file, it prints an error message # and returns an object with a zero-length graph @@ -156,18 +169,12 @@ def _read_jsonld_file(jsonld_filepath, output_filepath, verbose, **kwargs): # Build new graph by remove the embedded line number from ontospy's graph # and remember the line numbers in mapping {Node:line_number} - graph, line_numbers_dict = precondition.postcondition(graph, json.loads(preconditioned_text)['@context']) - - #for s,p,o in graph.triples((None, None, None)): - # print(repr(s)) - # print(repr(p)) - # print(repr(o)) - # print() - #pprint.pprint(line_numbers_dict) + graph, line_numbers_dict = postcondition(graph, context) # Construct and return results return { 'jsonld_filepath':jsonld_filepath, 'graph':graph, + 'bindings':context.bindings, 'line_numbers':line_numbers_dict } diff --git a/src/class_constraints.py b/src/class_constraints.py index f8f3123..bb55a1f 100644 --- a/src/class_constraints.py +++ b/src/class_constraints.py @@ -10,8 +10,8 @@ from rdflib.namespace import OWL, RDF from property_constraints import PropertyConstraints from message import OntologyError, UnsupportedFeature -from message import pretty_uri from triples import get_spo_dict +from context import Context @@ -135,12 +135,6 @@ def __init__(self, onto_class_uri): self.onto_class_uri = onto_class_uri self.property_constraints_dict = {} # {property_uri:PropertyConstraints} - def __str__(self): - if self.property_constraints_dict: - return '\n'.join([str(property_constraints) for property_constraints in self.property_constraints_dict.values()]) - else: - return '' - def set_property_constraints(self, property_uri, property_constraints): ''' Attach Contraint object to specified property_uri. @@ -190,20 +184,29 @@ def get_required_properties(self): return required_properties - def describe(self): + def describe(self, context=None): ''' Assemble and return a plain-text description of the class and property constraints in this object. ''' lines = [] + if context is None: + context = Context() for property_constraints in self.property_constraints_dict.values(): lines.append(property_constraints.describe()) for required_property in self.get_required_properties(): lines.append('Class {}: Property {} is required'.format( - pretty_uri(self.onto_class_uri), required_property)) + context.format(self.onto_class_uri), required_property)) for forbidden_property in self.get_forbidden_properties(): lines.append('Class {}: Property {} is forbidden'.format( - pretty_uri(self.onto_class_uri), forbidden_property)) + context.format(self.onto_class_uri), forbidden_property)) if not lines: lines.append('Empty') return '\n'.join(lines) + + def __str__(self): + if self.property_constraints_dict: + return '\n'.join([str(property_constraints) for property_constraints in self.property_constraints_dict.values()]) + else: + return '' + diff --git a/src/context.py b/src/context.py new file mode 100644 index 0000000..853ef7b --- /dev/null +++ b/src/context.py @@ -0,0 +1,323 @@ +# NOTICE +# This software was produced for the U.S. Government under contract FA8702-21-C-0001, +# and is subject to the Rights in Data-General Clause 52.227-14, Alt. IV (DEC 2007) +# ©2021 The MITRE Corporation. All Rights Reserved. +''' +This module implements the Context class. + +Definitions: + URI. A Universal Resource Indicator. + URI string. A URI represented as a string, e.g. 'http://www.w3.org/2001/XMLSchema#integer'. + + Namespace. A location of a group of related URIs. + Namespace string. A namespace represented as a string, e.g.'http://www.w3.org/2001/XMLSchema#'. + + Name. A property, class or object identifier. + Name string. A name represented as a string, e.g. 'integer'. + + Qualifier. An abbreviation for a namespace string, e.g. 'xsd' abbreviates 'http://www.w3.org/2001/XMLSchema#'. + Binding. Association of a qualifier with a namespace. + Namespace mapping. A mapping that relates Qualifiers to namespace strings. + Inverse namespace mapping. A mapping from namespace strings to qualifiers. + Qualified name or Qname. A uri represented as a qualifier and name string relative to a namespace mapping. + + Context. A set of namespace mappings. + Context object. An instance of the class implemented in this module. + + Normalized URI Object. An instance of rdflib.term.URIRef that describes a URI string. + Normalized Namespace Obj. An instance of rdflib.Namespace that describes a namespace string. +''' +import rdflib + +class Context: + ''' + This class encapsulates a set of namespaces and provides + functions to manipulate qnames and uris within them. + + Attributes: + self.namespace_mapping = {} # {qualifier_string:namespace_string} + self.inverse_namespace_mapping = {} # {namespace_string:qualifier_string} + self.bindings = [()] # list of [(qualifier, namespace)] in the order they were applied + self.default_qualifiers = set() # [qualifier_string] (CONSTANT) + self.default_namespaces = set() # [namespace_string] (CONSTANT) + + Methods: + bind(qualifier, namespace) Assign qualifier to namespace + populate([(qualifier, namespace]) Assign qualifiers to namespaces + namespace(qualifier) Convert qualifier to namespace string if possible + qualifier(namespace) Convert namespace string to qualifier if possible + split_qname(identifier) Derive (qualifier string, name string) from identifier if identifier is a qname + split_uri(identifier) Derive (namespace string, name string) from identifier if identifier is a uri + qname(identifier) Express identifier as qname string if possible + uri_string(identifier) Express identifier as uri string if possible + uri_object(identifier) Express identifier as normalized rdflib.term.URIRef instance if possible + format(identifier_or_list) Format identifier or list of identifiers as a pretty string, using qnames where possible + ''' + # Class constants + DEFAULT_BINDINGS = [ + ('rdf','http://www.w3.org/1999/02/22-rdf-syntax-ns#'), + ('rdfs','http://www.w3.org/2000/01/rdf-schema#'), + ('xsd','http://www.w3.org/2001/XMLSchema#'), + ('owl','http://www.w3.org/2002/07/owl#'), + ('sh','http://www.w3.org/ns/shacl#'), + ('olo','http://purl.org/ontology/olo/core#'), + ] + DEFAULT_QUALIFIERS = {t[0] for t in DEFAULT_BINDINGS} + DEFAULT_NAMESPACES = {t[1] for t in DEFAULT_BINDINGS} + + # Creator + def __init__(self): + ''' + Create an empty instance of this class + ''' + self.namespace_mapping = {} # {qualifier_string:namespace_string} + self.inverse_namespace_mapping = {} # {namespace_string:qualifier_string} + self.bindings = [] # [(qualifier_string, namespace_string)] in order applied + self.populate(Context.DEFAULT_BINDINGS) + + # Add binding + def bind(self, qualifier, namespace): + ''' + Arguments: + qualifier An object whose str() produces a qualifier, e.g. 'xsd' + namespace An object whose str() produces a namespace string, e.g. 'http://www.w3.org/2001/XMLSchema#' + In particular, this could be a string, rdflib.Namespace or a rdflib.term.URIRef. + + Return: + self, with addtional binding + + Action: + Add binding to this Context + + Note: Last one wins! + If qualifier_string is already in self.namespace_mapping, the new one replaces it. + If namespace_string is already in self.inverse_namespace_mapping, the new one replaces it. + ''' + qualifier_string = str(qualifier) + namespace_string = str(namespace) + self.bindings.append((qualifier, namespace)) + + # If qualifier is already in the forward mapping, clobber existing value + self.namespace_mapping[qualifier_string] = namespace_string + + # Computing the inverse mapping is tricky because of ontospy's namespaces + # Rule 1: If there is no existing reverse mapped qualifier, bind the new qualifier + # Rule 2: If the existing mapped qualifier is the empty string, bind new qualifier + existing_inverse_value = self.inverse_namespace_mapping.get(namespace_string) + if not existing_inverse_value: + self.inverse_namespace_mapping[namespace_string] = qualifier_string + + # Rule 3: If there is any existing mapped qualifier and the new one is the empty string, do not bind it + elif not qualifier_string: + pass + + # Rule 4: If the existing mapped qualifier ends with a digit, bind the new one + elif existing_inverse_value[-1] in '0123456789': + self.inverse_namespace_mapping[namespace_string] = qualifier_string + + # Rule 5: If the existing mapped qualifier does not end with a digit and the new one does, do not bind + elif qualifier_string[-1] in '0123456789': + pass + # Rule 6: If none of the above, clobber existing mapped qualifier with new one (last one wins) + else: + self.inverse_namespace_mapping[namespace_string] = qualifier_string + + return self + + + # Add bindings + def populate(self, bindings): + ''' + Arguments: + bindings LIST of (qualifier, namespace), e.g. ontospy.namespaces + + Return: + self, populated + + Action: + Add bindings to the Context. + + Note: Last one wins! + If qualifier is already in self.namespace_mapping, the new one replaces it. + If namespace is already in self.inverse_namespace_mapping, the new one replaces it. + ''' + for qualifier, namespace in bindings: + self.bind(qualifier, namespace) + return self + + + # qualifier -> namespace + def namespace(self, qualifier): + ''' + Arguments: + qualifier An object whose str() may be a qualifier string + + Return: + The corresponding namespace string if qualifier is in this Context + None if not + ''' + return self.namespace_mapping.get(str(qualifier)) + + + # namespace -> qualifier + def qualifier(self, namespace): + ''' + Arguments: + namespace An object whose str() may be namespace string + + Return: + The corresponding qualifier string if namespace is in this Context + None if not + ''' + return self.inverse_namespace_mapping.get(str(namespace)) + + + # identifier -> (qualifier, name) + def split_qname(self, identifier): + ''' + Arguments + identifier An object whose str() which may be a qname + + Return: + (qualifier_string, name_string) if string is a qname with a qualifier in this Context + (str(identifier), None) otherwise + ''' + string = str(identifier) + result = string.split(':', 1) # (stuff before first colon, stuff after first colon) + + # If no colon in string, it's certainly not a qname + if len(result) == 1: + return (string, None) + + # If colon in string and first part in mapping, it's a qname + qualifier_string, name = result + if qualifier_string in self.namespace_mapping: + return (qualifier_string, name) + + # If colon in string but first part not in mapping, it's not a qname + return (string, None) + + + # identifier -> (namespace, name) + def split_uri(self, identifier): + ''' + Arguments + identifier An object whose str() may be a uri + + Return: + (namespace_string, name_string) if string is a uri with a namespace string in this Context + (str(identifier), None) otherwise + ''' + string = str(identifier) + + # If string starts with a known namespace, return namespace and name + for namespace_string in self.inverse_namespace_mapping: + if string.startswith(namespace_string): + return (namespace_string, string[len(namespace_string):]) + + # If not, it's not a uri + return (string, None) + + + # identifier -> qname + def qname(self, identifier): + ''' + Arguments + identifier An object whose str() may be a qname, a uri, or something else + + Return: + qname_string If identifier is a qname or a uri that can be coverted to a qname in this Context + None If none of the above. + ''' + string = str(identifier) + + # If it's a qname, return the qname string + _qualifier, name = self.split_qname(string) + if name is not None: + return string + + # If it's a uri, convert to qname and return the qname string + namespace, name = self.split_uri(string) + if name is not None: + return '{}:{}'.format(self.inverse_namespace_mapping[namespace], name) + + # If none of the above, return None + return None + + + + # identifier -> uri_string + def uri_string(self, identifier): + ''' + Arguments + identifier An object whose str() may be a qname, a uri, or something else + + Return: + uri_string If identifier is a uri or a qname that can be coverted to a uri in this Context + None If none of the above. + ''' + string = str(identifier) + + # If it's a qname, convert to uri and return the uri string + qualifier, name = self.split_qname(string) + if name is not None: + return '{}{}'.format(self.namespace_mapping[qualifier], name) + + # If it's a uri, return the uri string + _namespace, name = self.split_uri(string) + if name is not None: + return string + + # If it's none of the above, return None + return None + + + # identifier -> rdflib.term.URIRef(uri_string) + def uri_object(self, identifier): + ''' + Arguments: + identifier An object whose str() may be a qname, a uri, or something else + + Return: + Normalized reflib.term.URIRef If identifier is a uri or a qname that can be coverted to a uri in this Context + None If none of the above + ''' + # If identifier is a uri or a qname in this namespace, return uri string + uri = self.uri_string(identifier) + if uri: + return rdflib.term.URIRef(uri) + + # If not, return None + return None + + + # identifier(s) -> formatted string + def format(self, identifier_or_list): + ''' + Arguments: + identifier_or_list + Either an object whose str() may be a qname, a uri, or something else, + OR + A LIST of objects whose str() may be a qname, a uri, or something else + + Return: + A "pretty" string representation of the identifier or list of identifiers, + using qnames where possible. + ''' + # Subfunction to prettify a single identifier + def _prettify(identifier): + string = str(identifier) + + # Return qname string if possible + qname = self.qname(string) + if qname is not None: + return '<{}>'.format(qname) + + # Return identifier string if not + return '<{}>'.format(string) + + # Main function to prettify the input argument + if isinstance(identifier_or_list, (list, tuple, set)): + return '[{}]'.format(', '.join([_prettify(identifier) for identifier in identifier_or_list])) + else: + return _prettify(identifier_or_list) # identifier_or_list is just an identifier diff --git a/src/message.py b/src/message.py index 85180f7..0609749 100644 --- a/src/message.py +++ b/src/message.py @@ -4,18 +4,14 @@ # ©2021 The MITRE Corporation. All Rights Reserved. ''' -This module implements some trivial Message classes. -Their purpose is to provide a uniform way of formatting -error messages. +This module implements the Message class and a few trivial subclasses. +Their purpose is to provide a uniform way of formatting error messages. ''' import traceback -from namespace_manager import namespace_manager -class ErrorMessage: - ''' - Base class for error message objects - ''' - message_type = 'Error' +class Message: + message_type = 'Message' + def __init__(self, message=None, # Message text message_source=None, # Name of module, class or function producing this Error @@ -31,9 +27,9 @@ def __init__(self, self.caller = traceback.extract_stack(limit=2)[0].name # If you add another attribute, be sure to add it to __members() - def describe(self): + def format(self, context): ''' - Build and return a string based on self's attribute values + Build and return a message string based on self's attribute values ''' phrases = [] if self.message_type: @@ -43,19 +39,20 @@ def describe(self): # if self.caller: # REMOVED FOR DEMO # phrases.append('{}()'.format(self.caller)) if self.onto_class_uri: - phrases.append('Class {}'.format(pretty_uri(self.onto_class_uri))) + phrases.append('Class {}'.format(context.format(self.onto_class_uri))) if self.property_uri: - phrases.append('Property {}'.format(pretty_uri(self.property_uri))) - phrases.append('{}.'.format(self.message)) + phrases.append('Property {}'.format(context.format(self.property_uri))) + if self.message: + phrases.append('{}{}.'.format(self.message[0].upper(), self.message[1:])) if self.exc: phrases.append('{}: {}'.format(type(self.exc), self.exc)) return ' '.join(phrases) def __str__(self): - return self.describe() + return '<{}>'.format(self.message_type) def __repr__(self): - return self.describe() + return str(self) def __members(self): ''' @@ -83,6 +80,12 @@ def __hash__(self): +class ErrorMessage(Message): + ''' + Base class for error message objects + ''' + message_type = 'Error' + class DataError(ErrorMessage): message_type = 'Data Error' @@ -100,27 +103,3 @@ class UnsupportedFeature(ErrorMessage): class SoftwareBug(ErrorMessage): message_type = 'Software Bug' - - -def pretty_uri(uri): - ''' - Arguments: - uri An rdflib.term.URIRef object - - Return: - "pretty" string representation of uri - ''' - try: - return '<{}>'.format(uri.n3(namespace_manager)) - except Exception: - return '<{}>'.format(uri) - -def pretty_uris(uris): - ''' - Arguments: - uris LIST of rdflib.term.URIRef objects - - Return: - "pretty" string representation of list of uris - ''' - return '[{}]'.format(', '.join([pretty_uri(uri) for uri in uris])) diff --git a/src/ontology.py b/src/ontology.py index fbfdc66..140a103 100644 --- a/src/ontology.py +++ b/src/ontology.py @@ -12,16 +12,16 @@ from copy import deepcopy import datetime import os +import sys from ontospy import Ontospy -from rdflib.namespace import OWL, RDFS +from rdflib.namespace import OWL, RDFS, XSD import serializer from class_constraints import get_class_constraints, ClassConstraints from datatype_constraints import get_datatype_constraints -import namespace_manager +from context import Context from message import OntologyError, UnsupportedFeature -from message import pretty_uri, pretty_uris -VERSION = '1.0' # Appears in the metadata when serialized +VERSION = '1.1' # Appears in the metadata when serialized, compared with serialized value before deserializing def get_ontology(path, verbose=True, **kwargs): ''' @@ -37,22 +37,24 @@ def get_ontology(path, verbose=True, **kwargs): Return: An Ontology object - Side effect: - This function populates namespace_manager.namespace_manager - Raise: Exception if path is not a serialized ontology file or a directory containing turtle files ''' # Start with empty Ontology object ontology = Ontology() - # If path is a serialized ontology file, deserialize it and populate namespace_manager + # If path is a serialized ontology file, deserialize it and set context for error messages if os.path.isfile(path) and serializer.get_identifier(path) == serializer.ONTOLOGY: - _identifer, _metadata, ontology.__dict__ = serializer.deserialize(path) - namespace_manager.populate(ontology.constraints.keys()) - - # If path is a directory containing turtle files, build Ontology object AND populate namespace_manager - # Note: _read_turtle_files() populates the namespace_manager so the ontology errors can use it + _identifer, metadata, ontology.__dict__ = serializer.deserialize(path) + if metadata['version'] != VERSION: + print('{} was serialized with a different version of the toolkit. Use this command to reserialize:'.format(path)) + print() + print(' serialize {}'.format(metadata['path'])) + print() + sys.exit(1) + + # If path is a directory containing turtle files, build Ontology + # Note that _read_turtle_files sets the context for error messages elif os.path.isdir(path) and [filename for filename in os.listdir(path) if filename.endswith('.ttl')]: ontology.__dict__ = _read_turtle_files(path, verbose, **kwargs) @@ -60,8 +62,6 @@ def get_ontology(path, verbose=True, **kwargs): else: raise Exception('{} is neither a serialized ontology file nor a directory containing turtle files'.format(path)) - # Populate the namespace from the ontology classes - # Return the ontology object return ontology @@ -76,12 +76,16 @@ class Ontology: constraints {URIRef(onto_class):ClassConstraints|DatatypeConstraints|None} (derived from ontospy.all_classes property_ranges {property_uri:range_uri|None} (derived from ontospy.all_properties) error_messages {ErrorMessage} Set of instances of ErrorMessage or subclass objects + bindings [(prefix, uri)] List of namespace mappings from ontospy.namespaces + ancestor_classes {class_uri:[ancestor_class_uri]} Classes and their ancestor classes ''' def __init__(self): self.turtle_dirpath = None # The path to the directory containing the turtle files self.constraints = {} # {URIRef(onto_class):ClassConstraints|DatatypeConstraints|None} self.property_ranges = {} # {property_uri:range_uri|None} self.error_messages = {} # {ErrorMessage} + self.bindings = [] # [(prefix, uri)] from ontospy.namespaces + self.ancestor_classes = {} # {class_uri:[ancestor_class_uri]} def serialize(self, output_filepath, comment): ''' @@ -101,6 +105,7 @@ def serialize(self, output_filepath, comment): 'md5': dir_hash, 'manifest': dir_manifest } + print('Writing serialized ontology to {}'.format(output_filepath)) serializer.serialize(identifier, metadata, self.__dict__, output_filepath) @@ -116,10 +121,12 @@ def _read_turtle_files(turtle_dirpath, verbose=True, **kwargs): Return: dictionary { - 'turtle_dirpath':dirpath, # Path to the directory containing the turtle files - 'property_ranges':value, # {property_uri:range_uri|None} (derived from ontospy.all_properties) - 'constraints':value, # {URIRef(onto_class):constraints} (derived from ontospy.all_classes) - 'error_messages':[errmsg] # List of unique ErrorMessage objects + 'turtle_dirpath':dirpath, # Path to the directory containing the turtle files + 'property_ranges':value, # {property_uri:range_uri|None} (derived from ontospy.all_properties) + 'constraints':value, # {URIRef(onto_class):constraints} (derived from ontospy.all_classes) + 'error_messages':[errmsg], # List of unique ErrorMessage objects + 'bindings':[(qualifier:uri_string)], # List of binding tuples, e.g. ('core','http://unifiedcyberontology.org/core') + 'ancestor_classes' # Classes and their ancestors {class_uri:[ancestor_class_uri]} } where constraints is an instance of ClassConstraints, DatatypeConstraints or None ''' @@ -128,9 +135,7 @@ def _read_turtle_files(turtle_dirpath, verbose=True, **kwargs): # Step 1. Build Ontospy ontospy = Ontospy(uri_or_path=turtle_dirpath, rdf_format='turtle', verbose=verbose, **kwargs) - - # Step 1A. Populate namespace_manager - namespace_manager.populate([onto_class.uri for onto_class in ontospy.all_classes]) + context = Context().populate(ontospy.namespaces) # Step 2. Get naive constraints for each class simple_constraints_dict = {} # {URIRef(class):ClassConstraints|DatatypeConstraints|None} @@ -159,24 +164,66 @@ def _read_turtle_files(turtle_dirpath, verbose=True, **kwargs): ontospy_property_ranges[onto_property.uri] = [property_range.uri for property_range in onto_property.ranges] ontospy_property_triples[onto_property.uri] = onto_property.triples - property_ranges, errmsgs = _get_property_ranges(ontospy_property_ranges, ontospy_property_triples) + property_ranges, errmsgs = _get_property_ranges(ontospy_property_ranges, ontospy_property_triples, context) error_messages.update(errmsgs) # Step 5. Check that range constraints are consistent with general property constraints - errmsgs = _check_range_consistency(simple_constraints_dict, property_ranges) + errmsgs = _check_range_consistency(simple_constraints_dict, property_ranges, context) error_messages.update(errmsgs) + # Step 6. Build class-ancestor lookup table + ancestor_classes = _build_ancestor_classes(ontospy.all_classes, context) # {class_uri:[ancestor_class_uri]} # Construct and return dictionary return { 'turtle_dirpath': turtle_dirpath, 'property_ranges': property_ranges, 'constraints': net_constraints, - 'error_messages': sorted(list(error_messages), key=lambda x:(x.onto_class_uri, x.property_uri)) + 'error_messages': sorted(list(error_messages), key=lambda x:(x.onto_class_uri, x.property_uri)), + 'bindings': context.bindings, + 'ancestor_classes': ancestor_classes } +def _build_ancestor_classes(all_onto_classes, context): + ''' + Arguments + all_onto_classes List of ontoClass objects (from ontospy.all_classes) + context Ontology Context object + + Return: + Dictionary {class_uri:[ancestor_class_uri]} of classes and their ancestors + + Note: + These custom hierarchy relations are included in the returned dictionary + * All vocab and vocab1 classes have xsd:string as an ancestor + * xsd:integer has xsd:long as an ancestor + * xsd:decimal has xsd:float as an ancestor + ''' + # Collect ancestors from ontospy {onto_class:[ancestor_onto_class]} + onto_class_ancestors = {} + for onto_class in all_onto_classes: + onto_class_ancestors[onto_class] = onto_class.ancestors() + + # Convert onto_class_ancestors to {class_uri:[ancestor_class_uris]} + ancestor_classes = {child_class.uri:[ancestor_class.uri for ancestor_class in ancestor_classes] \ + for child_class, ancestor_classes in onto_class_ancestors.items()} + + # Customization: add xsd:string as ancestor of all uris in vocab: or vocab1: namespace + for class_uri, ancestor_class_uris in ancestor_classes.items(): + if context.split_qname(context.qname(class_uri))[0] in ('vocab', 'vocab1'): + ancestor_class_uris.append(XSD.string) + + # Customization: add integer/long and decimal/float relations + ancestor_classes[XSD.integer] = [XSD.long] + ancestor_classes[XSD.decimal] = [XSD.float] + + # Return the result {class_uri:[ancestor_class_uris]} + return ancestor_classes + + + def _inherit_constraints(constraints, parent_child_class_dict): ''' This function belongs in the Ontology class. @@ -293,7 +340,7 @@ def _inherit_constraints(constraints, parent_child_class_dict): -def _get_property_ranges(ontospy_property_ranges, ontospy_property_triples): +def _get_property_ranges(ontospy_property_ranges, ontospy_property_triples, context): ''' This function belongs in the Ontology class. It is implemented outside the Ontology class to facilitate unit testing @@ -305,6 +352,7 @@ def _get_property_ranges(ontospy_property_ranges, ontospy_property_triples): Arguments: ontospy_property_ranges {property_uri:[range_uris]} ontospy_property_triples {property_uri:[triples]} + context Ontology Context object Return: property_ranges {property_uri:range_uri|None} @@ -337,7 +385,7 @@ def _get_property_ranges(ontospy_property_ranges, ontospy_property_triples): error_messages.append(OntologyError( message='Property has {} ranges {}'.format( len(range_uris), - pretty_uris(range_uris)), + context.format(range_uris)), property_uri = property_uri)) continue @@ -350,7 +398,7 @@ def _get_property_ranges(ontospy_property_ranges, ontospy_property_triples): -def _check_range_consistency(all_constraints, property_ranges): +def _check_range_consistency(all_constraints, property_ranges, context): ''' This function belongs in the Ontology class. It is implemented outside the Ontology class to facilitate unit testing @@ -361,6 +409,7 @@ def _check_range_consistency(all_constraints, property_ranges): Arguments: all_constraints {URIRef(onto_class):ClassConstraints|DatatypeConstraints|None} (ontology.constraints) property_ranges {property_uri:range_uri|None} (derived from ontospy.all_properties) + context Ontology Context object Return: List of ErrorMessage objects @@ -389,7 +438,7 @@ def _check_range_consistency(all_constraints, property_ranges): # Make sure property is in onto_property_ranges if not property_uri in property_ranges: error_messages.append(OntologyError( - message='property {} missing from ontology property list'.format(pretty_uri(property_uri)), + message='property {} missing from ontology property list'.format(context.format(property_uri)), onto_class_uri=onto_class_uri, property_uri=property_uri)) continue @@ -400,9 +449,9 @@ def _check_range_consistency(all_constraints, property_ranges): # Compare the owl property constraint with ontospy's opinion if range_uri and range_uri != property_constraints.value_range: error_messages.append(OntologyError( - message='owl subclass constraint {} does not match explicit property constraint {}'.format( - pretty_uri(property_constraints.value_range), - pretty_uri(range_uri)), + message='owl subclass constraint {} does not match property constraint {}'.format( + context.format(property_constraints.value_range), + context.format(range_uri)), onto_class_uri=onto_class_uri, property_uri=property_uri)) diff --git a/src/precondition.py b/src/precondition.py index 274c769..f995c9e 100644 --- a/src/precondition.py +++ b/src/precondition.py @@ -56,7 +56,7 @@ def postcondition(graph, context): and expand apparent prefixes in Literal objects that should be URIs Arguments: - context {prefix:expanded_location} + context A Context object graph A rdflib_graph object Return: tuple (new_graph, {node:line_number}) @@ -82,11 +82,9 @@ def postcondition(graph, context): # If obj is a literal and has no datatype, it's one of those ambiguous cases # If it has an IRI prefix in the context, expand its prefix and replace with a URIRef if isinstance(obj, rdflib.term.Literal) and not obj.datatype: - value = str(obj) - parts = value.split(':', 1) # does value look like prefix:stuff? - if len(parts) > 1 and parts[0] in context: # yes, and the prefix is in the context! - parts[0] = context[parts[0]] - obj = rdflib.term.URIRef(parts[0] + parts[1]) + uri_object = context.uri_object(str(obj)) # A URIRef or None + if uri_object: + obj = uri_object # If obj is a URIRef (maybe just created above!) and has a line number, # remove line number from URIRef object @@ -212,7 +210,7 @@ def embed_line_numbers(text): where n is the (possibly multidigit) line number ''' # This regular expressions matches a @type declaration in the json-ld file - type_matcher = re.compile(r'( *"@type": *")([\w:#/-]+)(",)') + type_matcher = re.compile(r'(\s*"@type":\s*")([\w:#/-]+)(",)') # Do for each line in text lines = text.split('\n') diff --git a/src/property_constraints.py b/src/property_constraints.py index 4be1ffb..c55d08e 100644 --- a/src/property_constraints.py +++ b/src/property_constraints.py @@ -10,8 +10,7 @@ This module implements the PropertyConstraints object ''' from message import OntologyError -from message import pretty_uri - +from context import Context class PropertyConstraints: ''' @@ -49,14 +48,6 @@ def __init__(self, onto_class_uri=None, property_uri=None): self.value_range = None self._qualified = None # True/False/None where None means unset - def __str__(self): - return '<{} {} [{}-{}] {}>'.format( - pretty_uri(self.onto_class_uri) if self.onto_class_uri else None, - pretty_uri(self.property_uri) if self.property_uri else 'DATATYPE', - '?' if self.min_cardinality is None else self.min_cardinality, - '?' if self.max_cardinality is None else self.max_cardinality, - pretty_uri(self.value_range) if self.value_range else '?') - def add_min_cardinality(self, min_cardinality): ''' "Add" specified min_cardinality value if possible. @@ -341,7 +332,7 @@ def check_consistency(self): return error_messages - def describe(self): + def describe(self, context=None): ''' Assemble and return a plain-text description of these PropertyConstraints @@ -350,10 +341,12 @@ def describe(self): ''' value = lambda n: 'value' if n == 1 else 'values' phrases = [] + if context is None: + context = Context() if self.onto_class_uri: - phrases.append('Class {}'.format(pretty_uri(self.onto_class_uri))) + phrases.append('Class {}'.format(context.format(self.onto_class_uri))) if self.property_uri: - phrases.append('Property {}'.format(pretty_uri(self.property_uri))) + phrases.append('Property {}'.format(context.format(self.property_uri))) else: phrases.append('Property') @@ -377,8 +370,6 @@ def describe(self): return ' '.join(phrases) - - def _get_ontology_error(self, message): ''' Arguments: @@ -397,6 +388,15 @@ def __members(self): ''' return (self.onto_class_uri, self.property_uri, self.min_cardinality, self.max_cardinality, self.value_range) + + def __str__(self): + return '<{} {} [{}-{}] {}>'.format( + self.onto_class_uri if self.onto_class_uri else None, + self.property_uri if self.property_uri else 'DATATYPE', + '?' if self.min_cardinality is None else self.min_cardinality, + '?' if self.max_cardinality is None else self.max_cardinality, + self.value_range if self.value_range else '?') + def __eq__(self, other): ''' Two instances of this class are equal if the __member attributes are equal diff --git a/src/setup.py b/src/setup.py index 4f10bb9..6d4055e 100644 --- a/src/setup.py +++ b/src/setup.py @@ -13,10 +13,10 @@ 'class_constraints', 'datatype_constraints', 'message', - 'namespace_manager', 'ontology', 'precondition', 'property_constraints', + 'context', 'serializer', 'triples', 'validator', diff --git a/src/validate.py b/src/validate.py index 551c94c..674a769 100644 --- a/src/validate.py +++ b/src/validate.py @@ -17,6 +17,7 @@ from ontology import get_ontology from casedata import get_casedata from validator import validate +from context import Context def parse_args(args): ''' @@ -154,19 +155,21 @@ def main(): # Load the ontology file and print the error messages print('ONTOLOGY {}'.format(args.ontology_path)) ontology = get_ontology(args.ontology_path) + context = Context().populate(ontology.bindings) for errmsg in ontology.error_messages: - print(errmsg) + print(errmsg.format(context)) # Do for each data file for filepath in args.data_filepaths: # Validate the data file against the ontology file and print error messages - print('\n\nCASEDATA {}'.format(filepath)) + print('\n\nVALIDATING {}'.format(filepath)) case_data = get_casedata(filepath) + context = Context().populate(case_data.bindings) errmsgs = validate(ontology, case_data) for errmsg in errmsgs: - print(errmsg) + print(errmsg.format(context)) # Done sys.exit(0) diff --git a/src/validator.py b/src/validator.py index d41e233..1c66ccf 100644 --- a/src/validator.py +++ b/src/validator.py @@ -6,18 +6,15 @@ ''' This module is where the ontology and casedata meet. ''' -import json +import pprint # For debug import rdflib from rdflib.namespace import RDF, XSD from triples import get_spo_dict from class_constraints import ClassConstraints -from message import DataError, UnsupportedFeature, ConstraintError -from message import pretty_uri, pretty_uris -from namespace_manager import namespace_manager +from message import DataError, UnsupportedFeature, ConstraintError from datatype_constraints import DatatypeConstraints from xsd_validator import validate_xsd -namespaces = dict(namespace_manager.namespaces()) # {'prefix':URIRef('http:/blah/blah#')} - +from context import Context def validate(ontology, case_data): @@ -29,25 +26,31 @@ def validate(ontology, case_data): Return: List of ErrorMessage objects sorted by line number ''' + # Set the message context + context = Context().populate(case_data.bindings) + # Convert case_data triples to SPO dictionary {subject:{predicate:{objects}}} spo_dict = get_spo_dict(case_data.graph[::]) # Call validate_case_data error_messages = validate_case_data(spo_dict, case_data.line_numbers, - ontology.constraints, ontology.property_ranges) + ontology.constraints, ontology.property_ranges, ontology.ancestor_classes, context) # Return error messages sorted by line number error_messages.sort(key=lambda x: x.line_number if x.line_number else 0) return error_messages -def validate_case_data(spo_dict, line_numbers, ontology_constraints, ontology_property_ranges): +def validate_case_data(spo_dict, line_numbers, ontology_constraints, ontology_property_ranges, + ontology_ancestor_classes, context): ''' Arguments: spo_dict case_data triples in dictionary {subject:{predicate:{objects}}} line_numbers case_data {subject_uri:line_number} ontology_constraints ontology {onto_class_uri:ClassConstraints|DatatypeConstraints|None} ontology_property_ranges ontology {property_uri:range_uri|None} + ontology_ancestor_classes {class_uri:[ancestor_class_uri]} Classes and their ancestor classes + context Data context object Return: List of error messages @@ -61,40 +64,34 @@ def validate_case_data(spo_dict, line_numbers, ontology_constraints, ontology_pr for subject, po_dict in spo_dict.items(): line_number = line_numbers.get(subject) - #print() - #print('subject', subject) - # Get Subject's type, which should be a Class in the ontology # Make sure there's exactly one type. If it isn't, skip this Subject subject_type_uris = po_dict.get(RDF.type) # If subject has no type, error if not subject_type_uris: - #print('Skipping subject {} because it has no type'.format(subject)) + subject_description = subject.__class__.__name__.split('.')[-1] error_messages.append(DataError( - message='{} has no ranges'.format(subject), + message='{} {} has no ranges'.format(subject_description, context.format(subject)), line_number=line_number)) continue # If subject has more than one type, error if len(subject_type_uris) > 1: - #print('Skipping subject {} because it has {} types {}'.format(subject, len(subject_type_uris), subject_type_uris)) error_messages.append(DataError( message='{} has {} types: {}'.format( - pretty_uri(subject), + context.format(subject), len(subject_type_uris), - pretty_uris(subject_type_uris)), + context.format(subject_type_uris)), line_number=line_number)) continue subject_type_uri = list(subject_type_uris)[0] - #print('{} is a {}'.format(subject, subject_type_uri)) # Make sure Subject's type exists in the ontology. If it doesn't, skip this Subject # ontology.constraints maps all ontology classes to constraint objects (or None) if not subject_type_uri in ontology_constraints: - #print('Skipping subject {} because type {} not in ontology'.format(subject, subject_type_uri)) error_messages.append(DataError( message='class not in ontology', onto_class_uri = subject_type_uri, @@ -113,13 +110,12 @@ def validate_case_data(spo_dict, line_numbers, ontology_constraints, ontology_pr continue pvt_dict[predicate] = {} for value in objects: - value_type, errmsgs = get_value_type(value, spo_dict) + value_type, errmsgs = get_value_type(value, spo_dict, context) for errmsg in errmsgs: errmsg.onto_class_uri = subject_type_uri errmsg.line_number = line_number error_messages.extend(errmsgs) pvt_dict[predicate][value] = value_type - #print('{} has pvt_dict of length {}'.format(subject, len(pvt_dict))) # Get the constraints object for Subject type's ontology class. @@ -131,7 +127,6 @@ def validate_case_data(spo_dict, line_numbers, ontology_constraints, ontology_pr message='expected ClassConstraints, found {}'.format(type(class_constraints)), onto_class_uri = subject_type_uri, line_number=line_number)) - #print('{} is subject to {}'.format(subject, class_constraints)) # If constraints is ClassConstraints, validate cardinality constraints if isinstance(class_constraints, ClassConstraints): @@ -139,29 +134,33 @@ def validate_case_data(spo_dict, line_numbers, ontology_constraints, ontology_pr for errmsg in errmsgs: errmsg.onto_class_uri = subject_type_uri errmsg.line_number = line_number - error_messages.extend(errmsgs) - #print('Validating cardinality constraints for {} got {} error messages'.format(subject, len(errmsgs))) + error_messages.extend(errmsgs) + #print('Validating property range constraints for {} got {} error messages'.format(subject, len(errmsgs))) # Validate property range constraints, with or without ClassConstraints if isinstance(class_constraints, ClassConstraints): - errmsgs = validate_range_constraints(pvt_dict, ontology_property_ranges, class_constraints) + errmsgs = validate_range_constraints(pvt_dict, ontology_property_ranges, ontology_ancestor_classes, + context, class_constraints) else: - errmsgs = validate_range_constraints(pvt_dict, ontology_property_ranges) + errmsgs = validate_range_constraints(pvt_dict, ontology_property_ranges, ontology_ancestor_classes, context) for errmsg in errmsgs: errmsg.onto_class_uri = subject_type_uri errmsg.line_number = line_number + error_messages.extend(errmsgs) #print('Validating property range constraints for {} got {} error messages'.format(subject, len(errmsgs))) # Validate the literals for property_uri, vt_dict in pvt_dict.items(): - errmsgs = validate_literals( - literals=[value for value in vt_dict.keys() if isinstance(value, rdflib.term.Literal)], - ontology_constraints=ontology_constraints) - for errmsg in errmsgs: - errmsg.line_number = line_number - errmsg.property_uri = property_uri - errmsg.onto_class_uri = subject_type_uri - #print('Validating literals for {} got {} error messages'.format(subject, len(errmsgs))) + for value, value_type in vt_dict.items(): + if not isinstance(value, rdflib.term.Literal): + continue + errmsgs = validate_literal(value, ontology_constraints.get(value.datatype), context) + for errmsg in errmsgs: + errmsg.line_number = line_number + errmsg.property_uri = property_uri + errmsg.onto_class_uri = subject_type_uri + error_messages.extend(errmsgs) + # #print('Validating literals for {} got {} error messages'.format(subject, len(errmsgs))) # Done! Return error messages return error_messages @@ -180,15 +179,17 @@ def validate_cardinality_constraints(pvt_dict, class_constraints): # Start with an empty list of error messages error_messages = [] - # Make sure REQUIRED properties are present - required_properties = class_constraints.get_required_properties() - for required_property in required_properties: - if required_property.n3(namespace_manager) == 'core:id': # This property is exempt + + # Make sure REQUIRED properties are present. Core:id is exempt and is allowed to be absent. + required_property_uris = class_constraints.get_required_properties() + exempt_property_uri = rdflib.term.URIRef('https://unifiedcyberontology.org/ontology/uco/core#id') # uco-core:id + for required_property_uri in required_property_uris: + if required_property_uri == exempt_property_uri: continue - if required_property not in pvt_dict: + if required_property_uri not in pvt_dict: error_messages.append(ConstraintError( message='data is missing required property', - property_uri=required_property)) + property_uri=required_property_uri)) # Make sure properties with cardinality constraints meet the constraints for property_uri, vt_dict in pvt_dict.items(): @@ -211,11 +212,13 @@ def validate_cardinality_constraints(pvt_dict, class_constraints): -def validate_range_constraints(pvt_dict, ontology_property_ranges, class_constraints=None): +def validate_range_constraints(pvt_dict, ontology_property_ranges, ancestor_classes, context, class_constraints=None): ''' Arguments: pvt_dict data {property_uri:{value:type_uri}} ontology_property_ranges ontology.property_ranges, {property_uri:range_uri|None} + ancestor_classes {class_uri:[ancestor_class_uri]} Classes and their ancestor classes + context Data Context object class_constraints A ClassConstraints object or None Return: @@ -241,84 +244,73 @@ def validate_range_constraints(pvt_dict, ontology_property_ranges, class_constra property_range = ontology_property_ranges.get(property_uri) # could still be None - # If there's a property range, check that each value's type is in the property range - # TODO: Allow value_type to be an instance of a *subclass* of property_range + # If there's a property range, check that the property range is the same as or an ancestor of the value type if property_range: for value, value_type in vt_dict.items(): - if value_type != property_range: # TODO: if not a subclass of property_range + if not (property_range == value_type or property_range in ancestor_classes.get(value_type, [])): error_messages.append(ConstraintError( message="property's value {} is a {} but must be a {}".format( - value, value_type, property_range), + '' if isinstance(value, rdflib.term.BNode) else value, + context.format(value_type), + context.format(property_range)), property_uri=property_uri)) + # Done! Return error_messages return error_messages -def validate_literals(literals, ontology_constraints): +def validate_literal(literal, constraints, context): ''' Arguments: - literals List of rdflib.term.Literal values - ontology_constraints ontology.constraints, - {onto_class_uri:ClassConstraints|DatatypeConstraints|None} + literal An rdflib.term.Literal value + constraints A ClassConstraints object, DatatypeConstraints object, or None + context Data Context object Return: List of ErrorMessage objects - Blacklisted types: + Non-checkable datatypes If literal has no datatype, it is by default a string, so no need to validate. If literal datatype is XSD.base64Binary, ontospy already converted its value to binary, so we cannot validate it. If the base64 were invalid, ontospy would have reported it. ''' - # Start with empty list of error messages - error_messages = [] - - # Do for each value - for literal in literals: - - # If Literal has no datatype, it is an XSD.string by default. No check needed. - if literal.datatype is None: - continue - - # If Literal is a blacklisted type, do not check - if literal.datatype == XSD.base64Binary: - continue + # If Literal has no datatype, it is an XSD.string by default. No check needed. + if literal.datatype is None: + return [] - # If Literal is not a URIRef, UnsupportedFeature - if not isinstance(literal.datatype, rdflib.term.URIRef): - error_messages.append(UnsupportedFeature( - message='Literal has datatype that is not a URIRef: {}'.format(literal))) - continue + # If Literal is listed as non-checkable in this function's doc block, do not check. + if literal.datatype == XSD.base64Binary or str(literal.datatype) in ('xsd:base64Binary', 'xs:base64Binary'): + return [] - # If Literal datatype is an XSD type, do validate XSD - # TODO: Is this really necessary? - if str(literal.datatype).startswith(XSD): - error_messages.extend(validate_xsd(str(literal), literal.datatype)) - continue + # If Literal is not a URIRef, return an error message + if not isinstance(literal.datatype, rdflib.term.URIRef): + return [UnsupportedFeature( + message='Literal {} has datatype that is not a URIRef: {}'.format(literal, literal.datatype))] - # If we're here, Literal datatype is a URIRef and not an XSD type. - # If it's not a Datatype, error. - datatype_constraints = ontology_constraints.get(literal.datatype) - if not isinstance(datatype_constraints, DatatypeConstraints): - error_messages.append(UnsupportedFeature( - message='Literal has unrecognized datatype: {}'.format(literal))) - continue + # If Literal datatype is an XSD type, validate it and return list of error messages + if str(literal.datatype).startswith(XSD) or str(literal.datatype).startswith('xsd:') or str(literal.datatype).startswith('xs:'): + return validate_xsd(str(literal), literal.datatype) - # Call datatypeConstraint's validate function and return results - error_messages.extend(datatype_constraints.validate(str(literal))) + # If we're here, Literal datatype is a URIRef and not an XSD type. + # If constraints is not a DatatypeConstraints object, then there's no such datatype in the ontology, return error message + if not isinstance(constraints, DatatypeConstraints): + return [UnsupportedFeature( + message='Literal {} has unrecognized datatype: {}'.format(literal, context.format(literal.datatype)))] - # Done! Return error messages - return error_messages + # Here constraints is a DatatypeConstraint object, so call its validate function and return results + return constraints.validate(str(literal)) -def get_value_type(value, spo_dict): +def get_value_type(value, spo_dict, context): ''' Arguments: value A value (object of a predicate) from the CaseData spo_dict The CaseData triples {subject:{predicate:{objects}}} + context Data Context object Return: tuple containing The value's data type as a rdflib.term.URIRef, possibly None @@ -327,7 +319,7 @@ def get_value_type(value, spo_dict): Cases: 1. If value is a Literal, its type is the literal's datatype 2. If value is a BNode, its type is the BNode's RDF.type - 3. If value is a URIRef, its points to an local object and its type is the local object's RDF.type + 3. If value is a URIRef, its points to an local object and its type is the local object's RDF.type (which is None for a broken link) In all cases, if a value or its type cannot be found or is invalid, this function returns None Geek note: @@ -344,26 +336,31 @@ def get_value_type(value, spo_dict): if literal.datatype is None: return XSD.string, [] + # If it's a known qname or uri, return a URIRef with no error + uri_object = context.uri_object(literal.datatype) + if uri_object: + return uri_object, [] + # If literal.datatype is "good", return it with no errors - datatype_string = str(literal.datatype) - if datatype_string.startswith('http://') or datatype_string.startswith('https://'): - return literal.datatype, [] + #datatype_string = str(literal.datatype) + #if datatype_string.startswith('http://') or datatype_string.startswith('https://'): + # return literal.datatype, [] # If it's is of the form something:something, try to expand it - tokens = datatype_string.split(':',1) - if len(tokens) == 2: - prefix, rest = tokens + #tokens = datatype_string.split(':',1) + #if len(tokens) == 2: + # prefix, rest = tokens - # If prefix is known, return URI with expanded prefix and no errors - expanded_prefix = namespaces.get(prefix) - if expanded_prefix: - return rdflib.term.URIRef(expanded_prefix + rest), [] + # # If prefix is known, return URI with expanded prefix and no errors + # expanded_prefix = namespaces.get(prefix) + # if expanded_prefix: + # return rdflib.term.URIRef(expanded_prefix + rest), [] - # If prefix is unknown, return None and error - errmsg = DataError(message='unknown prefix in literal datatype {}'.format(literal.datatype)) - return None, [errmsg] + # # If prefix is unknown, return None and error + # errmsg = DataError(message='unknown prefix in literal datatype {}'.format(literal.datatype)) + # return None, [errmsg] - # If it isn't even of the form something:something, return None and error + # If none of the above, return None with error message errmsg = DataError(message='unsupported datatype {}'.format(literal.datatype)) return None, [errmsg] @@ -378,7 +375,7 @@ def get_value_type(value, spo_dict): # If the link is broken, return None with error message if not po_dict: - errmsg = DataError(message='malformed case data: missing link <{}>'.format(link)) + errmsg = DataError(message='missing link <{}>'.format(link)) return None, [errmsg] # Get the BNode or URIRef's types @@ -386,36 +383,20 @@ def get_value_type(value, spo_dict): # If there are zero or multiple types, data is malformed if not datatypes: - errmsg = DataError(message='malformed case data: {} {} has no datatypes'.format(link_description, pretty_uri(link))) + errmsg = DataError(message='{} {} has no datatypes'.format(link_description, context.format(link))) return None, [errmsg] if len(datatypes) > 1: - errmsg = DataError(message='malformed case data: {} <{}> has {} datatypes {}'.format( - link_description, - pretty_uri(link), - len(datatypes), - pretty_uris(datatypes))) + errmsg = DataError( + message='{} <{}> has {} datatypes {}'.format( + link_description, context.format(link), len(datatypes), context.format(datatypes))) return None, [errmsg] # If we're here, datatype has one value. Return it with no error return list(datatypes)[0], [] # Case 4. Unexpected Value type - errmsg = DataError( - message='unrecognized data value {} of type {}, expected a Literal, URIRef or BNode'.format( - value, type(value))) - return None, [errmsg] - - - - -# Json encoder to support sets -class SetEncoder(json.JSONEncoder): - ''' - A Json Encoder class that treats sets as lists. - - Usage: json.dumps(spo_dict, indent=2, cls=SetEncoder) - ''' - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return json.JSONEncoder.default(self, obj) + else: + errmsg = DataError( + message='unrecognized data value {} of type {}, expected a Literal, URIRef or BNode'.format( + value, type(value))) + return None, [errmsg] diff --git a/src/xsd_validator.py b/src/xsd_validator.py index eba5c93..af6619c 100644 --- a/src/xsd_validator.py +++ b/src/xsd_validator.py @@ -10,6 +10,7 @@ import re from xml.sax.saxutils import escape from lxml import etree +import rdflib from rdflib.namespace import XSD # Namespace('http://www.w3.org/2001/XMLSchema#') from message import ErrorMessage