From 5918b2c894af67d8dcd7c4202b95293713ff9820 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 16:53:28 -0600 Subject: [PATCH 01/36] move dj.config into settings.py --- datajoint/__init__.py | 46 +++++++++---------------------------------- datajoint/heading.py | 11 ++++++----- datajoint/settings.py | 25 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 8958a2746..b9d2d907b 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -27,43 +27,8 @@ 'DataJointError', 'DuplicateError', 'set_password'] - -class key: - """ - object that allows requesting the primary key in Fetch.__getitem__ - """ - pass - - -# ----------- loads local configuration from file ---------------- -from .settings import Config, LOCALCONFIG, GLOBALCONFIG, logger, log_levels -config = Config() -config_files = (os.path.expanduser(n) for n in (LOCALCONFIG, os.path.join('~', GLOBALCONFIG))) -try: - config_file = next(n for n in config_files if os.path.exists(n)) -except StopIteration: - config.add_history('No config file found, using default settings.') -else: - config.load(config_file) - del config_file - -del config_files - -# override login credentials with environment variables -mapping = {k: v for k, v in zip( - ('database.host', 'database.user', 'database.password', - 'external.aws_access_key_id', 'external.aws_secret_access_key',), - map(os.getenv, ('DJ_HOST', 'DJ_USER', 'DJ_PASS', - 'DJ_AWS_ACCESS_KEY_ID', 'DJ_AWS_SECRET_ACCESS_KEY',))) - if v is not None} -for k in mapping: - config.add_history('Updated login credentials from %s' % k) -config.update(mapping) -del mapping - -logger.setLevel(log_levels[config['loglevel']]) - -# ------------- flatten import hierarchy ------------------------- +# ----------- flatten import hierarchy ---------------- +from .settings import config from .connection import conn, Connection from .table import FreeTable, Table from .user_tables import Manual, Lookup, Imported, Computed, Part @@ -75,6 +40,13 @@ class key: from .errors import DataJointError, DuplicateError +class key: + """ + object that allows requesting the primary key in Fetch.__getitem__ + """ + pass + + def create_virtual_module(module_name, schema_name, create_schema=False, create_tables=False, connection=None): """ Creates a python module with the given name from the name of a schema on the server and diff --git a/datajoint/heading.py b/datajoint/heading.py index f56585c40..d6c765b2d 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -3,6 +3,7 @@ from itertools import chain import re import logging +from .settings import config from .errors import DataJointError logger = logging.getLogger(__name__) @@ -185,10 +186,11 @@ def init_from_database(self, conn, database, table_name): # additional attribute properties for attr in attributes: - # process external attributes + # process configurable attributes split_comment = attr['comment'].split(':') attr['is_external'] = len(split_comment) >= 3 and split_comment[1].startswith('external') - if attr['is_external']: + attr['is_attachment'] = len(split_comment) >= 3 and split_comment[1].startswith('attach') + if attr['is_external'] or attr['is_attachment']: attr['comment'] = ':'.join(split_comment[2:]) attr['type'] = split_comment[1] @@ -208,9 +210,8 @@ def init_from_database(self, conn, database, table_name): attr['default'] = 'null' attr['sql_expression'] = None - if not (attr['numeric'] or attr['string'] or attr['is_blob']): - raise DataJointError('Unsupported field type {field} in `{database}`.`{table_name}`'.format( - field=attr['type'], database=database, table_name=table_name)) + attr['is_supported'] = attr['numeric'] or attr['string'] or attr['is_blob'] or attr['is_attachment'] + attr.pop('Extra') # fill out dtype. All floats and non-nullable integers are turned into specific dtypes diff --git a/datajoint/settings.py b/datajoint/settings.py index b1269f628..3a984a598 100644 --- a/datajoint/settings.py +++ b/datajoint/settings.py @@ -170,3 +170,28 @@ def __setitem__(self, key, value): self._conf[key] = value else: raise DataJointError(u'Validator for {0:s} did not pass'.format(key)) + + +# Load configuration from file + +config = Config() +config_files = (os.path.expanduser(n) for n in (LOCALCONFIG, os.path.join('~', GLOBALCONFIG))) +try: + config_file = next(n for n in config_files if os.path.exists(n)) +except StopIteration: + config.add_history('No config file found, using default settings.') +else: + config.load(config_file) + +# override login credentials with environment variables +mapping = {k: v for k, v in zip( + ('database.host', 'database.user', 'database.password', + 'external.aws_access_key_id', 'external.aws_secret_access_key',), + map(os.getenv, ('DJ_HOST', 'DJ_USER', 'DJ_PASS', + 'DJ_AWS_ACCESS_KEY_ID', 'DJ_AWS_SECRET_ACCESS_KEY',))) + if v is not None} +for k in mapping: + config.add_history('Updated login credentials from %s' % k) +config.update(mapping) + +logger.setLevel(log_levels[config['loglevel']]) From f3dd5b32e5abcd5b3f488a9379127ece4c97edc8 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 19:00:13 -0600 Subject: [PATCH 02/36] add properties for heading attributes for supporting configurable blobs and attachments --- datajoint/external.py | 2 +- datajoint/heading.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 2978681ac..48cc45fc3 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -6,7 +6,7 @@ from .blob import pack, unpack from .table import Table from .declare import STORE_HASH_LENGTH, HASH_DATA_TYPE -from .s3 import Folder as S3Folder +from .s3 import Folder as S3 from .utils import safe_write diff --git a/datajoint/heading.py b/datajoint/heading.py index 61e8de35d..916a90288 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -10,8 +10,8 @@ default_attribute_properties = dict( # these default values are set in computed attributes name=None, type='expression', in_key=False, nullable=False, default=None, comment='calculated attribute', - autoincrement=False, numeric=None, string=None, is_blob=False, is_attachment=False, is_external=False, sql_expression=None, - database=None, dtype=object) + autoincrement=False, numeric=None, string=None, is_blob=False, is_attachment=False, is_external=False, + is_supported=False, sql_expression=None, database=None, dtype=object, config_name=None) class Attribute(namedtuple('_Attribute', default_attribute_properties)): From 046291ad56f390f321dbee1b4e7adec4dfbac0e3 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 19:07:10 -0600 Subject: [PATCH 03/36] rename property is_supported to unsupported for heading attributes --- datajoint/external.py | 2 +- datajoint/heading.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 48cc45fc3..2978681ac 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -6,7 +6,7 @@ from .blob import pack, unpack from .table import Table from .declare import STORE_HASH_LENGTH, HASH_DATA_TYPE -from .s3 import Folder as S3 +from .s3 import Folder as S3Folder from .utils import safe_write diff --git a/datajoint/heading.py b/datajoint/heading.py index 916a90288..b8dd670e5 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -11,7 +11,7 @@ default_attribute_properties = dict( # these default values are set in computed attributes name=None, type='expression', in_key=False, nullable=False, default=None, comment='calculated attribute', autoincrement=False, numeric=None, string=None, is_blob=False, is_attachment=False, is_external=False, - is_supported=False, sql_expression=None, database=None, dtype=object, config_name=None) + unsupported=False, sql_expression=None, database=None, dtype=object, config_name=None) class Attribute(namedtuple('_Attribute', default_attribute_properties)): @@ -210,7 +210,7 @@ def init_from_database(self, conn, database, table_name): attr['default'] = 'null' attr['sql_expression'] = None - attr['is_supported'] = attr['numeric'] or attr['string'] or attr['is_blob'] or attr['is_attachment'] + attr['unsupported'] = not(attr['numeric'] or attr['string'] or attr['is_blob'] or attr['is_attachment']) attr.pop('Extra') From 1f5fe9d05b3aec25bcf28a89f2df46363a4b5db2 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 20:18:56 -0600 Subject: [PATCH 04/36] load configurable fields --- datajoint/heading.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/datajoint/heading.py b/datajoint/heading.py index b8dd670e5..6450d7c93 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -3,7 +3,6 @@ from itertools import chain import re import logging -from .settings import config from .errors import DataJointError logger = logging.getLogger(__name__) @@ -11,7 +10,7 @@ default_attribute_properties = dict( # these default values are set in computed attributes name=None, type='expression', in_key=False, nullable=False, default=None, comment='calculated attribute', autoincrement=False, numeric=None, string=None, is_blob=False, is_attachment=False, is_external=False, - unsupported=False, sql_expression=None, database=None, dtype=object, config_name=None) + unsupported=False, sql_expression=None, database=None, dtype=object) class Attribute(namedtuple('_Attribute', default_attribute_properties)): @@ -187,21 +186,33 @@ def init_from_database(self, conn, database, table_name): # additional attribute properties for attr in attributes: # process configurable attributes - split_comment = attr['comment'].split(':') - attr['is_external'] = len(split_comment) >= 3 and split_comment[1].startswith('external') - attr['is_attachment'] = len(split_comment) >= 3 and split_comment[1].startswith('attach') - if attr['is_external'] or attr['is_attachment']: - attr['comment'] = ':'.join(split_comment[2:]) - attr['type'] = split_comment[1] - - attr['nullable'] = (attr['nullable'] == 'YES') attr['in_key'] = (attr['in_key'] == 'PRI') + attr['database'] = database + attr['nullable'] = (attr['nullable'] == 'YES') attr['autoincrement'] = bool(re.search(r'auto_increment', attr['Extra'], flags=re.IGNORECASE)) - attr['type'] = re.sub(r'int\(\d+\)', 'int', attr['type'], count=1) # strip size off integers + attr['type'] = re.sub(r'int\(\d+\)', 'int', attr['type'], count=1) # strip size off integers attr['numeric'] = bool(re.match(r'(tiny|small|medium|big)?int|decimal|double|float', attr['type'])) attr['string'] = bool(re.match(r'(var)?char|enum|date|year|time|timestamp', attr['type'])) - attr['is_blob'] = attr['is_external'] or bool(re.match(r'(tiny|medium|long)?blob', attr['type'])) - attr['database'] = database + attr['is_blob'] = bool(re.match(r'(tiny|medium|long)?blob', attr['type'])) + + # recognize configurable fields + configurable_field = re.match( + r'^:(?P(blob|external|attach)(-\w+)?):(?P.*)$', attr['comment']) + if configurable_field is None: + attr['is_external'] = False + attr['is_attachment'] = False + else: + # configurable fields: blob- and attach + if attr['in_key']: + raise DataJointError('Configurable store attributes are not allowed in the primary key') + attr['comment'] = configurable_field.group('comment') + attr['is_external'] = not attr['is_blob'] + attr['type'] = configurable_field.group('type') + attr['is_attachment'] = attr['type'].startswith('attach') + attr['is_blob'] = attr['type'].startswith(('blob', 'external')) + attr['string'] = False + if not attr['is_external']: + print('aha!') if attr['string'] and attr['default'] is not None and attr['default'] not in sql_literals: attr['default'] = '"%s"' % attr['default'] From 33505161f98097a722b1fbb5b31fed26d440acfb Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 20:57:54 -0600 Subject: [PATCH 05/36] implement declaration of configurable attributes: blob- and attach. --- datajoint/declare.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/datajoint/declare.py b/datajoint/declare.py index 773888253..2ba6a7002 100644 --- a/datajoint/declare.py +++ b/datajoint/declare.py @@ -13,6 +13,7 @@ STORE_HASH_LENGTH = 43 HASH_DATA_TYPE = 'char(51)' MAX_TABLE_NAME_LENGTH = 64 +DEFAULT_PROTOCOL = 'LONGBLOB' # for a configurable field logger = logging.getLogger(__name__) @@ -291,17 +292,14 @@ def compile_attribute(line, in_key, foreign_key_sql): match['default'] = 'NOT NULL' match['comment'] = match['comment'].replace('"', '\\"') # escape double quotes in comment - is_external = match['type'].startswith('external') - is_attachment = match['type'].startswith('attachment') - if not is_external: - sql = ('`{name}` {type} {default}' + (' COMMENT "{comment}"' if match['comment'] else '')).format(**match) - else: - # process externally stored attribute + is_configurable = match['type'].startswith(('external', 'blob-', 'attach')) + is_external = False + if is_configurable: if in_key: - raise DataJointError('External attributes cannot be primary in:\n%s' % line) + raise DataJointError('Configurable attributes cannot be primary in:\n%s' % line) store_name = match['type'].split('-') - if store_name[0] != 'external': - raise DataJointError('External store types must be specified as "external" or "external-"') + if store_name[0] not in ('external', 'blob', 'attach'): + raise DataJointError('Invalid configurable attribute name in:\n%s' % line) store_name = '-'.join(store_name[1:]) if store_name != '' and not store_name.isidentifier(): raise DataJointError( @@ -311,10 +309,19 @@ def compile_attribute(line, in_key, foreign_key_sql): 'The external store name `{type}` is too long. Must be <={max_len} characters.'.format( max_len=STORE_NAME_LENGTH, **match)) if not match['default'] in ('DEFAULT NULL', 'NOT NULL'): - raise DataJointError('The only acceptable default value for an external field is null in:\n%s' % line) + raise DataJointError('The default value for a blob or attachment field, if any, can only be NULL in:\n%s' % line) if match['type'] not in config: raise DataJointError('The external store `{type}` is not configured.'.format(**match)) + match['comment'] = ':'.join(('', match['type'], match['comment'])) + protocol = config[match['type']].get('protocol', DEFAULT_PROTOCOL).lower() + is_external = protocol in {'s3', 'file'} + if not is_external: + if re.match(r'(long|medium|tiny)?blob', protocol): + match['type'] = protocol + if not is_external: + sql = ('`{name}` {type} {default}' + (' COMMENT "{comment}"' if match['comment'] else '')).format(**match) + else: # append external configuration name to the end of the comment sql = '`{name}` {hash_type} {default} COMMENT ":{type}:{comment}"'.format( hash_type=HASH_DATA_TYPE, **match) From acc07fb1f99c9617ab125a8cb63b56830d6c3655 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 21:22:05 -0600 Subject: [PATCH 06/36] prepare for saving attachments --- datajoint/external.py | 6 ++---- datajoint/fetch.py | 4 ++-- datajoint/table.py | 2 +- tests/test_external.py | 7 ++++--- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 2978681ac..ef574dce9 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -3,7 +3,6 @@ from .settings import config from .errors import DataJointError from .hash import long_hash -from .blob import pack, unpack from .table import Table from .declare import STORE_HASH_LENGTH, HASH_DATA_TYPE from .s3 import Folder as S3Folder @@ -42,12 +41,11 @@ def definition(self): def table_name(self): return '~external' - def put(self, store, obj): + def put(self, store, blob): """ put an object in external store """ spec = self._get_store_spec(store) - blob = pack(obj) blob_hash = long_hash(blob) + store[len('external-'):] if spec['protocol'] == 'file': folder = os.path.join(spec['location'], self.database) @@ -115,7 +113,7 @@ def get(self, blob_hash): os.makedirs(cache_folder) safe_write(os.path.join(cache_folder, blob_hash), blob) - return unpack(blob) + return blob @property def references(self): diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 21c76aa62..a79345fbc 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -75,7 +75,7 @@ def __call__(self, *attrs, offset=None, limit=None, order_by=None, as_dict=False for name in heading: if heading[name].is_external: external_table = self._expression.connection.schemas[heading[name].database].external_table - ret[name] = list(map(external_table.get, ret[name])) + ret[name] = list(map(unpack, map(external_table.get, ret[name]))) elif heading[name].is_blob: ret[name] = list(map(partial(unpack, squeeze=squeeze), ret[name])) else: # if list of attributes provided @@ -134,7 +134,7 @@ def __call__(self, *attrs, squeeze=False): raise DataJointError('fetch1 should only be used for relations with exactly one tuple') def get_external(attr, _hash): - return self._expression.connection.schemas[attr.database].external_table.get(_hash) + return unpack(self._expression.connection.schemas[attr.database].external_table.get(_hash)) ret = OrderedDict((name, get_external(heading[name], ret[name])) if heading[name].is_external else (name, unpack(ret[name], squeeze=squeeze) if heading[name].is_blob else ret[name]) diff --git a/datajoint/table.py b/datajoint/table.py index beda9003e..e113ffc0d 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -220,7 +220,7 @@ def make_placeholder(name, value): if ignore_extra_fields and name not in heading: return None if heading[name].is_external: - placeholder, value = '%s', self.external_table.put(heading[name].type, value) + placeholder, value = '%s', self.external_table.put(heading[name].type, pack(value)) elif heading[name].is_blob: if value is None: placeholder, value = 'NULL', None diff --git a/tests/test_external.py b/tests/test_external.py index 2c1583e62..450aab4d5 100644 --- a/tests/test_external.py +++ b/tests/test_external.py @@ -2,6 +2,7 @@ from numpy.testing import assert_array_equal from nose.tools import assert_true, assert_equal from datajoint.external import ExternalTable +from datajoint.blob import pack, unpack from . schema_external import schema @@ -15,13 +16,13 @@ def test_external_put(): count = 7 extra = 3 for i in range(count): - hash1 = ext.put('external-raw', input_) + hash1 = ext.put('external-raw', pack(input_)) for i in range(extra): - hash2 = ext.put('external-raw', np.random.randn(4, 3, 2)) + hash2 = ext.put('external-raw', pack(np.random.randn(4, 3, 2))) fetched_hashes = ext.fetch('hash') assert_true(all(hash in fetched_hashes for hash in (hash1, hash2))) assert_equal(len(ext), 1 + extra) - output_ = ext.get(hash1) + output_ = unpack(ext.get(hash1)) assert_array_equal(input_, output_) From c11bbbf99a20adf2de8fb41a4ca39752ead1dbc7 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 22:17:47 -0600 Subject: [PATCH 07/36] add attach.py for saving and loading attachments --- datajoint/attach.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 datajoint/attach.py diff --git a/datajoint/attach.py b/datajoint/attach.py new file mode 100644 index 000000000..d130f4227 --- /dev/null +++ b/datajoint/attach.py @@ -0,0 +1,29 @@ +""" +functionality for attaching files +""" +from os import path +from itertools import count + + +def load(file_path): + with open(file_path, mode='rb') as f: # b is important -> binary + contents = f.read() + return str.encode(path.basename(file_path)) + b'\0' + contents + + +def save(buffer, save_path='.'): + p = buffer.find(b'\0') + file_path = path.abspath(path.join(save_path, buffer[:p].decode())) + + if path.isfile(file_path): + # generate a new filename + split_name = path.splitext(file_path) + for n in count(): + file_path = '%s_%04u%s' % (split_name[0], n, split_name[1]) + if not path.isfile(file_path): + break + + with open(file_path, mode='wb') as f: + f.write(buffer[p+1:]) + + return file_path From 390b0b72a15b62e5c86e17848ec7fa16dcc64072 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 22:36:34 -0600 Subject: [PATCH 08/36] implement inserting attachments --- datajoint/table.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/datajoint/table.py b/datajoint/table.py index e113ffc0d..1bbcf5de9 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -10,7 +10,7 @@ from .settings import config from .declare import declare from .expression import QueryExpression -from .blob import pack +from . import attach, blob from .utils import user_choice from .heading import Heading from .errors import server_error_codes, DataJointError, DuplicateError @@ -219,14 +219,22 @@ def make_placeholder(name, value): """ if ignore_extra_fields and name not in heading: return None - if heading[name].is_external: - placeholder, value = '%s', self.external_table.put(heading[name].type, pack(value)) - elif heading[name].is_blob: + attr = heading[name] + if attr.is_blob: if value is None: placeholder, value = 'NULL', None else: - placeholder, value = '%s', pack(value) - elif heading[name].numeric: + placeholder = '%s' + value = blob.pack(value) + value = self.external_table.put(attr.type, value) if attr.is_external else value + elif attr.is_attachment: + if value is None: + placeholder, value = 'NULL', None + else: + placeholder = '%s' + value = attach.load(value) + value = self.external_table.put(attr.type, value) if attr.is_external else value + elif attr.numeric: if value is None or value == '' or np.isnan(np.float(value)): # nans are turned into NULLs placeholder, value = 'NULL', None else: @@ -550,7 +558,7 @@ def _update(self, attrname, value=None): attr = self.heading[attrname] if attr.is_blob: - value = pack(value) + value = blob.pack(value) placeholder = '%s' elif attr.numeric: if value is None or np.isnan(np.float(value)): # nans are turned into NULLs From 2c04d74df2ec1c6737cc019bb8f6adb0d71a2f37 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 23:21:29 -0600 Subject: [PATCH 09/36] implement fetch of attachments and configurable blobs --- datajoint/fetch.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index a79345fbc..b7f1c12ce 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -1,7 +1,7 @@ from collections import OrderedDict from functools import partial import numpy as np -from .blob import unpack +from . import blob, attach from .errors import DataJointError import warnings @@ -24,6 +24,23 @@ def to_dicts(recarray): yield dict(zip(recarray.dtype.names, rec.tolist())) +def _get(connection, attr, data, squeeze): + """ + :param connection: + :param attr: an attribute from the heading + :param data: literal value fetched from the table + :param squeeze: if True squeeze blobs + :return: unpacked data + """ + if attr.is_external: + data = connection.schemas[attr.database].external_table.get(data) + if attr.is_blob: + return blob.unpack(data, squeeze=squeeze) + if attr.is_attachment: + return attach.save(data) + return data + + class Fetch: """ A fetch object that handles retrieving elements from the table expression. @@ -61,23 +78,18 @@ def __call__(self, *attrs, offset=None, limit=None, order_by=None, as_dict=False 'Consider setting a limit explicitly.') limit = 2 * len(self._expression) + get = partial(_get, self._expression.connection, squeeze=squeeze) if not attrs: # fetch all attributes cur = self._expression.cursor(as_dict=as_dict, limit=limit, offset=offset, order_by=order_by) heading = self._expression.heading if as_dict: - ret = [OrderedDict((name, unpack(d[name], squeeze=squeeze) if heading[name].is_blob else d[name]) - for name in heading.names) - for d in cur] + ret = [OrderedDict((name, get(heading[name], d[name])) for name in heading.names) for d in cur] else: ret = list(cur.fetchall()) ret = np.array(ret, dtype=heading.as_dtype) for name in heading: - if heading[name].is_external: - external_table = self._expression.connection.schemas[heading[name].database].external_table - ret[name] = list(map(unpack, map(external_table.get, ret[name]))) - elif heading[name].is_blob: - ret[name] = list(map(partial(unpack, squeeze=squeeze), ret[name])) + ret[name] = list(map(partial(get, heading[name]), ret[name])) else: # if list of attributes provided attributes = [a for a in attrs if not is_key(a)] result = self._expression.proj(*attributes).fetch( @@ -109,6 +121,7 @@ class Fetch1: def __init__(self, relation): self._expression = relation + def __call__(self, *attrs, squeeze=False): """ Fetches the expression results from the database when the expression is known to yield only one entry. @@ -132,12 +145,7 @@ def __call__(self, *attrs, squeeze=False): ret = cur.fetchone() if not ret or cur.fetchone(): raise DataJointError('fetch1 should only be used for relations with exactly one tuple') - - def get_external(attr, _hash): - return unpack(self._expression.connection.schemas[attr.database].external_table.get(_hash)) - - ret = OrderedDict((name, get_external(heading[name], ret[name])) if heading[name].is_external - else (name, unpack(ret[name], squeeze=squeeze) if heading[name].is_blob else ret[name]) + ret = OrderedDict((name, _get(self._expression.connection, heading[name], ret[name], squeeze=squeeze)) for name in heading.names) else: # fetch some attributes, return as tuple attributes = [a for a in attrs if not is_key(a)] From 43e9c769bf0d91e4a3dbc93ee170733bbd462c5b Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 2 Dec 2018 23:50:51 -0600 Subject: [PATCH 10/36] fix issue #467 --- datajoint/table.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/datajoint/table.py b/datajoint/table.py index 1bbcf5de9..a0d304846 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -214,33 +214,27 @@ def make_placeholder(name, value): For a given attribute `name` with `value`, return its processed value or value placeholder as a string to be included in the query and the value, if any, to be submitted for processing by mysql API. - :param name: - :param value: + :param name: name of attribute to be inserted + :param value: value of attribute to be inserted """ if ignore_extra_fields and name not in heading: return None attr = heading[name] - if attr.is_blob: - if value is None: - placeholder, value = 'NULL', None - else: - placeholder = '%s' + if value is None: + placeholder, value = 'NULL', None + else: + placeholder = '%s' + if attr.is_blob: value = blob.pack(value) value = self.external_table.put(attr.type, value) if attr.is_external else value - elif attr.is_attachment: - if value is None: - placeholder, value = 'NULL', None - else: - placeholder = '%s' + elif attr.is_attachment: value = attach.load(value) value = self.external_table.put(attr.type, value) if attr.is_external else value - elif attr.numeric: - if value is None or value == '' or np.isnan(np.float(value)): # nans are turned into NULLs - placeholder, value = 'NULL', None - else: - placeholder, value = '%s', (str(int(value) if isinstance(value, bool) else value)) - else: - placeholder = '%s' + elif attr.numeric: + if value == '' or np.isnan(np.float(value)): # nans are turned into NULLs + placeholder, value = 'NULL', None + else: + value = str(int(value) if isinstance(value, bool) else value) return name, placeholder, value def check_fields(fields): From 51336ff06e268fadb782f1bd9d7d7160366bdbaa Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 3 Dec 2018 13:33:23 -0600 Subject: [PATCH 11/36] further cleanup of __init__.py --- datajoint/__init__.py | 20 ++++++++++---------- datajoint/version.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 33e303778..3352982e8 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -15,26 +15,26 @@ """ __author__ = "Dimitri Yatsenko, Edgar Y. Walker, and Fabian Sinz at Baylor College of Medicine" -__date__ = "Nov 15, 2018" +__date__ = "Dec 4, 2018" __all__ = ['__author__', '__version__', - 'config', 'conn', 'kill', 'Table', - 'Connection', 'Heading', 'FreeTable', 'Not', 'schema', + 'config', 'conn', 'Connection', + 'schema', 'create_virtual_module', + 'Table', 'FreeTable', 'Manual', 'Lookup', 'Imported', 'Computed', 'Part', - 'AndList', 'ERD', 'U', 'key', - 'DataJointError', 'DuplicateError', - 'set_password', 'create_virtual_module'] + 'Not', 'AndList', 'U', 'ERD', + 'set_password', 'kill', + 'DataJointError', 'DuplicateError', 'key'] # ------------- flatten import hierarchy ------------------------- from .version import __version__ from .settings import config from .connection import conn, Connection -from .table import FreeTable, Table -from .user_tables import Manual, Lookup, Imported, Computed, Part -from .expression import Not, AndList, U -from .heading import Heading from .schema import Schema as schema from .schema import create_virtual_module +from .table import Table, FreeTable +from .user_tables import Manual, Lookup, Imported, Computed, Part +from .expression import Not, AndList, U from .erd import ERD from .admin import set_password, kill from .errors import DataJointError, DuplicateError diff --git a/datajoint/version.py b/datajoint/version.py index fee46bd8c..ea370a8e5 100644 --- a/datajoint/version.py +++ b/datajoint/version.py @@ -1 +1 @@ -__version__ = "0.11.1" +__version__ = "0.12.0" From cee588eea40a5a6da2acde644a62d56b352d0d5c Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 3 Dec 2018 13:37:16 -0600 Subject: [PATCH 12/36] Use DEFAULT instead of NULL when the insert value is None. --- datajoint/external.py | 8 ++++---- datajoint/table.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index ef574dce9..b7cd381b2 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -5,7 +5,7 @@ from .hash import long_hash from .table import Table from .declare import STORE_HASH_LENGTH, HASH_DATA_TYPE -from .s3 import Folder as S3Folder +from . import s3 from .utils import safe_write @@ -57,7 +57,7 @@ def put(self, store, blob): os.makedirs(folder) safe_write(full_path, blob) elif spec['protocol'] == 's3': - S3Folder(database=self.database, **spec).put(blob_hash, blob) + s3.Folder(database=self.database, **spec).put(blob_hash, blob) else: raise DataJointError('Unknown external storage protocol {protocol} for {store}'.format( store=store, protocol=spec['protocol'])) @@ -102,7 +102,7 @@ def get(self, blob_hash): raise DataJointError('Lost access to external blob %s.' % full_path) from None elif spec['protocol'] == 's3': try: - blob = S3Folder(database=self.database, **spec).get(blob_hash) + blob = s3.Folder(database=self.database, **spec).get(blob_hash) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) else: @@ -169,7 +169,7 @@ def clean_store(self, store, display_progress=True): os.remove(os.path.join(folder, f)) elif spec['protocol'] == 's3': try: - S3Folder(database=self.database, **spec).clean(self.fetch('hash')) + s3.Folder(database=self.database, **spec).clean(self.fetch('hash')) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) diff --git a/datajoint/table.py b/datajoint/table.py index a0d304846..b870491e4 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -221,7 +221,7 @@ def make_placeholder(name, value): return None attr = heading[name] if value is None: - placeholder, value = 'NULL', None + placeholder, value = 'DEFAULT', None else: placeholder = '%s' if attr.is_blob: @@ -232,7 +232,7 @@ def make_placeholder(name, value): value = self.external_table.put(attr.type, value) if attr.is_external else value elif attr.numeric: if value == '' or np.isnan(np.float(value)): # nans are turned into NULLs - placeholder, value = 'NULL', None + placeholder, value = 'DEFAULT', None else: value = str(int(value) if isinstance(value, bool) else value) return name, placeholder, value From c04f974421f50f419bf884516821969f6258993c Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 3 Dec 2018 13:41:13 -0600 Subject: [PATCH 13/36] slight refactor of Table.insert --- datajoint/table.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datajoint/table.py b/datajoint/table.py index b870491e4..929086a9b 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -220,7 +220,7 @@ def make_placeholder(name, value): if ignore_extra_fields and name not in heading: return None attr = heading[name] - if value is None: + if value is None or (attr.is_numeric and value == '' or np.isnan(np.float(value))): placeholder, value = 'DEFAULT', None else: placeholder = '%s' @@ -231,10 +231,7 @@ def make_placeholder(name, value): value = attach.load(value) value = self.external_table.put(attr.type, value) if attr.is_external else value elif attr.numeric: - if value == '' or np.isnan(np.float(value)): # nans are turned into NULLs - placeholder, value = 'DEFAULT', None - else: - value = str(int(value) if isinstance(value, bool) else value) + value = str(int(value) if isinstance(value, bool) else value) return name, placeholder, value def check_fields(fields): From 9de4782ed5db693f2111634427867101ca4deb01 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 3 Dec 2018 15:36:40 -0600 Subject: [PATCH 14/36] fix for error introduced in previous commit --- datajoint/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datajoint/table.py b/datajoint/table.py index 929086a9b..5d2858d5b 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -220,7 +220,7 @@ def make_placeholder(name, value): if ignore_extra_fields and name not in heading: return None attr = heading[name] - if value is None or (attr.is_numeric and value == '' or np.isnan(np.float(value))): + if value is None or (attr.numeric and (value == '' or np.isnan(np.float(value)))): placeholder, value = 'DEFAULT', None else: placeholder = '%s' From f26567428f5d9bb1ad451354fbc51787dc26cfce Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Fri, 7 Dec 2018 15:44:15 -0600 Subject: [PATCH 15/36] implement external file folding --- datajoint/external.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index b7cd381b2..340cb55f3 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -8,6 +8,8 @@ from . import s3 from .utils import safe_write +DEFAULT_FOLDING = (2, 2) + class ExternalTable(Table): """ @@ -46,7 +48,7 @@ def put(self, store, blob): put an object in external store """ spec = self._get_store_spec(store) - blob_hash = long_hash(blob) + store[len('external-'):] + blob_hash = long_hash(blob) + ''.join(store.split('-')[1:]) if spec['protocol'] == 'file': folder = os.path.join(spec['location'], self.database) full_path = os.path.join(folder, blob_hash) @@ -161,22 +163,24 @@ def clean_store(self, store, display_progress=True): """ spec = self._get_store_spec(store) progress = tqdm if display_progress else lambda x: x + in_use = set(self.fetch('hash')) if spec['protocol'] == 'file': - folder = os.path.join(spec['location'], self.database) - delete_list = set(os.listdir(folder)).difference(self.fetch('hash')) - print('Deleting %d unused items from %s' % (len(delete_list), folder), flush=True) - for f in progress(delete_list): - os.remove(os.path.join(folder, f)) + for folder, _, files in progress(os.walk(os.path.join(spec['location'], self.database))): + for f in files: + if f not in in_use: + filename = os.join(folder, f) + os.remove(filename) elif spec['protocol'] == 's3': try: - s3.Folder(database=self.database, **spec).clean(self.fetch('hash')) + s3.Folder(database=self.database, **spec).clean(in_use) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) @staticmethod def _get_store_spec(store): + store = '-' + store.lstrip('-') try: - spec = config[store] + spec = config['stores'][store] except KeyError: raise DataJointError('Storage {store} is requested but not configured'.format(store=store)) from None if 'protocol' not in spec: @@ -184,4 +188,6 @@ def _get_store_spec(store): if spec['protocol'] not in {'file', 's3'}: raise DataJointError( 'Unknown external storage protocol "{protocol}" in "{store}"'.format(store=store, **spec)) + if spec['protocol'] == 'file': + spec['folding'] = spec.get('folding', DEFAULT_FOLDING) return spec From 50f17ce50a49959f803fce0c4dd27a510f68c667 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 11 Dec 2018 10:06:44 -0600 Subject: [PATCH 16/36] remote the `keys` property from `fetch` (a warning was displayed in several previous releases) --- datajoint/fetch.py | 4 ++-- tests/test_fetch.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 8318d64e3..a90a2e677 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -24,7 +24,7 @@ def is_key(attr): def to_dicts(recarray): """convert record array to a dictionaries""" for rec in recarray: - yield dict(zip(recarray.dtype.names, rec.tolist())) + yield OrderedDict(zip(recarray.dtype.names, rec.tolist())) def _get(connection, attr, data, squeeze): @@ -46,7 +46,7 @@ def _get(connection, attr, data, squeeze): def _flatten_attribute_list(primary_key, attr): for a in attr: - if re.match(r'^\s*KEY\s+(ASC\s*)?$', a): + if re.match(r'^\s*KEY(\s+ASC)?\s*$', a): yield from primary_key elif re.match(r'^\s*KEY\s+DESC\s*$', a): yield from (q + ' DESC' for q in primary_key) diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 719ac0543..3bf607772 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -118,13 +118,13 @@ def test_iter(self): assert_true(row['name'] == tname and row['language'] == tlang, 'Values are not the same') def test_keys(self): - """test key iterator""" + """test key fetch""" languages = schema.Language.contents languages.sort(key=itemgetter(0), reverse=True) languages.sort(key=itemgetter(1), reverse=False) cur = self.lang.fetch('name', 'language', order_by=('language', 'name DESC')) - cur2 = list(self.lang.fetch.keys(order_by=['language', 'name DESC'])) + cur2 = list(self.lang.fetch("KEY", order_by=['language', 'name DESC'])) for c, c2 in zip(zip(*cur), cur2): assert_true(c == tuple(c2.values()), 'Values are not the same') From f141aa77b5e423c33f7867b4ce178e8a8fce97c1 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 11 Dec 2018 13:24:35 -0600 Subject: [PATCH 17/36] add `dj.get_schema_names()` --- datajoint/__init__.py | 4 ++-- datajoint/schema.py | 6 ++++++ tests/test_fetch.py | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 3352982e8..02881ac81 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -18,7 +18,7 @@ __date__ = "Dec 4, 2018" __all__ = ['__author__', '__version__', 'config', 'conn', 'Connection', - 'schema', 'create_virtual_module', + 'schema', 'create_virtual_module', 'get_schema_names', 'Table', 'FreeTable', 'Manual', 'Lookup', 'Imported', 'Computed', 'Part', 'Not', 'AndList', 'U', 'ERD', @@ -31,7 +31,7 @@ from .settings import config from .connection import conn, Connection from .schema import Schema as schema -from .schema import create_virtual_module +from .schema import create_virtual_module, get_schema_names from .table import Table, FreeTable from .user_tables import Manual, Lookup, Imported, Computed, Part from .expression import Not, AndList, U diff --git a/datajoint/schema.py b/datajoint/schema.py index 5f50c9d9a..d7fc14462 100644 --- a/datajoint/schema.py +++ b/datajoint/schema.py @@ -254,3 +254,9 @@ def create_virtual_module(module_name, schema_name, create_schema=False, create_ _schema.spawn_missing_classes(context=module.__dict__) module.__dict__['schema'] = _schema return module + + +def get_schema_names(connection=None): + if connection is None: + connection = conn() + return [r[0] for r in connection.query('SHOW SCHEMAS').fetchall() if r[0] not in {'information_schema'}] \ No newline at end of file diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 3bf607772..d701afd08 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -81,6 +81,7 @@ def test_head_tail(): query = schema.User * schema.Language n = 5 frame = query.head(n, format='frame') + assert_true(isinstance(frame, pandas.DataFrame)) array = query.head(n, format='array') assert_equal(array.size, n) assert_equal(len(frame), n) From 6310c7de824611b91fe70fcee692af2dea4fbdcd Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 13 Dec 2018 12:00:38 -0600 Subject: [PATCH 18/36] stylistic improvements --- datajoint/declare.py | 10 +++++----- datajoint/schema.py | 8 +++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/datajoint/declare.py b/datajoint/declare.py index 2992e2c2a..ec6bef1e5 100644 --- a/datajoint/declare.py +++ b/datajoint/declare.py @@ -198,7 +198,6 @@ def declare(full_table_name, definition, context): :param definition: DataJoint table definition :param context: dictionary of objects that might be referred to in the table. """ - table_name = full_table_name.strip('`').split('.')[1] if len(table_name) > MAX_TABLE_NAME_LENGTH: raise DataJointError( @@ -272,10 +271,11 @@ def compile_attribute(line, in_key, foreign_key_sql): match['default'] = '' match = {k: v.strip() for k, v in match.items()} match['nullable'] = match['default'].lower() == 'null' - accepted_datatype = r'^(time|date|year|enum|(var)?char|float|real|double|decimal|numeric|' \ - r'(tiny|small|medium|big)?int|bool|' \ - r'(tiny|small|medium|long)?blob|external|attach)' - if re.match(accepted_datatype, match['type']) is None: + accepted_datatype = ( + r'time|date|year|enum|(var)?char|float|real|double|decimal|numeric|' + r'(tiny|small|medium|big)?int|bool|' + r'(tiny|small|medium|long)?blob|external|attach') + if re.match(accepted_datatype, match['type'], re.I) is None: raise DataJointError('DataJoint does not support datatype "{type}"'.format(**match)) literals = ['CURRENT_TIMESTAMP'] # not to be enclosed in quotes diff --git a/datajoint/schema.py b/datajoint/schema.py index d7fc14462..5a080da39 100644 --- a/datajoint/schema.py +++ b/datajoint/schema.py @@ -257,6 +257,12 @@ def create_virtual_module(module_name, schema_name, create_schema=False, create_ def get_schema_names(connection=None): + """ + :param connection: a dj.Connection object + :return: a generator of all accessible schemas on the server + """ if connection is None: connection = conn() - return [r[0] for r in connection.query('SHOW SCHEMAS').fetchall() if r[0] not in {'information_schema'}] \ No newline at end of file + for r in connection.query('SHOW SCHEMAS'): + if r[0] not in {'information_schema'}: + yield r[0] \ No newline at end of file From 3aa936e397d3b0a7046aedc9b61c9065e6226f07 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 15:19:45 -0600 Subject: [PATCH 19/36] complete implementation of attachments and configurable blobs with path folding --- datajoint/attach.py | 6 ++-- datajoint/declare.py | 36 ++++++++++++------------ datajoint/external.py | 54 ++++++++++++++++-------------------- datajoint/fetch.py | 15 ++++++---- datajoint/heading.py | 4 +-- datajoint/settings.py | 24 +++++++++++++++- tests/test_external_class.py | 4 +-- 7 files changed, 82 insertions(+), 61 deletions(-) diff --git a/datajoint/attach.py b/datajoint/attach.py index d130f4227..21f178023 100644 --- a/datajoint/attach.py +++ b/datajoint/attach.py @@ -5,10 +5,10 @@ from itertools import count -def load(file_path): - with open(file_path, mode='rb') as f: # b is important -> binary +def load(local_path): + with open(local_path, mode='rb') as f: # b is important -> binary contents = f.read() - return str.encode(path.basename(file_path)) + b'\0' + contents + return str.encode(path.basename(local_path)) + b'\0' + contents def save(buffer, save_path='.'): diff --git a/datajoint/declare.py b/datajoint/declare.py index ec6bef1e5..982198732 100644 --- a/datajoint/declare.py +++ b/datajoint/declare.py @@ -13,7 +13,6 @@ STORE_HASH_LENGTH = 43 HASH_DATA_TYPE = 'char(51)' MAX_TABLE_NAME_LENGTH = 64 -DEFAULT_PROTOCOL = 'LONGBLOB' # for a configurable field logger = logging.getLogger(__name__) @@ -271,13 +270,13 @@ def compile_attribute(line, in_key, foreign_key_sql): match['default'] = '' match = {k: v.strip() for k, v in match.items()} match['nullable'] = match['default'].lower() == 'null' + blob_datatype = r'(tiny|small|medium|long)?blob' accepted_datatype = ( r'time|date|year|enum|(var)?char|float|real|double|decimal|numeric|' - r'(tiny|small|medium|big)?int|bool|' - r'(tiny|small|medium|long)?blob|external|attach') + r'(tiny|small|medium|big)?int|bool|external|attach|' + blob_datatype) if re.match(accepted_datatype, match['type'], re.I) is None: raise DataJointError('DataJoint does not support datatype "{type}"'.format(**match)) - + is_blob = bool(re.match(blob_datatype, match['type'], re.I)) literals = ['CURRENT_TIMESTAMP'] # not to be enclosed in quotes if match['nullable']: if in_key: @@ -297,33 +296,36 @@ def compile_attribute(line, in_key, foreign_key_sql): if is_configurable: if in_key: raise DataJointError('Configurable attributes cannot be primary in:\n%s' % line) + match['comment'] = ':{type}:{comment}'.format(**match) # insert configurable type into comment store_name = match['type'].split('-') if store_name[0] not in ('external', 'blob', 'attach'): - raise DataJointError('Invalid configurable attribute name in:\n%s' % line) + raise DataJointError('Invalid attribute type in:\n%s' % line) store_name = '-'.join(store_name[1:]) - if store_name != '' and not store_name.isidentifier(): + if store_name and not store_name.isidentifier(): raise DataJointError( 'The external store name `{type}` is invalid. Make like a python identifier.'.format(**match)) if len(store_name) > STORE_NAME_LENGTH: raise DataJointError( 'The external store name `{type}` is too long. Must be <={max_len} characters.'.format( max_len=STORE_NAME_LENGTH, **match)) - if not match['default'] in ('DEFAULT NULL', 'NOT NULL'): - raise DataJointError('The default value for a blob or attachment field, if any, can only be NULL in:\n%s' % line) - if match['type'] not in config: - raise DataJointError('The external store `{type}` is not configured.'.format(**match)) - match['comment'] = ':'.join(('', match['type'], match['comment'])) - protocol = config[match['type']].get('protocol', DEFAULT_PROTOCOL).lower() - is_external = protocol in {'s3', 'file'} + spec = config.get_store_spec(store_name) + is_external = spec['protocol'] in {'s3', 'file'} if not is_external: - if re.match(r'(long|medium|tiny)?blob', protocol): - match['type'] = protocol + is_blob = re.match(blob_datatype, spec['protocol'], re.I) + if not is_blob: + raise DataJointError('Invalid protocol {protocol} in external store in:\n{line}'.format( + line=line, **spec)) + match['type'] = spec['protocol'] + + if (is_external or is_blob) and match['default'] not in ('DEFAULT NULL', 'NOT NULL'): + raise DataJointError( + 'The default value for a blob or attachment can only be NULL in:\n%s' % line) if not is_external: sql = ('`{name}` {type} {default}' + (' COMMENT "{comment}"' if match['comment'] else '')).format(**match) else: - # append external configuration name to the end of the comment - sql = '`{name}` {hash_type} {default} COMMENT ":{type}:{comment}"'.format( + # add hash field with a dependency on the ~external table + sql = '`{name}` {hash_type} {default} COMMENT "{comment}"'.format( hash_type=HASH_DATA_TYPE, **match) foreign_key_sql.append( "FOREIGN KEY (`{name}`) REFERENCES {{external_table}} (`hash`) " diff --git a/datajoint/external.py b/datajoint/external.py index 340cb55f3..e36808630 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -8,7 +8,12 @@ from . import s3 from .utils import safe_write -DEFAULT_FOLDING = (2, 2) + +def subfold(name, folds): + """ + subfolding for external storage: e.g. subfold('abcdefg', (2, 3)) --> ['ab','cde'] + """ + return (name[:folds[0]], *subfold(name[folds[0]:], folds[1:])) if folds else () class ExternalTable(Table): @@ -47,10 +52,11 @@ def put(self, store, blob): """ put an object in external store """ - spec = self._get_store_spec(store) - blob_hash = long_hash(blob) + ''.join(store.split('-')[1:]) + store = ''.join(store.split('-')[1:]) + spec = config.get_store_spec(store) + blob_hash = long_hash(blob) + store if spec['protocol'] == 'file': - folder = os.path.join(spec['location'], self.database) + folder = os.path.join(spec['location'], self.database, *subfold(blob_hash, spec['subfolding'])) full_path = os.path.join(folder, blob_hash) if not os.path.isfile(full_path): try: @@ -59,9 +65,10 @@ def put(self, store, blob): os.makedirs(folder) safe_write(full_path, blob) elif spec['protocol'] == 's3': - s3.Folder(database=self.database, **spec).put(blob_hash, blob) + s3.Folder(database=self.database, **spec).put( + '/'.join((*subfold(blob_hash, spec['subfolding']), blob_hash)), blob) else: - raise DataJointError('Unknown external storage protocol {protocol} for {store}'.format( + raise DataJointError('Unknown external storage protocol {protocol} in store "-{store}"'.format( store=store, protocol=spec['protocol'])) # insert tracking info @@ -80,12 +87,10 @@ def get(self, blob_hash): """ if blob_hash is None: return None - store = blob_hash[STORE_HASH_LENGTH:] - store = 'external' + ('-' if store else '') + store - - cache_folder = config.get('cache', None) + # attempt to get object from cache blob = None + cache_folder = config.get('cache', None) if cache_folder: try: with open(os.path.join(cache_folder, blob_hash), 'rb') as f: @@ -93,10 +98,13 @@ def get(self, blob_hash): except FileNotFoundError: pass + # attempt to get object from store if blob is None: - spec = self._get_store_spec(store) + store = blob_hash[STORE_HASH_LENGTH:] + spec = config.get_store_spec(store) if spec['protocol'] == 'file': - full_path = os.path.join(spec['location'], self.database, blob_hash) + full_path = os.path.join( + spec['location'], self.database, *subfold(blob_hash, spec['subfolding']), blob_hash) try: with open(full_path, 'rb') as f: blob = f.read() @@ -104,7 +112,8 @@ def get(self, blob_hash): raise DataJointError('Lost access to external blob %s.' % full_path) from None elif spec['protocol'] == 's3': try: - blob = s3.Folder(database=self.database, **spec).get(blob_hash) + blob = s3.Folder(database=self.database, **spec).get( + '/'.join((*subfold(blob_hash, spec['subfolding']), blob_hash))) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) else: @@ -161,14 +170,14 @@ def clean_store(self, store, display_progress=True): Clean unused data in an external storage repository from unused blobs. This must be performed after delete_garbage during low-usage periods to reduce risks of data loss. """ - spec = self._get_store_spec(store) + spec = config.get_store_spec(store) progress = tqdm if display_progress else lambda x: x in_use = set(self.fetch('hash')) if spec['protocol'] == 'file': for folder, _, files in progress(os.walk(os.path.join(spec['location'], self.database))): for f in files: if f not in in_use: - filename = os.join(folder, f) + filename = os.path.join(folder, f) os.remove(filename) elif spec['protocol'] == 's3': try: @@ -176,18 +185,3 @@ def clean_store(self, store, display_progress=True): except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) - @staticmethod - def _get_store_spec(store): - store = '-' + store.lstrip('-') - try: - spec = config['stores'][store] - except KeyError: - raise DataJointError('Storage {store} is requested but not configured'.format(store=store)) from None - if 'protocol' not in spec: - raise DataJointError('Storage {store} config is missing the protocol field'.format(store=store)) - if spec['protocol'] not in {'file', 's3'}: - raise DataJointError( - 'Unknown external storage protocol "{protocol}" in "{store}"'.format(store=store, **spec)) - if spec['protocol'] == 'file': - spec['folding'] = spec.get('folding', DEFAULT_FOLDING) - return spec diff --git a/datajoint/fetch.py b/datajoint/fetch.py index a90a2e677..ea40613d0 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -44,11 +44,16 @@ def _get(connection, attr, data, squeeze): return data -def _flatten_attribute_list(primary_key, attr): - for a in attr: - if re.match(r'^\s*KEY(\s+ASC)?\s*$', a): +def _flatten_attribute_list(primary_key, attrs): + """ + :param primary_key: list of attributes in primary key + :param attrs: list of attribute names, which may include "KEY", "KEY DESC" or "KEY ASC" + :return: generator of attributes where "KEY" is replaces with its component attributes + """ + for a in attrs: + if re.match(r'^\s*KEY(\s+[aA][Ss][Cc])?\s*$', a): yield from primary_key - elif re.match(r'^\s*KEY\s+DESC\s*$', a): + elif re.match(r'^\s*KEY\s+[Dd][Ee][Ss][Cc]\s*$', a): yield from (q + ' DESC' for q in primary_key) else: yield a @@ -57,7 +62,7 @@ def _flatten_attribute_list(primary_key, attr): class Fetch: """ A fetch object that handles retrieving elements from the table expression. - :param relation: the table expression to fetch from + :param expression: the table expression to fetch from. """ def __init__(self, expression): diff --git a/datajoint/heading.py b/datajoint/heading.py index 6450d7c93..138cbf1b1 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -211,8 +211,6 @@ def init_from_database(self, conn, database, table_name): attr['is_attachment'] = attr['type'].startswith('attach') attr['is_blob'] = attr['type'].startswith(('blob', 'external')) attr['string'] = False - if not attr['is_external']: - print('aha!') if attr['string'] and attr['default'] is not None and attr['default'] not in sql_literals: attr['default'] = '"%s"' % attr['default'] @@ -237,7 +235,7 @@ def init_from_database(self, conn, database, table_name): t = re.sub(r' unsigned$', '', t) # remove unsigned assert (t, is_unsigned) in numeric_types, 'dtype not found for type %s' % t attr['dtype'] = numeric_types[(t, is_unsigned)] - self.attributes = OrderedDict([(q['name'], Attribute(**q)) for q in attributes]) + self.attributes = OrderedDict(((q['name'], Attribute(**q)) for q in attributes)) # Read and tabulate secondary indexes keys = defaultdict(dict) diff --git a/datajoint/settings.py b/datajoint/settings.py index cb08bf4c5..45103525f 100644 --- a/datajoint/settings.py +++ b/datajoint/settings.py @@ -14,6 +14,9 @@ LOCALCONFIG = 'dj_local_conf.json' GLOBALCONFIG = '.datajoint_config.json' +DEFAULT_SUBFOLDING = (2, 2) # subfolding for external storage in filesystem. 2, 2 means that file abcdef is stored as /ab/cd/abcdef +DEFAULT_STORE_PROTOCOL = 'LONGBLOB' + validators = collections.defaultdict(lambda: lambda value: True) validators['database.port'] = lambda a: isinstance(a, int) @@ -124,6 +127,26 @@ def save_global(self, verbose=False): """ self.save(os.path.expanduser(os.path.join('~', GLOBALCONFIG)), verbose) + def get_store_spec(self, store): + """ + find configuration of blob and attachment stores + """ + # check new style + try: + spec = self['stores']['-' + store] + except KeyError: + # check old style + try: + spec = self['external' + ('-' + store if store else '')] + except KeyError: + raise DataJointError('Storage {store} is requested but not configured'.format(store=store)) + else: + spec['subfolding'] = spec.get('subfolding', ()) # old style external fields + else: + spec['subfolding'] = spec.get('subfolding', DEFAULT_SUBFOLDING) + spec['protocol'] = spec.get('protocol', DEFAULT_STORE_PROTOCOL) + return spec + @contextmanager def __call__(self, **kwargs): """ @@ -174,7 +197,6 @@ def __setitem__(self, key, value): # Load configuration from file - config = Config() config_files = (os.path.expanduser(n) for n in (LOCALCONFIG, os.path.join('~', GLOBALCONFIG))) try: diff --git a/tests/test_external_class.py b/tests/test_external_class.py index e1f9b9c97..61ed12111 100644 --- a/tests/test_external_class.py +++ b/tests/test_external_class.py @@ -37,14 +37,14 @@ def test_populate(): image = modu.Image() image.populate() remaining, total = image.progress() - image.external_table.clean_store('external-raw') + image.external_table.clean_store('raw') assert_true(total == len(modu.Dimension() * modu.Seed()) and remaining == 0) for img, neg, dimensions in zip(*(image * modu.Dimension()).fetch('img', 'neg', 'dimensions')): assert_list_equal(list(img.shape), list(dimensions)) assert_almost_equal(img, -neg) image.delete() image.external_table.delete_garbage() - image.external_table.clean_store('external-raw') + image.external_table.clean_store('raw') @raises(dj.DataJointError) From 173bf1dfb9f1d80a7dd4600426fde463327e6bfd Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 15:46:56 -0600 Subject: [PATCH 20/36] add test for configurable blobs --- datajoint/heading.py | 2 +- tests/schema_external.py | 19 ++++---- tests/schema_legacy_external.py | 74 +++++++++++++++++++++++++++++ tests/test_external_class.py | 2 +- tests/test_legacy_external.py | 28 +++++++++++ tests/test_legacy_external_class.py | 63 ++++++++++++++++++++++++ 6 files changed, 178 insertions(+), 10 deletions(-) create mode 100644 tests/schema_legacy_external.py create mode 100644 tests/test_legacy_external.py create mode 100644 tests/test_legacy_external_class.py diff --git a/datajoint/heading.py b/datajoint/heading.py index 138cbf1b1..6d2bf730c 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -197,7 +197,7 @@ def init_from_database(self, conn, database, table_name): # recognize configurable fields configurable_field = re.match( - r'^:(?P(blob|external|attach)(-\w+)?):(?P.*)$', attr['comment']) + r'^:(?P(blob|external|attach)(-\w*)?):(?P.*)$', attr['comment']) if configurable_field is None: attr['is_external'] = False attr['is_attachment'] = False diff --git a/tests/schema_external.py b/tests/schema_external.py index cf9a0cc0c..93f6a5323 100644 --- a/tests/schema_external.py +++ b/tests/schema_external.py @@ -11,19 +11,22 @@ schema = dj.schema(PREFIX + '_extern', connection=dj.conn(**CONN_INFO)) -dj.config['external'] = { +dj.config['stores'] = { + '-': { 'protocol': 'file', - 'location': 'dj-store/external'} + 'location': 'dj-store/external' + }, -dj.config['external-raw'] = { + '-raw': { 'protocol': 'file', - 'location': 'dj-store/raw'} + 'location': 'dj-store/raw'}, -dj.config['external-compute'] = { + '-compute': { 'protocol': 's3', 'location': '/datajoint-projects/test', 'user': 'djtest', 'token': '2e05709792545ce'} +} dj.config['cache'] = tempfile.mkdtemp('dj-cache') @@ -33,7 +36,7 @@ class Simple(dj.Manual): definition = """ simple : int --- - item : external-raw + item : blob- """ @@ -64,8 +67,8 @@ class Image(dj.Computed): -> Seed -> Dimension ---- - img : external-raw # objects are stored as specified by dj.config['external-raw'] - neg : external # objects are stored as specified by dj.config['external'] + img : blob-raw # objects are stored as specified by dj.config['stores'][-raw'] + neg : blob- # objects are stored as specified by dj.config['stores']['-'] """ def make(self, key): diff --git a/tests/schema_legacy_external.py b/tests/schema_legacy_external.py new file mode 100644 index 000000000..a6969f6d4 --- /dev/null +++ b/tests/schema_legacy_external.py @@ -0,0 +1,74 @@ +""" +a schema for testing external attributes using legacy syntax pre-version 0.12.0 +""" + +import tempfile +import datajoint as dj + +from . import PREFIX, CONN_INFO +import numpy as np + +schema = dj.schema(PREFIX + '_legacy_extern', connection=dj.conn(**CONN_INFO)) + + +dj.config['external'] = { + 'protocol': 'file', + 'location': 'dj-legacy/external'} + +dj.config['external-raw'] = { + 'protocol': 'file', + 'location': 'dj-legacy/raw'} + +dj.config['external-compute'] = { + 'protocol': 's3', + 'location': '/datajoint-legacy/test', + 'user': 'djtest', + 'token': '2e05709792545ce'} + +dj.config['cache'] = tempfile.mkdtemp('dj-legacy-cache') + + +@schema +class Simple(dj.Manual): + definition = """ + simple : int + --- + item : external-raw + """ + + +@schema +class Seed(dj.Lookup): + definition = """ + seed : int + """ + contents = zip(range(4)) + + +@schema +class Dimension(dj.Lookup): + definition = """ + dim : int + --- + dimensions : blob + """ + contents = ( + [0, [100, 50]], + [1, [3, 4, 8, 6]]) + + +@schema +class Image(dj.Computed): + definition = """ + # table for storing + -> Seed + -> Dimension + ---- + img : external-raw # objects are stored as specified by dj.config['external-raw'] + neg : external # objects are stored as specified by dj.config['external'] + """ + + def make(self, key): + np.random.seed(key['seed']) + img = np.random.rand(*(Dimension() & key).fetch1('dimensions')) + self.insert1(dict(key, img=img, neg=-img.astype(np.float32))) diff --git a/tests/test_external_class.py b/tests/test_external_class.py index 61ed12111..26bb1b0b7 100644 --- a/tests/test_external_class.py +++ b/tests/test_external_class.py @@ -5,7 +5,7 @@ def test_heading(): - heading = modu.Simple().heading + heading = modu.Simple.heading assert_true('item' in heading) assert_true(heading['item'].is_external) diff --git a/tests/test_legacy_external.py b/tests/test_legacy_external.py new file mode 100644 index 000000000..d5ed915b9 --- /dev/null +++ b/tests/test_legacy_external.py @@ -0,0 +1,28 @@ +import numpy as np +from numpy.testing import assert_array_equal +from nose.tools import assert_true, assert_equal +from datajoint.external import ExternalTable +from datajoint.blob import pack, unpack + +from . schema_legacy_external import schema + + +def test_external_put(): + """ + external storage put and get and remove + """ + ext = ExternalTable(schema.connection, schema.database) + input_ = np.random.randn(3, 7, 8) + count = 7 + extra = 3 + for i in range(count): + hash1 = ext.put('external-raw', pack(input_)) + for i in range(extra): + hash2 = ext.put('external-raw', pack(np.random.randn(4, 3, 2))) + + fetched_hashes = ext.fetch('hash') + assert_true(all(hash in fetched_hashes for hash in (hash1, hash2))) + assert_equal(len(ext), 1 + extra) + + output_ = unpack(ext.get(hash1)) + assert_array_equal(input_, output_) diff --git a/tests/test_legacy_external_class.py b/tests/test_legacy_external_class.py new file mode 100644 index 000000000..1b5b7ef03 --- /dev/null +++ b/tests/test_legacy_external_class.py @@ -0,0 +1,63 @@ +from nose.tools import assert_true, assert_list_equal, raises +from numpy.testing import assert_almost_equal +import datajoint as dj +from . import schema_legacy_external as modu + + +def test_heading(): + heading = modu.Simple().heading + assert_true('item' in heading) + assert_true(heading['item'].is_external) + + +def test_insert_and_fetch(): + original_list = [1, 3, 8] + modu.Simple().insert1(dict(simple=1, item=original_list)) + # test fetch + q = (modu.Simple() & {'simple': 1}).fetch('item')[0] + assert_list_equal(list(q), original_list) + # test fetch1 as a tuple + q = (modu.Simple() & {'simple': 1}).fetch1('item') + assert_list_equal(list(q), original_list) + # test fetch1 as a dict + q = (modu.Simple() & {'simple': 1}).fetch1() + assert_list_equal(list(q['item']), original_list) + # test without cache + previous_cache = dj.config['cache'] + dj.config['cache'] = None + q = (modu.Simple() & {'simple': 1}).fetch1() + assert_list_equal(list(q['item']), original_list) + # test with cache + dj.config['cache'] = previous_cache + q = (modu.Simple() & {'simple': 1}).fetch1() + assert_list_equal(list(q['item']), original_list) + + +def test_populate(): + image = modu.Image() + image.populate() + remaining, total = image.progress() + image.external_table.clean_store('raw') + assert_true(total == len(modu.Dimension() * modu.Seed()) and remaining == 0) + for img, neg, dimensions in zip(*(image * modu.Dimension()).fetch('img', 'neg', 'dimensions')): + assert_list_equal(list(img.shape), list(dimensions)) + assert_almost_equal(img, -neg) + image.delete() + image.external_table.delete_garbage() + image.external_table.clean_store('raw') + + +@raises(dj.DataJointError) +def test_drop(): + image = modu.Image() + image.populate() + image.external_table.drop() + + +@raises(dj.DataJointError) +def test_delete(): + image = modu.Image() + image.external_table.delete() + + + From 61c2ce778ce6298134e2aee5dbdd88ffe5c9baae Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 15:52:28 -0600 Subject: [PATCH 21/36] drop support of Python 3.4 --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d5157187e..5444a5f69 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ language: python env: - DJ_TEST_HOST="127.0.0.1" DJ_TEST_USER="datajoint" DJ_TEST_PASSWORD="datajoint" DJ_HOST="127.0.0.1" DJ_USER="datajoint" DJ_PASS="datajoint" BOTO_CONFIG="/tmp/bogusvalue" python: - - "3.4" - "3.5" - "3.6" matrix: From aa6a2ce90f8ab61b8c3a3dbc7bd9e5e032c162e5 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 17:16:35 -0600 Subject: [PATCH 22/36] add test for attachment methods --- tests/test_attach.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/test_attach.py diff --git a/tests/test_attach.py b/tests/test_attach.py new file mode 100644 index 000000000..5e98c700a --- /dev/null +++ b/tests/test_attach.py @@ -0,0 +1,19 @@ +from nose.tools import assert_true, assert_not_equal +import tempfile +import filecmp +import shutil +from datajoint import attach +import os + +def test_attach(): + """ + test + """ + attach_file = 'schema.py' + folder = tempfile.mkdtemp() + source_file = os.path.join(folder, attach_file) + shutil.copy(attach_file, source_file) + buffer = attach.load(source_file) + download_file = attach.save(buffer, folder) + assert_true(filecmp.cmp(download_file, source_file)) + assert_not_equal(os.path.basename(attach_file), os.path.basename(download_file)) \ No newline at end of file From f49cf22575b26c11d24a67ae638175ecc0a43420 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 20:56:06 -0600 Subject: [PATCH 23/36] fix test_attach --- .travis.yml | 1 + datajoint/__init__.py | 2 +- tests/schema_external.py | 18 +++++++++--------- tests/test_attach.py | 14 +++++++------- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5444a5f69..d5157187e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ language: python env: - DJ_TEST_HOST="127.0.0.1" DJ_TEST_USER="datajoint" DJ_TEST_PASSWORD="datajoint" DJ_HOST="127.0.0.1" DJ_USER="datajoint" DJ_PASS="datajoint" BOTO_CONFIG="/tmp/bogusvalue" python: + - "3.4" - "3.5" - "3.6" matrix: diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 02881ac81..6a1d27886 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -15,7 +15,7 @@ """ __author__ = "Dimitri Yatsenko, Edgar Y. Walker, and Fabian Sinz at Baylor College of Medicine" -__date__ = "Dec 4, 2018" +__date__ = "January 14, 2018" __all__ = ['__author__', '__version__', 'config', 'conn', 'Connection', 'schema', 'create_virtual_module', 'get_schema_names', diff --git a/tests/schema_external.py b/tests/schema_external.py index 93f6a5323..c0efc132d 100644 --- a/tests/schema_external.py +++ b/tests/schema_external.py @@ -13,22 +13,22 @@ dj.config['stores'] = { '-': { - 'protocol': 'file', - 'location': 'dj-store/external' + 'protocol': 'file', + 'location': 'dj-store/external' }, '-raw': { - 'protocol': 'file', - 'location': 'dj-store/raw'}, + 'protocol': 'file', + 'location': 'dj-store/raw'}, '-compute': { - 'protocol': 's3', - 'location': '/datajoint-projects/test', - 'user': 'djtest', - 'token': '2e05709792545ce'} + 'protocol': 's3', + 'location': '/datajoint-projects/test', + 'user': 'djtest', + 'token': '2e05709792545ce'} } -dj.config['cache'] = tempfile.mkdtemp('dj-cache') +dj.config['cache'] = tempfile.mkdtemp() @schema diff --git a/tests/test_attach.py b/tests/test_attach.py index 5e98c700a..7b6c863ca 100644 --- a/tests/test_attach.py +++ b/tests/test_attach.py @@ -1,19 +1,19 @@ from nose.tools import assert_true, assert_not_equal import tempfile import filecmp -import shutil from datajoint import attach import os + def test_attach(): """ - test + test attaching files and writing attached files """ - attach_file = 'schema.py' folder = tempfile.mkdtemp() - source_file = os.path.join(folder, attach_file) - shutil.copy(attach_file, source_file) - buffer = attach.load(source_file) + attach_file = os.path.join(folder, 'attachment.dat') + with open(attach_file, 'wb') as f: + f.write(os.urandom(3000)) + buffer = attach.load(attach_file) download_file = attach.save(buffer, folder) - assert_true(filecmp.cmp(download_file, source_file)) + assert_true(filecmp.cmp(download_file, attach_file)) assert_not_equal(os.path.basename(attach_file), os.path.basename(download_file)) \ No newline at end of file From eea3e20199e52a4b97e0d817d23bcc52132b1c1b Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 21:22:19 -0600 Subject: [PATCH 24/36] fix 3.4 compatibility --- datajoint/external.py | 4 ++-- tests/test_attach.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index e36808630..89f356208 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -103,8 +103,8 @@ def get(self, blob_hash): store = blob_hash[STORE_HASH_LENGTH:] spec = config.get_store_spec(store) if spec['protocol'] == 'file': - full_path = os.path.join( - spec['location'], self.database, *subfold(blob_hash, spec['subfolding']), blob_hash) + subfolders = subfold(blob_hash, spec['subfolding']) + full_path = os.path.join(spec['location'], self.database, *subfolders, blob_hash) try: with open(full_path, 'rb') as f: blob = f.read() diff --git a/tests/test_attach.py b/tests/test_attach.py index 7b6c863ca..2d69d1c61 100644 --- a/tests/test_attach.py +++ b/tests/test_attach.py @@ -1,4 +1,4 @@ -from nose.tools import assert_true, assert_not_equal +from nose.tools import assert_true, assert_equal, assert_not_equal import tempfile import filecmp from datajoint import attach @@ -11,9 +11,13 @@ def test_attach(): """ folder = tempfile.mkdtemp() attach_file = os.path.join(folder, 'attachment.dat') + data = os.urandom(3000) with open(attach_file, 'wb') as f: - f.write(os.urandom(3000)) + f.write(data) buffer = attach.load(attach_file) download_file = attach.save(buffer, folder) assert_true(filecmp.cmp(download_file, attach_file)) - assert_not_equal(os.path.basename(attach_file), os.path.basename(download_file)) \ No newline at end of file + assert_not_equal(os.path.basename(attach_file), os.path.basename(download_file)) + with open(download_file, 'rb') as f: + attachment_data = f.read() + assert_equal(data, attachment_data) From 7ee6134576aabef7d00c721ffba9ff5960b65d23 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 22:03:51 -0600 Subject: [PATCH 25/36] Python 3.4 compatibility --- datajoint/external.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 89f356208..2fe481913 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -103,8 +103,8 @@ def get(self, blob_hash): store = blob_hash[STORE_HASH_LENGTH:] spec = config.get_store_spec(store) if spec['protocol'] == 'file': - subfolders = subfold(blob_hash, spec['subfolding']) - full_path = os.path.join(spec['location'], self.database, *subfolders, blob_hash) + subfolders = os.path.join(*subfold(blob_hash, spec['subfolding'])) + full_path = os.path.join(spec['location'], self.database, subfolders, blob_hash) try: with open(full_path, 'rb') as f: blob = f.read() From 6701abb07dc9175979b0d8c69a331edb2f56919a Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 22:24:28 -0600 Subject: [PATCH 26/36] fix Python 3.4 compatibility --- datajoint/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datajoint/external.py b/datajoint/external.py index 2fe481913..2392548b5 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -13,7 +13,7 @@ def subfold(name, folds): """ subfolding for external storage: e.g. subfold('abcdefg', (2, 3)) --> ['ab','cde'] """ - return (name[:folds[0]], *subfold(name[folds[0]:], folds[1:])) if folds else () + return (name[:folds[0]],) + subfold(name[folds[0]:], folds[1:]) if folds else () class ExternalTable(Table): From 346f47f16da58f4339dae1e4902dd240df0841b0 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 22:44:52 -0600 Subject: [PATCH 27/36] fix Python 3.4 compatibility --- datajoint/external.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 2392548b5..977877d38 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -65,8 +65,8 @@ def put(self, store, blob): os.makedirs(folder) safe_write(full_path, blob) elif spec['protocol'] == 's3': - s3.Folder(database=self.database, **spec).put( - '/'.join((*subfold(blob_hash, spec['subfolding']), blob_hash)), blob) + subfolder = '/'.join(subfold(blob_hash, spec['subfolding'])) + s3.Folder(database=self.database, **spec).put('/'.join(subfolder, blob)) else: raise DataJointError('Unknown external storage protocol {protocol} in store "-{store}"'.format( store=store, protocol=spec['protocol'])) From b2087aa0f9139d842d5680abd0a8984c4e34023b Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 14 Jan 2019 22:59:41 -0600 Subject: [PATCH 28/36] fix Python 3.4 compatibility --- datajoint/external.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 977877d38..a7cb3571e 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -112,8 +112,8 @@ def get(self, blob_hash): raise DataJointError('Lost access to external blob %s.' % full_path) from None elif spec['protocol'] == 's3': try: - blob = s3.Folder(database=self.database, **spec).get( - '/'.join((*subfold(blob_hash, spec['subfolding']), blob_hash))) + subfolder = '/'.join(subfold(blob_hash, spec['subfolding'])) + blob = s3.Folder(database=self.database, **spec).get('/'.join((subfolder, blob_hash))) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) else: From 0c491e270959b7baa3315de44a28905861338eb7 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 15 Jan 2019 10:39:57 -0600 Subject: [PATCH 29/36] improve error message --- datajoint/table.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datajoint/table.py b/datajoint/table.py index a90b35724..b98dd9303 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -170,7 +170,8 @@ def insert(self, rows, replace=False, skip_duplicates=False, ignore_extra_fields # prohibit direct inserts into auto-populated tables if not (allow_direct_insert or getattr(self, '_allow_insert', True)): # _allow_insert is only present in AutoPopulate raise DataJointError( - 'Auto-populate tables can only be inserted into from their make methods during populate calls. (see allow_direct_insert)') + 'Auto-populate tables can only be inserted into from their make methods during populate calls.' \ + ' To override, use the the allow_direct_insert argument.') heading = self.heading if inspect.isclass(rows) and issubclass(rows, QueryExpression): # instantiate if a class From 7e51e4f525aed08b0e2cf7812a31fb3ae4a09c20 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 15 Jan 2019 10:39:57 -0600 Subject: [PATCH 30/36] improve error message --- datajoint/attach.py | 7 ++----- datajoint/external.py | 1 - datajoint/fetch.py | 22 ++++++++++++++-------- datajoint/table.py | 3 ++- tests/schema_external.py | 20 +++++++++++++++++++- tests/test_attach.py | 38 ++++++++++++++++++++++++++++++++++++++ tests/test_external.py | 3 ++- 7 files changed, 77 insertions(+), 17 deletions(-) diff --git a/datajoint/attach.py b/datajoint/attach.py index 21f178023..202aa902d 100644 --- a/datajoint/attach.py +++ b/datajoint/attach.py @@ -17,11 +17,8 @@ def save(buffer, save_path='.'): if path.isfile(file_path): # generate a new filename - split_name = path.splitext(file_path) - for n in count(): - file_path = '%s_%04u%s' % (split_name[0], n, split_name[1]) - if not path.isfile(file_path): - break + file, ext = path.splitext(file_path) + file_path = next(f for f in ('%s_%04x%s' % (file, n, ext) for n in count()) if not path.isfile(f)) with open(file_path, mode='wb') as f: f.write(buffer[p+1:]) diff --git a/datajoint/external.py b/datajoint/external.py index a7cb3571e..87a3d09c7 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -184,4 +184,3 @@ def clean_store(self, store, display_progress=True): s3.Folder(database=self.database, **spec).clean(in_use) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) - diff --git a/datajoint/fetch.py b/datajoint/fetch.py index ea40613d0..8a8ae527f 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -27,12 +27,13 @@ def to_dicts(recarray): yield OrderedDict(zip(recarray.dtype.names, rec.tolist())) -def _get(connection, attr, data, squeeze): +def _get(connection, attr, data, squeeze, download_path): """ :param connection: :param attr: an attribute from the heading :param data: literal value fetched from the table :param squeeze: if True squeeze blobs + :param download_path: for fetches that download data, e.g. attachments :return: unpacked data """ if attr.is_external: @@ -40,7 +41,7 @@ def _get(connection, attr, data, squeeze): if attr.is_blob: return blob.unpack(data, squeeze=squeeze) if attr.is_attachment: - return attach.save(data) + return attach.save(data, download_path) return data @@ -68,7 +69,8 @@ class Fetch: def __init__(self, expression): self._expression = expression - def __call__(self, *attrs, offset=None, limit=None, order_by=None, format=None, as_dict=False, squeeze=False): + def __call__(self, *attrs, offset=None, limit=None, order_by=None, format=None, as_dict=False, + squeeze=False, download_path='.'): """ Fetches the expression results from the database into an np.array or list of dictionaries and unpacks blob attributes. @@ -86,6 +88,7 @@ def __call__(self, *attrs, offset=None, limit=None, order_by=None, format=None, "frame": output pandas.DataFrame. . :param as_dict: returns a list of dictionaries instead of a record array :param squeeze: if True, remove extra dimensions from arrays + :param download_path: for fetches that download data, e.g. attachments :return: the contents of the relation in the form of a structured numpy.array or a dict list """ @@ -119,7 +122,7 @@ def __call__(self, *attrs, offset=None, limit=None, order_by=None, format=None, 'Consider setting a limit explicitly.') limit = 2 * len(self._expression) - get = partial(_get, self._expression.connection, squeeze=squeeze) + get = partial(_get, self._expression.connection, squeeze=squeeze, download_path=download_path) if not attrs: # fetch all attributes as a numpy.record_array or pandas.DataFrame cur = self._expression.cursor(as_dict=as_dict, limit=limit, offset=offset, order_by=order_by) @@ -136,7 +139,8 @@ def __call__(self, *attrs, offset=None, limit=None, order_by=None, format=None, else: # if list of attributes provided attributes = [a for a in attrs if not is_key(a)] result = self._expression.proj(*attributes).fetch( - offset=offset, limit=limit, order_by=order_by, as_dict=False, squeeze=squeeze) + offset=offset, limit=limit, order_by=order_by, + as_dict=False, squeeze=squeeze, download_path=download_path) return_values = [ list(to_dicts(result[self._expression.primary_key])) if is_key(attribute) else result[attribute] @@ -155,7 +159,7 @@ class Fetch1: def __init__(self, relation): self._expression = relation - def __call__(self, *attrs, squeeze=False): + def __call__(self, *attrs, squeeze=False, download_path='.'): """ Fetches the expression results from the database when the expression is known to yield only one entry. @@ -168,6 +172,7 @@ def __call__(self, *attrs, squeeze=False): :params *attrs: attributes to return when expanding into a tuple. If empty, the return result is a dict :param squeeze: When true, remove extra dimensions from arrays in attributes + :param download_path: for fetches that download data, e.g. attachments :return: the one tuple in the relation in the form of a dict """ @@ -178,11 +183,12 @@ def __call__(self, *attrs, squeeze=False): ret = cur.fetchone() if not ret or cur.fetchone(): raise DataJointError('fetch1 should only be used for relations with exactly one tuple') - ret = OrderedDict((name, _get(self._expression.connection, heading[name], ret[name], squeeze=squeeze)) + ret = OrderedDict((name, _get(self._expression.connection, heading[name], ret[name], + squeeze=squeeze, download_path=download_path)) for name in heading.names) else: # fetch some attributes, return as tuple attributes = [a for a in attrs if not is_key(a)] - result = self._expression.proj(*attributes).fetch(squeeze=squeeze) + result = self._expression.proj(*attributes).fetch(squeeze=squeeze, download_path=download_path) if len(result) != 1: raise DataJointError('fetch1 should only return one tuple. %d tuples were found' % len(result)) return_values = tuple( diff --git a/datajoint/table.py b/datajoint/table.py index a90b35724..b98dd9303 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -170,7 +170,8 @@ def insert(self, rows, replace=False, skip_duplicates=False, ignore_extra_fields # prohibit direct inserts into auto-populated tables if not (allow_direct_insert or getattr(self, '_allow_insert', True)): # _allow_insert is only present in AutoPopulate raise DataJointError( - 'Auto-populate tables can only be inserted into from their make methods during populate calls. (see allow_direct_insert)') + 'Auto-populate tables can only be inserted into from their make methods during populate calls.' \ + ' To override, use the the allow_direct_insert argument.') heading = self.heading if inspect.isclass(rows) and issubclass(rows, QueryExpression): # instantiate if a class diff --git a/tests/schema_external.py b/tests/schema_external.py index c0efc132d..c06c2b3cf 100644 --- a/tests/schema_external.py +++ b/tests/schema_external.py @@ -14,7 +14,12 @@ dj.config['stores'] = { '-': { 'protocol': 'file', - 'location': 'dj-store/external' + 'location': 'dj-store/external', + 'folding': (1, 1) + }, + + '-b': { + 'protocol': 'longblob' }, '-raw': { @@ -26,6 +31,8 @@ 'location': '/datajoint-projects/test', 'user': 'djtest', 'token': '2e05709792545ce'} + + } dj.config['cache'] = tempfile.mkdtemp() @@ -75,3 +82,14 @@ def make(self, key): np.random.seed(key['seed']) img = np.random.rand(*(Dimension() & key).fetch1('dimensions')) self.insert1(dict(key, img=img, neg=-img.astype(np.float32))) + + +@schema +class Attach(dj.Manual): + definition = """ + # table for storing attachments + attach : int + ---- + img : attach-raw # attachments are stored as specified by dj.config['stores']['-file'] + txt : attach-b # attachments are stored as specified by dj.config['stores']['-b'] + """ diff --git a/tests/test_attach.py b/tests/test_attach.py index 2d69d1c61..0288ef334 100644 --- a/tests/test_attach.py +++ b/tests/test_attach.py @@ -4,20 +4,58 @@ from datajoint import attach import os +from .schema_external import Attach + def test_attach(): """ test attaching files and writing attached files """ + # create a mock file folder = tempfile.mkdtemp() attach_file = os.path.join(folder, 'attachment.dat') data = os.urandom(3000) with open(attach_file, 'wb') as f: f.write(data) + # load as an attachment buffer buffer = attach.load(attach_file) + # save from an attachment buffer download_file = attach.save(buffer, folder) assert_true(filecmp.cmp(download_file, attach_file)) assert_not_equal(os.path.basename(attach_file), os.path.basename(download_file)) with open(download_file, 'rb') as f: attachment_data = f.read() assert_equal(data, attachment_data) + + +def test_attach_attributes(): + """ + test saving files in attachments + """ + # create a mock file + source_folder = tempfile.mkdtemp() + attach1 = os.path.join(source_folder, 'attach1.img') + data1 = os.urandom(100) + with open(attach1, 'wb') as f: + f.write(data1) + attach2 = os.path.join(source_folder, 'attach2.txt') + data2 = os.urandom(200) + with open(attach2, 'wb') as f: + f.write(data2) + + Attach().insert1(dict(attach=0, img=attach1, txt=attach2)) + + download_folder = tempfile.mkdtemp() + path1, path2 = Attach.fetch1('img', 'txt', download_path=download_folder) + + assert_not_equal(path1, path2) + assert_equal(os.path.split(path1)[0], download_folder) + with open(path1, 'rb') as f: + check1 = f.read() + with open(path2, 'rb') as f: + check2 = f.read() + assert_equal(data1, check1) + assert_equal(data2, check2) + + + diff --git a/tests/test_external.py b/tests/test_external.py index 450aab4d5..b7e75998d 100644 --- a/tests/test_external.py +++ b/tests/test_external.py @@ -12,6 +12,7 @@ def test_external_put(): external storage put and get and remove """ ext = ExternalTable(schema.connection, schema.database) + initial_length = len(ext) input_ = np.random.randn(3, 7, 8) count = 7 extra = 3 @@ -22,7 +23,7 @@ def test_external_put(): fetched_hashes = ext.fetch('hash') assert_true(all(hash in fetched_hashes for hash in (hash1, hash2))) - assert_equal(len(ext), 1 + extra) + assert_equal(len(ext), initial_length + 1 + extra) output_ = unpack(ext.get(hash1)) assert_array_equal(input_, output_) From 484e92680ee94b25097b67eb4cb4785ae2067831 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Wed, 16 Jan 2019 12:47:01 -0600 Subject: [PATCH 31/36] bugfix in S3 store --- datajoint/attach.py | 8 +++++--- datajoint/external.py | 4 ++-- datajoint/fetch.py | 7 ++----- datajoint/heading.py | 2 +- datajoint/table.py | 3 +-- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/datajoint/attach.py b/datajoint/attach.py index 202aa902d..d137aa3d2 100644 --- a/datajoint/attach.py +++ b/datajoint/attach.py @@ -2,25 +2,27 @@ functionality for attaching files """ from os import path -from itertools import count +from itertools import count, chain def load(local_path): + """ make an attachment from a local file """ with open(local_path, mode='rb') as f: # b is important -> binary contents = f.read() return str.encode(path.basename(local_path)) + b'\0' + contents def save(buffer, save_path='.'): + """ save attachment from memory buffer into the save_path """ p = buffer.find(b'\0') file_path = path.abspath(path.join(save_path, buffer[:p].decode())) if path.isfile(file_path): # generate a new filename file, ext = path.splitext(file_path) - file_path = next(f for f in ('%s_%04x%s' % (file, n, ext) for n in count()) if not path.isfile(f)) + file_path = next(f for f in ('%s_%04x%s' % (file, n, ext) for n in count()) + if not path.isfile(f)) with open(file_path, mode='wb') as f: f.write(buffer[p+1:]) - return file_path diff --git a/datajoint/external.py b/datajoint/external.py index 87a3d09c7..d802a26b3 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -65,8 +65,8 @@ def put(self, store, blob): os.makedirs(folder) safe_write(full_path, blob) elif spec['protocol'] == 's3': - subfolder = '/'.join(subfold(blob_hash, spec['subfolding'])) - s3.Folder(database=self.database, **spec).put('/'.join(subfolder, blob)) + folder = '/'.join(subfold(blob_hash, spec['subfolding'])) + s3.Folder(database=self.database, **spec).put('/'.join((folder, blob_hash)), blob) else: raise DataJointError('Unknown external storage protocol {protocol} in store "-{store}"'.format( store=store, protocol=spec['protocol'])) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 8a8ae527f..69ac335f3 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -38,11 +38,8 @@ def _get(connection, attr, data, squeeze, download_path): """ if attr.is_external: data = connection.schemas[attr.database].external_table.get(data) - if attr.is_blob: - return blob.unpack(data, squeeze=squeeze) - if attr.is_attachment: - return attach.save(data, download_path) - return data + return (blob.unpack(data, squeeze=squeeze) if attr.is_blob else + attach.save(data, download_path) if attr.is_attachment else data) def _flatten_attribute_list(primary_key, attrs): diff --git a/datajoint/heading.py b/datajoint/heading.py index 6d2bf730c..f16dfb269 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -75,7 +75,7 @@ def blobs(self): @property def non_blobs(self): - return [k for k, v in self.attributes.items() if not v.is_blob] + return [k for k, v in self.attributes.items() if not v.is_blob and not v.is_attachment] @property def expressions(self): diff --git a/datajoint/table.py b/datajoint/table.py index b98dd9303..3d7cec6de 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -450,8 +450,7 @@ def size_on_disk(self): return ret['Data_length'] + ret['Index_length'] def show_definition(self): - logger.warning('show_definition is deprecated. Use describe instead.') - return self.describe() + raise AttributeError('show_definition is deprecated. Use the describe method instead.') def describe(self, context=None, printout=True): """ From 1a83fe6036b76f17e49904103576191e5854629c Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Wed, 16 Jan 2019 12:47:01 -0600 Subject: [PATCH 32/36] bugfix in S3 store --- datajoint/attach.py | 6 ++++-- datajoint/external.py | 6 +++--- datajoint/fetch.py | 7 ++----- datajoint/heading.py | 2 +- datajoint/table.py | 3 +-- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/datajoint/attach.py b/datajoint/attach.py index 202aa902d..f341a8e0a 100644 --- a/datajoint/attach.py +++ b/datajoint/attach.py @@ -6,21 +6,23 @@ def load(local_path): + """ make an attachment from a local file """ with open(local_path, mode='rb') as f: # b is important -> binary contents = f.read() return str.encode(path.basename(local_path)) + b'\0' + contents def save(buffer, save_path='.'): + """ save attachment from memory buffer into the save_path """ p = buffer.find(b'\0') file_path = path.abspath(path.join(save_path, buffer[:p].decode())) if path.isfile(file_path): # generate a new filename file, ext = path.splitext(file_path) - file_path = next(f for f in ('%s_%04x%s' % (file, n, ext) for n in count()) if not path.isfile(f)) + file_path = next(f for f in ('%s_%04x%s' % (file, n, ext) for n in count()) + if not path.isfile(f)) with open(file_path, mode='wb') as f: f.write(buffer[p+1:]) - return file_path diff --git a/datajoint/external.py b/datajoint/external.py index 87a3d09c7..7cf049cc6 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -13,7 +13,7 @@ def subfold(name, folds): """ subfolding for external storage: e.g. subfold('abcdefg', (2, 3)) --> ['ab','cde'] """ - return (name[:folds[0]],) + subfold(name[folds[0]:], folds[1:]) if folds else () + return (name[:folds[0]].lower(),) + subfold(name[folds[0]:], folds[1:]) if folds else () class ExternalTable(Table): @@ -65,8 +65,8 @@ def put(self, store, blob): os.makedirs(folder) safe_write(full_path, blob) elif spec['protocol'] == 's3': - subfolder = '/'.join(subfold(blob_hash, spec['subfolding'])) - s3.Folder(database=self.database, **spec).put('/'.join(subfolder, blob)) + folder = '/'.join(subfold(blob_hash, spec['subfolding'])) + s3.Folder(database=self.database, **spec).put('/'.join((folder, blob_hash)), blob) else: raise DataJointError('Unknown external storage protocol {protocol} in store "-{store}"'.format( store=store, protocol=spec['protocol'])) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 8a8ae527f..69ac335f3 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -38,11 +38,8 @@ def _get(connection, attr, data, squeeze, download_path): """ if attr.is_external: data = connection.schemas[attr.database].external_table.get(data) - if attr.is_blob: - return blob.unpack(data, squeeze=squeeze) - if attr.is_attachment: - return attach.save(data, download_path) - return data + return (blob.unpack(data, squeeze=squeeze) if attr.is_blob else + attach.save(data, download_path) if attr.is_attachment else data) def _flatten_attribute_list(primary_key, attrs): diff --git a/datajoint/heading.py b/datajoint/heading.py index 6d2bf730c..f16dfb269 100644 --- a/datajoint/heading.py +++ b/datajoint/heading.py @@ -75,7 +75,7 @@ def blobs(self): @property def non_blobs(self): - return [k for k, v in self.attributes.items() if not v.is_blob] + return [k for k, v in self.attributes.items() if not v.is_blob and not v.is_attachment] @property def expressions(self): diff --git a/datajoint/table.py b/datajoint/table.py index b98dd9303..3d7cec6de 100644 --- a/datajoint/table.py +++ b/datajoint/table.py @@ -450,8 +450,7 @@ def size_on_disk(self): return ret['Data_length'] + ret['Index_length'] def show_definition(self): - logger.warning('show_definition is deprecated. Use describe instead.') - return self.describe() + raise AttributeError('show_definition is deprecated. Use the describe method instead.') def describe(self, context=None, printout=True): """ From 0826d94b866d971dde81e3a856afce20399a9360 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Wed, 6 Feb 2019 08:21:51 -0600 Subject: [PATCH 33/36] implement external storage cleanup with subfolding --- datajoint/erd.py | 4 ++-- datajoint/external.py | 29 +++++++++++++++++++++-------- datajoint/s3.py | 31 ++++++++++++++++++++++++++----- datajoint/schema.py | 4 +++- tests/test_erd.py | 1 + 5 files changed, 53 insertions(+), 16 deletions(-) diff --git a/datajoint/erd.py b/datajoint/erd.py index 677ef2267..0566daf6c 100644 --- a/datajoint/erd.py +++ b/datajoint/erd.py @@ -50,8 +50,8 @@ class ERD: """ def __init__(self, *args, **kwargs): - warnings.warn('ERD functionality depends on matplotlib and pygraphviz. Please install both of these ' - 'libraries to enable the ERD feature.') + warnings.warn('ERD functionality depends on matplotlib, networkx, and pygraphviz. ' + 'Please install both of these libraries to enable the ERD feature.') else: class ERD(nx.DiGraph): """ diff --git a/datajoint/external.py b/datajoint/external.py index 7cf049cc6..1449e6443 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -1,5 +1,5 @@ import os -from tqdm import tqdm +import itertools from .settings import config from .errors import DataJointError from .hash import long_hash @@ -165,22 +165,35 @@ def delete_garbage(self): for ref in self.references) or "TRUE") print('Deleted %d items' % self.connection.query("SELECT ROW_COUNT()").fetchone()[0]) - def clean_store(self, store, display_progress=True): + def clean_store(self, store, verbose=True): """ Clean unused data in an external storage repository from unused blobs. This must be performed after delete_garbage during low-usage periods to reduce risks of data loss. """ spec = config.get_store_spec(store) - progress = tqdm if display_progress else lambda x: x - in_use = set(self.fetch('hash')) + in_use = set(x for x in (self & '`hash` LIKE "%%{store}"'.format(store=store)).fetch('hash')) if spec['protocol'] == 'file': - for folder, _, files in progress(os.walk(os.path.join(spec['location'], self.database))): - for f in files: - if f not in in_use: + count = itertools.count() + print('Deleting...') + deleted_folders = set() + for folder, dirs, files in os.walk(os.path.join(spec['location'], self.database), topdown=False): + if dirs and files: + raise DataJointError('Invalid repository with files in non-terminal folder %s' % folder) + dirs = set(d for d in dirs if os.path.join(folder, d) not in deleted_folders) + if not dirs: + files_not_in_use = [f for f in files if f not in in_use] + for f in files_not_in_use: filename = os.path.join(folder, f) + next(count) + if verbose: + print(filename) os.remove(filename) + if len(files_not_in_use) == len(files): + os.rmdir(folder) + deleted_folders.add(folder) + print('Deleted %d objects' % next(count)) elif spec['protocol'] == 's3': try: - s3.Folder(database=self.database, **spec).clean(in_use) + failed_deletes = s3.Folder(database=self.database, **spec).clean(in_use, verbose=verbose) except TypeError: raise DataJointError('External store {store} configuration is incomplete.'.format(store=store)) diff --git a/datajoint/s3.py b/datajoint/s3.py index ff56d6459..d591566ec 100644 --- a/datajoint/s3.py +++ b/datajoint/s3.py @@ -30,13 +30,34 @@ def get(self, blob_hash): except minio.error.NoSuchKey: return None - def clean(self, exclude, max_count=None): + def clean(self, exclude, max_count=None, verbose=False): """ Delete all objects except for those in the exclude :param exclude: a list of blob_hashes to skip. :param max_count: maximum number of object to delete - :return: generator of objects that failed to delete + :param verbose: If True, print deleted objects + :return: list of objects that failed to delete """ - return self.client.remove_objects(self.bucket, itertools.islice( - (x.object_name for x in self.client.list_objects(self.bucket, self.remote_path + '/') - if x not in exclude), max_count)) + count = itertools.count() + if verbose: + def out(name): + next(count) + print(name) + return name + else: + def out(name): + next(count) + return name + + if verbose: + print('Deleting...') + + names = (out(x.object_name) + for x in self.client.list_objects(self.bucket, self.remote_path + '/', recursive=True) + if x.object_name.split('/')[-1] not in exclude) + + failed_deletes = list( + self.client.remove_objects(self.bucket, itertools.islice(names, max_count))) + + print('Deleted: %i S3 objects' % next(count)) + return failed_deletes diff --git a/datajoint/schema.py b/datajoint/schema.py index 5a080da39..849919151 100644 --- a/datajoint/schema.py +++ b/datajoint/schema.py @@ -228,7 +228,7 @@ def jobs(self): return self._jobs @property - def external_table(self): + def external(self): """ schema.external provides a view of the external hash table for the schema :return: external table @@ -237,6 +237,8 @@ def external_table(self): self._external = ExternalTable(self.connection, self.database) return self._external + external_table = external # for backward compatibility to pre-0.12.0 + def create_virtual_module(module_name, schema_name, create_schema=False, create_tables=False, connection=None): """ diff --git a/tests/test_erd.py b/tests/test_erd.py index 4617852d8..f952efedf 100644 --- a/tests/test_erd.py +++ b/tests/test_erd.py @@ -34,6 +34,7 @@ def test_dependencies(): @staticmethod def test_erd(): + assert_true(dj.erd.erd_active, 'Failed to import networkx and pydot') erd = dj.ERD(schema, context=namespace) graph = erd._make_graph() assert_true(set(cls.__name__ for cls in (A, B, D, E, L)).issubset(graph.nodes())) From fb140299db6140415bfe1f41ec715c2e4a5decd2 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 7 Feb 2019 15:30:34 -0600 Subject: [PATCH 34/36] fix error message and release date --- datajoint/__init__.py | 7 +++---- datajoint/declare.py | 6 ++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 6a1d27886..9e98ba31b 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -14,8 +14,8 @@ http://dx.doi.org/10.1101/031658 """ -__author__ = "Dimitri Yatsenko, Edgar Y. Walker, and Fabian Sinz at Baylor College of Medicine" -__date__ = "January 14, 2018" +__author__ = "DataJoint Contributors" +__date__ = "February 7, 2019" __all__ = ['__author__', '__version__', 'config', 'conn', 'Connection', 'schema', 'create_virtual_module', 'get_schema_names', @@ -26,7 +26,6 @@ 'DataJointError', 'DuplicateError', 'key'] -# ------------- flatten import hierarchy ------------------------- from .version import __version__ from .settings import config from .connection import conn, Connection @@ -38,4 +37,4 @@ from .erd import ERD from .admin import set_password, kill from .errors import DataJointError, DuplicateError -from .fetch import key \ No newline at end of file +from .fetch import key diff --git a/datajoint/declare.py b/datajoint/declare.py index 982198732..64b81c09d 100644 --- a/datajoint/declare.py +++ b/datajoint/declare.py @@ -285,12 +285,10 @@ def compile_attribute(line, in_key, foreign_key_sql): else: if match['default']: quote = match['default'].upper() not in literals and match['default'][0] not in '"\'' - match['default'] = ('NOT NULL DEFAULT ' + - ('"%s"' if quote else "%s") % match['default']) + match['default'] = 'NOT NULL DEFAULT ' + ('"%s"' if quote else "%s") % match['default'] else: match['default'] = 'NOT NULL' match['comment'] = match['comment'].replace('"', '\\"') # escape double quotes in comment - is_configurable = match['type'].startswith(('external', 'blob-', 'attach')) is_external = False if is_configurable: @@ -299,7 +297,7 @@ def compile_attribute(line, in_key, foreign_key_sql): match['comment'] = ':{type}:{comment}'.format(**match) # insert configurable type into comment store_name = match['type'].split('-') if store_name[0] not in ('external', 'blob', 'attach'): - raise DataJointError('Invalid attribute type in:\n%s' % line) + raise DataJointError('Configurable types must be in the form blob- or attach- in:\n%s' % line) store_name = '-'.join(store_name[1:]) if store_name and not store_name.isidentifier(): raise DataJointError( From 90cf69788cac738d2b0d83b91314ad61887fcbf9 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 7 Feb 2019 18:03:49 -0600 Subject: [PATCH 35/36] improve warning messages --- datajoint/erd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datajoint/erd.py b/datajoint/erd.py index 0566daf6c..6e830d26b 100644 --- a/datajoint/erd.py +++ b/datajoint/erd.py @@ -50,7 +50,7 @@ class ERD: """ def __init__(self, *args, **kwargs): - warnings.warn('ERD functionality depends on matplotlib, networkx, and pygraphviz. ' + warnings.warn('ERD functionality depends on matplotlib and pygraphviz. ' 'Please install both of these libraries to enable the ERD feature.') else: class ERD(nx.DiGraph): @@ -228,7 +228,6 @@ def _make_graph(self): return graph def make_dot(self): - import networkx as nx graph = self._make_graph() graph.nodes() From afeadb17190436cf268327783c28ecad772e808a Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Fri, 8 Feb 2019 10:30:33 -0600 Subject: [PATCH 36/36] change version to 0.12.dev --- datajoint/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datajoint/version.py b/datajoint/version.py index ea370a8e5..92cebb540 100644 --- a/datajoint/version.py +++ b/datajoint/version.py @@ -1 +1 @@ -__version__ = "0.12.0" +__version__ = "0.12.dev"