Skip to content

Commit

Permalink
Add dataset_id_field to SchemingDCATXLSHarvester for identifier resol…
Browse files Browse the repository at this point in the history
…ution of attached resources
  • Loading branch information
mjanez committed Oct 23, 2024
1 parent 9935d21 commit dcb88d3
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 5 deletions.
1 change: 0 additions & 1 deletion ckanext/schemingdcat/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,7 +1527,6 @@ def _set_package_dict_default_values(self, package_dict, harvest_object, context
elif isinstance(value, dict):
package_dict[key] = {k: v for k, v in value.items()}


# Fallback: Using schemingdcat config defaults if no default values are set
package_dict = self._apply_package_defaults_from_config(package_dict, DATASET_DEFAULT_FIELDS)

Expand Down
19 changes: 15 additions & 4 deletions ckanext/schemingdcat/harvesters/xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def _clean_table_datadictionaries(self, data, prefix_colnames='datadictionary_',
def _add_distributions_and_datadictionaries_to_datasets(self, table_datasets, table_distributions_grouped, table_datadictionaries_grouped, identifier_field='identifier', alternate_identifier_field='alternate_identifier', inspire_id_field='inspire_id', datadictionary_id_field="id"):
"""
Add distributions (CKAN resources) and datadictionaries to each dataset object.
Args:
table_datasets (list): List of dataset objects.
table_distributions_grouped (dict): Dictionary of distributions grouped by dataset identifier.
Expand All @@ -357,18 +357,23 @@ def _add_distributions_and_datadictionaries_to_datasets(self, table_datasets, ta
alternate_identifier_field (str, optional): Field name for the alternate identifier. Defaults to 'alternate_identifier'.
inspire_id_field (str, optional): Field name for the inspire id. Defaults to 'inspire_id'.
datadictionary_id_field (str, optional): Field name for the datadictionary id. Defaults to 'id'.
Returns:
list: List of dataset objects with distributions (CKAN resources) and datadictionaries added.
Notes:
If 'dataset_id_field' is specified in the Harvester configuration, it will be used as the primary identifier field.
Otherwise, the function will fall back to using 'identifier_field', 'alternate_identifier_field', and 'inspire_id_field' in that order.
"""
try:
dataset_id_field = self.config.get('dataset_id_field', None)
return [
{
**d,
'resources': [
{**dr, 'datadictionaries': table_datadictionaries_grouped.get(dr[datadictionary_id_field], []) if table_datadictionaries_grouped else []}
for dr in table_distributions_grouped.get(
d.get(identifier_field) or d.get(alternate_identifier_field) or d.get(inspire_id_field), []
d.get(dataset_id_field) if dataset_id_field else d.get(identifier_field) or d.get(alternate_identifier_field) or d.get(inspire_id_field), []
)
]
}
Expand Down Expand Up @@ -669,7 +674,13 @@ def validate_config(self, config):

config = json.dumps({**config_obj, mapping_name: field_mapping})

return config
# Check if dataset_id_field exists and is a string
if 'dataset_id_field' in config_obj:
dataset_id_field = config_obj['datadictionary_sheet']
if not isinstance(dataset_id_field, basestring):
raise ValueError('dataset_id_field must be a string')

return config

def modify_package_dict(self, package_dict, harvest_object):
'''
Expand Down

0 comments on commit dcb88d3

Please sign in to comment.