diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 32fc0d5e..3271637d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,6 +32,7 @@ jobs: runs-on: ubuntu-latest container: image: ${{ matrix.ckan-image }} + options: --user root services: solr: image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 @@ -63,7 +64,7 @@ jobs: - name: Install requirements (2.9) run: | pip install -U pytest-rerunfailures - if: ${{ matrix.ckan-version == '2.9' }} + if: ${{ matrix.ckan-version == '2.9' }} - name: Setup extension (CKAN >= 2.9) run: | ckan -c test.ini db init diff --git a/README.rst b/README.rst index d3508ac0..266630f4 100644 --- a/README.rst +++ b/README.rst @@ -232,7 +232,7 @@ For example, in case you want to retain changes made by the users to the fields Command line interface ====================== -The ``ckan harvester`` command provides utilities to manage harvest operations from the command line. +The ``ckan harvester`` command provides utilities to manage harvest operations from the command line. Please refer to the help message of each command for more details:: @@ -329,6 +329,9 @@ field. The currently supported configuration options are: * api_key: If the remote CKAN instance has restricted access to the API, you can provide a CKAN API key, which will be sent in any request. +* user_agent: Set a custom user agent string on gathering and fetching, + to handle servers that whitelist or blacklist specific values. + * read_only: Create harvested packages in read-only mode. Only the user who performed the harvest (the one defined in the previous setting or the 'harvest' sysadmin) will be able to edit and administer the packages diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index aba454dd..878f20c5 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -35,6 +35,11 @@ def _get_search_api_offset(self): def _get_content(self, url): headers = {} + + user_agent = self.config.get('user_agent') + if user_agent: + headers['User-Agent'] = str(user_agent) + api_key = self.config.get('api_key') if api_key: headers['Authorization'] = api_key