diff --git a/desktop/conf.dist/hue.ini b/desktop/conf.dist/hue.ini index d45e1400ce2..d9deabfd889 100644 --- a/desktop/conf.dist/hue.ini +++ b/desktop/conf.dist/hue.ini @@ -935,8 +935,6 @@ tls=no # Settings for the Google Cloud lib # ------------------------------------------------------------------------ -# Maximum number of keys with specific directory prefix that can be deleted in a single bulk operation in GS. -## gs_bulk_delete_dir_keys_max_limit=100 [[gc_accounts]] [[[default]]] @@ -983,6 +981,9 @@ tls=no ## Autocreate the user home directory in the remote home storage path. # autocreate_user_dir=true +## Enable integration with Google Storage for RAZ +# is_raz_gs_enabled=false + ########################################################################### # Settings to configure the snippets available in the Notebook ########################################################################### diff --git a/desktop/conf/pseudo-distributed.ini.tmpl b/desktop/conf/pseudo-distributed.ini.tmpl index 2a902fc5199..7f3d40716b3 100644 --- a/desktop/conf/pseudo-distributed.ini.tmpl +++ b/desktop/conf/pseudo-distributed.ini.tmpl @@ -918,8 +918,6 @@ # Settings for the Google Cloud lib # ------------------------------------------------------------------------ - # Maximum number of keys with specific directory prefix that can be deleted in a single bulk operation in GS. - ## gs_bulk_delete_dir_keys_max_limit=100 [[gc_accounts]] [[[default]]] @@ -966,6 +964,9 @@ ## Autocreate the user home directory in the remote home storage path. # autocreate_user_dir=true + ## Enable integration with Google Storage for RAZ + # is_raz_gs_enabled=false + ########################################################################### # Settings to configure the snippets available in the Notebook ########################################################################### diff --git a/desktop/core/ext-py3/boto-2.49.0/boto/gs/connection.py b/desktop/core/ext-py3/boto-2.49.0/boto/gs/connection.py index 9a2e4a2bbb6..0fc13639e4b 100755 --- a/desktop/core/ext-py3/boto-2.49.0/boto/gs/connection.py +++ b/desktop/core/ext-py3/boto-2.49.0/boto/gs/connection.py @@ -39,12 +39,12 @@ def __init__(self, gs_access_key_id=None, gs_secret_access_key=None, proxy_user=None, proxy_pass=None, host=DefaultHost, debug=0, https_connection_factory=None, calling_format=SubdomainCallingFormat(), path='/', - suppress_consec_slashes=True): + suppress_consec_slashes=True, anon=False): super(GSConnection, self).__init__(gs_access_key_id, gs_secret_access_key, is_secure, port, proxy, proxy_port, proxy_user, proxy_pass, host, debug, https_connection_factory, calling_format, path, "google", Bucket, - suppress_consec_slashes=suppress_consec_slashes) + suppress_consec_slashes=suppress_consec_slashes, anon=anon) def create_bucket(self, bucket_name, headers=None, location=Location.DEFAULT, policy=None, diff --git a/desktop/core/src/desktop/conf.py b/desktop/core/src/desktop/conf.py index e966f3e28a1..2c0c5aa1ce4 100644 --- a/desktop/core/src/desktop/conf.py +++ b/desktop/core/src/desktop/conf.py @@ -2348,6 +2348,12 @@ def handle_raz_api_auth(): type=coerce_bool, default=True, ), + IS_RAZ_GS_ENABLED=Config( + help=_('Enable integration with Google Storage for RAZ'), + key='is_raz_gs_enabled', + default=False, + type=coerce_bool + ) ) ) @@ -2683,13 +2689,6 @@ def get_ldap_bind_password(ldap_config): PERMISSION_ACTION_GS = "gs_access" -GS_BULK_DELETE_DIR_KEYS_MAX_LIMIT = Config( - help=_('Maximum number of keys with specific directory prefix that can be deleted in a single bulk operation in GS.'), - key='gs_bulk_delete_dir_keys_max_limit', - default=100, - type=coerce_zero_or_positive_integer -) - GC_ACCOUNTS = UnspecifiedConfigSection( 'gc_accounts', help=_('One entry for each GC account'), @@ -2711,12 +2710,20 @@ def is_cm_managed(): def is_gs_enabled(): from desktop.lib.idbroker import conf as conf_idbroker # Circular dependencies desktop.conf -> idbroker.conf -> desktop.conf + return ('default' in list(GC_ACCOUNTS.keys()) and GC_ACCOUNTS['default'].JSON_CREDENTIALS.get()) or \ - conf_idbroker.is_idbroker_enabled('gs') + conf_idbroker.is_idbroker_enabled('gs') or \ + is_raz_gs() def has_gs_access(user): from desktop.auth.backend import is_admin - return user.is_authenticated and user.is_active and (is_admin(user) or user.has_hue_permission(action="gs_access", app="filebrowser")) + return user.is_authenticated and user.is_active and ( + is_admin(user) or user.has_hue_permission(action="gs_access", app="filebrowser") or is_raz_gs()) + +def is_raz_gs(): + from desktop.conf import RAZ # Must be imported dynamically in order to have proper value + + return (RAZ.IS_ENABLED.get() and RAZ.IS_RAZ_GS_ENABLED.get()) def get_ozone_conf_dir_default(): diff --git a/desktop/core/src/desktop/lib/fs/gc/client.py b/desktop/core/src/desktop/lib/fs/gc/client.py index 021a9d5c341..f9ce8c59971 100644 --- a/desktop/core/src/desktop/lib/fs/gc/client.py +++ b/desktop/core/src/desktop/lib/fs/gc/client.py @@ -32,6 +32,7 @@ from desktop.lib.idbroker import conf as conf_idbroker from desktop.lib.idbroker.client import IDBroker from desktop.lib.fs.gc.gs import GSFileSystem +from desktop.lib.fs.gc.gsconnection import RazGSConnection def get_credential_provider(config, user): @@ -40,13 +41,19 @@ def get_credential_provider(config, user): def _make_client(identifier, user): - config = conf.GC_ACCOUNTS[identifier] if identifier in list(conf.GC_ACCOUNTS.keys()) else None - client = Client.from_config(config, get_credential_provider(config, user)) + if conf.is_raz_gs(): + gs_client_connection = RazGSConnection(username=user) # Note: Remaining GS configuration is fully skipped + gs_client_expiration = None + else: + config = conf.GC_ACCOUNTS[identifier] if identifier in list(conf.GC_ACCOUNTS.keys()) else None + gs_client_builder = Client.from_config(config, get_credential_provider(config, user)) + + gs_client_connection = gs_client_builder.get_gs_connection() + gs_client_expiration = gs_client_builder.expiration return GSFileSystem( - client.get_s3_connection(), - client.expiration, - headers={"x-goog-project-id": client.project}, + gs_client_connection, + gs_client_expiration, ) # It would be nice if the connection was lazy loaded @@ -62,7 +69,7 @@ def from_config(cls, config, credential_provider): credentials = credential_provider.get_credentials() return Client(json_credentials=credentials.get('JsonCredentials'), expiration=credentials.get('Expiration')) - def get_s3_connection(self): + def get_gs_connection(self): return HueGSConnection(provider=HueProvider('google', json_credentials=self.json_credentials)) diff --git a/desktop/core/src/desktop/lib/fs/gc/gs.py b/desktop/core/src/desktop/lib/fs/gc/gs.py index b58db45d703..8cf3cc948c6 100644 --- a/desktop/core/src/desktop/lib/fs/gc/gs.py +++ b/desktop/core/src/desktop/lib/fs/gc/gs.py @@ -27,7 +27,7 @@ from boto.s3.prefix import Prefix from django.utils.translation import gettext as _ -from desktop.conf import PERMISSION_ACTION_GS, GS_BULK_DELETE_DIR_KEYS_MAX_LIMIT +from desktop.conf import PERMISSION_ACTION_GS, is_raz_gs from desktop.lib.fs.gc import GS_ROOT, abspath, parse_uri, translate_gs_error, normpath, join as gs_join from desktop.lib.fs.gc.gsstat import GSStat from desktop.lib.fs.gc.gsfile import open as gsfile_open @@ -135,7 +135,18 @@ def parent_path(path): parent_dir = abspath(bucket_path, key_path) return parent_dir - + + def create_home_dir(self, home_path): + # When GS raz is enabled, try to create user home dir for REMOTE_STORAGE_HOME path + if is_raz_gs(): + LOG.debug('Attempting to create user directory for path: %s' % home_path) + try: + self.mkdir(home_path) + except Exception as e: + LOG.exception('Failed to create user home directory for path %s with error: %s' % (home_path, str(e))) + else: + LOG.info('Create home directory is not available for GS filesystem') + @translate_gs_error def stats(self, path): """Get file or directory stats for a GS path. @@ -299,15 +310,8 @@ def rmtree(self, path, skipTrash=True): else: # key.bucket.delete_keys() call is not supported from GS side # So, try deleting the all keys with directory prefix one by one - - # TODO: Check on the UI side if key count is greater than max limit and show nice notification. - deleted_dir_key_count = 0 for key in list(dir_keys): - if deleted_dir_key_count > GS_BULK_DELETE_DIR_KEYS_MAX_LIMIT.get(): - break - deleted_key = key.delete() - deleted_dir_key_count += 1 @translate_gs_error @auth_error_handler diff --git a/desktop/core/src/desktop/lib/fs/gc/gsconnection.py b/desktop/core/src/desktop/lib/fs/gc/gsconnection.py new file mode 100644 index 00000000000..13b1874b187 --- /dev/null +++ b/desktop/core/src/desktop/lib/fs/gc/gsconnection.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# Licensed to Cloudera, Inc. under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. Cloudera, Inc. licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import re + +from boto.gs.key import Key +from boto.gs.connection import GSConnection +from boto.s3.connection import SubdomainCallingFormat + +from desktop.conf import RAZ +from desktop.lib.raz.clients import GSRazClient + + +LOG = logging.getLogger() + + +class SignedUrlGSConnection(GSConnection): + """ + Contact GS via a presigned Url of the resource hence not requiring any GS credentials. + + This is a client replacing the building of the Http Request of the GS resource via asking a third party providing for a presigned Urls. + The request information is then injected into the regular boto HTTPRequest as the format is the same. Raw calls via the requests + lib would work but the unmarshalling back from XML to boto2 Python object is tedious. + + The main logic consists in some light overrides in S3Connection#make_request() and AWSAuthConnection#make_request() so that we + send an updated HTTPRequest. + https://github.com/boto/boto/blob/develop/boto/s3/connection.py + https://github.com/boto/boto/blob/develop/boto/connection.py + + Example of a presigned GS Url declaring a `list bucket` call: + https://hue-gs-bucket.storage.googleapis.com/?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA23E77ZX2HVY76YGL%2F20210505%2Fus-west-1%2Fs3%2Faws4_request&X-Amz-Date=20210505T171457Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=994d0ec2ca19a00aa2925fe62cab0e727591b1951a8a47504b2b9124facbd6cf + """ + def __init__(self, username, gs_access_key_id=None, gs_secret_access_key=None, + is_secure=True, port=None, proxy=None, proxy_port=None, + proxy_user=None, proxy_pass=None, + host=GSConnection.DefaultHost, debug=0, https_connection_factory=None, + calling_format=SubdomainCallingFormat(), path='/', + suppress_consec_slashes=True, anon=False): + + self.username = username + + # No auth handler with RAZ + anon = RAZ.IS_ENABLED.get() + + super(SignedUrlGSConnection, self).__init__( + gs_access_key_id=gs_access_key_id, gs_secret_access_key=gs_secret_access_key, + is_secure=is_secure, port=port, proxy=proxy, proxy_port=proxy_port, + proxy_user=proxy_user, proxy_pass=proxy_pass, + host=host, debug=debug, https_connection_factory=https_connection_factory, + calling_format=calling_format, path=path, + suppress_consec_slashes=suppress_consec_slashes, anon=anon + ) + + +class RazGSConnection(SignedUrlGSConnection): + """ + Class asking a RAZ server presigned Urls for all the operations on GS resources. + Some operations can be denied depending on the privileges of the users in Ranger. + + Then fill-up the boto HttpRequest with the presigned Url data and lets boto executes the request as usual, + so that we get the XML unmarshalling for free. + + Flow: + 1. signed_url = self.get_signed_url(/bucket/dir/key) + 2. request = http_request(signed_url) + 3. return self._mexe(requests) + """ + + def make_request(self, method, bucket='', key='', headers=None, data='', + query_args=None, sender=None, override_num_retries=None, + retry_handler=None): + + if isinstance(bucket, self.bucket_class): + bucket = bucket.name + if isinstance(key, Key): + key = key.name + + path = self.calling_format.build_path_base(bucket, key) + LOG.debug('path=%s' % path) + + auth_path = self.calling_format.build_auth_path(bucket, key) + LOG.debug('auth_path=%s' % auth_path) + + host = self.calling_format.build_host(self.server_name(), bucket) + + if query_args: + # Call to RAZ for getting the signed headers does not expect the 'generation' argument in the query_args + # This might be done to make their side RAZ-GS implementation similar to S3 + + # Using regex to remove the 'generation' arg and its value from the query string + query_args = re.sub(r'&?generation=[^&]*', '', query_args) + query_args = query_args.lstrip('&') # Remove any leading '&' if 'generation' is at the beginning + + path += '?' + query_args + LOG.debug('path=%s' % path) + auth_path += '?' + query_args + LOG.debug('auth_path=%s' % auth_path) + + params = {} + # GS expects only one type of headers, either all x-amz-* or all x-goog-*, and the signed headers returned from RAZ are of x-amz-* type + # So, we are converting all x-goog-* headers to x-amz-* headers before sending the final request to GS from Hue + if headers: + updated_headers = {'x-amz-' + key[7:]: value for key, value in headers.items() if key.startswith('x-goog-')} + headers.update(updated_headers) + + for key in list(headers.keys()): + if key.startswith('x-goog-'): + del headers[key] + + http_request = self.build_base_http_request(method, path, auth_path, params, headers, data, host) + + # Actual override starts here + LOG.debug('http_request: %s, %s, %s, %s, %s, %s, %s' % (method, path, auth_path, params, headers, data, host)) + LOG.debug('http_request object: %s' % http_request) + + url = 'https://%(host)s%(path)s' % {'host': host, 'path': path} + + # Do not send the xml data for signing for upload operation + xml_data = '' if query_args and 'uploadId=' in query_args else data + + raz_headers = self.get_signed_url(action=method, url=url, headers=headers, data=xml_data) + LOG.debug('Raz returned those headers: %s' % raz_headers) + + if raz_headers is not None: + http_request.headers.update(raz_headers) + else: + raise Exception('Aborting operation: We got back empty header from Raz for the request %s' % http_request) + + LOG.debug('Overriden: %s' % http_request) + + return self._mexe(http_request, sender, override_num_retries, + retry_handler=retry_handler) + + + def get_signed_url(self, action='GET', url=None, headers=None, data=None): + raz_client = GSRazClient(username=self.username) + + return raz_client.get_url(action, url, headers, data) diff --git a/desktop/core/src/desktop/lib/fs/proxyfs.py b/desktop/core/src/desktop/lib/fs/proxyfs.py index 06e9d42b710..ae4621754ee 100644 --- a/desktop/core/src/desktop/lib/fs/proxyfs.py +++ b/desktop/core/src/desktop/lib/fs/proxyfs.py @@ -13,33 +13,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from __future__ import absolute_import - -from future import standard_library -standard_library.install_aliases() from builtins import object import logging -import sys + +from urllib.parse import urlparse as lib_urlparse from crequest.middleware import CrequestMiddleware from useradmin.models import User from desktop.auth.backend import is_admin -from desktop.conf import DEFAULT_USER, ENABLE_ORGANIZATIONS, is_ofs_enabled +from desktop.conf import DEFAULT_USER, ENABLE_ORGANIZATIONS, is_ofs_enabled, is_raz_gs from desktop.lib.fs.ozone import OFS_ROOT +from desktop.lib.fs.gc.gs import get_gs_home_directory + from aws.conf import is_raz_s3 from aws.s3.s3fs import get_s3_home_directory from azure.conf import is_raz_abfs from azure.abfs.__init__ import get_home_dir_for_abfs -if sys.version_info[0] > 2: - from urllib.parse import urlparse as lib_urlparse -else: - from urlparse import urlparse as lib_urlparse - LOG = logging.getLogger() DEFAULT_USER = DEFAULT_USER.get() @@ -225,11 +218,13 @@ def create_home_dir(self, home_path=None): if is_ofs_enabled(): LOG.debug('Creation of user home path is not supported in Ozone.') - # Get the new home_path for S3/ABFS when RAZ is enabled. + # Get the new home_path for S3/ABFS/GS when RAZ is enabled. if is_raz_s3(): home_path = get_s3_home_directory(User.objects.get(username=self.getuser())) elif is_raz_abfs(): home_path = get_home_dir_for_abfs(User.objects.get(username=self.getuser())) + elif is_raz_gs(): + home_path = get_gs_home_directory(User.objects.get(username=self.getuser())) # Try getting user from the request and create home dirs. This helps when Hue admin is trying to create the dir for other users. # That way only Hue admin needs authorization to create for all Hue users and not each individual user. @@ -237,7 +232,7 @@ def create_home_dir(self, home_path=None): request = CrequestMiddleware.get_request() username = request.user.username if request and hasattr(request, 'user') and request.user.is_authenticated else self.getuser() - if RAZ.AUTOCREATE_USER_DIR.get() and (is_raz_s3() or is_raz_abfs()): + if RAZ.AUTOCREATE_USER_DIR.get() and (is_raz_s3() or is_raz_abfs() or is_raz_gs()): fs = self.do_as_user(username, self._get_fs, home_path) fs.create_home_dir(home_path) diff --git a/desktop/core/src/desktop/lib/raz/clients.py b/desktop/core/src/desktop/lib/raz/clients.py index 99ded646f03..c9bb585690c 100644 --- a/desktop/core/src/desktop/lib/raz/clients.py +++ b/desktop/core/src/desktop/lib/raz/clients.py @@ -33,13 +33,13 @@ def get_url(self, action='GET', path=None, headers=None, data=None): Example of headers: { u'x-amz-content-sha256': u'UNSIGNED-PAYLOAD', - u'Host': u'prakashmowdev1.s3-us-west-2.amazonaws.com', + u'Host': u'hue-testing.s3-us-west-2.amazonaws.com', u'X-Amz-Security-Token': u'IQoJb3JpZ2luX2Vj...C', u'X-Amz-Date': u'20210604T102022Z', u'Authorization': u'AWS4-HMAC-SHA256 Credential=ASIAYO3P24NAOAYMMDNN/20210604/us-west-2/s3/aws4_request, SignedHeaders=host;user-agent;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=d341a194c2998c64b6fc726b69d0c3c2b97d520265f80df7e1bc1ac59a21ef94', - u'User-Agent': u'user:csso_romain' + u'User-Agent': u'user:csso_gethue_user' } ''' c = get_raz_client( @@ -52,6 +52,35 @@ def get_url(self, action='GET', path=None, headers=None, data=None): return c.check_access(method=action, url=path, headers=headers, data=data) +class GSRazClient(): + + def __init__(self, username): + self.username = username + + def get_url(self, action='GET', path=None, headers=None, data=None): + ''' + Example of headers: + { + u'x-amz-content-sha256': u'UNSIGNED-PAYLOAD', + u'Host': u'hue-testing.s3-us-west-2.amazonaws.com', + u'X-Amz-Security-Token': u'IQoJb3JpZ2luX2Vj...C', + u'X-Amz-Date': u'20210604T102022Z', + u'Authorization': u'AWS4-HMAC-SHA256 Credential=ASIAYO3P24NAOAYMMDNN/20210604/us-west-2/s3/aws4_request, + SignedHeaders=host;user-agent;x-amz-content-sha256;x-amz-date;x-amz-security-token, + Signature=d341a194c2998c64b6fc726b69d0c3c2b97d520265f80df7e1bc1ac59a21ef94', + u'User-Agent': u'user:csso_gethue_user' + } + ''' + c = get_raz_client( + raz_url=RAZ.API_URL.get(), + username=self.username, + auth=RAZ.API_AUTHENTICATION.get(), + service='gs', + ) + + return c.check_access(method=action, url=path, headers=headers, data=data) + + class AdlsRazClient(): def __init__(self, username): diff --git a/desktop/core/src/desktop/lib/raz/raz_client.py b/desktop/core/src/desktop/lib/raz/raz_client.py index 9a94bf203f6..2009696dcce 100644 --- a/desktop/core/src/desktop/lib/raz/raz_client.py +++ b/desktop/core/src/desktop/lib/raz/raz_client.py @@ -55,7 +55,7 @@ def __init__(self, raz_url, auth_type, username, service='s3', service_name='cm_ 'service_name': 'adls', 'serviceType': 'adls' } - else: + elif self.service in ('s3', 'gs'): self.service_params = { 'endpoint_prefix': 's3', 'service_name': 's3', @@ -97,7 +97,7 @@ def check_access(self, method, url, params=None, headers=None, data=None): if self.service == 'adls': self._make_adls_request(request_data, method, path, url_params, resource_path) - elif self.service == 's3': + elif self.service in ('s3', 'gs'): self._make_s3_request(request_data, request_headers, method, params, headers, url_params, endpoint, resource_path, data=data) LOG.debug('Raz url: %s' % raz_url) @@ -127,7 +127,8 @@ def check_access(self, method, url, params=None, headers=None, data=None): if self.service == 'adls': LOG.debug("Received SAS %s" % signed_response_data["ADLS_DSAS"]) return {'token': signed_response_data["ADLS_DSAS"]} - else: + + elif self.service in ('s3', 'gs'): signed_response_result = signed_response_data["S3_SIGN_RESPONSE"] if signed_response_result is not None: