Skip to content

Commit

Permalink
CDPD-48829: [raz] Make user access for GS compatible with existing RA…
Browse files Browse the repository at this point in the history
…Z implementation (#3511)

(cherry picked from commit df26eff)
(cherry picked from commit e244164)
Change-Id: I011f6d81925ec0f9cbc27ec11e7b32f0be382735
  • Loading branch information
Harshg999 authored and wing2fly committed Nov 8, 2023
1 parent cf34868 commit 8ae91b6
Show file tree
Hide file tree
Showing 10 changed files with 246 additions and 49 deletions.
5 changes: 3 additions & 2 deletions desktop/conf.dist/hue.ini
Original file line number Diff line number Diff line change
Expand Up @@ -935,8 +935,6 @@ tls=no

# Settings for the Google Cloud lib
# ------------------------------------------------------------------------
# Maximum number of keys with specific directory prefix that can be deleted in a single bulk operation in GS.
## gs_bulk_delete_dir_keys_max_limit=100

[[gc_accounts]]
[[[default]]]
Expand Down Expand Up @@ -983,6 +981,9 @@ tls=no
## Autocreate the user home directory in the remote home storage path.
# autocreate_user_dir=true

## Enable integration with Google Storage for RAZ
# is_raz_gs_enabled=false

###########################################################################
# Settings to configure the snippets available in the Notebook
###########################################################################
Expand Down
5 changes: 3 additions & 2 deletions desktop/conf/pseudo-distributed.ini.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -918,8 +918,6 @@

# Settings for the Google Cloud lib
# ------------------------------------------------------------------------
# Maximum number of keys with specific directory prefix that can be deleted in a single bulk operation in GS.
## gs_bulk_delete_dir_keys_max_limit=100

[[gc_accounts]]
[[[default]]]
Expand Down Expand Up @@ -966,6 +964,9 @@
## Autocreate the user home directory in the remote home storage path.
# autocreate_user_dir=true

## Enable integration with Google Storage for RAZ
# is_raz_gs_enabled=false

###########################################################################
# Settings to configure the snippets available in the Notebook
###########################################################################
Expand Down
4 changes: 2 additions & 2 deletions desktop/core/ext-py3/boto-2.49.0/boto/gs/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ def __init__(self, gs_access_key_id=None, gs_secret_access_key=None,
proxy_user=None, proxy_pass=None,
host=DefaultHost, debug=0, https_connection_factory=None,
calling_format=SubdomainCallingFormat(), path='/',
suppress_consec_slashes=True):
suppress_consec_slashes=True, anon=False):
super(GSConnection, self).__init__(gs_access_key_id, gs_secret_access_key,
is_secure, port, proxy, proxy_port, proxy_user, proxy_pass,
host, debug, https_connection_factory, calling_format, path,
"google", Bucket,
suppress_consec_slashes=suppress_consec_slashes)
suppress_consec_slashes=suppress_consec_slashes, anon=anon)

def create_bucket(self, bucket_name, headers=None,
location=Location.DEFAULT, policy=None,
Expand Down
25 changes: 16 additions & 9 deletions desktop/core/src/desktop/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2348,6 +2348,12 @@ def handle_raz_api_auth():
type=coerce_bool,
default=True,
),
IS_RAZ_GS_ENABLED=Config(
help=_('Enable integration with Google Storage for RAZ'),
key='is_raz_gs_enabled',
default=False,
type=coerce_bool
)
)
)

Expand Down Expand Up @@ -2683,13 +2689,6 @@ def get_ldap_bind_password(ldap_config):

PERMISSION_ACTION_GS = "gs_access"

GS_BULK_DELETE_DIR_KEYS_MAX_LIMIT = Config(
help=_('Maximum number of keys with specific directory prefix that can be deleted in a single bulk operation in GS.'),
key='gs_bulk_delete_dir_keys_max_limit',
default=100,
type=coerce_zero_or_positive_integer
)

GC_ACCOUNTS = UnspecifiedConfigSection(
'gc_accounts',
help=_('One entry for each GC account'),
Expand All @@ -2711,12 +2710,20 @@ def is_cm_managed():

def is_gs_enabled():
from desktop.lib.idbroker import conf as conf_idbroker # Circular dependencies desktop.conf -> idbroker.conf -> desktop.conf

return ('default' in list(GC_ACCOUNTS.keys()) and GC_ACCOUNTS['default'].JSON_CREDENTIALS.get()) or \
conf_idbroker.is_idbroker_enabled('gs')
conf_idbroker.is_idbroker_enabled('gs') or \
is_raz_gs()

def has_gs_access(user):
from desktop.auth.backend import is_admin
return user.is_authenticated and user.is_active and (is_admin(user) or user.has_hue_permission(action="gs_access", app="filebrowser"))
return user.is_authenticated and user.is_active and (
is_admin(user) or user.has_hue_permission(action="gs_access", app="filebrowser") or is_raz_gs())

def is_raz_gs():
from desktop.conf import RAZ # Must be imported dynamically in order to have proper value

return (RAZ.IS_ENABLED.get() and RAZ.IS_RAZ_GS_ENABLED.get())


def get_ozone_conf_dir_default():
Expand Down
19 changes: 13 additions & 6 deletions desktop/core/src/desktop/lib/fs/gc/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from desktop.lib.idbroker import conf as conf_idbroker
from desktop.lib.idbroker.client import IDBroker
from desktop.lib.fs.gc.gs import GSFileSystem
from desktop.lib.fs.gc.gsconnection import RazGSConnection


def get_credential_provider(config, user):
Expand All @@ -40,13 +41,19 @@ def get_credential_provider(config, user):


def _make_client(identifier, user):
config = conf.GC_ACCOUNTS[identifier] if identifier in list(conf.GC_ACCOUNTS.keys()) else None
client = Client.from_config(config, get_credential_provider(config, user))
if conf.is_raz_gs():
gs_client_connection = RazGSConnection(username=user) # Note: Remaining GS configuration is fully skipped
gs_client_expiration = None
else:
config = conf.GC_ACCOUNTS[identifier] if identifier in list(conf.GC_ACCOUNTS.keys()) else None
gs_client_builder = Client.from_config(config, get_credential_provider(config, user))

gs_client_connection = gs_client_builder.get_gs_connection()
gs_client_expiration = gs_client_builder.expiration

return GSFileSystem(
client.get_s3_connection(),
client.expiration,
headers={"x-goog-project-id": client.project},
gs_client_connection,
gs_client_expiration,
) # It would be nice if the connection was lazy loaded


Expand All @@ -62,7 +69,7 @@ def from_config(cls, config, credential_provider):
credentials = credential_provider.get_credentials()
return Client(json_credentials=credentials.get('JsonCredentials'), expiration=credentials.get('Expiration'))

def get_s3_connection(self):
def get_gs_connection(self):
return HueGSConnection(provider=HueProvider('google', json_credentials=self.json_credentials))


Expand Down
22 changes: 13 additions & 9 deletions desktop/core/src/desktop/lib/fs/gc/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from boto.s3.prefix import Prefix
from django.utils.translation import gettext as _

from desktop.conf import PERMISSION_ACTION_GS, GS_BULK_DELETE_DIR_KEYS_MAX_LIMIT
from desktop.conf import PERMISSION_ACTION_GS, is_raz_gs
from desktop.lib.fs.gc import GS_ROOT, abspath, parse_uri, translate_gs_error, normpath, join as gs_join
from desktop.lib.fs.gc.gsstat import GSStat
from desktop.lib.fs.gc.gsfile import open as gsfile_open
Expand Down Expand Up @@ -135,7 +135,18 @@ def parent_path(path):
parent_dir = abspath(bucket_path, key_path)

return parent_dir


def create_home_dir(self, home_path):
# When GS raz is enabled, try to create user home dir for REMOTE_STORAGE_HOME path
if is_raz_gs():
LOG.debug('Attempting to create user directory for path: %s' % home_path)
try:
self.mkdir(home_path)
except Exception as e:
LOG.exception('Failed to create user home directory for path %s with error: %s' % (home_path, str(e)))
else:
LOG.info('Create home directory is not available for GS filesystem')

@translate_gs_error
def stats(self, path):
"""Get file or directory stats for a GS path.
Expand Down Expand Up @@ -299,15 +310,8 @@ def rmtree(self, path, skipTrash=True):
else:
# key.bucket.delete_keys() call is not supported from GS side
# So, try deleting the all keys with directory prefix one by one

# TODO: Check on the UI side if key count is greater than max limit and show nice notification.
deleted_dir_key_count = 0
for key in list(dir_keys):
if deleted_dir_key_count > GS_BULK_DELETE_DIR_KEYS_MAX_LIMIT.get():
break

deleted_key = key.delete()
deleted_dir_key_count += 1

@translate_gs_error
@auth_error_handler
Expand Down
152 changes: 152 additions & 0 deletions desktop/core/src/desktop/lib/fs/gc/gsconnection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python
# Licensed to Cloudera, Inc. under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. Cloudera, Inc. licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import re

from boto.gs.key import Key
from boto.gs.connection import GSConnection
from boto.s3.connection import SubdomainCallingFormat

from desktop.conf import RAZ
from desktop.lib.raz.clients import GSRazClient


LOG = logging.getLogger()


class SignedUrlGSConnection(GSConnection):
"""
Contact GS via a presigned Url of the resource hence not requiring any GS credentials.
This is a client replacing the building of the Http Request of the GS resource via asking a third party providing for a presigned Urls.
The request information is then injected into the regular boto HTTPRequest as the format is the same. Raw calls via the requests
lib would work but the unmarshalling back from XML to boto2 Python object is tedious.
The main logic consists in some light overrides in S3Connection#make_request() and AWSAuthConnection#make_request() so that we
send an updated HTTPRequest.
https://github.com/boto/boto/blob/develop/boto/s3/connection.py
https://github.com/boto/boto/blob/develop/boto/connection.py
Example of a presigned GS Url declaring a `list bucket` call:
https://hue-gs-bucket.storage.googleapis.com/?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA23E77ZX2HVY76YGL%2F20210505%2Fus-west-1%2Fs3%2Faws4_request&X-Amz-Date=20210505T171457Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=994d0ec2ca19a00aa2925fe62cab0e727591b1951a8a47504b2b9124facbd6cf
"""
def __init__(self, username, gs_access_key_id=None, gs_secret_access_key=None,
is_secure=True, port=None, proxy=None, proxy_port=None,
proxy_user=None, proxy_pass=None,
host=GSConnection.DefaultHost, debug=0, https_connection_factory=None,
calling_format=SubdomainCallingFormat(), path='/',
suppress_consec_slashes=True, anon=False):

self.username = username

# No auth handler with RAZ
anon = RAZ.IS_ENABLED.get()

super(SignedUrlGSConnection, self).__init__(
gs_access_key_id=gs_access_key_id, gs_secret_access_key=gs_secret_access_key,
is_secure=is_secure, port=port, proxy=proxy, proxy_port=proxy_port,
proxy_user=proxy_user, proxy_pass=proxy_pass,
host=host, debug=debug, https_connection_factory=https_connection_factory,
calling_format=calling_format, path=path,
suppress_consec_slashes=suppress_consec_slashes, anon=anon
)


class RazGSConnection(SignedUrlGSConnection):
"""
Class asking a RAZ server presigned Urls for all the operations on GS resources.
Some operations can be denied depending on the privileges of the users in Ranger.
Then fill-up the boto HttpRequest with the presigned Url data and lets boto executes the request as usual,
so that we get the XML unmarshalling for free.
Flow:
1. signed_url = self.get_signed_url(/bucket/dir/key)
2. request = http_request(signed_url)
3. return self._mexe(requests)
"""

def make_request(self, method, bucket='', key='', headers=None, data='',
query_args=None, sender=None, override_num_retries=None,
retry_handler=None):

if isinstance(bucket, self.bucket_class):
bucket = bucket.name
if isinstance(key, Key):
key = key.name

path = self.calling_format.build_path_base(bucket, key)
LOG.debug('path=%s' % path)

auth_path = self.calling_format.build_auth_path(bucket, key)
LOG.debug('auth_path=%s' % auth_path)

host = self.calling_format.build_host(self.server_name(), bucket)

if query_args:
# Call to RAZ for getting the signed headers does not expect the 'generation' argument in the query_args
# This might be done to make their side RAZ-GS implementation similar to S3

# Using regex to remove the 'generation' arg and its value from the query string
query_args = re.sub(r'&?generation=[^&]*', '', query_args)
query_args = query_args.lstrip('&') # Remove any leading '&' if 'generation' is at the beginning

path += '?' + query_args
LOG.debug('path=%s' % path)
auth_path += '?' + query_args
LOG.debug('auth_path=%s' % auth_path)

params = {}
# GS expects only one type of headers, either all x-amz-* or all x-goog-*, and the signed headers returned from RAZ are of x-amz-* type
# So, we are converting all x-goog-* headers to x-amz-* headers before sending the final request to GS from Hue
if headers:
updated_headers = {'x-amz-' + key[7:]: value for key, value in headers.items() if key.startswith('x-goog-')}
headers.update(updated_headers)

for key in list(headers.keys()):
if key.startswith('x-goog-'):
del headers[key]

http_request = self.build_base_http_request(method, path, auth_path, params, headers, data, host)

# Actual override starts here
LOG.debug('http_request: %s, %s, %s, %s, %s, %s, %s' % (method, path, auth_path, params, headers, data, host))
LOG.debug('http_request object: %s' % http_request)

url = 'https://%(host)s%(path)s' % {'host': host, 'path': path}

# Do not send the xml data for signing for upload operation
xml_data = '' if query_args and 'uploadId=' in query_args else data

raz_headers = self.get_signed_url(action=method, url=url, headers=headers, data=xml_data)
LOG.debug('Raz returned those headers: %s' % raz_headers)

if raz_headers is not None:
http_request.headers.update(raz_headers)
else:
raise Exception('Aborting operation: We got back empty header from Raz for the request %s' % http_request)

LOG.debug('Overriden: %s' % http_request)

return self._mexe(http_request, sender, override_num_retries,
retry_handler=retry_handler)


def get_signed_url(self, action='GET', url=None, headers=None, data=None):
raz_client = GSRazClient(username=self.username)

return raz_client.get_url(action, url, headers, data)
Loading

0 comments on commit 8ae91b6

Please sign in to comment.