From e8628979d2011f9237665b162a17710a19817cab Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 11 Apr 2022 12:07:58 +0200 Subject: [PATCH 001/119] conflict --- viadot/flows/adls_gen1_to_azure_sql_new.py | 1 + viadot/task_utils.py | 72 ++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/viadot/flows/adls_gen1_to_azure_sql_new.py b/viadot/flows/adls_gen1_to_azure_sql_new.py index 05f977280..8ae014b3c 100644 --- a/viadot/flows/adls_gen1_to_azure_sql_new.py +++ b/viadot/flows/adls_gen1_to_azure_sql_new.py @@ -149,3 +149,4 @@ def gen_flow(self) -> Flow: gen2_upload_task.set_upstream(df_to_csv_task, flow=self) create_table_task.set_upstream(df_to_csv_task, flow=self) bulk_insert_task.set_upstream(create_table_task, flow=self) + diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 2890cdc70..b663eeed9 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -345,6 +345,78 @@ def df_to_dataset( ) +@curry +def custom_mail_state_handler( + tracked_obj: Union["Flow", "Task"], + old_state: prefect.engine.state.State, + new_state: prefect.engine.state.State, + only_states: list = [Failed], + local_api_key: str = None, + credentials_secret: str = None, + vault_name: str = None, + from_email: str = None, + to_emails: str = None, +) -> prefect.engine.state.State: + + """ + Custom state handler configured to work with sendgrid. + Works as a standalone state handler, or can be called from within a custom state handler. + Args: + tracked_obj (Task or Flow): Task or Flow object the handler is registered with. + old_state (State): previous state of tracked object. + new_state (State): new state of tracked object. + only_states ([State], optional): similar to `ignore_states`, but instead _only_ + notifies you if the Task / Flow is in a state from the provided list of `State` + classes. + local_api_key (str, optional): Api key from local config. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with API KEY. + vault_name (str, optional): Name of key vault. + from_email (str): Sender mailbox address. + to_emails (str): Receiver mailbox address. + Returns: State: the `new_state` object that was provided + """ + + if credentials_secret is None: + try: + credentials_secret = PrefectSecret("mail_notifier_api_key").run() + except ValueError: + pass + + if credentials_secret is not None: + credentials_str = AzureKeyVaultSecret( + credentials_secret, vault_name=vault_name + ).run() + api_key = json.loads(credentials_str).get("API_KEY") + elif local_api_key is not None: + api_key = local_config.get(local_api_key).get("API_KEY") + else: + raise Exception("Please provide API KEY") + + curr_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + only_states = only_states or [] + if only_states and not any( + [isinstance(new_state, included) for included in only_states] + ): + return new_state + url = prefect.client.Client().get_cloud_url( + "flow-run", prefect.context["flow_run_id"], as_user=False + ) + message = Mail( + from_email=from_email, + to_emails=to_emails, + subject=f"The flow {tracked_obj.name} - Status {new_state}", + html_content=f"The flow {cast(str,tracked_obj.name)} FAILED at {curr_dt}. \ +

More details here: {url}

", + ) + try: + send_grid = SendGridAPIClient(api_key) + response = send_grid.send(message) + except Exception as e: + raise e + + return new_state + + @task def df_clean_column( df: pd.DataFrame, columns_to_clean: List[str] = None From d1022eae12904428b6592314dc33c40afc6570ec Mon Sep 17 00:00:00 2001 From: winiar93 Date: Fri, 15 Apr 2022 15:22:53 +0200 Subject: [PATCH 002/119] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20edited=20git=20ign?= =?UTF-8?q?ore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 929fa1d4d..fb0123b47 100644 --- a/.gitignore +++ b/.gitignore @@ -152,4 +152,6 @@ desktop.ini .viminfo # SAP RFC lib -sap_netweaver_rfc \ No newline at end of file +sap_netweaver_rfc + +michal \ No newline at end of file From 2abfedc8d369d6c3cdab11a88d2f658307ca7d54 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 19 Apr 2022 10:07:26 +0200 Subject: [PATCH 003/119] =?UTF-8?q?=F0=9F=90=9B=20=20Fixed=20and=20updated?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_azure_sql_transform.py | 1 + tests/integration/tasks/test_azure_data_lake.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/integration/flows/test_azure_sql_transform.py b/tests/integration/flows/test_azure_sql_transform.py index 9e401e2ca..732de7a33 100644 --- a/tests/integration/flows/test_azure_sql_transform.py +++ b/tests/integration/flows/test_azure_sql_transform.py @@ -11,6 +11,7 @@ @pytest.fixture() def TEST_TABLE(): run_sql_task = AzureSQLDBQuery() + run_sql_task.run(f"DROP TABLE IF EXISTS {FQN}") run_sql_task.run(f"CREATE TABLE {FQN} (id INT, name VARCHAR(25))") run_sql_task.run(f"INSERT INTO {FQN} VALUES (1, 'Mike')") yield diff --git a/tests/integration/tasks/test_azure_data_lake.py b/tests/integration/tasks/test_azure_data_lake.py index de3ecf49e..92a625932 100644 --- a/tests/integration/tasks/test_azure_data_lake.py +++ b/tests/integration/tasks/test_azure_data_lake.py @@ -1,5 +1,6 @@ import os import uuid +import pytest from viadot.sources import AzureDataLake from viadot.tasks import ( @@ -8,8 +9,9 @@ AzureDataLakeUpload, AzureDataLakeCopy, AzureDataLakeList, + AzureDataLakeRemove, ) -from viadot.tasks.azure_data_lake import AzureDataLakeRemove + uuid_4 = uuid.uuid4() uuid_4_2 = uuid.uuid4() @@ -22,17 +24,18 @@ file_name_parquet = f"test_file_{uuid_4}.parquet" adls_path_parquet = f"raw/supermetrics/{file_name_parquet}" -# TODO: add pytest-depends as download tests depend on the upload -# and can't be ran separately - def test_azure_data_lake_upload(TEST_CSV_FILE_PATH): upload_task = AzureDataLakeUpload() - upload_task.run(from_path=TEST_CSV_FILE_PATH, to_path=adls_path) + upload_task.run( + from_path=TEST_CSV_FILE_PATH, + to_path=adls_path, + ) file = AzureDataLake(adls_path) assert file.exists() +@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_download(): download_task = AzureDataLakeDownload() download_task.run(from_path=adls_path) @@ -40,6 +43,7 @@ def test_azure_data_lake_download(): os.remove(file_name) +@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_to_df(): task = AzureDataLakeToDF() df = task.run(path=adls_path, sep="\t") @@ -55,6 +59,7 @@ def test_azure_data_lake_to_df_parquet(TEST_PARQUET_FILE_PATH): assert not df.empty +@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_copy(): copy_task = AzureDataLakeCopy() copy_task.run(from_path=adls_path, to_path=adls_path_2) @@ -68,6 +73,7 @@ def test_azure_data_lake_list(): assert adls_path in files +@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_remove(): file = AzureDataLake(adls_path) assert file.exists() From 9498269e4442dd4a3d6e6fe499501e55243d1ee5 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 20 Apr 2022 14:30:26 +0200 Subject: [PATCH 004/119] =?UTF-8?q?=E2=9C=85=20Addad=20and=20updated=20tes?= =?UTF-8?q?ts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_container_to_container.py | 16 ++++++++++++++++ .../tasks/test_cloud_for_customers.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 tests/integration/flows/test_adls_container_to_container.py diff --git a/tests/integration/flows/test_adls_container_to_container.py b/tests/integration/flows/test_adls_container_to_container.py new file mode 100644 index 000000000..511faad69 --- /dev/null +++ b/tests/integration/flows/test_adls_container_to_container.py @@ -0,0 +1,16 @@ +from viadot.flows import ADLSContainerToContainer +from viadot.sources import AzureDataLake + +TEST_FILE_BLOB_PATH = "raw/supermetrics/mp/test.csv" +TEST_FILE_BLOB_PATH2 = "operations/supermetrics/mp/test.csv" + + +def test_adls_container_to_container(): + flow = ADLSContainerToContainer( + name="test to container", + from_path=TEST_FILE_BLOB_PATH, + to_path=TEST_FILE_BLOB_PATH2, + ) + flow.run() + file = AzureDataLake(TEST_FILE_BLOB_PATH2) + assert file.exists() diff --git a/tests/integration/tasks/test_cloud_for_customers.py b/tests/integration/tasks/test_cloud_for_customers.py index 50a6a1b8c..9446161f9 100644 --- a/tests/integration/tasks/test_cloud_for_customers.py +++ b/tests/integration/tasks/test_cloud_for_customers.py @@ -7,7 +7,7 @@ def test_c4c_to_df(): url = "http://services.odata.org/V2/Northwind/Northwind.svc/" endpoint = "Employees" c4c_to_df = C4CToDF() - df = c4c_to_df.run(url=url, endpoint=endpoint) + df = c4c_to_df.run(url=url, endpoint=endpoint, params={}) answer = df.head() assert answer.shape[1] == 23 From f0da097c9425bee33f0ad0aaf6e5603f7b456cc4 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 20 Apr 2022 14:38:28 +0200 Subject: [PATCH 005/119] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29d26e3ec..01d3aff88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,14 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added `ADLSContainerToContainer` test - Added `Salesforce` source - Added `SalesforceUpsert` task - Added C4C secret handling to `CloudForCustomersReportToADLS` flow (`c4c_credentials_secret` parameter) ### Fixed +- `C4CToDF`, `TEST_TABLE` in AzureSQLTransform tests - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed C4C secret handling (tasks now correctly read the secret as the credentials, rather than assuming the secret is a container for credentials for all environments and trying to access specific key inside it). In other words, tasks now assume the secret holds credentials, rather than a dict of the form `{env: credentials, env2: credentials2}` +### Changed +- Changed `AzureDataLake` tests ## [0.4.2] - 2022-04-08 ### Added From b459f5ea350f47a29c9e19a4761462556be8df2d Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 20 Apr 2022 14:41:09 +0200 Subject: [PATCH 006/119] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01d3aff88..d1bfc7799 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added C4C secret handling to `CloudForCustomersReportToADLS` flow (`c4c_credentials_secret` parameter) ### Fixed -- `C4CToDF`, `TEST_TABLE` in AzureSQLTransform tests +- Fixed `C4CToDF`, `TEST_TABLE` in AzureSQLTransform tests - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed C4C secret handling (tasks now correctly read the secret as the credentials, rather than assuming the secret is a container for credentials for all environments and trying to access specific key inside it). In other words, tasks now assume the secret holds credentials, rather than a dict of the form `{env: credentials, env2: credentials2}` From 6bab6eacdb084082287d3d82734ce2431557ac65 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 25 Apr 2022 16:21:26 +0200 Subject: [PATCH 007/119] =?UTF-8?q?=E2=9C=A8=20Added=20SAPRFCToADLS=20flow?= =?UTF-8?q?=20and=20edited=20SAPRFCToDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/__init__.py | 1 + viadot/flows/sap_rfc_to_adls.py | 74 +++++++++++++++++++++++++++++++++ viadot/sources/sap_rfc.py | 9 ++-- viadot/tasks/sap_rfc.py | 12 ++++-- 4 files changed, 89 insertions(+), 7 deletions(-) create mode 100644 viadot/flows/sap_rfc_to_adls.py diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index f143e4d08..68ab0420b 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -19,3 +19,4 @@ from .duckdb_transform import DuckDBTransform from .duckdb_to_sql_server import DuckDBToSQLServer from .multiple_flows import MultipleFlows +from .sap_rfc_to_adls import SAPRFCToADLS diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py new file mode 100644 index 000000000..cd423822f --- /dev/null +++ b/viadot/flows/sap_rfc_to_adls.py @@ -0,0 +1,74 @@ +import csv +from typing import Any, Dict, List, Literal +from prefect import Flow + +from viadot.tasks import SAPRFCToDF +from viadot.tasks import AzureDataLakeUpload +from viadot.task_utils import df_to_csv + +download_sap_task = SAPRFCToDF() +file_to_adls_task = AzureDataLakeUpload() + + +class SAPRFCToADLS(Flow): + def __init__( + self, + name: str, + query: str = None, + sep: str = None, + func: str = "BBP_RFC_READ_TABLE", + sap_credentials: dict = None, + local_file_path: str = None, + file_sep: str = "\t", + if_exists: Literal["append", "replace", "skip"] = "replace", + adls_path: str = None, + overwrite: bool = False, + sp_credentials_secret: str = None, + vault_name: str = None, + gen: int = 2, + *args: List[any], + **kwargs: Dict[str, Any], + ): + """ """ + self.query = query + self.sep = sep + self.func = func + self.sap_credentials = sap_credentials + self.local_file_path = local_file_path + self.file_sep = file_sep + self.if_exists = if_exists + self.adls_path = adls_path + self.overwrite = overwrite + self.sp_credentials_secret = sp_credentials_secret + self.vault_name = vault_name + self.gen = gen + + super().__init__(*args, name=name, **kwargs) + + self.gen_flow() + + def gen_flow(self) -> Flow: + df = download_sap_task.bind( + query=self.query, + sep=self.sep, + func=self.func, + credentials=self.sap_credentials, + flow=self, + ) + csv = df_to_csv.bind( + df=df, + sep=self.file_sep, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) + adls_upload = file_to_adls_task.bind( + from_path=self.local_file_path, + to_path=self.adls_path, + overwrite=self.overwrite, + sp_credentials_secret=self.sp_credentials_secret, + gen=self.gen, + flow=self, + ) + csv.set_upstream(df, flow=self) + adls_upload.set_upstream(csv, flow=self) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 94243d5e8..3c2104c18 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -93,12 +93,13 @@ class SAPRFC(Source): - etc. """ - def __init__(self, sep: str = None, *args, **kwargs): + def __init__(self, sep: str = None, func: str = "RFC_READ_TABLE", *args, **kwargs): """Create an instance of the SAPRFC class. Args: sep (str, optional): Which separator to use when querying SAP. If not provided, multiple options are automatically tried. + func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". Raises: CredentialError: If provided credentials are incorrect. @@ -114,6 +115,7 @@ def __init__(self, sep: str = None, *args, **kwargs): self.sep = sep self.client_side_filters = None + self.func = func @property def con(self) -> pyrfc.Connection: @@ -400,7 +402,8 @@ def to_df(self): params = self._query columns = self.select_columns_aliased sep = self._query.get("DELIMITER") - + func = self.func + print(func) if sep is None: # automatically find a working separator SEPARATORS = [ @@ -424,7 +427,7 @@ def to_df(self): for sep in SEPARATORS: self._query["DELIMITER"] = sep try: - response = self.call("RFC_READ_TABLE", **params) + response = self.call(func, **params) record_key = "WA" data_raw = response["DATA"] records = [row[record_key].split(sep) for row in data_raw] diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index 538c943d7..1e3c323e4 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -15,6 +15,7 @@ def __init__( self, query: str = None, sep: str = None, + func: str = "RFC_READ_TABLE", credentials: dict = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), @@ -39,12 +40,14 @@ def __init__( query (str, optional): The query to be executed with pyRFC. sep (str, optional): The separator to use when reading query results. If not provided, multiple options are automatically tried. Defaults to None. + func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. """ self.query = query self.sep = sep self.credentials = credentials + self.func = func super().__init__( name="sap_rfc_to_df", @@ -57,6 +60,7 @@ def __init__( @defaults_from_attrs( "query", "sep", + "func", "credentials", "max_retries", "retry_delay", @@ -66,6 +70,7 @@ def run( query: str = None, sep: str = None, credentials: dict = None, + func: str = "RFC_READ_TABLE", max_retries: int = None, retry_delay: timedelta = None, ) -> pd.DataFrame: @@ -75,14 +80,13 @@ def run( query (str, optional): The query to be executed with pyRFC. sep (str, optional): The separator to use when reading query results. If not provided, multiple options are automatically tried. Defaults to None. + func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". """ - + self.func = func if query is None: raise ValueError("Please provide the query.") - - sap = SAPRFC(sep=sep, credentials=credentials) + sap = SAPRFC(sep=sep, credentials=credentials, func=self.func) sap.query(query) - self.logger.info(f"Downloading data from SAP to a DataFrame...") self.logger.debug(f"Running query: \n{query}.") From d222c53dd961e828319349f59b8cb54bcc938a66 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 26 Apr 2022 13:23:29 +0200 Subject: [PATCH 008/119] =?UTF-8?q?=E2=9C=A8=20Added=20=20option=20to=20pa?= =?UTF-8?q?ss=20more=20queries,=20concat=20dfs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sap_rfc_to_adls.py | 32 ++++++++++++++++++++++---------- viadot/sources/sap_rfc.py | 1 - 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index cd423822f..87d31ed45 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -1,6 +1,7 @@ import csv +import pandas as pd from typing import Any, Dict, List, Literal -from prefect import Flow +from prefect import Flow, task, unmapped from viadot.tasks import SAPRFCToDF from viadot.tasks import AzureDataLakeUpload @@ -10,11 +11,19 @@ file_to_adls_task = AzureDataLakeUpload() +@task +def concat_dfs(dfs: List[pd.DataFrame]): + output_df = pd.DataFrame() + for i in range(len(dfs) - 1): + output_df = pd.concat([output_df, dfs[i]], axis=1) + return output_df + + class SAPRFCToADLS(Flow): def __init__( self, name: str, - query: str = None, + query_list: List[str] = None, sep: str = None, func: str = "BBP_RFC_READ_TABLE", sap_credentials: dict = None, @@ -30,7 +39,7 @@ def __init__( **kwargs: Dict[str, Any], ): """ """ - self.query = query + self.query_list = query_list self.sep = sep self.func = func self.sap_credentials = sap_credentials @@ -48,15 +57,17 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - df = download_sap_task.bind( - query=self.query, - sep=self.sep, - func=self.func, - credentials=self.sap_credentials, + + df = download_sap_task.map( + query=self.query_list, + sep=unmapped(self.sep), + func=unmapped(self.func), + credentials=unmapped(self.sap_credentials), flow=self, ) + df_full = concat_dfs.bind(df, flow=self) csv = df_to_csv.bind( - df=df, + df=df_full, sep=self.file_sep, path=self.local_file_path, if_exists=self.if_exists, @@ -70,5 +81,6 @@ def gen_flow(self) -> Flow: gen=self.gen, flow=self, ) - csv.set_upstream(df, flow=self) + df_full.set_upstream(df, flow=self) + csv.set_upstream(df_full, flow=self) adls_upload.set_upstream(csv, flow=self) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 3c2104c18..a47521cb6 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -403,7 +403,6 @@ def to_df(self): columns = self.select_columns_aliased sep = self._query.get("DELIMITER") func = self.func - print(func) if sep is None: # automatically find a working separator SEPARATORS = [ From 99eadacaa72c558ce72f2a13b9c292b93d1fd98a Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 26 Apr 2022 14:07:00 +0200 Subject: [PATCH 009/119] =?UTF-8?q?=F0=9F=93=9D=20Added=20docstrings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sap_rfc_to_adls.py | 49 +++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 87d31ed45..fdb6bfa30 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -13,17 +13,25 @@ @task def concat_dfs(dfs: List[pd.DataFrame]): - output_df = pd.DataFrame() + """ + Task to combine list of data frames into one + + Args: + dfs (List[pd.DataFrame]): List of dataframes to concat. + Returns: + full_df (pd.DataFrame()): Pandas dataframe containing all columns from dataframes from list. + """ + full_df = pd.DataFrame() for i in range(len(dfs) - 1): - output_df = pd.concat([output_df, dfs[i]], axis=1) - return output_df + full_df = pd.concat([full_df, dfs[i]], axis=1) + return full_df class SAPRFCToADLS(Flow): def __init__( self, name: str, - query_list: List[str] = None, + query_list: List[str], sep: str = None, func: str = "BBP_RFC_READ_TABLE", sap_credentials: dict = None, @@ -34,11 +42,38 @@ def __init__( overwrite: bool = False, sp_credentials_secret: str = None, vault_name: str = None, - gen: int = 2, *args: List[any], **kwargs: Dict[str, Any], ): - """ """ + """ + Flow for downloading data from SAP DataBase using the RFC protocol. + + Note that only a very limited subset of SQL is supported: + - aliases + - where clauses combined using the AND operator + - limit & offset + + Unsupported: + - aggregations + - joins + - subqueries + - etc. + + Args: + name (str): The name of the flow. + query_list(List[str]) The list of queries to be executed with pyRFC. + sep(str, optional): Which separator to use when querying SAP. If not provided, multiple options are automatically tried. + func (str, optional): SAP RFC function to use. Defaults to "BBP_RFC_READ_TABLE". + credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. + local_file_path (str, optional): Local destination path. Defaults to None. + file_sep(str, optional): The separator to use in the CSV. Defaults to "\t". + if_exists (Literal["append", "replace", "skip"], optional): What to do if the table exists. Defaults to "replace". + adls_path(str, optional): Azure Data Lake destination file path. Defaults to None. + overwrite(bool, optional) Whether to overwrite the file in ADLS. Defaults to False. + sp_credentials_secret(str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal + credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake.Defaults to None. + vault_name(str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + """ self.query_list = query_list self.sep = sep self.func = func @@ -50,7 +85,6 @@ def __init__( self.overwrite = overwrite self.sp_credentials_secret = sp_credentials_secret self.vault_name = vault_name - self.gen = gen super().__init__(*args, name=name, **kwargs) @@ -78,7 +112,6 @@ def gen_flow(self) -> Flow: to_path=self.adls_path, overwrite=self.overwrite, sp_credentials_secret=self.sp_credentials_secret, - gen=self.gen, flow=self, ) df_full.set_upstream(df, flow=self) From 6e1036fce38f37cdeb77d96e164489ed918df2f3 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 26 Apr 2022 14:09:27 +0200 Subject: [PATCH 010/119] =?UTF-8?q?=F0=9F=94=A5=20Removed=20unused=20impor?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sap_rfc_to_adls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index fdb6bfa30..0428d75be 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -1,4 +1,3 @@ -import csv import pandas as pd from typing import Any, Dict, List, Literal from prefect import Flow, task, unmapped From 038b3fd6d4d840cc2c6a3629c4ac55884dbaee98 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 26 Apr 2022 14:17:31 +0200 Subject: [PATCH 011/119] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog=20an?= =?UTF-8?q?d=20docstrings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ viadot/flows/sap_rfc_to_adls.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00a9a1089..3ccd57ce0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added `func` parameter to `SAPRFC` +- Added `SAPRFCToADLS` flow which downloads data from SAP Database to df, exports df to csv and uploads it to Azure Storage Explorer. - Added `Salesforce` source - Added `SalesforceUpsert` task - Added `SalesforceBulkUpsert` task diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 0428d75be..423d969f9 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -45,7 +45,7 @@ def __init__( **kwargs: Dict[str, Any], ): """ - Flow for downloading data from SAP DataBase using the RFC protocol. + Flow for downloading data from SAP DataBase using the RFC protocol and uploading it to Azure Storage Explorer. Note that only a very limited subset of SQL is supported: - aliases From e37a4486f2cb415c49940f69bd6f0e2d8ca4a873 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 09:56:16 +0200 Subject: [PATCH 012/119] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20consat=5Fdfs=20t?= =?UTF-8?q?ask?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sap_rfc_to_adls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 423d969f9..01f5039d9 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -21,8 +21,8 @@ def concat_dfs(dfs: List[pd.DataFrame]): full_df (pd.DataFrame()): Pandas dataframe containing all columns from dataframes from list. """ full_df = pd.DataFrame() - for i in range(len(dfs) - 1): - full_df = pd.concat([full_df, dfs[i]], axis=1) + for df in dfs: + full_df = pd.concat([full_df, df], axis=1) return full_df From 9628c27bc05b3a9a6bd3087cbe4258e0a88331cf Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 09:56:43 +0200 Subject: [PATCH 013/119] =?UTF-8?q?=E2=9C=85=20Added=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_sap_rfc_to_adls.py | 25 +++++++++++++++++++ tests/integration/tasks/test_sap_rfc_to_df.py | 13 ++++++++++ 2 files changed, 38 insertions(+) create mode 100644 tests/integration/flows/test_sap_rfc_to_adls.py create mode 100644 tests/integration/tasks/test_sap_rfc_to_df.py diff --git a/tests/integration/flows/test_sap_rfc_to_adls.py b/tests/integration/flows/test_sap_rfc_to_adls.py new file mode 100644 index 000000000..6eb083e97 --- /dev/null +++ b/tests/integration/flows/test_sap_rfc_to_adls.py @@ -0,0 +1,25 @@ +from viadot.flows import SAPRFCToADLS +from viadot.config import local_config +from viadot.sources import AzureDataLake + +ADLS_PATH = "raw/supermetrics/mp/test_file_sap.csv" + + +def test_sap_rfc_to_adls(): + sap_test_creds = local_config.get("SAP").get("QA") + flow = SAPRFCToADLS( + name="test flow", + query_list=[ + "SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%'", + "SELECT MTART, LAEDA FROM MARA WHERE LAEDA LIKE '2022%'", + ], + func="BBP_RFC_READ_TABLE", + sap_credentials=sap_test_creds, + local_file_path="test_file.csv", + adls_path=ADLS_PATH, + overwrite=True, + ) + result = flow.run() + assert result.is_successful() + file = AzureDataLake(ADLS_PATH) + assert file.exists() diff --git a/tests/integration/tasks/test_sap_rfc_to_df.py b/tests/integration/tasks/test_sap_rfc_to_df.py new file mode 100644 index 000000000..45eea5343 --- /dev/null +++ b/tests/integration/tasks/test_sap_rfc_to_df.py @@ -0,0 +1,13 @@ +from viadot.tasks import SAPRFCToDF +from viadot.config import local_config + + +def test_sap_rfc_to_df_bbp(): + sap_test_creds = local_config.get("SAP").get("QA") + task = SAPRFCToDF( + credentials=sap_test_creds, + query="SELECT MATNR, MATKL, MTART, LAEDA FROM MARA WHERE LAEDA LIKE '2022%'", + func="BBP_RFC_READ_TABLE", + ) + df = task.run() + assert len(df.columns) == 4 and df.empty == False From 32250e1730d5a63cabd13ce071fc00959388dbc5 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 10:08:01 +0200 Subject: [PATCH 014/119] =?UTF-8?q?=F0=9F=94=A5=20Removed=20unused=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/sap_rfc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index 1e3c323e4..072b833ea 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -82,10 +82,9 @@ def run( multiple options are automatically tried. Defaults to None. func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". """ - self.func = func if query is None: raise ValueError("Please provide the query.") - sap = SAPRFC(sep=sep, credentials=credentials, func=self.func) + sap = SAPRFC(sep=sep, credentials=credentials, func=func) sap.query(query) self.logger.info(f"Downloading data from SAP to a DataFrame...") self.logger.debug(f"Running query: \n{query}.") From 2226c2106caac63ec637c84c299695ac1df3059e Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 10:37:09 +0200 Subject: [PATCH 015/119] =?UTF-8?q?=F0=9F=8E=A8=20Removed=20space?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_adls_container_to_container.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/flows/test_adls_container_to_container.py b/tests/integration/flows/test_adls_container_to_container.py index 511faad69..91aeb14be 100644 --- a/tests/integration/flows/test_adls_container_to_container.py +++ b/tests/integration/flows/test_adls_container_to_container.py @@ -7,7 +7,7 @@ def test_adls_container_to_container(): flow = ADLSContainerToContainer( - name="test to container", + name="test to container", from_path=TEST_FILE_BLOB_PATH, to_path=TEST_FILE_BLOB_PATH2, ) From dea175760b4788925ea8fb8c4afbc450717e156d Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 27 Apr 2022 13:25:54 +0200 Subject: [PATCH 016/119] =?UTF-8?q?=E2=9C=85=20Added=20some=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_gen1_to_azure_sql.py | 34 ++++++ .../flows/test_adls_gen1_to_azure_sql_new.py | 111 ++++++++++++++++++ .../flows/test_adls_gen1_to_gen2.py | 25 ++++ viadot/flows/adls_gen1_to_azure_sql_new.py | 1 - 4 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 tests/integration/flows/test_adls_gen1_to_azure_sql.py create mode 100644 tests/integration/flows/test_adls_gen1_to_azure_sql_new.py create mode 100644 tests/integration/flows/test_adls_gen1_to_gen2.py diff --git a/tests/integration/flows/test_adls_gen1_to_azure_sql.py b/tests/integration/flows/test_adls_gen1_to_azure_sql.py new file mode 100644 index 000000000..48b997690 --- /dev/null +++ b/tests/integration/flows/test_adls_gen1_to_azure_sql.py @@ -0,0 +1,34 @@ +from viadot.flows import ADLSGen1ToAzureSQL +from unittest import mock + + +def test_adls_gen1_to_azure_sql_new_init( + TEST_CSV_FILE_BLOB_PATH, TEST_PARQUET_FILE_PATH +): + instance = ADLSGen1ToAzureSQL( + name="test_adls_gen1_azure_sql_flow", + path=TEST_PARQUET_FILE_PATH, + blob_path=TEST_CSV_FILE_BLOB_PATH, + schema="sandbox", + table="test_bcp", + dtypes={"country": "VARCHAR(25)", "sales": "INT"}, + if_exists="replace", + ) + assert instance + + +def test_adls_gen1_to_azure_sql_new_mock( + TEST_CSV_FILE_BLOB_PATH, TEST_PARQUET_FILE_PATH +): + with mock.patch.object(ADLSGen1ToAzureSQL, "run", return_value=True) as mock_method: + instance = ADLSGen1ToAzureSQL( + name="test_adls_gen1_azure_sql_flow", + path=TEST_PARQUET_FILE_PATH, + blob_path=TEST_CSV_FILE_BLOB_PATH, + schema="sandbox", + table="test_bcp", + dtypes={"country": "VARCHAR(25)", "sales": "INT"}, + if_exists="replace", + ) + instance.run() + mock_method.assert_called_with() diff --git a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py new file mode 100644 index 000000000..93ad69c96 --- /dev/null +++ b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py @@ -0,0 +1,111 @@ +from viadot.flows import ADLSGen1ToAzureSQLNew +import pandas as pd +import pytest +from unittest.mock import MagicMock +from viadot.flows.adls_to_azure_sql import df_to_csv_task +from viadot.task_utils import METADATA_COLUMNS, add_ingestion_metadata_task +from viadot.flows.adls_gen1_to_azure_sql_new import df_replace_special_chars +from viadot.tasks import AzureDataLakeUpload, AzureSQLCreateTable, BCPTask +from unittest import mock + +gen2_upload_task = AzureDataLakeUpload(gen=2) +create_table_task = AzureSQLCreateTable() +bulk_insert_task = BCPTask() + +d = {"col1": [1, 2], "col2": [3, 4]} +df = pd.DataFrame(data=d) +SCHEMA = "sandbox" +TABLE = "test_bcp" + + +@pytest.fixture() +def test_adls_gen1_to_azure_sql_new_init_args(): + + flow = ADLSGen1ToAzureSQLNew( + name="test_adls_gen1_gen2_flow", + gen1_path="test_file_1.csv", + gen2_path="test_file_2.csv", + schema=SCHEMA, + table=TABLE, + dtypes={"country": "INT", "sales": "INT"}, + if_exists="replace", + ) + + assert flow + + +@pytest.fixture() +def test_adls_gen1_to_azure_sql_new_run(): + class TestMocker(ADLSGen1ToAzureSQLNew): + def gen_flow(self): + d = {"country": [1, 2], "sales": [3, 4]} + df = pd.DataFrame(data=d) + + df2 = df_replace_special_chars.bind(df=df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df=df2, flow=self) + df_to_csv_task.bind( + df=df_with_metadata, + path=self.local_file_path, + sep=self.write_sep, + flow=self, + remove_tab=True, + ) + gen2_upload_task.bind( + from_path=self.local_file_path, + to_path=self.gen2_path, + overwrite=self.overwrite, + sp_credentials_secret=self.gen2_sp_credentials_secret, + vault_name=self.vault_name, + flow=self, + ) + create_table_task.bind( + schema=self.schema, + table=self.table, + dtypes=self.dtypes, + if_exists=self.if_exists, + credentials_secret=self.sqldb_credentials_secret, + vault_name=self.vault_name, + flow=self, + ) + bulk_insert_task.bind( + path=self.local_file_path, + schema=self.schema, + table=self.table, + credentials_secret=self.sqldb_credentials_secret, + vault_name=self.vault_name, + flow=self, + ) + + df_with_metadata.set_upstream(df_replace_special_chars, flow=self) + df_to_csv_task.set_upstream(df_with_metadata, flow=self) + gen2_upload_task.set_upstream(df_to_csv_task, flow=self) + create_table_task.set_upstream(df_to_csv_task, flow=self) + bulk_insert_task.set_upstream(create_table_task, flow=self) + + flow = TestMocker( + name="test_adls_gen1_gen2_flow", + gen1_path="test_file_1.csv", + gen2_path="raw/supermetrics/test_file_2.csv", + schema=SCHEMA, + table=TABLE, + dtypes={"country": "VARCHAR(25)", "sales": "INT"}, + if_exists="replace", + ) + assert flow.run() + + +def test_adls_gen1_to_azure_sql_new_mock(): + with mock.patch.object( + ADLSGen1ToAzureSQLNew, "run", return_value=True + ) as mock_method: + instance = ADLSGen1ToAzureSQLNew( + name="test_adls_gen1_gen2_flow", + gen1_path="folder1/example_file.csv", + gen2_path="folder2/example_file.csv", + schema="sandbox", + table="test_bcp", + dtypes={"country": "VARCHAR(25)", "sales": "INT"}, + if_exists="replace", + ) + instance.run() + mock_method.assert_called_with() diff --git a/tests/integration/flows/test_adls_gen1_to_gen2.py b/tests/integration/flows/test_adls_gen1_to_gen2.py new file mode 100644 index 000000000..f91f56acc --- /dev/null +++ b/tests/integration/flows/test_adls_gen1_to_gen2.py @@ -0,0 +1,25 @@ +from viadot.flows import ADLSGen1ToGen2 +from unittest import mock + + +def test_adls_gen1_gen2_init(TEST_PARQUET_FILE_PATH_2): + + flow = ADLSGen1ToGen2( + "test_adls_gen1_gen2_init", + gen1_path=TEST_PARQUET_FILE_PATH_2, + gen2_path=TEST_PARQUET_FILE_PATH_2, + ) + assert flow + + +def test_adls_gen1_to_azure_sql_new_mock( + TEST_PARQUET_FILE_PATH, TEST_PARQUET_FILE_PATH_2 +): + with mock.patch.object(ADLSGen1ToGen2, "run", return_value=True) as mock_method: + instance = ADLSGen1ToGen2( + "test_adls_gen1_gen2_init", + gen1_path=TEST_PARQUET_FILE_PATH, + gen2_path=TEST_PARQUET_FILE_PATH_2, + ) + instance.run() + mock_method.assert_called_with() diff --git a/viadot/flows/adls_gen1_to_azure_sql_new.py b/viadot/flows/adls_gen1_to_azure_sql_new.py index 8ae014b3c..05f977280 100644 --- a/viadot/flows/adls_gen1_to_azure_sql_new.py +++ b/viadot/flows/adls_gen1_to_azure_sql_new.py @@ -149,4 +149,3 @@ def gen_flow(self) -> Flow: gen2_upload_task.set_upstream(df_to_csv_task, flow=self) create_table_task.set_upstream(df_to_csv_task, flow=self) bulk_insert_task.set_upstream(create_table_task, flow=self) - From 4db91dae4e3ceee0847e20e771453afdc9267c4a Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 27 Apr 2022 16:24:51 +0200 Subject: [PATCH 017/119] =?UTF-8?q?=E2=9C=85=20=20Added=20duckdb=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_duckdb_to_sql_server.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tests/integration/flows/test_duckdb_to_sql_server.py diff --git a/tests/integration/flows/test_duckdb_to_sql_server.py b/tests/integration/flows/test_duckdb_to_sql_server.py new file mode 100644 index 000000000..c6b59d36d --- /dev/null +++ b/tests/integration/flows/test_duckdb_to_sql_server.py @@ -0,0 +1,73 @@ +import os +import json +import pytest +from viadot.flows import DuckDBToSQLServer +from unittest import mock +from viadot.sources import DuckDB +from viadot.tasks.azure_key_vault import AzureKeyVaultSecret +from prefect.tasks.secrets import PrefectSecret + +TABLE = "test_table" +SCHEMA = "test_schema" +TABLE_MULTIPLE_PARQUETS = "test_multiple_parquets" +DATABASE_PATH = "test_db_123.duckdb" + + +@pytest.fixture(scope="session") +def duckdb(): + duckdb = DuckDB(credentials=dict(database=DATABASE_PATH)) + yield duckdb + os.remove(DATABASE_PATH) + + +def test__check_if_schema_exists(duckdb): + duckdb.run(f"CREATE SCHEMA {SCHEMA}") + assert not duckdb._check_if_schema_exists(SCHEMA) + + +def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH): + duckdb.create_table_from_parquet( + schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH + ) + + +def test_duckdb_sql_server_init(): + + flow = DuckDBToSQLServer("test_duckdb_init") + assert flow + + +def test_duckdb_sql_server_flow(): + + credentials_secret = PrefectSecret( + "AZURE_DEFAULT_SQLDB_SERVICE_PRINCIPAL_SECRET" + ).run() + vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() + azure_secret_task = AzureKeyVaultSecret() + credentials_str = azure_secret_task.run( + secret=credentials_secret, vault_name=vault_name + ) + + flow = DuckDBToSQLServer( + "test_duckdb_flow_run", + duckdb_credentials=dict(database=DATABASE_PATH), + sql_server_credentials=json.loads(credentials_str), + sql_server_schema="sandbox", + sql_server_table=TABLE, + duckdb_schema=SCHEMA, + duckdb_table=TABLE, + ) + r = flow.run() + assert r.is_successful() + + +def test_duckdb_sql_server_flow_mocked(): + with mock.patch.object(DuckDBToSQLServer, "run", return_value=True) as mock_method: + flow = DuckDBToSQLServer( + "test_duckdb_flow_run", + sql_server_table=TABLE, + duckdb_schema=SCHEMA, + duckdb_table=TABLE, + ) + flow.run() + mock_method.assert_called_with() From e2f79edc7fd6cad56adaa9c0fb986980d6b1bb4e Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 16:39:34 +0200 Subject: [PATCH 018/119] =?UTF-8?q?=F0=9F=8E=A8=20Applied=20changes=20afte?= =?UTF-8?q?r=20review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_sap_rfc_to_adls.py | 6 +- tests/integration/tasks/test_sap_rfc_to_df.py | 2 +- viadot/flows/sap_rfc_to_adls.py | 104 ++++++++++-------- viadot/task_utils.py | 16 +++ viadot/tasks/sap_rfc.py | 4 +- 5 files changed, 79 insertions(+), 53 deletions(-) diff --git a/tests/integration/flows/test_sap_rfc_to_adls.py b/tests/integration/flows/test_sap_rfc_to_adls.py index 6eb083e97..10665f470 100644 --- a/tests/integration/flows/test_sap_rfc_to_adls.py +++ b/tests/integration/flows/test_sap_rfc_to_adls.py @@ -2,20 +2,20 @@ from viadot.config import local_config from viadot.sources import AzureDataLake -ADLS_PATH = "raw/supermetrics/mp/test_file_sap.csv" +ADLS_PATH = "raw/supermetrics/mp/test_file_sap.parquet" def test_sap_rfc_to_adls(): sap_test_creds = local_config.get("SAP").get("QA") flow = SAPRFCToADLS( name="test flow", - query_list=[ + queries=[ "SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%'", "SELECT MTART, LAEDA FROM MARA WHERE LAEDA LIKE '2022%'", ], func="BBP_RFC_READ_TABLE", sap_credentials=sap_test_creds, - local_file_path="test_file.csv", + local_file_path="test_file.parquet", adls_path=ADLS_PATH, overwrite=True, ) diff --git a/tests/integration/tasks/test_sap_rfc_to_df.py b/tests/integration/tasks/test_sap_rfc_to_df.py index 45eea5343..d5208625e 100644 --- a/tests/integration/tasks/test_sap_rfc_to_df.py +++ b/tests/integration/tasks/test_sap_rfc_to_df.py @@ -10,4 +10,4 @@ def test_sap_rfc_to_df_bbp(): func="BBP_RFC_READ_TABLE", ) df = task.run() - assert len(df.columns) == 4 and df.empty == False + assert len(df.columns) == 4 and not df.empty diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 01f5039d9..b023ecdf9 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -4,48 +4,34 @@ from viadot.tasks import SAPRFCToDF from viadot.tasks import AzureDataLakeUpload -from viadot.task_utils import df_to_csv +from viadot.task_utils import df_to_csv, df_to_parquet, concat_dfs download_sap_task = SAPRFCToDF() file_to_adls_task = AzureDataLakeUpload() -@task -def concat_dfs(dfs: List[pd.DataFrame]): - """ - Task to combine list of data frames into one - - Args: - dfs (List[pd.DataFrame]): List of dataframes to concat. - Returns: - full_df (pd.DataFrame()): Pandas dataframe containing all columns from dataframes from list. - """ - full_df = pd.DataFrame() - for df in dfs: - full_df = pd.concat([full_df, df], axis=1) - return full_df - - class SAPRFCToADLS(Flow): def __init__( self, name: str, - query_list: List[str], - sep: str = None, + query: str = None, + queries: List[str] = None, + rfc_sep: str = None, func: str = "BBP_RFC_READ_TABLE", sap_credentials: dict = None, + output_file_extension: str = ".parquet", local_file_path: str = None, file_sep: str = "\t", if_exists: Literal["append", "replace", "skip"] = "replace", adls_path: str = None, overwrite: bool = False, - sp_credentials_secret: str = None, + adls_sp_credentials_secret: str = None, vault_name: str = None, *args: List[any], **kwargs: Dict[str, Any], ): """ - Flow for downloading data from SAP DataBase using the RFC protocol and uploading it to Azure Storage Explorer. + Flow for downloading data from SAP database using the RFC protocol and uploading it to Azure Data Lake. Note that only a very limited subset of SQL is supported: - aliases @@ -60,29 +46,33 @@ def __init__( Args: name (str): The name of the flow. - query_list(List[str]) The list of queries to be executed with pyRFC. - sep(str, optional): Which separator to use when querying SAP. If not provided, multiple options are automatically tried. + query (str): Query to be executed with pyRFC. If multiple queries needed use `queries` parmeter. Defaults to None. + queries(List[str]) The list of queries to be executed with pyRFC. Defaults to None. + rfc_sep(str, optional): Which separator to use when querying SAP. If not provided, multiple options are automatically tried. func (str, optional): SAP RFC function to use. Defaults to "BBP_RFC_READ_TABLE". - credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. + sap_credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. + local_file_path (str, optional): Local destination path. Defaults to None. file_sep(str, optional): The separator to use in the CSV. Defaults to "\t". if_exists (Literal["append", "replace", "skip"], optional): What to do if the table exists. Defaults to "replace". adls_path(str, optional): Azure Data Lake destination file path. Defaults to None. overwrite(bool, optional) Whether to overwrite the file in ADLS. Defaults to False. - sp_credentials_secret(str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal + adls_sp_credentials_secret(str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake.Defaults to None. vault_name(str, optional): The name of the vault from which to obtain the secrets. Defaults to None. """ - self.query_list = query_list - self.sep = sep + self.query = query + self.queries = queries + self.rfc_sep = rfc_sep self.func = func self.sap_credentials = sap_credentials + self.output_file_extension = output_file_extension self.local_file_path = local_file_path self.file_sep = file_sep self.if_exists = if_exists self.adls_path = adls_path self.overwrite = overwrite - self.sp_credentials_secret = sp_credentials_secret + self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name super().__init__(*args, name=name, **kwargs) @@ -91,28 +81,48 @@ def __init__( def gen_flow(self) -> Flow: - df = download_sap_task.map( - query=self.query_list, - sep=unmapped(self.sep), - func=unmapped(self.func), - credentials=unmapped(self.sap_credentials), - flow=self, - ) - df_full = concat_dfs.bind(df, flow=self) - csv = df_to_csv.bind( - df=df_full, - sep=self.file_sep, - path=self.local_file_path, - if_exists=self.if_exists, - flow=self, - ) + if self.queries is not None: + df = download_sap_task.map( + query=self.queries, + sep=unmapped(self.rfc_sep), + func=unmapped(self.func), + credentials=unmapped(self.sap_credentials), + flow=self, + ) + df_final = concat_dfs.bind(df, flow=self) + df_final.set_upstream(df, flow=self) + else: + df_final = download_sap_task( + query=self.query, + sep=self.rfc_sep, + func=self.func, + credentials=self.sap_credentials, + flow=self, + ) + + if self.output_file_extension == ".parquet": + df_to_file = df_to_parquet.bind( + df=df_final, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) + else: + df_to_file = df_to_csv.bind( + df=df_final, + sep=self.file_sep, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) + adls_upload = file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_path, overwrite=self.overwrite, - sp_credentials_secret=self.sp_credentials_secret, + sp_credentials_secret=self.adls_sp_credentials_secret, flow=self, ) - df_full.set_upstream(df, flow=self) - csv.set_upstream(df_full, flow=self) - adls_upload.set_upstream(csv, flow=self) + + df_to_file.set_upstream(df_final, flow=self) + adls_upload.set_upstream(df_to_file, flow=self) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 846bee73b..e92ce4c82 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -462,6 +462,22 @@ def df_clean_column( return df +@task +def concat_dfs(dfs: List[pd.DataFrame]): + """ + Task to combine list of data frames into one + + Args: + dfs (List[pd.DataFrame]): List of dataframes to concat. + Returns: + full_df (pd.DataFrame()): Pandas dataframe containing all columns from dataframes from list. + """ + full_df = pd.DataFrame() + for df in dfs: + full_df = pd.concat([full_df, df], axis=1) + return full_df + + class Git(Git): @property def git_clone_url(self): diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index 072b833ea..b5b2ce281 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -15,7 +15,7 @@ def __init__( self, query: str = None, sep: str = None, - func: str = "RFC_READ_TABLE", + func: str = None, credentials: dict = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), @@ -40,7 +40,7 @@ def __init__( query (str, optional): The query to be executed with pyRFC. sep (str, optional): The separator to use when reading query results. If not provided, multiple options are automatically tried. Defaults to None. - func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". + func (str, optional): SAP RFC function to use. Defaults to None. credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. """ From 781dc26ff53e119154dcad63a52657b63caa0cdb Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 16:40:58 +0200 Subject: [PATCH 019/119] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ccd57ce0..6b5b8ed28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `func` parameter to `SAPRFC` -- Added `SAPRFCToADLS` flow which downloads data from SAP Database to df, exports df to csv and uploads it to Azure Storage Explorer. +- Added `SAPRFCToADLS` flow which downloads data from SAP Database to to a pandas DataFrame, exports df to csv and uploads it to Azure Data Lake. - Added `Salesforce` source - Added `SalesforceUpsert` task - Added `SalesforceBulkUpsert` task From 94eee155431b27dbc2830ba2274ca5579c3a5d2c Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 16:44:55 +0200 Subject: [PATCH 020/119] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20func=20default?= =?UTF-8?q?=20value?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/sap_rfc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index b5b2ce281..24800f47d 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -70,7 +70,7 @@ def run( query: str = None, sep: str = None, credentials: dict = None, - func: str = "RFC_READ_TABLE", + func: str = None, max_retries: int = None, retry_delay: timedelta = None, ) -> pd.DataFrame: @@ -80,7 +80,7 @@ def run( query (str, optional): The query to be executed with pyRFC. sep (str, optional): The separator to use when reading query results. If not provided, multiple options are automatically tried. Defaults to None. - func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". + func (str, optional): SAP RFC function to use. Defaults to None. """ if query is None: raise ValueError("Please provide the query.") From 15dd67de36e81e1e8b9cea01c12db2cdcdc9932d Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 27 Apr 2022 17:02:05 +0200 Subject: [PATCH 021/119] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20concat=20dfs?= =?UTF-8?q?=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index e92ce4c82..350334510 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -470,12 +470,9 @@ def concat_dfs(dfs: List[pd.DataFrame]): Args: dfs (List[pd.DataFrame]): List of dataframes to concat. Returns: - full_df (pd.DataFrame()): Pandas dataframe containing all columns from dataframes from list. + pd.DataFrame(): Pandas dataframe containing all columns from dataframes from list. """ - full_df = pd.DataFrame() - for df in dfs: - full_df = pd.concat([full_df, df], axis=1) - return full_df + return pd.concat(dfs, axis=1) class Git(Git): From 6028d0d3964d0d13f9add98c718396128a0e5138 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 28 Apr 2022 08:21:09 +0200 Subject: [PATCH 022/119] =?UTF-8?q?=E2=9C=85=20Added=20test=20for=20query?= =?UTF-8?q?=20option?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_sap_rfc_to_adls.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/tests/integration/flows/test_sap_rfc_to_adls.py b/tests/integration/flows/test_sap_rfc_to_adls.py index 10665f470..9aa57eedf 100644 --- a/tests/integration/flows/test_sap_rfc_to_adls.py +++ b/tests/integration/flows/test_sap_rfc_to_adls.py @@ -1,11 +1,13 @@ from viadot.flows import SAPRFCToADLS from viadot.config import local_config from viadot.sources import AzureDataLake +from viadot.tasks import AzureDataLakeRemove ADLS_PATH = "raw/supermetrics/mp/test_file_sap.parquet" +FILE_NAME = "test_file.parquet" -def test_sap_rfc_to_adls(): +def test_sap_rfc_to_adls_queries(): sap_test_creds = local_config.get("SAP").get("QA") flow = SAPRFCToADLS( name="test flow", @@ -15,7 +17,7 @@ def test_sap_rfc_to_adls(): ], func="BBP_RFC_READ_TABLE", sap_credentials=sap_test_creds, - local_file_path="test_file.parquet", + local_file_path=FILE_NAME, adls_path=ADLS_PATH, overwrite=True, ) @@ -23,3 +25,22 @@ def test_sap_rfc_to_adls(): assert result.is_successful() file = AzureDataLake(ADLS_PATH) assert file.exists() + AzureDataLakeRemove(ADLS_PATH) + + +def test_sap_rfc_to_adls_query(): + sap_test_creds = local_config.get("SAP").get("QA") + flow = SAPRFCToADLS( + name="test flow", + query="SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%'", + func="BBP_RFC_READ_TABLE", + sap_credentials=sap_test_creds, + local_file_path=FILE_NAME, + adls_path=ADLS_PATH, + overwrite=True, + ) + result = flow.run() + assert result.is_successful() + file = AzureDataLake(ADLS_PATH) + assert file.exists() + AzureDataLakeRemove(ADLS_PATH) From 28ab885a06600b5baba97991d79f536c0f7b2f79 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 28 Apr 2022 08:31:19 +0200 Subject: [PATCH 023/119] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20func=20default?= =?UTF-8?q?=20valute?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sap_rfc_to_adls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index b023ecdf9..52dad10fd 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -17,7 +17,7 @@ def __init__( query: str = None, queries: List[str] = None, rfc_sep: str = None, - func: str = "BBP_RFC_READ_TABLE", + func: str = "RFC_READ_TABLE", sap_credentials: dict = None, output_file_extension: str = ".parquet", local_file_path: str = None, @@ -49,9 +49,9 @@ def __init__( query (str): Query to be executed with pyRFC. If multiple queries needed use `queries` parmeter. Defaults to None. queries(List[str]) The list of queries to be executed with pyRFC. Defaults to None. rfc_sep(str, optional): Which separator to use when querying SAP. If not provided, multiple options are automatically tried. - func (str, optional): SAP RFC function to use. Defaults to "BBP_RFC_READ_TABLE". + func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". sap_credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. - + output_file_extension (str, optional): Output file extension - to allow selection of .csv for data which is not easy to handle with parquet. Defaults to ".parquet". local_file_path (str, optional): Local destination path. Defaults to None. file_sep(str, optional): The separator to use in the CSV. Defaults to "\t". if_exists (Literal["append", "replace", "skip"], optional): What to do if the table exists. Defaults to "replace". From b9140dd3b2e3f6fedac1d523c26b4e7a84a367ad Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 28 Apr 2022 09:14:06 +0200 Subject: [PATCH 024/119] =?UTF-8?q?=E2=9C=85=20adls=20g1=20g2=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_gen1_to_azure_sql_new.py | 93 +++++-------------- 1 file changed, 25 insertions(+), 68 deletions(-) diff --git a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py index 93ad69c96..20f59157d 100644 --- a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py +++ b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py @@ -1,16 +1,8 @@ from viadot.flows import ADLSGen1ToAzureSQLNew import pandas as pd import pytest -from unittest.mock import MagicMock -from viadot.flows.adls_to_azure_sql import df_to_csv_task -from viadot.task_utils import METADATA_COLUMNS, add_ingestion_metadata_task -from viadot.flows.adls_gen1_to_azure_sql_new import df_replace_special_chars -from viadot.tasks import AzureDataLakeUpload, AzureSQLCreateTable, BCPTask from unittest import mock -gen2_upload_task = AzureDataLakeUpload(gen=2) -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() d = {"col1": [1, 2], "col2": [3, 4]} df = pd.DataFrame(data=d) @@ -34,66 +26,6 @@ def test_adls_gen1_to_azure_sql_new_init_args(): assert flow -@pytest.fixture() -def test_adls_gen1_to_azure_sql_new_run(): - class TestMocker(ADLSGen1ToAzureSQLNew): - def gen_flow(self): - d = {"country": [1, 2], "sales": [3, 4]} - df = pd.DataFrame(data=d) - - df2 = df_replace_special_chars.bind(df=df, flow=self) - df_with_metadata = add_ingestion_metadata_task.bind(df=df2, flow=self) - df_to_csv_task.bind( - df=df_with_metadata, - path=self.local_file_path, - sep=self.write_sep, - flow=self, - remove_tab=True, - ) - gen2_upload_task.bind( - from_path=self.local_file_path, - to_path=self.gen2_path, - overwrite=self.overwrite, - sp_credentials_secret=self.gen2_sp_credentials_secret, - vault_name=self.vault_name, - flow=self, - ) - create_table_task.bind( - schema=self.schema, - table=self.table, - dtypes=self.dtypes, - if_exists=self.if_exists, - credentials_secret=self.sqldb_credentials_secret, - vault_name=self.vault_name, - flow=self, - ) - bulk_insert_task.bind( - path=self.local_file_path, - schema=self.schema, - table=self.table, - credentials_secret=self.sqldb_credentials_secret, - vault_name=self.vault_name, - flow=self, - ) - - df_with_metadata.set_upstream(df_replace_special_chars, flow=self) - df_to_csv_task.set_upstream(df_with_metadata, flow=self) - gen2_upload_task.set_upstream(df_to_csv_task, flow=self) - create_table_task.set_upstream(df_to_csv_task, flow=self) - bulk_insert_task.set_upstream(create_table_task, flow=self) - - flow = TestMocker( - name="test_adls_gen1_gen2_flow", - gen1_path="test_file_1.csv", - gen2_path="raw/supermetrics/test_file_2.csv", - schema=SCHEMA, - table=TABLE, - dtypes={"country": "VARCHAR(25)", "sales": "INT"}, - if_exists="replace", - ) - assert flow.run() - - def test_adls_gen1_to_azure_sql_new_mock(): with mock.patch.object( ADLSGen1ToAzureSQLNew, "run", return_value=True @@ -109,3 +41,28 @@ def test_adls_gen1_to_azure_sql_new_mock(): ) instance.run() mock_method.assert_called_with() + + +def test_adls_gen1_to_azure_sql_new_flow_run_mock(): + + d = {"country": [1, 2], "sales": [3, 4]} + df = pd.DataFrame(data=d) + + with mock.patch( + "viadot.flows.adls_gen1_to_azure_sql_new.gen1_to_df_task.bind" + ) as gen1_to_df_task_mock_bind_method_mock: + gen1_to_df_task_mock_bind_method_mock.return_value = df + + flow = ADLSGen1ToAzureSQLNew( + name="test_adls_g1g2", + gen1_path="example_path", + gen2_path="raw/test/test.csv", + dtypes={"country": "VARCHAR(25)", "sales": "INT"}, + if_exists="replace", + table="test", + schema="sandbox", + ) + + result = flow.run() + + assert result.is_successful() From 1ef55dbb83622dec37b3f75c424fa49caf4c6b24 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 28 Apr 2022 10:08:26 +0200 Subject: [PATCH 025/119] =?UTF-8?q?=E2=9C=85=20added=20duckdb=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_gen1_to_azure_sql_new.py | 7 ++-- .../integration/flows/test_duck_transform.py | 40 +++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 tests/integration/flows/test_duck_transform.py diff --git a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py index 20f59157d..52d798798 100644 --- a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py +++ b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py @@ -1,7 +1,8 @@ -from viadot.flows import ADLSGen1ToAzureSQLNew -import pandas as pd +import os import pytest +import pandas as pd from unittest import mock +from viadot.flows import ADLSGen1ToAzureSQLNew d = {"col1": [1, 2], "col2": [3, 4]} @@ -10,7 +11,6 @@ TABLE = "test_bcp" -@pytest.fixture() def test_adls_gen1_to_azure_sql_new_init_args(): flow = ADLSGen1ToAzureSQLNew( @@ -66,3 +66,4 @@ def test_adls_gen1_to_azure_sql_new_flow_run_mock(): result = flow.run() assert result.is_successful() + os.remove("test_adls_g1g2.csv") diff --git a/tests/integration/flows/test_duck_transform.py b/tests/integration/flows/test_duck_transform.py new file mode 100644 index 000000000..9913af33b --- /dev/null +++ b/tests/integration/flows/test_duck_transform.py @@ -0,0 +1,40 @@ +from viadot.flows import DuckDBTransform +import pytest +import pandas as pd +from unittest import mock +from viadot.sources import DuckDB +import os + +TABLE = "test_table" +SCHEMA = "test_schema" +TABLE_MULTIPLE_PARQUETS = "test_multiple_parquets" +DATABASE_PATH = "test_db_123.duckdb" + + +@pytest.fixture(scope="session") +def duckdb(): + duckdb = DuckDB(credentials=dict(database=DATABASE_PATH)) + yield duckdb + os.remove(DATABASE_PATH) + + +def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH): + duckdb.create_table_from_parquet( + schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH + ) + + +def test_duckdb_transform_init(): + instance = DuckDBTransform("test_duckdb_transform", query="select * from test") + + assert instance + + +def test_duckdb_transform_flow_run(): + instance = DuckDBTransform( + "test_duckdb_transform", + query=f"select * from {SCHEMA}.{TABLE}", + credentials=dict(database=DATABASE_PATH), + ) + result = instance.run() + assert result.is_successful() From 56691fa6d88c11c82eec4a011f01705fa30b18af Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 28 Apr 2022 13:26:00 +0200 Subject: [PATCH 026/119] =?UTF-8?q?=E2=9C=85=20Added=20sharepoint=5Fto=5Fa?= =?UTF-8?q?dls=20flow=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_duckdb_transform.py | 40 +++++++++++++++++++ tests/integration/flows/test_flow_of_flows.py | 28 +++++++++++++ .../flows/test_sharepoint_to_adls.py | 28 +++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 tests/integration/flows/test_duckdb_transform.py create mode 100644 tests/integration/flows/test_flow_of_flows.py create mode 100644 tests/integration/flows/test_sharepoint_to_adls.py diff --git a/tests/integration/flows/test_duckdb_transform.py b/tests/integration/flows/test_duckdb_transform.py new file mode 100644 index 000000000..9913af33b --- /dev/null +++ b/tests/integration/flows/test_duckdb_transform.py @@ -0,0 +1,40 @@ +from viadot.flows import DuckDBTransform +import pytest +import pandas as pd +from unittest import mock +from viadot.sources import DuckDB +import os + +TABLE = "test_table" +SCHEMA = "test_schema" +TABLE_MULTIPLE_PARQUETS = "test_multiple_parquets" +DATABASE_PATH = "test_db_123.duckdb" + + +@pytest.fixture(scope="session") +def duckdb(): + duckdb = DuckDB(credentials=dict(database=DATABASE_PATH)) + yield duckdb + os.remove(DATABASE_PATH) + + +def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH): + duckdb.create_table_from_parquet( + schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH + ) + + +def test_duckdb_transform_init(): + instance = DuckDBTransform("test_duckdb_transform", query="select * from test") + + assert instance + + +def test_duckdb_transform_flow_run(): + instance = DuckDBTransform( + "test_duckdb_transform", + query=f"select * from {SCHEMA}.{TABLE}", + credentials=dict(database=DATABASE_PATH), + ) + result = instance.run() + assert result.is_successful() diff --git a/tests/integration/flows/test_flow_of_flows.py b/tests/integration/flows/test_flow_of_flows.py new file mode 100644 index 000000000..2639e36b0 --- /dev/null +++ b/tests/integration/flows/test_flow_of_flows.py @@ -0,0 +1,28 @@ +import pytest +from unittest import mock +from viadot.flows import Pipeline +from prefect.engine.state import Success +from prefect.engine.state import State + + +def test_pipeline_init(): + instance = Pipeline( + "test_pipeline_flow", + project_name="example_project", + extract_flows_names=["flow1_extract", "flow2_load"], + transform_flow_name="flow1_extract", + ) + assert instance + + +def test_pipeline_flow_run_mock(): + with mock.patch.object(Pipeline, "run", return_value=Success) as mock_method: + + instance = Pipeline( + "test_pipeline_flow", + project_name="example_project", + extract_flows_names=["flow1_extract", "flow2_load"], + transform_flow_name="flow1_extract", + ) + result = instance.run() + assert result diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py new file mode 100644 index 000000000..0a986f936 --- /dev/null +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -0,0 +1,28 @@ +from viadot.flows import SharepointToADLS +from unittest import mock +import pandas as pd +from prefect.tasks.secrets import PrefectSecret + + +def test_sharepoint_to_adls_run_flow(): + + d = {"country": [1, 2], "sales": [3, 4]} + df = pd.DataFrame(data=d) + + credentials_secret = PrefectSecret( + "AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET" + ).run() + + with mock.patch( + "viadot.flows.sharepoint_to_adls.excel_to_df_task.bind" + ) as excel_to_df_task_mock: + excel_to_df_task_mock.return_value = df + + flow = SharepointToADLS( + "test_sharepoint_to_adls_run_flow", + output_file_extension=".csv", + adls_sp_credentials_secret=credentials_secret, + adls_dir_path="raw/tests/test.csv", + ) + result = flow.run() + assert result.is_successful() From 83f297502a3beaacb5ecb9cac4a2a3ee309ef16d Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 28 Apr 2022 16:24:20 +0200 Subject: [PATCH 027/119] =?UTF-8?q?=E2=9C=85=20Added=20and=20corrected=20s?= =?UTF-8?q?harepoint=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_sharepoint_to_adls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index 0a986f936..31e1047db 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -2,6 +2,7 @@ from unittest import mock import pandas as pd from prefect.tasks.secrets import PrefectSecret +import os def test_sharepoint_to_adls_run_flow(): @@ -26,3 +27,5 @@ def test_sharepoint_to_adls_run_flow(): ) result = flow.run() assert result.is_successful() + os.remove("test_sharepoint_to_adls_run_flow.csv") + os.remove("test_sharepoint_to_adls_run_flow.json") From 06390637afa6e8f7e508aefd73fb5331a4de34a6 Mon Sep 17 00:00:00 2001 From: m-paz Date: Thu, 28 Apr 2022 17:34:14 +0200 Subject: [PATCH 028/119] =?UTF-8?q?=F0=9F=93=9D=20Bumped=20version=20after?= =?UTF-8?q?=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_viadot.py | 2 +- viadot/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_viadot.py b/tests/test_viadot.py index 0503a0ef0..8890de84e 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.4.3" + assert __version__ == "0.4.4" diff --git a/viadot/__init__.py b/viadot/__init__.py index f6b7e267c..cd1ee63b7 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.4.3" +__version__ = "0.4.4" From 284ac51ee52da8a62e3ea0e775577b08082c030f Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 2 May 2022 10:18:08 +0200 Subject: [PATCH 029/119] =?UTF-8?q?=E2=9C=85=20=20Added=20test=20for=20Sup?= =?UTF-8?q?ermetricsToAzureSQL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_gen1_to_azure_sql_new.py | 2 +- .../flows/test_supermetrics_to_azure_sql.py | 54 +++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 tests/integration/flows/test_supermetrics_to_azure_sql.py diff --git a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py index 52d798798..6c171ca28 100644 --- a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py +++ b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py @@ -5,7 +5,7 @@ from viadot.flows import ADLSGen1ToAzureSQLNew -d = {"col1": [1, 2], "col2": [3, 4]} +d = {"country": [1, 2], "sales": [3, 4]} df = pd.DataFrame(data=d) SCHEMA = "sandbox" TABLE = "test_bcp" diff --git a/tests/integration/flows/test_supermetrics_to_azure_sql.py b/tests/integration/flows/test_supermetrics_to_azure_sql.py new file mode 100644 index 000000000..ffc9d834e --- /dev/null +++ b/tests/integration/flows/test_supermetrics_to_azure_sql.py @@ -0,0 +1,54 @@ +import os +import pandas as pd +from unittest import mock +from viadot.flows import SupermetricsToAzureSQL +from io import StringIO +from csv import reader + + +def test_supermetrics_to_azure_sql_init(): + + tasks_set = """{, + , + , + }""" + + instance = SupermetricsToAzureSQL( + "test_name", + ds_id="example_id", + ds_accounts="example_accounts", + ds_user="example_user", + fields=["filed", "field2"], + ) + + assert instance + assert instance.__dict__["tasks"] == tasks_set + + +def test_supermetrics_to_azure_sql_run_flow(): + + with mock.patch.object( + SupermetricsToAzureSQL, "run", return_value=True + ) as mock_method: + flow = SupermetricsToAzureSQL( + "test_name_extract", + ds_id="example_id", + ds_accounts="example_accounts", + ds_user="example_user", + date_range_type="last_year_inc", + max_rows=10, + fields=["Date", "profile", "Campaignname"], + schema="raw", + table="test_name_extract", + local_file_path="test.csv", + blob_path="tests/supermetrics/test.csv", + if_exists="replace", + dtypes={ + "Date": "DATE", + "profile": "VARCHAR(255)", + "Campaignname": "VARCHAR(255)", + }, + ) + + flow.run() + mock_method.assert_called_with() From 21869da1b9f433c0772fcb70292872eaab719c88 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 2 May 2022 11:28:52 +0200 Subject: [PATCH 030/119] =?UTF-8?q?=E2=9C=85=20=20added=20test=20of=20Supe?= =?UTF-8?q?rmetricsToAzureSQL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_supermetrics_to_azure_sql.py | 64 ++++++++++++------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/tests/integration/flows/test_supermetrics_to_azure_sql.py b/tests/integration/flows/test_supermetrics_to_azure_sql.py index ffc9d834e..e20834681 100644 --- a/tests/integration/flows/test_supermetrics_to_azure_sql.py +++ b/tests/integration/flows/test_supermetrics_to_azure_sql.py @@ -2,16 +2,19 @@ import pandas as pd from unittest import mock from viadot.flows import SupermetricsToAzureSQL +from prefect.storage import Local from io import StringIO from csv import reader +from viadot.config import local_config +CWD = os.getcwd() +adls_dir_path = "raw/supermetrics" +STORAGE = Local(path=CWD) +SCHEMA = "sandbox" +TABLE = "test_supermetrics" -def test_supermetrics_to_azure_sql_init(): - tasks_set = """{, - , - , - }""" +def test_supermetrics_to_azure_sql_init(): instance = SupermetricsToAzureSQL( "test_name", @@ -22,32 +25,49 @@ def test_supermetrics_to_azure_sql_init(): ) assert instance - assert instance.__dict__["tasks"] == tasks_set + assert instance.__dict__["ds_id"] == "example_id" -def test_supermetrics_to_azure_sql_run_flow(): +def test_supermetrics_to_azure_sql_run_flow_mock(): with mock.patch.object( SupermetricsToAzureSQL, "run", return_value=True ) as mock_method: + + credentials = local_config.get("SUPERMETRICS") + flow = SupermetricsToAzureSQL( - "test_name_extract", - ds_id="example_id", - ds_accounts="example_accounts", - ds_user="example_user", - date_range_type="last_year_inc", - max_rows=10, - fields=["Date", "profile", "Campaignname"], - schema="raw", - table="test_name_extract", - local_file_path="test.csv", - blob_path="tests/supermetrics/test.csv", - if_exists="replace", + "test_supermetrics", + ds_id="GA", + ds_segments=[ + "R1fbzFNQQ3q_GYvdpRr42w", + "I8lnFFvdSFKc50lP7mBKNA", + "Lg7jR0VWS5OqGPARtGYKrw", + "h8ViuGLfRX-cCL4XKk6yfQ", + "-1", + ], + ds_accounts=["8326007", "58338899"], + date_range_type="last_month", + ds_user=credentials["USER"], + fields=[ + {"id": "Date"}, + {"id": "segment", "split": "column"}, + {"id": "AvgPageLoadTime_calc"}, + ], dtypes={ - "Date": "DATE", - "profile": "VARCHAR(255)", - "Campaignname": "VARCHAR(255)", + "date": "DATE", + "segment": "VARCHAR(255)", + "AvgPageLoadTime_calc": "VARCHAR(255)", }, + settings={"avoid_sampling": "true"}, + order_columns="alphabetic", + max_columns=10, + max_rows=1, + schema=SCHEMA, + table=TABLE, + local_file_path="test_supermetrics.csv", + blob_path="tests/test.csv", + storage=STORAGE, ) flow.run() From 43f46fb0d8e6ba332614b744abb3bb2069b299fd Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 2 May 2022 12:36:06 +0200 Subject: [PATCH 031/119] =?UTF-8?q?=E2=9C=85=20Added=20duckdb=20task=20tes?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_duckdb.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/integration/tasks/test_duckdb.py diff --git a/tests/integration/tasks/test_duckdb.py b/tests/integration/tasks/test_duckdb.py new file mode 100644 index 000000000..e69de29bb From af3d04dbf1c82c5f585c15bb459da9e7bbd781f9 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 2 May 2022 13:18:09 +0200 Subject: [PATCH 032/119] =?UTF-8?q?=E2=9C=85=20Added=20test=20of=20SQLServ?= =?UTF-8?q?erCreateTable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_sql_server.py | 41 ++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tests/integration/tasks/test_sql_server.py diff --git a/tests/integration/tasks/test_sql_server.py b/tests/integration/tasks/test_sql_server.py new file mode 100644 index 000000000..42d960818 --- /dev/null +++ b/tests/integration/tasks/test_sql_server.py @@ -0,0 +1,41 @@ +import json +import logging +from viadot.tasks import SQLServerCreateTable +from viadot.tasks.azure_key_vault import AzureKeyVaultSecret +from prefect.tasks.secrets import PrefectSecret + +SCHEMA = "sandbox" +TABLE = "test" + + +def test_sql_server_create_table(caplog): + + credentials_secret = PrefectSecret( + "AZURE_DEFAULT_SQLDB_SERVICE_PRINCIPAL_SECRET" + ).run() + vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() + azure_secret_task = AzureKeyVaultSecret() + credentials_str = azure_secret_task.run( + secret=credentials_secret, vault_name=vault_name + ) + + dtypes = { + "Date": "DATE", + "profile": "VARCHAR(255)", + "Campaignname": "VARCHAR(255)", + "Impressions": "FLOAT(24)", + "Clicks": "FLOAT(24)", + "Cost_eur": "FLOAT(24)", + "SearchImpressionShare": "VARCHAR(255)", + } + + create_table_task = SQLServerCreateTable() + with caplog.at_level(logging.INFO): + create_table_task.run( + schema=SCHEMA, + table=TABLE, + dtypes=dtypes, + if_exists="replace", + credentials=json.loads(credentials_str), + ) + assert "Successfully created table" in caplog.text From b7562cc36649056f354bc3da9a503b2cb49e8b80 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 2 May 2022 13:58:16 +0200 Subject: [PATCH 033/119] =?UTF-8?q?=E2=9C=85=20=20Added=20tests=20SQLServe?= =?UTF-8?q?rCreateTable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_sql_server.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/integration/tasks/test_sql_server.py b/tests/integration/tasks/test_sql_server.py index 42d960818..aea2bc0c6 100644 --- a/tests/integration/tasks/test_sql_server.py +++ b/tests/integration/tasks/test_sql_server.py @@ -1,5 +1,7 @@ import json import logging +import inspect +import types from viadot.tasks import SQLServerCreateTable from viadot.tasks.azure_key_vault import AzureKeyVaultSecret from prefect.tasks.secrets import PrefectSecret @@ -8,6 +10,14 @@ TABLE = "test" +def test_sql_server_create_table_init(): + instance = SQLServerCreateTable() + name = instance.__dict__["name"] + assert inspect.isclass(SQLServerCreateTable) + assert isinstance(instance, SQLServerCreateTable) + assert name == "sql_server_create_table" + + def test_sql_server_create_table(caplog): credentials_secret = PrefectSecret( From 480cbb4a0192b15bb6bf56300a70b183fa3b11e2 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 2 May 2022 14:53:43 +0200 Subject: [PATCH 034/119] =?UTF-8?q?=E2=9C=85=20Added=20more=20tests=20to?= =?UTF-8?q?=20duckdb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_duckdb_transform.py | 20 +++++++++++++++++++ viadot/task_utils.py | 3 --- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/integration/flows/test_duckdb_transform.py b/tests/integration/flows/test_duckdb_transform.py index 9913af33b..1a3fd9564 100644 --- a/tests/integration/flows/test_duckdb_transform.py +++ b/tests/integration/flows/test_duckdb_transform.py @@ -1,4 +1,5 @@ from viadot.flows import DuckDBTransform +from viadot.tasks import DuckDBQuery, DuckDBToDF import pytest import pandas as pd from unittest import mock @@ -24,6 +25,25 @@ def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH): ) +def test_duckdb_query(): + + db_query = DuckDBQuery(credentials=dict(database=DATABASE_PATH)) + + result = db_query.run(f"select * from {SCHEMA}.{TABLE}") + assert type(result) == list + assert len(result) > 1 + + +def test_duckdb_to_df(): + + instance = DuckDBToDF( + schema=SCHEMA, table=TABLE, credentials=dict(database=DATABASE_PATH) + ) + test_df = instance.run() + assert test_df.shape > (1, 1) + assert type(test_df) == pd.core.frame.DataFrame + + def test_duckdb_transform_init(): instance = DuckDBTransform("test_duckdb_transform", query="select * from test") diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 6c0374526..846bee73b 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -382,10 +382,7 @@ def custom_mail_state_handler( from_email (str): Sender mailbox address. to_emails (str): Receiver mailbox address. Returns: State: the `new_state` object that was provided -<<<<<<< HEAD -======= ->>>>>>> a8c7a85218aee3536bbc81d517f509bd058126d5 """ if credentials_secret is None: From 44f4da70cb3316cae956cfc1f5d6aaaa71254fc8 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 5 May 2022 08:23:09 +0200 Subject: [PATCH 035/119] =?UTF-8?q?=E2=9C=A8=20Added=20func=20parameter=20?= =?UTF-8?q?to=20sap=5Fto=5Fduckdb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sap_to_duckdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/viadot/flows/sap_to_duckdb.py b/viadot/flows/sap_to_duckdb.py index 0c4867a6a..bf332ab56 100644 --- a/viadot/flows/sap_to_duckdb.py +++ b/viadot/flows/sap_to_duckdb.py @@ -17,6 +17,7 @@ def __init__( query: str, table: str, local_file_path: str, + func: str = "RFC_READ_TABLE", name: str = None, sep: str = None, schema: str = None, @@ -34,6 +35,7 @@ def __init__( query (str): The query to be executed on SAP with pyRFC. table (str): Destination table in DuckDB. local_file_path (str): The path to the source Parquet file. + func (str, optional): SAP RFC function to use. Defaults to "RFC_READ_TABLE". name (str, optional): The name of the flow. Defaults to None. sep (str, optional): The separator to use when reading query results. If not provided, multiple options are automatically tried. Defaults to None. @@ -46,6 +48,7 @@ def __init__( # SAPRFCToDF self.query = query + self.func = func self.sep = sep self.sap_credentials = sap_credentials @@ -70,6 +73,7 @@ def gen_flow(self) -> Flow: df = self.sap_to_df_task.bind( query=self.query, sep=self.sep, + func=self.func, flow=self, ) From 3c213e78c2c5da88f1b4d9adae4703309aeb2a7d Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 5 May 2022 08:25:40 +0200 Subject: [PATCH 036/119] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20import=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_sap_rfc_to_adls.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/flows/test_sap_rfc_to_adls.py b/tests/integration/flows/test_sap_rfc_to_adls.py index 9aa57eedf..48c28f7f1 100644 --- a/tests/integration/flows/test_sap_rfc_to_adls.py +++ b/tests/integration/flows/test_sap_rfc_to_adls.py @@ -3,6 +3,11 @@ from viadot.sources import AzureDataLake from viadot.tasks import AzureDataLakeRemove +try: + import pyrfc +except ModuleNotFoundError: + raise + ADLS_PATH = "raw/supermetrics/mp/test_file_sap.parquet" FILE_NAME = "test_file.parquet" From 5cd29a5f2475cb4ba5d2ba54c02265908953a186 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 5 May 2022 08:30:31 +0200 Subject: [PATCH 037/119] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20import=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index b063a91d9..402b1afcf 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -20,4 +20,8 @@ from .duckdb_transform import DuckDBTransform from .duckdb_to_sql_server import DuckDBToSQLServer from .multiple_flows import MultipleFlows -from .sap_rfc_to_adls import SAPRFCToADLS + +try: + from .sap_rfc_to_adls import SAPRFCToADLS +except ImportError: + pass From 4aad22eafbb3a853dd771d9744b798ef6cc54cfc Mon Sep 17 00:00:00 2001 From: lzuchowska Date: Tue, 10 May 2022 15:51:30 +0200 Subject: [PATCH 038/119] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20error=20handling?= =?UTF-8?q?=20for=20salesforce=20source?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/salesforce.py | 43 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/viadot/sources/salesforce.py b/viadot/sources/salesforce.py index b8931fcc1..656116801 100644 --- a/viadot/sources/salesforce.py +++ b/viadot/sources/salesforce.py @@ -3,7 +3,7 @@ import pandas as pd from prefect.utilities import logging from simple_salesforce import Salesforce as SF -from simple_salesforce.exceptions import SalesforceMalformedRequest +from simple_salesforce.exceptions import SalesforceResourceNotFound from ..config import local_config from ..exceptions import CredentialError @@ -91,40 +91,41 @@ def upsert( table_to_upsert = getattr(self.salesforce, table) records = df.to_dict("records") records_cp = records.copy() - + successes = 0 for record in records_cp: - response = 0 if external_id: if record[external_id] is None: continue else: merge_key = f"{external_id}/{record[external_id]}" record.pop(external_id) + record.pop("Id") else: merge_key = record.pop("Id") - try: response = table_to_upsert.upsert(data=record, record_id=merge_key) - except SalesforceMalformedRequest as e: - msg = f"Upsert of record {merge_key} failed." + codes = {200: "updated", 201: "created", 204: "updated"} + + if response not in codes: + msg = ( + f"Upsert failed for record: \n{record} with response {response}" + ) + if raise_on_error: + raise ValueError(msg) + else: + self.logger.warning(msg) + else: + successes += 1 + logger.info(f"Successfully {codes[response]} record {merge_key}.") + except SalesforceResourceNotFound as e: if raise_on_error: - raise ValueError(msg) from e + raise e else: - self.logger.warning(msg) + self.logger.warning( + f"Upsert failed for record: \n{record} with response {e}" + ) - codes = {200: "updated", 201: "created", 204: "updated"} - logger.info(f"Successfully {codes[response]} record {merge_key}.") - - if response not in codes: - raise ValueError( - f"Upsert failed for record: \n{record} with response {response}" - ) - else: - logger.info(f"Successfully {codes[response]} record {merge_key}.") - - logger.info( - f"Successfully upserted {len(records)} records into table '{table}'." - ) + logger.info(f"Successfully upserted {successes} records into table '{table}'.") def bulk_upsert( self, From 95891d8335b3c0ffb9568b1de118ce499a220905 Mon Sep 17 00:00:00 2001 From: lzuchowska Date: Tue, 10 May 2022 15:51:58 +0200 Subject: [PATCH 039/119] =?UTF-8?q?=E2=9C=85=20Added=20tests=20for=20sales?= =?UTF-8?q?force=20tasks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_salesforce.py | 53 ++++++++++++++++++++-- tests/integration/test_salesforce.py | 35 +++++++++----- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/tests/integration/tasks/test_salesforce.py b/tests/integration/tasks/test_salesforce.py index df659eae0..2474b5037 100644 --- a/tests/integration/tasks/test_salesforce.py +++ b/tests/integration/tasks/test_salesforce.py @@ -1,14 +1,26 @@ import pandas as pd import pytest from viadot.tasks import SalesforceUpsert +from simple_salesforce import SalesforceResourceNotFound @pytest.fixture(scope="session") def test_df(): data = { "Id": ["111"], - "LastName": ["John Tester-External 3"], - "SAPContactId__c": [111], + "LastName": ["John Tester-External"], + "SAPContactId__c": ["111"], + } + df = pd.DataFrame(data=data) + yield df + + +@pytest.fixture(scope="session") +def test_df_wrong(): + data = { + "Id": ["123"], + "LastName": ["John Tester-Wrong"], + "SAPContactId__c": ["111"], } df = pd.DataFrame(data=data) yield df @@ -22,6 +34,41 @@ def test_salesforce_upsert(test_df): """ try: sf = SalesforceUpsert() - sf.run(test_df, table="Contact") + sf.run(test_df, table="Contact", raise_on_error=True) + except Exception as exception: + assert False, exception + + +def test_salesforce_upsert_incorrect(test_df_wrong): + """ + Checks if the error handling system catches errors regarding improper IDs. + """ + with pytest.raises(SalesforceResourceNotFound): + sf = SalesforceUpsert() + sf.run(test_df_wrong, table="Contact", raise_on_error=True) + + +def test_salesforce_upsert_incorrect_warn(test_df_wrong): + """ + Checks if the error handling system catches errors regarding improper IDs. + """ + try: + sf = SalesforceUpsert() + sf.run(test_df_wrong, table="Contact", raise_on_error=False) + except Exception as exception: + assert False, exception + + +def test_salesforce_upsert_external(test_df): + """ + Id and SAPContactId__c are unique values, you can update only non-unique values for this test. + If the combiantion of Id and SAPContactId__c do not exist, the test will fail. + The Id and SAPContactId__c values '111' needs to be replaced with proper one (that exist in the testing system). + """ + try: + sf = SalesforceUpsert() + sf.run( + test_df, table="Contact", external_id="SAPContactId__c", raise_on_error=True + ) except Exception as exception: assert False, exception diff --git a/tests/integration/test_salesforce.py b/tests/integration/test_salesforce.py index d54a25933..97b230ed9 100644 --- a/tests/integration/test_salesforce.py +++ b/tests/integration/test_salesforce.py @@ -14,7 +14,7 @@ def test_df_external(): data = { "Id": ["111"], "LastName": ["John Tester-External"], - "SAPContactId__c": ["112"], + "SAPContactId__c": ["111"], } df = pd.DataFrame(data=data) yield df @@ -28,23 +28,36 @@ def test_upsert_empty(salesforce): assert False, exception -def test_upsert_external_id_correct(salesforce, test_df_external): +def test_upsert(salesforce): + new_name = "Test Upsert" + correct_row = [salesforce.download(table="Contact", columns=["Id", "LastName"])[0]] + to_upsert = pd.DataFrame(correct_row) + to_upsert["LastName"] = new_name + try: salesforce.upsert( - df=test_df_external, table="Contact", external_id="SAPContactId__c" + df=to_upsert, + table="Contact", + raise_on_error=True, ) except Exception as exception: assert False, exception - result = salesforce.download(table="Contact") - exists = list( - filter(lambda contact: contact["LastName"] == "John Tester-External", result) - ) - assert exists != None + result = salesforce.to_df(table="Contact", columns=["Id", "LastName"]) + assert len(result.loc[result["LastName"] == new_name]) > 0 -def test_upsert_external_id_wrong(salesforce, test_df_external): - with pytest.raises(ValueError): - salesforce.upsert(df=test_df_external, table="Contact", external_id="SAPId") +def test_upsert_external_id(salesforce, test_df_external): + try: + salesforce.upsert( + df=test_df_external, + table="Contact", + external_id="SAPContactId__c", + raise_on_error=True, + ) + except Exception as exception: + assert False, exception + result = salesforce.to_df(table="Contact", columns=["Id", "LastName"]) + assert len(result.loc[result["LastName"] == "John Tester-External"]) > 0 def test_download_no_query(salesforce): From 122ab512ef0054e34c67edde3f10cd45126303c1 Mon Sep 17 00:00:00 2001 From: lzuchowska Date: Tue, 10 May 2022 15:58:14 +0200 Subject: [PATCH 040/119] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20a=20typo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/salesforce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/salesforce.py b/viadot/sources/salesforce.py index 656116801..6d8fafdcc 100644 --- a/viadot/sources/salesforce.py +++ b/viadot/sources/salesforce.py @@ -150,7 +150,7 @@ def bulk_upsert( response = self.salesforce.bulk.__getattr__(table).upsert( data=records, external_id_field=external_id, batch_size=batch_size ) - except SalesforceMalformedRequest as e: + except SalesforceResourceNotFound as e: # Bulk insert didn't work at all. raise ValueError(f"Upsert of records failed: {e}") from e From d97f96e72d5ab1d775bf61da7dd66a7586b0d39c Mon Sep 17 00:00:00 2001 From: trymzet Date: Fri, 13 May 2022 10:02:15 +0200 Subject: [PATCH 041/119] =?UTF-8?q?=E2=9C=A8=20Add=20complete=20proxy=20se?= =?UTF-8?q?ttings=20in=20`SAPRFC`=20example?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 5 +++++ viadot/examples/sap_rfc/Dockerfile | 2 ++ viadot/examples/sap_rfc/build.sh | 1 + 3 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a19b46d..042b4233f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added complete proxy set up in `SAPRFC` example (`viadot/examples/sap_rfc`) + + ## [0.4.3] - 2022-04-28 ### Added - Added `adls_file_name` in `SupermetricsToADLS` and `SharepointToADLS` flows @@ -21,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed `MultipleFlows` when one flow is passed and when last flow fails. + ## [0.4.2] - 2022-04-08 ### Added - Added `AzureDataLakeRemove` task diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index c08acb30f..a7fa620c8 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -13,7 +13,9 @@ COPY requirements.txt . RUN xargs -L 1 pip install < requirements.txt ARG HTTP_PROXY="" +ARG NO_PROXY="" ENV HTTP_PROXY=$HTTP_PROXY +ENV NO_PROXY=$NO_PROXY RUN git config --global http.proxy ${HTTP_PROXY:-""} USER viadot \ No newline at end of file diff --git a/viadot/examples/sap_rfc/build.sh b/viadot/examples/sap_rfc/build.sh index 9d5d45380..786d231f5 100644 --- a/viadot/examples/sap_rfc/build.sh +++ b/viadot/examples/sap_rfc/build.sh @@ -1 +1,2 @@ +# Add --build-arg NO_PROXY=$no_proxy etc. as needed docker build --no-cache . -t viadot:sap_rfc From 41bf545744319d5633e2b5267ff9284297c05f98 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 13 May 2022 13:09:21 +0200 Subject: [PATCH 042/119] =?UTF-8?q?=E2=9C=A8=20Added=20SQLServerToDF=20tas?= =?UTF-8?q?k?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/__init__.py | 2 +- viadot/tasks/sql_server.py | 46 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 4710ef9fb..b7ebb0953 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -39,4 +39,4 @@ pass from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery, DuckDBToDF -from .sql_server import SQLServerCreateTable +from .sql_server import SQLServerCreateTable, SQLServerToDF diff --git a/viadot/tasks/sql_server.py b/viadot/tasks/sql_server.py index 349091c40..0be1c1f82 100644 --- a/viadot/tasks/sql_server.py +++ b/viadot/tasks/sql_server.py @@ -4,6 +4,8 @@ from prefect import Task from prefect.utilities.tasks import defaults_from_attrs +from viadot.sources import sql_server + from ..config import local_config from ..sources import SQLServer @@ -81,3 +83,47 @@ def run( self.logger.info( f"Table {fqn} has not been created as if_exists is set to {if_exists}." ) + + +class SQLServerToDF(Task): + def __init__( + self, + config_key: str = None, + *args, + **kwargs, + ): + """ + Task for downloading data from SQL Server to a pandas DataFrame. + + Args: + config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + + """ + self.config_key = config_key + + super().__init__(name="sql_server_to_df", *args, **kwargs) + + @defaults_from_attrs("config_key") + def run( + self, + query: str, + config_key: str = None, + ): + """ + Load the result of a SQL Server Database query into a pandas DataFrame. + + Args: + query (str, required): The query to execute on the SQL Server database. If don't start with "SELECT" + returns empty DataFrame. + config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + + """ + sql_server = SQLServer(config_key=config_key) + df = sql_server.to_df(query=query) + nrows = df.shape[0] + ncols = df.shape[1] + + self.logger.info( + f"Successfully downloaded {nrows} rows and {ncols} columns of data to a DataFrame." + ) + return df From 43b052610ea0295fab3fb76d420279b46e021a73 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 13 May 2022 13:10:00 +0200 Subject: [PATCH 043/119] =?UTF-8?q?=E2=9C=A8=20Added=20=20SQLServerToDuckD?= =?UTF-8?q?B=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 + viadot/flows/__init__.py | 1 + viadot/flows/sql_server_to_duckdb.py | 81 ++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 viadot/flows/sql_server_to_duckdb.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 042b4233f..d309f90a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added `SQLServerToDF` task +- Added `SQLServerToDuckDB` flow which downloads data from SQLServer table, loads it to parquet file and then uplads it do DuckDB - Added complete proxy set up in `SAPRFC` example (`viadot/examples/sap_rfc`) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 55c705613..0414817c6 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -20,3 +20,4 @@ from .duckdb_transform import DuckDBTransform from .duckdb_to_sql_server import DuckDBToSQLServer from .multiple_flows import MultipleFlows +from .sql_server_to_duckdb import SQLServerToDuckDB diff --git a/viadot/flows/sql_server_to_duckdb.py b/viadot/flows/sql_server_to_duckdb.py new file mode 100644 index 000000000..361bb1242 --- /dev/null +++ b/viadot/flows/sql_server_to_duckdb.py @@ -0,0 +1,81 @@ +from prefect import Flow +from typing import Any, Dict, List, Literal + + +from ..task_utils import df_to_parquet, add_ingestion_metadata_task +from ..tasks import SQLServerToDF, DuckDBCreateTableFromParquet + +df_task = SQLServerToDF() + + +class SQLServerToDuckDB(Flow): + def __init__( + self, + name, + sql_query: str, + local_file_path: str, + sqlserver_config_key: str = None, + duckdb_table: str = None, + duckdb_schema: str = None, + if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", + duckdb_credentials: dict = None, + *args: List[any], + **kwargs: Dict[str, Any], + ): + """ + Flow for upolading data from SQL Server to DuckDB. + + Args: + name (str): The name of the flow. + sql_query (str, required): The query to execute on the SQL Server database. If don't start with "SELECT" + returns empty DataFrame. + local_file_path (str): Path to output parquet file. + sqlserver_config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + duckdb_table (str, optional): Destination table in DuckDB. Defaults to None. + duckdb_schema (str, optional): Destination schema in DuckDB. Defaults to None. + if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". + duckdb_credentials (dict, optional): Credentials for the DuckDB connection. Defaults to None. + + """ + # SQLServerToDF + self.sql_query = sql_query + self.sqlserver_config_key = sqlserver_config_key + + # DuckDBCreateTableFromParquet + self.local_file_path = local_file_path + self.duckdb_table = duckdb_table + self.duckdb_schema = duckdb_schema + self.if_exists = if_exists + self.duckdb_credentials = duckdb_credentials + + super().__init__(*args, name=name, **kwargs) + + self.create_duckdb_table_task = DuckDBCreateTableFromParquet( + credentials=duckdb_credentials + ) + + self.gen_flow() + + def gen_flow(self) -> Flow: + df = df_task.bind( + config_key=self.sqlserver_config_key, query=self.sql_query, flow=self + ) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) + + parquet = df_to_parquet.bind( + df=df_with_metadata, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) + create_duckdb_table = self.create_duckdb_table_task.bind( + path=self.local_file_path, + schema=self.duckdb_schema, + table=self.duckdb_table, + if_exists=self.if_exists, + flow=self, + ) + + df_with_metadata.set_upstream(df, flow=self) + parquet.set_upstream(df_with_metadata, flow=self) + create_duckdb_table.set_upstream(parquet, flow=self) From e669ad8a37892f5e24ae2cdebb0cc1c587052e14 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 16 May 2022 09:55:08 +0200 Subject: [PATCH 044/119] =?UTF-8?q?=E2=9C=85=20corrected=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_duckdb_to_sql_server.py | 12 ++++++++---- tests/integration/tasks/test_sql_server.py | 14 +++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/integration/flows/test_duckdb_to_sql_server.py b/tests/integration/flows/test_duckdb_to_sql_server.py index c6b59d36d..b5d40d149 100644 --- a/tests/integration/flows/test_duckdb_to_sql_server.py +++ b/tests/integration/flows/test_duckdb_to_sql_server.py @@ -1,6 +1,7 @@ import os import json import pytest +import logging from viadot.flows import DuckDBToSQLServer from unittest import mock from viadot.sources import DuckDB @@ -25,10 +26,13 @@ def test__check_if_schema_exists(duckdb): assert not duckdb._check_if_schema_exists(SCHEMA) -def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH): - duckdb.create_table_from_parquet( - schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH - ) +def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH, caplog): + with caplog.at_level(logging.INFO): + duckdb.create_table_from_parquet( + schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH + ) + + assert "created successfully" in caplog.text def test_duckdb_sql_server_init(): diff --git a/tests/integration/tasks/test_sql_server.py b/tests/integration/tasks/test_sql_server.py index aea2bc0c6..6657ceaa4 100644 --- a/tests/integration/tasks/test_sql_server.py +++ b/tests/integration/tasks/test_sql_server.py @@ -30,13 +30,13 @@ def test_sql_server_create_table(caplog): ) dtypes = { - "Date": "DATE", - "profile": "VARCHAR(255)", - "Campaignname": "VARCHAR(255)", - "Impressions": "FLOAT(24)", - "Clicks": "FLOAT(24)", - "Cost_eur": "FLOAT(24)", - "SearchImpressionShare": "VARCHAR(255)", + "date": "DATE", + "name": "VARCHAR(255)", + "id": "VARCHAR(255)", + "weather": "FLOAT(24)", + "rain": "FLOAT(24)", + "temp": "FLOAT(24)", + "summary": "VARCHAR(255)", } create_table_task = SQLServerCreateTable() From 01ee4d5df23185fd6fbf3279a63b2c04c1ce4aa4 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 16 May 2022 09:55:43 +0200 Subject: [PATCH 045/119] =?UTF-8?q?=F0=9F=90=9B=20edited=20gitignore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index fb0123b47..36fdc6f6b 100644 --- a/.gitignore +++ b/.gitignore @@ -153,5 +153,3 @@ desktop.ini # SAP RFC lib sap_netweaver_rfc - -michal \ No newline at end of file From 44a3cf7d0113a60261d211476067529f4efe47a5 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 16 May 2022 15:07:28 +0200 Subject: [PATCH 046/119] =?UTF-8?q?=E2=9C=85=20Added=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_sql_server_to_duckdb.py | 70 +++++++++++++++++++ tests/integration/tasks/test_sql_server.py | 18 ++++- 2 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 tests/integration/flows/test_sql_server_to_duckdb.py diff --git a/tests/integration/flows/test_sql_server_to_duckdb.py b/tests/integration/flows/test_sql_server_to_duckdb.py new file mode 100644 index 000000000..fc923b211 --- /dev/null +++ b/tests/integration/flows/test_sql_server_to_duckdb.py @@ -0,0 +1,70 @@ +import json +import pytest + +from viadot.flows.sql_server_to_duckdb import SQLServerToDuckDB +from viadot.tasks import SQLServerCreateTable, DuckDBToDF +from viadot.tasks.azure_key_vault import AzureKeyVaultSecret +from prefect.tasks.secrets import PrefectSecret + +SCHEMA = "sandbox" +TABLE = "test" + + +@pytest.fixture(scope="session") +def create_sql_server_table(): + credentials_secret = PrefectSecret( + "AZURE_DEFAULT_SQLDB_SERVICE_PRINCIPAL_SECRET" + ).run() + vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() + azure_secret_task = AzureKeyVaultSecret() + credentials_str = azure_secret_task.run( + secret=credentials_secret, vault_name=vault_name + ) + dtypes = { + "date": "DATE", + "name": "VARCHAR(255)", + "id": "VARCHAR(255)", + "weather": "FLOAT(24)", + "rain": "FLOAT(24)", + "temp": "FLOAT(24)", + "summary": "VARCHAR(255)", + } + create_table_task = SQLServerCreateTable() + yield create_table_task.run( + schema=SCHEMA, + table=TABLE, + dtypes=dtypes, + if_exists="replace", + credentials=json.loads(credentials_str), + ) + + +def test_sql_server_to_duckdb(create_sql_server_table): + create_sql_server_table + duckdb_creds = {"database": "/home/viadot/database/test.duckdb"} + flow = SQLServerToDuckDB( + name="test", + sql_query=f"SELECT * FROM {SCHEMA}.{TABLE}", + local_file_path="test.parquet", + sqlserver_config_key="AZURE_SQL", + if_exists="replace", + duckdb_table=TABLE, + duckdb_schema=SCHEMA, + duckdb_credentials=duckdb_creds, + ) + result = flow.run() + assert result.is_successful() + + df_task = DuckDBToDF(credentials=duckdb_creds) + df = df_task.run(table=TABLE, schema=SCHEMA) + + assert df.columns.to_list() == [ + "date", + "name", + "id", + "weather", + "rain", + "temp", + "summary", + "_viadot_downloaded_at_utc", + ] diff --git a/tests/integration/tasks/test_sql_server.py b/tests/integration/tasks/test_sql_server.py index 6657ceaa4..972c2b83b 100644 --- a/tests/integration/tasks/test_sql_server.py +++ b/tests/integration/tasks/test_sql_server.py @@ -1,8 +1,7 @@ import json import logging import inspect -import types -from viadot.tasks import SQLServerCreateTable +from viadot.tasks import SQLServerCreateTable, SQLServerToDF from viadot.tasks.azure_key_vault import AzureKeyVaultSecret from prefect.tasks.secrets import PrefectSecret @@ -19,7 +18,6 @@ def test_sql_server_create_table_init(): def test_sql_server_create_table(caplog): - credentials_secret = PrefectSecret( "AZURE_DEFAULT_SQLDB_SERVICE_PRINCIPAL_SECRET" ).run() @@ -49,3 +47,17 @@ def test_sql_server_create_table(caplog): credentials=json.loads(credentials_str), ) assert "Successfully created table" in caplog.text + + +def test_sql_server_to_df(): + task = SQLServerToDF(config_key="AZURE_SQL") + df = task.run(query=f"SELECT * FROM {SCHEMA}.{TABLE}") + assert df.columns.to_list() == [ + "date", + "name", + "id", + "weather", + "rain", + "temp", + "summary", + ] From a66729411735d2c8784a957f1e1b1d7f78fdce6a Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 17 May 2022 14:54:34 +0200 Subject: [PATCH 047/119] =?UTF-8?q?=F0=9F=8E=A8Added=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_sql_server_to_duckdb.py | 8 +++++++- viadot/flows/sql_server_to_duckdb.py | 3 --- viadot/tasks/sql_server.py | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/integration/flows/test_sql_server_to_duckdb.py b/tests/integration/flows/test_sql_server_to_duckdb.py index fc923b211..03974a0ca 100644 --- a/tests/integration/flows/test_sql_server_to_duckdb.py +++ b/tests/integration/flows/test_sql_server_to_duckdb.py @@ -2,7 +2,7 @@ import pytest from viadot.flows.sql_server_to_duckdb import SQLServerToDuckDB -from viadot.tasks import SQLServerCreateTable, DuckDBToDF +from viadot.tasks import SQLServerCreateTable, DuckDBToDF, DuckDBQuery, AzureSQLDBQuery from viadot.tasks.azure_key_vault import AzureKeyVaultSecret from prefect.tasks.secrets import PrefectSecret @@ -37,6 +37,10 @@ def create_sql_server_table(): if_exists="replace", credentials=json.loads(credentials_str), ) + drop_sqlserver = AzureSQLDBQuery() + drop_sqlserver.run( + query=f"DROP TABLE {SCHEMA}.{TABLE}", credentials_secret=credentials_secret + ) def test_sql_server_to_duckdb(create_sql_server_table): @@ -68,3 +72,5 @@ def test_sql_server_to_duckdb(create_sql_server_table): "summary", "_viadot_downloaded_at_utc", ] + drop_duckdb = DuckDBQuery() + drop_duckdb.run(query=f"DROP TABLE {SCHEMA}.{TABLE}", credentials=duckdb_creds) diff --git a/viadot/flows/sql_server_to_duckdb.py b/viadot/flows/sql_server_to_duckdb.py index 361bb1242..d852331d0 100644 --- a/viadot/flows/sql_server_to_duckdb.py +++ b/viadot/flows/sql_server_to_duckdb.py @@ -75,7 +75,4 @@ def gen_flow(self) -> Flow: if_exists=self.if_exists, flow=self, ) - - df_with_metadata.set_upstream(df, flow=self) - parquet.set_upstream(df_with_metadata, flow=self) create_duckdb_table.set_upstream(parquet, flow=self) diff --git a/viadot/tasks/sql_server.py b/viadot/tasks/sql_server.py index 0be1c1f82..0ead13908 100644 --- a/viadot/tasks/sql_server.py +++ b/viadot/tasks/sql_server.py @@ -113,8 +113,8 @@ def run( Load the result of a SQL Server Database query into a pandas DataFrame. Args: - query (str, required): The query to execute on the SQL Server database. If don't start with "SELECT" - returns empty DataFrame. + query (str, required): The query to execute on the SQL Server database. If the qery doesn't start + with "SELECT" returns an empty DataFrame. config_key (str, optional): The key inside local config containing the credentials. Defaults to None. """ From 143581fd8f36f4cab2cad3c1c4cbf77f6dc2f8cc Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 17 May 2022 15:48:20 +0200 Subject: [PATCH 048/119] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Added=203=20more?= =?UTF-8?q?=20lib?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba7146655..56649358a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,7 @@ sql-metadata==2.3.0 duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 -pandas-gbq==0.17.4 \ No newline at end of file +pandas-gbq==0.17.4 +PyMySQL==1.0.2 +paramiko==2.11.0 +sshtunnel==0.4.0 \ No newline at end of file From 106204d171280f4db880177b76813adeebcab3bf Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 17 May 2022 15:50:26 +0200 Subject: [PATCH 049/119] =?UTF-8?q?=F0=9F=90=9B=20solved=20conflict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 6c0374526..846bee73b 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -382,10 +382,7 @@ def custom_mail_state_handler( from_email (str): Sender mailbox address. to_emails (str): Receiver mailbox address. Returns: State: the `new_state` object that was provided -<<<<<<< HEAD -======= ->>>>>>> a8c7a85218aee3536bbc81d517f509bd058126d5 """ if credentials_secret is None: From 3fcb60bec4a5a8aad07e615866c628e0ba77143d Mon Sep 17 00:00:00 2001 From: Mike <70263671+winiar93@users.noreply.github.com> Date: Tue, 17 May 2022 15:53:47 +0200 Subject: [PATCH 050/119] update gitignore --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index fb0123b47..36fdc6f6b 100644 --- a/.gitignore +++ b/.gitignore @@ -153,5 +153,3 @@ desktop.ini # SAP RFC lib sap_netweaver_rfc - -michal \ No newline at end of file From 2d49708e754bd02bf6e8d1246080e0060a992378 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 17 May 2022 15:59:12 +0200 Subject: [PATCH 051/119] =?UTF-8?q?=F0=9F=90=9B=20black=20formatting=20-?= =?UTF-8?q?=20solved?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_gen1_to_azure_sql_new.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/adls_gen1_to_azure_sql_new.py b/viadot/flows/adls_gen1_to_azure_sql_new.py index 8ae014b3c..dfd21ea9d 100644 --- a/viadot/flows/adls_gen1_to_azure_sql_new.py +++ b/viadot/flows/adls_gen1_to_azure_sql_new.py @@ -10,6 +10,7 @@ from ..tasks import AzureDataLakeToDF, AzureDataLakeUpload, AzureSQLCreateTable, BCPTask + gen1_to_df_task = AzureDataLakeToDF(gen=1) gen2_upload_task = AzureDataLakeUpload(gen=2) create_table_task = AzureSQLCreateTable() @@ -149,4 +150,3 @@ def gen_flow(self) -> Flow: gen2_upload_task.set_upstream(df_to_csv_task, flow=self) create_table_task.set_upstream(df_to_csv_task, flow=self) bulk_insert_task.set_upstream(create_table_task, flow=self) - From c061fa1362edb2e2a2796d837b8d57cee60c4ce9 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:28:55 +0200 Subject: [PATCH 052/119] =?UTF-8?q?=E2=9C=A8=20Added=20databricks-connect?= =?UTF-8?q?=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .databricks-connect | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .databricks-connect diff --git a/.databricks-connect b/.databricks-connect new file mode 100644 index 000000000..3cc53d274 --- /dev/null +++ b/.databricks-connect @@ -0,0 +1,7 @@ +{ + "host": "", + "token": "", + "cluster_id": "", + "org_id": "", + "port": "" +} \ No newline at end of file From 5588d1e4747b4234220a71d990728d2a2d1b2d97 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:32:11 +0200 Subject: [PATCH 053/119] =?UTF-8?q?=E2=9C=A8=20Added=20example=20notebook?= =?UTF-8?q?=20for=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- databricks_prefect_test.ipynb | 182 ++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 databricks_prefect_test.ipynb diff --git a/databricks_prefect_test.ipynb b/databricks_prefect_test.ipynb new file mode 100644 index 000000000..be40ee8bd --- /dev/null +++ b/databricks_prefect_test.ipynb @@ -0,0 +1,182 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "656de0ec-f9bf-4303-bcb7-5413c07b801a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/05/17 12:33:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "22/05/17 12:33:00 WARN MetricsSystem: Using default name SparkStatusTracker for source because neither spark.metrics.namespace nor spark.app.id is set.\n", + "22/05/17 12:33:03 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state TERMINATED, waiting for it to start running...\n", + "22/05/17 12:33:13 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:33:23 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:33:33 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:33:44 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:33:54 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:34:04 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:34:14 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:34:24 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:34:34 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:34:44 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:34:54 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:35:04 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:35:14 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:35:25 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:35:35 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:35:45 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:35:55 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:36:05 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:36:15 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:36:25 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:36:35 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", + "22/05/17 12:37:03 WARN DBFS: DBFS listStatus on /mnt took 1136 ms\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[FileInfo(path='dbfs:/mnt/DLQ/', name='DLQ/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/SPT/', name='SPT/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/adls/', name='adls/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/azuwevelbwdls01q/', name='azuwevelbwdls01q/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/conformed/', name='conformed/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/delta/', name='delta/', size=0, modificationTime=1611768799000), FileInfo(path='dbfs:/mnt/raw/', name='raw/', size=0, modificationTime=0)]\n" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.types import *\n", + "from IPython.display import display\n", + "from pyspark.dbutils import DBUtils\n", + "from datetime import date\n", + "import os\n", + "\n", + "spark = SparkSession.builder.appName('viadot').getOrCreate()\n", + "dbutils = DBUtils(spark)\n", + "\n", + "response = dbutils.fs.ls(\"/mnt/\")\n", + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "cb26af4f-96c9-4f8b-a632-54088dcfd304", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2022-05-17 13:51:28+0000] INFO - prefect.test | Waiting for next scheduled run at 2022-05-17T13:52:00+00:00\n", + "[2022-05-17 13:52:00+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'test'\n", + "[2022-05-17 13:52:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Starting task run...\n", + "[2022-05-17 13:52:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Finished task run for task with final state: 'Success'\n", + "[2022-05-17 13:52:00+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Starting task run...\n", + "View job details at https://adb-1930462786844525.5.azuredatabricks.net/?o=1930462786844525#/setting/clusters/0427-122644-45iadnd/sparkUi\n", + "[2022-05-17 13:52:02+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Finished task run for task with final state: 'Success'\n", + "[2022-05-17 13:52:02+0000] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded\n", + "[2022-05-17 13:52:02+0000] INFO - prefect.test | Waiting for next scheduled run at 2022-05-17T13:53:00+00:00\n", + "[2022-05-17 13:53:00+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'test'\n", + "[2022-05-17 13:53:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Starting task run...\n", + "[2022-05-17 13:53:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Finished task run for task with final state: 'Success'\n", + "[2022-05-17 13:53:00+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Starting task run...\n", + "View job details at https://adb-1930462786844525.5.azuredatabricks.net/?o=1930462786844525#/setting/clusters/0427-122644-45iadnd/sparkUi\n", + "[2022-05-17 13:53:02+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Finished task run for task with final state: 'Success'\n", + "[2022-05-17 13:53:02+0000] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded\n", + "[2022-05-17 13:53:02+0000] INFO - prefect.test | Waiting for next scheduled run at 2022-05-17T13:54:00+00:00\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [41]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m write_to_spark(df)\n\u001b[1;32m 23\u001b[0m f\u001b[38;5;241m.\u001b[39mvisualize()\n\u001b[0;32m---> 24\u001b[0m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.8/site-packages/prefect/core/flow.py:1274\u001b[0m, in \u001b[0;36mFlow.run\u001b[0;34m(self, parameters, run_on_schedule, runner_cls, **kwargs)\u001b[0m\n\u001b[1;32m 1271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m run_on_schedule \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1272\u001b[0m run_on_schedule \u001b[38;5;241m=\u001b[39m cast(\u001b[38;5;28mbool\u001b[39m, prefect\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mflows\u001b[38;5;241m.\u001b[39mrun_on_schedule)\n\u001b[0;32m-> 1274\u001b[0m state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1275\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1276\u001b[0m \u001b[43m \u001b[49m\u001b[43mrunner_cls\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrunner_cls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1277\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_on_schedule\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_on_schedule\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1278\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1279\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1281\u001b[0m \u001b[38;5;66;03m# state always should return a dict of tasks. If it's empty (meaning the run was\u001b[39;00m\n\u001b[1;32m 1282\u001b[0m \u001b[38;5;66;03m# interrupted before any tasks were executed), we set the dict manually.\u001b[39;00m\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m state\u001b[38;5;241m.\u001b[39m_result:\n", + "File \u001b[0;32m/usr/local/lib/python3.8/site-packages/prefect/core/flow.py:1079\u001b[0m, in \u001b[0;36mFlow._run\u001b[0;34m(self, parameters, runner_cls, run_on_schedule, **kwargs)\u001b[0m\n\u001b[1;32m 1075\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m naptime \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1076\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 1077\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWaiting for next scheduled run at \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(next_run_time)\n\u001b[1;32m 1078\u001b[0m )\n\u001b[0;32m-> 1079\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnaptime\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1081\u001b[0m error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 1083\u001b[0m \u001b[38;5;66;03m# begin a single flow run\u001b[39;00m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "from prefect import task, Flow\n", + "from prefect.schedules import IntervalSchedule\n", + "import pandas as pd\n", + "from datetime import timedelta\n", + "import datetime\n", + "\n", + "@task\n", + "def generate_fake_data(data: list):\n", + " return pd.DataFrame(data)\n", + "\n", + "@task\n", + "def write_to_spark(df: pd.DataFrame):\n", + " # this works for a Spark DataFrame when ran in a Databricks notebook; check if pyspark-python can handle pandas DataFrame as well\n", + " sparkdf = spark.createDataFrame(df)\n", + " sparkdf.write.mode(\"append\").saveAsTable(\"raw.c4c_test4\")\n", + "\n", + "schedule = IntervalSchedule(interval=timedelta(minutes=1))\n", + "with Flow(\"test\", schedule=schedule) as f:\n", + " data_raw =[{\"Id\": \"KVSzUaILfQZXDb\" + str(datetime.datetime.now()), \"AccountId\": \"EHNYKjSZsiy\", \"Name\": \"Turner-Black\", \"FirstName\": \"Adam\", \"LastName\": \"Carter\", \"ContactEMail\": \"Adam.Carter@TurnerBlack.com\", \"MailingCity\": \"Jamesport\"}]\n", + " df = generate_fake_data(data_raw)\n", + " write_to_spark(df)\n", + " \n", + "f.visualize()\n", + "f.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "1ae98113-4079-4ec1-a816-bffab1ca5a99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-17 13:42:54.358083\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdf7af8d-c075-4c78-a113-18b5afa1ebe1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ac7f7de3d93913fc83a1a3fa0377a7b8350ab5f2 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:34:53 +0200 Subject: [PATCH 054/119] =?UTF-8?q?=F0=9F=92=A1=20Added=20comments=20to=20?= =?UTF-8?q?code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- databricks_prefect_test.ipynb | 9 +++++++-- docker/Dockerfile | 36 ++++++++++++++++------------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/databricks_prefect_test.ipynb b/databricks_prefect_test.ipynb index be40ee8bd..031abb9b3 100644 --- a/databricks_prefect_test.ipynb +++ b/databricks_prefect_test.ipynb @@ -56,6 +56,8 @@ "from datetime import date\n", "import os\n", "\n", + "# Testing reading directory\n", + "\n", "spark = SparkSession.builder.appName('viadot').getOrCreate()\n", "dbutils = DBUtils(spark)\n", "\n", @@ -113,23 +115,26 @@ "from datetime import timedelta\n", "import datetime\n", "\n", + "# Flow for writing data every minute to a table\n", + "\n", + "# Convert list to pandas dataframe\n", "@task\n", "def generate_fake_data(data: list):\n", " return pd.DataFrame(data)\n", "\n", + "# Convert pandas dataframe to spark dataframe then write to table\n", "@task\n", "def write_to_spark(df: pd.DataFrame):\n", - " # this works for a Spark DataFrame when ran in a Databricks notebook; check if pyspark-python can handle pandas DataFrame as well\n", " sparkdf = spark.createDataFrame(df)\n", " sparkdf.write.mode(\"append\").saveAsTable(\"raw.c4c_test4\")\n", "\n", + "# Flow scheduled for every minute\n", "schedule = IntervalSchedule(interval=timedelta(minutes=1))\n", "with Flow(\"test\", schedule=schedule) as f:\n", " data_raw =[{\"Id\": \"KVSzUaILfQZXDb\" + str(datetime.datetime.now()), \"AccountId\": \"EHNYKjSZsiy\", \"Name\": \"Turner-Black\", \"FirstName\": \"Adam\", \"LastName\": \"Carter\", \"ContactEMail\": \"Adam.Carter@TurnerBlack.com\", \"MailingCity\": \"Jamesport\"}]\n", " df = generate_fake_data(data_raw)\n", " write_to_spark(df)\n", " \n", - "f.visualize()\n", "f.run()" ] }, diff --git a/docker/Dockerfile b/docker/Dockerfile index 97a5a3cbc..1ddc79f69 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,6 +1,4 @@ -FROM prefecthq/prefect:0.15.11-python3.8 - - +FROM prefecthq/prefect:latest-python3.8 # Add user RUN useradd --create-home viadot && \ chown -R viadot /home/viadot && \ @@ -11,14 +9,12 @@ RUN useradd --create-home viadot && \ RUN groupadd docker && \ usermod -aG docker viadot - # Release File Error # https://stackoverflow.com/questions/63526272/release-file-is-not-valid-yet-docker RUN echo "Acquire::Check-Valid-Until \"false\";\nAcquire::Check-Date \"false\";" | cat > /etc/apt/apt.conf.d/10no--check-valid-until - # System packages -RUN apt update -q && yes | apt install -q vim unixodbc-dev build-essential \ +RUN apt update && yes | apt install vim unixodbc-dev build-essential \ curl python3-dev libboost-all-dev libpq-dev graphviz python3-gi sudo git RUN pip install --upgrade cffi @@ -26,10 +22,6 @@ RUN curl http://archive.ubuntu.com/ubuntu/pool/main/g/glibc/multiarch-support_2. -o multiarch-support_2.27-3ubuntu1_amd64.deb && \ apt install ./multiarch-support_2.27-3ubuntu1_amd64.deb -# Fix for old SQL Servers still using TLS < 1.2 -RUN chmod +rwx /usr/lib/ssl/openssl.cnf && \ - sed -i 's/SECLEVEL=2/SECLEVEL=1/g' /usr/lib/ssl/openssl.cnf - # ODBC -- make sure to pin driver version as it's reflected in odbcinst.ini RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ @@ -41,14 +33,7 @@ RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ COPY docker/odbcinst.ini /etc - # Python env - -# This one's needed for the SAP RFC connector. -# It must be installed here as the SAP package does not define its dependencies, -# so `pip install pyrfc` breaks if all deps are not already present. -RUN pip install cython==0.29.24 - WORKDIR /code COPY requirements.txt /code/ RUN pip install --upgrade pip @@ -57,16 +42,27 @@ RUN pip install -r requirements.txt COPY . . RUN pip install . -RUN rm -rf /code +# Instaling databricks-connect +RUN apt-get update && apt-get -y install sudo +RUN sudo apt-get -y install software-properties-common +## Install Java 8 +RUN curl https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | sudo apt-key add - && \ + add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \ + apt update && \ + apt install adoptopenjdk-11-hotspot -y && \ + find /usr/bin/java -type d -exec chmod 777 {} \; +### Export env variable +ENV SPARK_HOME /usr/local/lib/python3.8/site-packages/pyspark +RUN export SPARK_HOME +RUN rm -rf /code # Workdir ENV USER viadot ENV HOME="/home/$USER" + WORKDIR ${HOME} USER ${USER} - - EXPOSE 8000 \ No newline at end of file From 98fc142fa1d0f0c4fff33977a5dc92d3b5d0e6d0 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:37:32 +0200 Subject: [PATCH 055/119] =?UTF-8?q?=F0=9F=93=9D=20Updated=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 63 +++++++++++++------------------------------------------ 1 file changed, 15 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 03abb7c55..17d04ac0b 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,8 @@ Install the library in development mode (repeat for the `viadot_jupyter_lab` con docker exec -it viadot_testing pip install -e . --user ``` +### Databricks integration +To connect to a Databricks cluster, modify `/.databricks-connect` with the desired values. Follow step 2 of this [link](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect) to retrieve the values from the Databricks cluster. ## Running tests @@ -101,6 +103,13 @@ FLOW_NAME=hello_world; python -m viadot.examples.$FLOW_NAME However, when developing, the easiest way is to use the provided Jupyter Lab container available in the browser at `http://localhost:9000/`. +## Executing Spark jobs locally using databricks-connect +To begin using spark you must first create a Spark Session: `spark = SparkSession.builder.appName('session_name').getOrCreate()`. `spark` will be used to access all the spark methods. Here is a list of commonly used spark methods (WIP): +* spark.createDataFrame(df): Create a Spark dataframe from a Pandas dataframe +* sparkdf.write.saveAsTable("schema.table"): Takes a Spark Dataframe and saves it as a table in Databricks. +* Ensure to use the correct schema, as it should be created and specified by the administrator +* table = spark.sql("select * from schema.table"): example of a simple query run through Python + ## How to contribute @@ -108,59 +117,17 @@ However, when developing, the easiest way is to use the provided Jupyter Lab con 2. Set up locally 3. Test your changes with `pytest` 4. Submit a PR. The PR should contain the following: - - new/changed functionality - - tests for the changes - - changes added to `CHANGELOG.md` - - any other relevant resources updated (esp. `viadot/docs`) - -The general flow of working for this repository in case of forking: -1. Pull before making any changes -2. Create a new branch with -``` -git checkout -b -``` -3. Make some work on repository -4. Stage changes with -``` -git add -``` -5. Commit the changes with -``` -git commit -m -``` -__Note__: See out Style Guidelines for more information about commit messages and PR names - -6. Fetch and pull the changes that could happen while working with -``` -git fetch -git checkout / -``` -7. Push your changes on repostory using -``` -git push origin -``` -8. Use merge to finish your push to repository -``` -git checkout -git merge -``` +- new/changed functionality +- tests for the changes +- changes added to `CHANGELOG.md` +- any other relevant resources updated (esp. `viadot/docs`) Please follow the standards and best practices used within the library (eg. when adding tasks, see how other tasks are constructed, etc.). For any questions, please reach out to us here on GitHub. + ### Style guidelines - the code should be formatted with Black using default settings (easiest way is to use the VSCode extension) - commit messages should: - begin with an emoji - start with one of the following verbs, capitalized, immediately after the summary emoji: "Added", "Updated", "Removed", "Fixed", "Renamed", and, sporadically, other ones, such as "Upgraded", "Downgraded", or whatever you find relevant for your particular situation - - contain a useful description of what the commit is doing - -## Set up Black for development in VSCode -Your code should be formatted with Black when you want to contribute. To set up Black in Visual Studio Code follow instructions below. -1. Install `black` in your environment by writing in the terminal: -``` -pip install black -``` -2. Go to the settings - gear icon in the bottom left corner and select `Settings` or type "Ctrl" + ",". -3. Find the `Format On Save` setting - check the box. -4. Find the `Python Formatting Provider` and select "black" in the drop-down list. -5. Your code should auto format on save now. \ No newline at end of file + - contain a useful description of what the commit is doing \ No newline at end of file From 30bbfbd1355a4dc07d851ac44b11286b3240aab3 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:45:45 +0200 Subject: [PATCH 056/119] =?UTF-8?q?=F0=9F=93=9D=20Appended=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a19b46d..8a0139d5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added + - Added `.databricks-connect` file for configuring databricks connectivity + +### Changed + - Changed README with added databricks/spark usage instructions + - Modified dockerfile to accomodate databricks installation ## [0.4.3] - 2022-04-28 ### Added - Added `adls_file_name` in `SupermetricsToADLS` and `SharepointToADLS` flows From 24264dcef7c98fce9aca5bc95d1471ad7567f318 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:53:13 +0200 Subject: [PATCH 057/119] =?UTF-8?q?=F0=9F=94=A5=20Removed=20unnecessary=20?= =?UTF-8?q?file(s)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 - databricks_prefect_test.ipynb | 187 ---------------------------------- 2 files changed, 188 deletions(-) delete mode 100644 databricks_prefect_test.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a0139d5b..3ca37d15c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `.databricks-connect` file for configuring databricks connectivity - ### Changed - Changed README with added databricks/spark usage instructions - Modified dockerfile to accomodate databricks installation diff --git a/databricks_prefect_test.ipynb b/databricks_prefect_test.ipynb deleted file mode 100644 index 031abb9b3..000000000 --- a/databricks_prefect_test.ipynb +++ /dev/null @@ -1,187 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "656de0ec-f9bf-4303-bcb7-5413c07b801a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "22/05/17 12:33:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "22/05/17 12:33:00 WARN MetricsSystem: Using default name SparkStatusTracker for source because neither spark.metrics.namespace nor spark.app.id is set.\n", - "22/05/17 12:33:03 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state TERMINATED, waiting for it to start running...\n", - "22/05/17 12:33:13 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:33:23 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:33:33 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:33:44 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:33:54 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:34:04 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:34:14 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:34:24 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:34:34 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:34:44 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:34:54 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:35:04 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:35:14 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:35:25 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:35:35 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:35:45 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:35:55 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:36:05 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:36:15 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:36:25 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:36:35 WARN SparkServiceRPCClient: Cluster 0427-122644-45iadnd in state PENDING, waiting for it to start running...\n", - "22/05/17 12:37:03 WARN DBFS: DBFS listStatus on /mnt took 1136 ms\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[FileInfo(path='dbfs:/mnt/DLQ/', name='DLQ/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/SPT/', name='SPT/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/adls/', name='adls/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/azuwevelbwdls01q/', name='azuwevelbwdls01q/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/conformed/', name='conformed/', size=0, modificationTime=0), FileInfo(path='dbfs:/mnt/delta/', name='delta/', size=0, modificationTime=1611768799000), FileInfo(path='dbfs:/mnt/raw/', name='raw/', size=0, modificationTime=0)]\n" - ] - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.types import *\n", - "from IPython.display import display\n", - "from pyspark.dbutils import DBUtils\n", - "from datetime import date\n", - "import os\n", - "\n", - "# Testing reading directory\n", - "\n", - "spark = SparkSession.builder.appName('viadot').getOrCreate()\n", - "dbutils = DBUtils(spark)\n", - "\n", - "response = dbutils.fs.ls(\"/mnt/\")\n", - "print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "cb26af4f-96c9-4f8b-a632-54088dcfd304", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2022-05-17 13:51:28+0000] INFO - prefect.test | Waiting for next scheduled run at 2022-05-17T13:52:00+00:00\n", - "[2022-05-17 13:52:00+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'test'\n", - "[2022-05-17 13:52:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Starting task run...\n", - "[2022-05-17 13:52:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Finished task run for task with final state: 'Success'\n", - "[2022-05-17 13:52:00+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Starting task run...\n", - "View job details at https://adb-1930462786844525.5.azuredatabricks.net/?o=1930462786844525#/setting/clusters/0427-122644-45iadnd/sparkUi\n", - "[2022-05-17 13:52:02+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Finished task run for task with final state: 'Success'\n", - "[2022-05-17 13:52:02+0000] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded\n", - "[2022-05-17 13:52:02+0000] INFO - prefect.test | Waiting for next scheduled run at 2022-05-17T13:53:00+00:00\n", - "[2022-05-17 13:53:00+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'test'\n", - "[2022-05-17 13:53:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Starting task run...\n", - "[2022-05-17 13:53:00+0000] INFO - prefect.TaskRunner | Task 'generate_fake_data': Finished task run for task with final state: 'Success'\n", - "[2022-05-17 13:53:00+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Starting task run...\n", - "View job details at https://adb-1930462786844525.5.azuredatabricks.net/?o=1930462786844525#/setting/clusters/0427-122644-45iadnd/sparkUi\n", - "[2022-05-17 13:53:02+0000] INFO - prefect.TaskRunner | Task 'write_to_spark': Finished task run for task with final state: 'Success'\n", - "[2022-05-17 13:53:02+0000] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded\n", - "[2022-05-17 13:53:02+0000] INFO - prefect.test | Waiting for next scheduled run at 2022-05-17T13:54:00+00:00\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Input \u001b[0;32mIn [41]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m write_to_spark(df)\n\u001b[1;32m 23\u001b[0m f\u001b[38;5;241m.\u001b[39mvisualize()\n\u001b[0;32m---> 24\u001b[0m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.8/site-packages/prefect/core/flow.py:1274\u001b[0m, in \u001b[0;36mFlow.run\u001b[0;34m(self, parameters, run_on_schedule, runner_cls, **kwargs)\u001b[0m\n\u001b[1;32m 1271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m run_on_schedule \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1272\u001b[0m run_on_schedule \u001b[38;5;241m=\u001b[39m cast(\u001b[38;5;28mbool\u001b[39m, prefect\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mflows\u001b[38;5;241m.\u001b[39mrun_on_schedule)\n\u001b[0;32m-> 1274\u001b[0m state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1275\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1276\u001b[0m \u001b[43m \u001b[49m\u001b[43mrunner_cls\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrunner_cls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1277\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_on_schedule\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_on_schedule\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1278\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1279\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1281\u001b[0m \u001b[38;5;66;03m# state always should return a dict of tasks. If it's empty (meaning the run was\u001b[39;00m\n\u001b[1;32m 1282\u001b[0m \u001b[38;5;66;03m# interrupted before any tasks were executed), we set the dict manually.\u001b[39;00m\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m state\u001b[38;5;241m.\u001b[39m_result:\n", - "File \u001b[0;32m/usr/local/lib/python3.8/site-packages/prefect/core/flow.py:1079\u001b[0m, in \u001b[0;36mFlow._run\u001b[0;34m(self, parameters, runner_cls, run_on_schedule, **kwargs)\u001b[0m\n\u001b[1;32m 1075\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m naptime \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1076\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 1077\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWaiting for next scheduled run at \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(next_run_time)\n\u001b[1;32m 1078\u001b[0m )\n\u001b[0;32m-> 1079\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnaptime\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1081\u001b[0m error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 1083\u001b[0m \u001b[38;5;66;03m# begin a single flow run\u001b[39;00m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "from prefect import task, Flow\n", - "from prefect.schedules import IntervalSchedule\n", - "import pandas as pd\n", - "from datetime import timedelta\n", - "import datetime\n", - "\n", - "# Flow for writing data every minute to a table\n", - "\n", - "# Convert list to pandas dataframe\n", - "@task\n", - "def generate_fake_data(data: list):\n", - " return pd.DataFrame(data)\n", - "\n", - "# Convert pandas dataframe to spark dataframe then write to table\n", - "@task\n", - "def write_to_spark(df: pd.DataFrame):\n", - " sparkdf = spark.createDataFrame(df)\n", - " sparkdf.write.mode(\"append\").saveAsTable(\"raw.c4c_test4\")\n", - "\n", - "# Flow scheduled for every minute\n", - "schedule = IntervalSchedule(interval=timedelta(minutes=1))\n", - "with Flow(\"test\", schedule=schedule) as f:\n", - " data_raw =[{\"Id\": \"KVSzUaILfQZXDb\" + str(datetime.datetime.now()), \"AccountId\": \"EHNYKjSZsiy\", \"Name\": \"Turner-Black\", \"FirstName\": \"Adam\", \"LastName\": \"Carter\", \"ContactEMail\": \"Adam.Carter@TurnerBlack.com\", \"MailingCity\": \"Jamesport\"}]\n", - " df = generate_fake_data(data_raw)\n", - " write_to_spark(df)\n", - " \n", - "f.run()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "1ae98113-4079-4ec1-a816-bffab1ca5a99", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-17 13:42:54.358083\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fdf7af8d-c075-4c78-a113-18b5afa1ebe1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 76087c2cce340cc41c71795e3ab7466bb0813eeb Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:57:20 +0200 Subject: [PATCH 058/119] =?UTF-8?q?=F0=9F=93=9D=20Fixed=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 17d04ac0b..532cc46cd 100644 --- a/README.md +++ b/README.md @@ -117,10 +117,42 @@ To begin using spark you must first create a Spark Session: `spark = SparkSessio 2. Set up locally 3. Test your changes with `pytest` 4. Submit a PR. The PR should contain the following: -- new/changed functionality -- tests for the changes -- changes added to `CHANGELOG.md` -- any other relevant resources updated (esp. `viadot/docs`) + - new/changed functionality + - tests for the changes + - changes added to `CHANGELOG.md` + - any other relevant resources updated (esp. `viadot/docs`) + +The general flow of working for this repository in case of forking: +1. Pull before making any changes +2. Create a new branch with +``` +git checkout -b +``` +3. Make some work on repository +4. Stage changes with +``` +git add +``` +5. Commit the changes with +``` +git commit -m +``` +__Note__: See out Style Guidelines for more information about commit messages and PR names + +6. Fetch and pull the changes that could happen while working with +``` +git fetch +git checkout / +``` +7. Push your changes on repostory using +``` +git push origin +``` +8. Use merge to finish your push to repository +``` +git checkout +git merge +``` Please follow the standards and best practices used within the library (eg. when adding tasks, see how other tasks are constructed, etc.). For any questions, please reach out to us here on GitHub. @@ -130,4 +162,16 @@ Please follow the standards and best practices used within the library (eg. when - commit messages should: - begin with an emoji - start with one of the following verbs, capitalized, immediately after the summary emoji: "Added", "Updated", "Removed", "Fixed", "Renamed", and, sporadically, other ones, such as "Upgraded", "Downgraded", or whatever you find relevant for your particular situation - - contain a useful description of what the commit is doing \ No newline at end of file + - contain a useful description of what the commit is doing + - contain a useful description of what the commit is doing + +## Set up Black for development in VSCode +Your code should be formatted with Black when you want to contribute. To set up Black in Visual Studio Code follow instructions below. +1. Install `black` in your environment by writing in the terminal: +``` +pip install black +``` +2. Go to the settings - gear icon in the bottom left corner and select `Settings` or type "Ctrl" + ",". +3. Find the `Format On Save` setting - check the box. +4. Find the `Python Formatting Provider` and select "black" in the drop-down list. +5. Your code should auto format on save now. \ No newline at end of file From 573fa5ec121150edddc2b10e2a97779aa61bd510 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Wed, 18 May 2022 15:58:06 +0200 Subject: [PATCH 059/119] =?UTF-8?q?=F0=9F=93=9D=20Fixed=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 532cc46cd..9cc5c3f20 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,6 @@ Please follow the standards and best practices used within the library (eg. when - begin with an emoji - start with one of the following verbs, capitalized, immediately after the summary emoji: "Added", "Updated", "Removed", "Fixed", "Renamed", and, sporadically, other ones, such as "Upgraded", "Downgraded", or whatever you find relevant for your particular situation - contain a useful description of what the commit is doing - - contain a useful description of what the commit is doing ## Set up Black for development in VSCode Your code should be formatted with Black when you want to contribute. To set up Black in Visual Studio Code follow instructions below. From c6e20d3324e09a6b9d127a5a56ead171e6239e68 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 18 May 2022 16:09:49 +0200 Subject: [PATCH 060/119] =?UTF-8?q?=E2=9C=A8=20mysql=20with=20ssh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mysql.py | 109 ++++++++++++++++++++++++++++++++++++++++ viadot/utils.py | 25 +++++++-- 2 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 viadot/sources/mysql.py diff --git a/viadot/sources/mysql.py b/viadot/sources/mysql.py new file mode 100644 index 000000000..51b6efc89 --- /dev/null +++ b/viadot/sources/mysql.py @@ -0,0 +1,109 @@ +import pymysql +import paramiko +import pandas as pd +from paramiko import SSHClient +from sshtunnel import SSHTunnelForwarder +from os.path import expanduser +import io +from viadot.tasks import AzureDataLakeToDF, AzureDataLakeUpload +from viadot.flows.adls_to_azure_sql import df_to_csv_task +from prefect import Flow, task, unmapped +from viadot.sources.base import Source +from viadot.config import local_config + +# from ..exceptions import CredentialError +from viadot.exceptions import CredentialError + + +class MySQL(Source): + def __init__( + self, + config_key: str = None, + credentials: dict = None, + *args, + **kwargs, + ): + + """A class for interacting with DuckDB. + + Args: + config_key (str, optional): The key inside local config containing the config. + User can choose to use this or pass credentials directly to the `credentials` + parameter. Defaults to None. + credentials (dict, optional): Credentials for the connection. Defaults to None. + """ + + if config_key: + config_credentials = local_config.get(config_key) + + credentials = credentials if credentials else config_credentials + if credentials is None: + raise CredentialError("Credentials not found.") + + super().__init__(*args, credentials=credentials, **kwargs) + + @property + def con(self) -> pymysql.connect: + """Return a new connection to the MySQL database. + + Returns: + pymysql.connect: database connection. + """ + + if self.credentials.get("host") is None: + host = "127.0.0.1" + + conn = pymysql.connect( + host=host, + user=self.credentials.get("sql_username"), + passwd=self.credentials.get("sql_password"), + db=self.credentials.get("sql_main_database"), + port=self.credentials.get("sql_port"), + ) + return conn + + def to_df(self, query: str) -> pd.DataFrame: + data = pd.read_sql_query(query, self.con) + self.con.close() + return data + + def connect_sql_ssh( + self, + query, + ): + + if self.credentials.get("host") is None: + host = "127.0.0.1" + + sql_hostname = self.credentials.get("sql_hostname") + sql_username = self.credentials.get("sql_username") + sql_password = self.credentials.get("sql_password") + sql_db_name = self.credentials.get("sql_db_name") + sql_port = self.credentials.get("sql_port") + + ssh_host = self.credentials.get("ssh_host") + ssh_user = self.credentials.get("ssh_user") + ssh_port = self.credentials.get("ssh_port") + rsakey = self.credentials.get("rsakey") + + rsakey = self.credentials["rsakey"] + ssh_pkey = paramiko.RSAKey.from_private_key(io.StringIO(rsakey)) + + with SSHTunnelForwarder( + (ssh_host, ssh_port), + ssh_username=ssh_user, + ssh_pkey=ssh_pkey, + remote_bind_address=(sql_hostname, sql_port), + ) as ssh_tunnel: + + conn = pymysql.connect( + host=host, + user=sql_username, + passwd=sql_password, + db=sql_db_name, + port=ssh_tunnel.local_bind_port, + ) + + df = pd.read_sql_query(query, conn) + conn.close() + return df diff --git a/viadot/utils.py b/viadot/utils.py index bae5f8d6b..51fb1ce1d 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -7,11 +7,10 @@ import requests from prefect.utilities.graphql import EnumValue, with_args from requests.adapters import HTTPAdapter -from requests.exceptions import (ConnectionError, HTTPError, ReadTimeout, - Timeout) +from requests.exceptions import ConnectionError, HTTPError, ReadTimeout, Timeout from requests.packages.urllib3.util.retry import Retry from urllib3.exceptions import ProtocolError - +from itertools import chain from .exceptions import APIError @@ -325,3 +324,23 @@ def _gen_insert_query_from_records(records: List[tuple]) -> str: return insert_query else: return _gen_insert_query_from_records(tuples_escaped) + + +def union_credentials_dict(*dicts): + """Function that union list of dictionaries + + Args: + dicts (List[Dict]): list of dictionaries with credentials. + + Returns: + Dict: A single dictionary createb by union method. + + Examples: + + >>> a = {"a":1} + >>> b = {"b":2} + >>> union_credentials_dict(a ,b) + {'a': 1, 'b': 2} + + """ + return dict(chain.from_iterable(dct.items() for dct in dicts)) From 52bcf4350ae881680391e36e0e2e70aac2722634 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 18 May 2022 16:46:58 +0200 Subject: [PATCH 061/119] =?UTF-8?q?=E2=9C=85=20added=20simple=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_mysql.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/unit/test_mysql.py diff --git a/tests/unit/test_mysql.py b/tests/unit/test_mysql.py new file mode 100644 index 000000000..0aa7873c7 --- /dev/null +++ b/tests/unit/test_mysql.py @@ -0,0 +1,19 @@ +from viadot.sources.mysql import MySQL +import pandas as pd +from unittest import mock + + +def test_create_mysql_instance(): + s = MySQL(credentials={"usr": 1, "pswd": 2}) + assert s + + +def test_connection_mysql(): + + d = {"country": [1, 2], "sales": [3, 4]} + df = pd.DataFrame(data=d) + + with mock.patch("viadot.sources.mysql.MySQL.to_df") as mock_method: + mock_method.return_value = df + + assert type(df) == pd.DataFrame From 6e25bfd7084d7695b7d1f05501ddf5a435804881 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 19 May 2022 09:26:19 +0200 Subject: [PATCH 062/119] =?UTF-8?q?=E2=9C=A8=20Added=20mysql=20to=20df=20t?= =?UTF-8?q?ask?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/mysql_to_df.py | 73 +++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 viadot/tasks/mysql_to_df.py diff --git a/viadot/tasks/mysql_to_df.py b/viadot/tasks/mysql_to_df.py new file mode 100644 index 000000000..14ded2e00 --- /dev/null +++ b/viadot/tasks/mysql_to_df.py @@ -0,0 +1,73 @@ +import json +import prefect +from typing import Any, Dict, Literal +from prefect import Task +from prefect.tasks.secrets import PrefectSecret +from .azure_key_vault import AzureKeyVaultSecret +from viadot.config import local_config +from viadot.sources.mysql import MySQL + + +class MySqlToDf(Task): + def __init__( + self, + country_short: Literal["AT", "DE", "CH"], + credentials: Dict[str, Any] = None, + query: str = None, + *args, + **kwargs, + ): + """ + Task for obtaining data from MySql source. + Args: + credentials (Dict[str, Any], optional): MySql Database credentials. Defaults to None. + query(str, optional): Query to perform on a database. Defaults to None. + country_short (Dict[str, Any], optional): country short to select proper credential. + Returns: Pandas DataFrame + """ + self.credentials = credentials + self.country_short = country_short + self.query = query + + super().__init__( + name="MySqlToDf", + *args, + **kwargs, + ) + + def __call__(self, *args, **kwargs): + """Download from aselite database to df""" + return super().__call__(*args, **kwargs) + + def run( + self, + query: str, + credentials: Dict[str, Any] = None, + credentials_secret: str = None, + vault_name: str = None, + ): + logger = prefect.context.get("logger") + if not credentials_secret: + try: + credentials_secret = PrefectSecret("CONVIDERA").run() + except ValueError: + pass + + if credentials_secret: + credentials_str = AzureKeyVaultSecret( + credentials_secret, vault_name=vault_name + ).run() + credentials = json.loads(credentials_str) + logger.info("Loaded credentials from Key Vault") + else: + credentials = local_config.get("CONVIDERA") + logger.info("Loaded credentials from local source") + + country_cred = credentials.get(f"{self.country_short}") + ssh_creds = credentials.get("SSH_CREDS") + credentials_country = dict(country_cred, **ssh_creds) + mysql = MySQL(credentials=credentials_country) + logger.info("Connected to MySql Database") + df = mysql.connect_sql_ssh(query=query) + logger.info("Succefully collected data from query") + return df From 4576cd09920dd36f4831d9e6682b1ae658d0f5f3 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 19 May 2022 10:07:06 +0200 Subject: [PATCH 063/119] =?UTF-8?q?=E2=9C=A8=20Added=20mysql=20to=20adls?= =?UTF-8?q?=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + viadot/flows/mysql_to_adls.py | 86 +++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 viadot/flows/mysql_to_adls.py diff --git a/.gitignore b/.gitignore index 36fdc6f6b..81e1857e7 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,4 @@ desktop.ini # SAP RFC lib sap_netweaver_rfc +michal \ No newline at end of file diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py new file mode 100644 index 000000000..bb65ac527 --- /dev/null +++ b/viadot/flows/mysql_to_adls.py @@ -0,0 +1,86 @@ +from typing import Any, Dict, List, Literal +from prefect import Flow +from viadot.task_utils import df_to_csv +from viadot.tasks import AzureDataLakeUpload +from viadot.tasks.mysql_to_df import MySqlToDf + +file_to_adls_task = AzureDataLakeUpload() + + +class MySqlToADLS(Flow): + def __init__( + self, + name: str, + country_short: Literal["AT", "DE", "CH"], + query: str = None, + sqldb_credentials_secret: str = None, + vault_name: str = None, + file_path: str = None, + sep: str = "\t", + to_path: str = None, + if_exists: Literal["replace", "append", "delete"] = "replace", + overwrite: bool = True, + sp_credentials_secret: str = None, + *args: List[any], + **kwargs: Dict[str, Any] + ): + """ + Flow for downloading data from ASElite to csv file, then uploading it to ADLS. + + Args: + name (str): The name of the flow. + query (str): Query to perform on a database. Defaults to None. + sqldb_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ASElite SQL Database credentials. Defaults to None. + vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + file_path (str, optional): Local destination path. Defaults to None. + sep (str, optional): The delimiter for the output CSV file. Defaults to "\t". + to_path (str): The path to an ADLS file. Defaults to None. + if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". + overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. + sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. + remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. + columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. + If None whole data frame will be processed. Defaults to None. + """ + self.country_short = country_short + self.query = query + self.sqldb_credentials_secret = sqldb_credentials_secret + self.vault_name = vault_name + self.overwrite = overwrite + + self.file_path = file_path + self.sep = sep + self.to_path = to_path + self.if_exists = if_exists + self.sp_credentials_secret = sp_credentials_secret + + super().__init__(*args, name=name, **kwargs) + + self.gen_flow() + + def gen_flow(self) -> Flow: + + df_task = MySqlToDf(country_short=self.country_short) + + df = df_task.bind(query=self.query, flow=self) + + create_csv = df_to_csv.bind( + df, + path=self.file_path, + sep=self.sep, + if_exists=self.if_exists, + flow=self, + ) + + adls_upload = file_to_adls_task.bind( + from_path=self.file_path, + to_path=self.to_path, + overwrite=self.overwrite, + sp_credentials_secret=self.sp_credentials_secret, + flow=self, + ) + + create_csv.set_upstream(df, flow=self) + adls_upload.set_upstream(create_csv, flow=self) From c81b01e8f6abcbe83ac44f6218148ab0048baa7d Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 19 May 2022 10:08:55 +0200 Subject: [PATCH 064/119] =?UTF-8?q?=F0=9F=93=9D=20Changed=20docstrings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mysql_to_adls.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index bb65ac527..888abe4b7 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -25,13 +25,14 @@ def __init__( **kwargs: Dict[str, Any] ): """ - Flow for downloading data from ASElite to csv file, then uploading it to ADLS. + Flow for downloading data from MySQL to csv file, then uploading it to ADLS. Args: name (str): The name of the flow. + country_short (str): Country short to extract proper credentials. query (str): Query to perform on a database. Defaults to None. sqldb_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ASElite SQL Database credentials. Defaults to None. + MySQL Database credentials. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. file_path (str, optional): Local destination path. Defaults to None. sep (str, optional): The delimiter for the output CSV file. Defaults to "\t". From 78422daa5d8afabf77f3e3b9cd7a050603b812a5 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 19 May 2022 10:11:09 +0200 Subject: [PATCH 065/119] =?UTF-8?q?=F0=9F=93=9D=20renamed=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/mysql_to_df.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/mysql_to_df.py b/viadot/tasks/mysql_to_df.py index 14ded2e00..4194746cb 100644 --- a/viadot/tasks/mysql_to_df.py +++ b/viadot/tasks/mysql_to_df.py @@ -36,7 +36,7 @@ def __init__( ) def __call__(self, *args, **kwargs): - """Download from aselite database to df""" + """Download from MySQL database to df""" return super().__call__(*args, **kwargs) def run( From abe4e9d46f58d1866e59ae1bb0812aec41b0b758 Mon Sep 17 00:00:00 2001 From: Hummer3099 Date: Fri, 20 May 2022 09:54:10 +0200 Subject: [PATCH 066/119] =?UTF-8?q?=F0=9F=94=A5=20Removed=20databricks-con?= =?UTF-8?q?nect=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .databricks-connect | 7 ------- README.md | 25 ++++++++++++++++++++++--- 2 files changed, 22 insertions(+), 10 deletions(-) delete mode 100644 .databricks-connect diff --git a/.databricks-connect b/.databricks-connect deleted file mode 100644 index 3cc53d274..000000000 --- a/.databricks-connect +++ /dev/null @@ -1,7 +0,0 @@ -{ - "host": "", - "token": "", - "cluster_id": "", - "org_id": "", - "port": "" -} \ No newline at end of file diff --git a/README.md b/README.md index 9cc5c3f20..f9fc22e59 100644 --- a/README.md +++ b/README.md @@ -82,9 +82,6 @@ Install the library in development mode (repeat for the `viadot_jupyter_lab` con docker exec -it viadot_testing pip install -e . --user ``` -### Databricks integration -To connect to a Databricks cluster, modify `/.databricks-connect` with the desired values. Follow step 2 of this [link](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect) to retrieve the values from the Databricks cluster. - ## Running tests To run tests, log into the container and run pytest: @@ -104,6 +101,28 @@ FLOW_NAME=hello_world; python -m viadot.examples.$FLOW_NAME However, when developing, the easiest way is to use the provided Jupyter Lab container available in the browser at `http://localhost:9000/`. ## Executing Spark jobs locally using databricks-connect +### Setting up +To begin using spark, you must first declare the environmental variables as follows: +``` +DATABRICKS_HOST = os.getenv("DATABRICKS_HOST") +DATABRICKS_API_TOKEN = os.getenv("DATABRICKS_API_TOKEN") +DATABRICKS_ORG_ID = os.getenv("DATABRICKS_ORG_ID") +DATABRICKS_PORT = os.getenv("DATABRICKS_PORT") +DATABRICKS_CLUSTER_ID = os.getenv("DATABRICKS_CLUSTER_ID") +``` + +Alternatively, you can also create a file called `.databricks-connect` in the root directory and add the required variables there. It should follow the following format: +``` +{ + "host": "", + "token": "", + "cluster_id": "", + "org_id": "", + "port": "" +} +``` +To retrieve the values, follow step 2 in this [link](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect) +### Executing spark functions To begin using spark you must first create a Spark Session: `spark = SparkSession.builder.appName('session_name').getOrCreate()`. `spark` will be used to access all the spark methods. Here is a list of commonly used spark methods (WIP): * spark.createDataFrame(df): Create a Spark dataframe from a Pandas dataframe * sparkdf.write.saveAsTable("schema.table"): Takes a Spark Dataframe and saves it as a table in Databricks. From 8ca37418bc91031cb04e85d56ceea0ab52a8fae4 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Fri, 20 May 2022 15:47:56 +0200 Subject: [PATCH 067/119] =?UTF-8?q?=E2=9C=85=20Added=20test=20for=20flow?= =?UTF-8?q?=20and=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_mysql_to_adls.py | 24 ++++++++++++++ tests/integration/test_mysql.py | 31 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tests/integration/flows/test_mysql_to_adls.py create mode 100644 tests/integration/test_mysql.py diff --git a/tests/integration/flows/test_mysql_to_adls.py b/tests/integration/flows/test_mysql_to_adls.py new file mode 100644 index 000000000..dc6e55092 --- /dev/null +++ b/tests/integration/flows/test_mysql_to_adls.py @@ -0,0 +1,24 @@ +from viadot.flows.mysql_to_adls import MySqlToADLS +from unittest import mock + +query = """SELECT * FROM `example-views`.`sales`""" + + +def test_instance_mysqltoadls(): + flow = MySqlToADLS("test_flow", country_short="DE") + assert flow + + +def test_adls_gen1_to_azure_sql_new_mock(TEST_PARQUET_FILE_PATH): + with mock.patch.object(MySqlToADLS, "run", return_value=True) as mock_method: + flow = MySqlToADLS( + "test_flow_de", + country_short="DE", + query=query, + file_path=TEST_PARQUET_FILE_PATH, + to_path=f"raw/examples/{TEST_PARQUET_FILE_PATH}", + sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA-DEV", + overwrite=True, + ) + flow.run() + mock_method.assert_called_with() diff --git a/tests/integration/test_mysql.py b/tests/integration/test_mysql.py new file mode 100644 index 000000000..8d6b934f9 --- /dev/null +++ b/tests/integration/test_mysql.py @@ -0,0 +1,31 @@ +from viadot.sources.mysql import MySQL +import pandas as pd +from unittest import mock + +d = {"country": [1, 2], "sales": [3, 4]} +df = pd.DataFrame(data=d) + +query = """SELECT * FROM `example-views`.`sales`""" + + +def test_create_mysql_instance(): + s = MySQL(credentials={"usr": 1, "pswd": 2}) + assert s + + +def test_connection_mysql(): + with mock.patch("viadot.sources.mysql.MySQL.to_df") as mock_method: + mock_method.return_value = df + s = MySQL(credentials={"usr": 1, "pswd": 2}) + + final_df = s.to_df(query=query) + assert type(final_df) == pd.DataFrame + + +def test_connect_mysql_ssh(): + with mock.patch("viadot.sources.mysql.MySQL.connect_sql_ssh") as mock_method: + mock_method.return_value = df + s = MySQL(credentials={"usr": 1, "pswd": 2}) + + final_df = s.connect_sql_ssh(query=query) + assert type(final_df) == pd.DataFrame From a1136b31742beed84482563cc56d327efefd7b71 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Fri, 20 May 2022 16:01:12 +0200 Subject: [PATCH 068/119] =?UTF-8?q?=E2=9C=85=20=20moved=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_mysql.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 tests/unit/test_mysql.py diff --git a/tests/unit/test_mysql.py b/tests/unit/test_mysql.py deleted file mode 100644 index 0aa7873c7..000000000 --- a/tests/unit/test_mysql.py +++ /dev/null @@ -1,19 +0,0 @@ -from viadot.sources.mysql import MySQL -import pandas as pd -from unittest import mock - - -def test_create_mysql_instance(): - s = MySQL(credentials={"usr": 1, "pswd": 2}) - assert s - - -def test_connection_mysql(): - - d = {"country": [1, 2], "sales": [3, 4]} - df = pd.DataFrame(data=d) - - with mock.patch("viadot.sources.mysql.MySQL.to_df") as mock_method: - mock_method.return_value = df - - assert type(df) == pd.DataFrame From e04b6386ac2083dffd4b2a773e9dd804ee32a5dc Mon Sep 17 00:00:00 2001 From: Abdallah Al Fraijat Date: Mon, 23 May 2022 09:42:06 +0200 Subject: [PATCH 069/119] Update requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba7146655..831e361da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,5 @@ sql-metadata==2.3.0 duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 -pandas-gbq==0.17.4 \ No newline at end of file +pandas-gbq==0.17.4 +databricks-connect==10.4.0b0 From 78ead1a525b0c7ae5c95064f618fd364d714ea75 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 23 May 2022 11:43:49 +0200 Subject: [PATCH 070/119] =?UTF-8?q?=E2=9C=A8=20Added=20Epicor=20connector?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/flows/__init__.py | 1 + viadot/flows/epicor_to_duckdb.py | 90 ++++++++++++ viadot/sources/__init__.py | 1 + viadot/sources/epicor.py | 111 ++++++++++++++ viadot/tasks/__init__.py | 1 + viadot/tasks/epicor.py | 239 +++++++++++++++++++++++++++++++ 7 files changed, 444 insertions(+) create mode 100644 viadot/flows/epicor_to_duckdb.py create mode 100644 viadot/sources/epicor.py create mode 100644 viadot/tasks/epicor.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d309f90a2..09448be06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added new connector - Epicor. Created `Epicor` source, `EpicorToDF` task and `EpicorToDuckDB` flow. - Added `SQLServerToDF` task - Added `SQLServerToDuckDB` flow which downloads data from SQLServer table, loads it to parquet file and then uplads it do DuckDB - Added complete proxy set up in `SAPRFC` example (`viadot/examples/sap_rfc`) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 0414817c6..ea433bcfe 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -21,3 +21,4 @@ from .duckdb_to_sql_server import DuckDBToSQLServer from .multiple_flows import MultipleFlows from .sql_server_to_duckdb import SQLServerToDuckDB +from .epicor_to_duckdb import EpicorOrdersToDuckDB diff --git a/viadot/flows/epicor_to_duckdb.py b/viadot/flows/epicor_to_duckdb.py new file mode 100644 index 000000000..12fc3d055 --- /dev/null +++ b/viadot/flows/epicor_to_duckdb.py @@ -0,0 +1,90 @@ +from prefect import Flow +from typing import Any, Dict, List, Literal + +from ..tasks import EpicorOrdersToDF, DuckDBCreateTableFromParquet +from ..task_utils import df_to_parquet, add_ingestion_metadata_task + + +class EpicorOrdersToDuckDB(Flow): + def __init__( + self, + name: str, + base_url: str, + filters_xml: str, + local_file_path: str, + epicor_credentials: Dict[str, Any] = None, + epicor_config_key: str = None, + start_date_field: str = "BegInvoiceDate", + end_date_field: str = "EndInvoiceDate", + duckdb_table: str = None, + duckdb_schema: str = None, + if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", + duckdb_credentials: dict = None, + *args: List[any], + **kwargs: Dict[str, Any], + ): + """ + Flow for downloading orders data from Epicor API and uploading it to DuckDB using .parquet files. + + Args: + base_url (str, required): Base url to Epicor Orders. + filters_xml (str, required): Filters in form of XML. The date filter is necessary. + local_file_path (str): The path to the source Parquet file. + epicor_credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, + username and password. Defaults to None. + epicor_config_key (str, optional): Credential key to dictionary where details are stored. Defaults to None. + start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". + end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + duckdb_table (str, optional): Destination table in DuckDB. Defaults to None. + duckdb_schema (str, optional): Destination schema in DuckDB. Defaults to None. + if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". + duckdb_credentials (dict, optional): Credentials for the DuckDB connection. Defaults to None. + """ + self.base_url = base_url + self.epicor_credentials = epicor_credentials + self.epicor_config_key = epicor_config_key + self.filters_xml = filters_xml + self.end_date_field = end_date_field + self.start_date_field = start_date_field + self.local_file_path = local_file_path + self.duckdb_table = duckdb_table + self.duckdb_schema = duckdb_schema + self.if_exists = if_exists + self.duckdb_credentials = duckdb_credentials + + super().__init__(*args, name=name, **kwargs) + + self.df_task = EpicorOrdersToDF( + base_url=self.base_url, + filters_xml=self.filters_xml, + ) + self.create_duckdb_table_task = DuckDBCreateTableFromParquet( + credentials=duckdb_credentials + ) + + self.gen_flow() + + def gen_flow(self) -> Flow: + df = self.df_task.bind( + flow=self, + credentials=self.epicor_credentials, + config_key=self.epicor_config_key, + end_date_field=self.end_date_field, + start_date_field=self.start_date_field, + ) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) + + parquet = df_to_parquet.bind( + df=df_with_metadata, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) + create_duckdb_table = self.create_duckdb_table_task.bind( + path=self.local_file_path, + schema=self.duckdb_schema, + table=self.duckdb_table, + if_exists=self.if_exists, + flow=self, + ) + create_duckdb_table.set_upstream(parquet, flow=self) diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index 9e22bdecf..5ef7e1e72 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -18,3 +18,4 @@ from .sqlite import SQLite from .duckdb import DuckDB from .sql_server import SQLServer +from .epicor import Epicor diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py new file mode 100644 index 000000000..299505c4e --- /dev/null +++ b/viadot/sources/epicor.py @@ -0,0 +1,111 @@ +import requests +from typing import Any, Dict +import xml.etree.ElementTree as ET + +from .base import Source +from ..config import local_config +from ..exceptions import CredentialError, DataRangeError + + +class Epicor(Source): + def __init__( + self, + base_url: str, + filters_xml: str, + credentials: Dict[str, Any] = None, + config_key: str = None, + start_date_field: str = "BegInvoiceDate", + end_date_field: str = "EndInvoiceDate", + *args, + **kwargs, + ): + """ + Class to connect to Epicor API and download results. + + Args: + base_url (str, required): Base url to Epicor Orders. + filters_xml (str, required): Filters in form of XML. The date filter is necessary. + credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, username and password. + Defaults to None. + config_key (str, optional): Credential key to dictionary where details are stored. + start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". + end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + """ + DEFAULT_CREDENTIALS = local_config.get(config_key) + credentials = credentials or DEFAULT_CREDENTIALS + + if credentials is None: + raise CredentialError("Credentials not found.") + + self.credentials = credentials + self.config_key = config_key + self.base_url = base_url + self.filters_xml = filters_xml + self.start_date_field = start_date_field + self.end_date_field = end_date_field + + super().__init__(*args, credentials=credentials, **kwargs) + + def generate_token(self) -> str: + "Function to generate API access token that last 24 hours" + + url = ( + "http://" + + self.credentials["host"] + + ":" + + str(self.credentials["port"]) + + "/api/security/token/?username=" + + self.credentials["username"] + + "&password=" + + self.credentials["password"] + ) + + payload = {} + files = {} + headers = { + "Content-Type": "application/xml", + } + + response = requests.request( + "POST", url, headers=headers, data=payload, files=files + ) + + return response.text + + def generate_url(self) -> str: + "Function to generate url to download data" + + return ( + "http://" + + self.credentials["host"] + + ":" + + str(self.credentials["port"]) + + self.base_url + + "?token=" + + str(self.generate_token()) + ) + + def check_filter(self) -> None: + "Function checking if user had specified date range filters." + + root = ET.fromstring(self.filters_xml) + for child in root: + for subchild in child: + if ( + subchild.tag == self.start_date_field + or subchild.tag == self.end_date_field + ) and subchild.text == None: + raise DataRangeError( + "The data filter must be provided due to full data size." + ) + + def get_xml_data(self): + "Function for getting response from Epicor API" + + self.check_filter() + payload = self.filters_xml + url = self.generate_url() + headers = {"Content-Type": "application/xml"} + response = requests.request("POST", url, headers=headers, data=payload) + + return response diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index b7ebb0953..cf9bed241 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -40,3 +40,4 @@ from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery, DuckDBToDF from .sql_server import SQLServerCreateTable, SQLServerToDF +from .epicor import EpicorOrdersToDF diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py new file mode 100644 index 000000000..dbdffffbe --- /dev/null +++ b/viadot/tasks/epicor.py @@ -0,0 +1,239 @@ +import pandas as pd +import xml.etree.ElementTree as ET +from xml.etree.ElementTree import fromstring + +from prefect import Task +from prefect.utilities.tasks import defaults_from_attrs +from typing import Any, Dict, List, Optional + +from ..sources import Epicor + +from pydantic import BaseModel + + +class HeaderInformation(BaseModel): + CompanyNumber: Optional[str] + OrderNumber: Optional[str] + InvoiceNumber: Optional[str] + CustomerNumber: Optional[str] + CustomerDescription: Optional[str] + CustomerPurchaseOrderNumber: Optional[str] + Contact: Optional[str] + SellingWarehouse: Optional[str] + ShippingWarehouse: Optional[str] + ShippingMethod: Optional[str] + PaymentTerms: Optional[str] + PaymentTermsDescription: Optional[str] + FreightTermsDescription: Optional[str] + SalesRepOne: Optional[str] + SalesRepOneDescription: Optional[str] + EntryDate: Optional[str] + OrderDate: Optional[str] + RequiredDate: Optional[str] + ShippedDate: Optional[str] + InvoiceDate: Optional[str] + ShipToAddress: Optional[List[Any]] + TrackingNumbers: Optional[List[Any]] + InvoiceTotals: Optional[List[Any]] + + +class ShipToAddress(BaseModel): + ShipToNumber: Optional[str] + Attention: Optional[str] + AddressLine1: Optional[str] + AddressLine2: Optional[str] + City: Optional[str] + State: Optional[str] + Zip: Optional[str] + Country: Optional[str] + EmailAddress: Optional[str] + PhoneNumber: Optional[str] + FaxNumber: Optional[str] + + +class TrackingNumbers(BaseModel): + TrackingNumber: Optional[str] + + +class InvoiceTotals(BaseModel): + Merchandise: Optional[str] + InboundFreight: Optional[str] + OutboundFreight: Optional[str] + Handling: Optional[str] + Delivery: Optional[str] + Pickup: Optional[str] + Restocking: Optional[str] + MinimumCharge: Optional[str] + DiscountAllowance: Optional[str] + SalesTax: Optional[str] + TotalInvoice: Optional[str] + + +class LineItemDetail(BaseModel): + ProductNumber: Optional[str] + ProductDescription1: Optional[str] + ProductDescription2: Optional[str] + LineItemNumber: Optional[str] + QuantityOrdered: Optional[str] + QuantityShipped: Optional[str] + Price: Optional[str] + UnitOfMeasure: Optional[str] + ExtendedPrice: Optional[str] + QuantityShippedExtension: Optional[str] + LineItemShipWarehouse: Optional[str] + + +class LineItemDetails(BaseModel): + LineItemDetail: Optional[LineItemDetail] + + +class Order(BaseModel): + HeaderInformation: Optional[HeaderInformation] + LineItemDetails: Optional[LineItemDetails] + + +class Orders(BaseModel): + Order: Optional[List[Any]] + + +def parse_orders_xml(xml_data: str) -> pd.DataFrame: + """ + Function to parse xml containing Epicor Orders Data. + + Args: + xml_data (str, required): Response from Epicor API in form of xml + Returns: + pd.DataFrame: DataFrame containing parsed orders data. + + """ + full_df = pd.DataFrame() + ship_dict = {} + invoice_dict = {} + header_params_dict = {} + item_params_dict = {} + almost_full = {} + + root = ET.fromstring(xml_data.text) + for order in root.findall("Order"): + for header in order.findall("HeaderInformation"): + for tracking_numbers in header.findall("TrackingNumbers"): + numbers = "" + for tracking_number in tracking_numbers.findall("TrackingNumber"): + numbers = numbers + "'" + tracking_number.text + "'" + result_numbers = TrackingNumbers(TrackingNumber=numbers) + almost_full.update(result_numbers) + + for shipto in header.findall("ShipToAddress"): + for ship_param in ShipToAddress.__dict__.get("__annotations__"): + try: + ship_value = shipto.find(f"{ship_param}").text + except: + ship_value = None + ship_parameter = {ship_param: ship_value} + ship_dict.update(ship_parameter) + almost_full.update(ship_dict) + + for invoice in header.findall("InvoiceTotals"): + for invoice_param in InvoiceTotals.__dict__.get("__annotations__"): + try: + invoice_value = invoice.find(f"{invoice_param}").text + except: + invoice_value = None + invoice_parameter = {invoice_param: invoice_value} + invoice_dict.update(invoice_parameter) + almost_full.update(invoice_dict) + + for header_param in HeaderInformation.__dict__.get("__annotations__"): + try: + header_value = header.find(f"{header_param}").text + except: + header_value = None + header_parameter = {header_param: header_value} + header_params_dict.update(header_parameter) + almost_full.update(header_params_dict) + + for items in order.findall("LineItemDetails"): + for item in items.findall("LineItemDetail"): + for item_param in LineItemDetail.__dict__.get("__annotations__"): + try: + item_value = item.find(f"{item_param}").text + except: + item_value = None + item_parameter = {item_param: item_value} + item_params_dict.update(item_parameter) + almost_full.update(item_params_dict) + full_df = full_df.append(almost_full, ignore_index=True) + return full_df + + +class EpicorOrdersToDF(Task): + def __init__( + self, + base_url: str, + filters_xml: str, + credentials: Dict[str, Any] = None, + config_key: str = None, + start_date_field: str = "BegInvoiceDate", + end_date_field: str = "EndInvoiceDate", + *args, + **kwargs, + ) -> pd.DataFrame: + """ + Task for downloading and parsing orders data from Epicor API to a pandas DataFrame. + + Args: + name (str): The name of the flow. + base_url (str, required): Base url to Epicor Orders. + filters_xml (str, required): Filters in form of XML. The date filter is necessary. + credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, username and password. Defaults to None. + config_key (str, optional): Credential key to dictionary where details are stored. Defauls to None. + start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". + end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + + Returns: + pd.DataFrame: DataFrame with parsed API output + """ + self.credentials = credentials + self.config_key = config_key + self.base_url = base_url + self.filters_xml = filters_xml + self.start_date_field = start_date_field + self.end_date_field = end_date_field + super().__init__( + name="EpicorOrders_to_df", + *args, + **kwargs, + ) + + def __call__(self, *args, **kwargs): + """Load Epicor Orders to DF""" + return super().__call__(*args, **kwargs) + + @defaults_from_attrs( + "credentials", + "config_key", + "base_url", + "filters_xml", + "start_date_field", + "end_date_field", + ) + def run( + self, + credentials: Dict[str, Any] = None, + config_key: str = None, + base_url: str = None, + filters_xml: str = None, + start_date_field: str = None, + end_date_field: str = None, + ): + epicor = Epicor( + credentials=credentials, + config_key=config_key, + base_url=base_url, + filters_xml=filters_xml, + start_date_field=start_date_field, + end_date_field=end_date_field, + ) + data = epicor.get_xml_data() + df = parse_orders_xml(data) + return df From a39c7c190bc74d6257ebe0d244dceb478ea30097 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 23 May 2022 11:44:23 +0200 Subject: [PATCH 071/119] =?UTF-8?q?=E2=9C=A8=20Added=20DataRangeError=20ex?= =?UTF-8?q?ception?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/viadot/exceptions.py b/viadot/exceptions.py index 807a92976..b00272f95 100644 --- a/viadot/exceptions.py +++ b/viadot/exceptions.py @@ -12,3 +12,7 @@ class CredentialError(Exception): class DBDataAccessError(Exception): pass + + +class DataRangeError(Exception): + pass From 4a9009ac7d84de80d0cf27683047014399806184 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 23 May 2022 12:34:20 +0200 Subject: [PATCH 072/119] =?UTF-8?q?=F0=9F=8E=A8=20=20Added=20pydantic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba7146655..1f0ca8976 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,5 @@ sql-metadata==2.3.0 duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 -pandas-gbq==0.17.4 \ No newline at end of file +pandas-gbq==0.17.4 +pydantic==1.9.0 \ No newline at end of file From aa3305aecb35a063d6ddeee35a5556d33b15b6fe Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 23 May 2022 14:08:28 +0200 Subject: [PATCH 073/119] =?UTF-8?q?=F0=9F=8E=A8=20Renamed=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/epicor.py | 2 +- viadot/tasks/epicor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 299505c4e..4237c8cd1 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -99,7 +99,7 @@ def check_filter(self) -> None: "The data filter must be provided due to full data size." ) - def get_xml_data(self): + def get_xml_response(self): "Function for getting response from Epicor API" self.check_filter() diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index dbdffffbe..540fc4a2d 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -234,6 +234,6 @@ def run( start_date_field=start_date_field, end_date_field=end_date_field, ) - data = epicor.get_xml_data() + data = epicor.get_xml_response() df = parse_orders_xml(data) return df From 8bbd44b456e1615ddeb23f0f9b11a3e7e432e4c4 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 23 May 2022 14:09:17 +0200 Subject: [PATCH 074/119] =?UTF-8?q?=E2=9C=85=20Added=20epicor=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_epicor_to_duckdb.py | 44 ++++++++++++++++ tests/integration/tasks/test_epicor.py | 20 ++++++++ tests/integration/test_epicor.py | 50 +++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100644 tests/integration/flows/test_epicor_to_duckdb.py create mode 100644 tests/integration/tasks/test_epicor.py create mode 100644 tests/integration/test_epicor.py diff --git a/tests/integration/flows/test_epicor_to_duckdb.py b/tests/integration/flows/test_epicor_to_duckdb.py new file mode 100644 index 000000000..16f933012 --- /dev/null +++ b/tests/integration/flows/test_epicor_to_duckdb.py @@ -0,0 +1,44 @@ +import os + +from viadot.tasks import DuckDBToDF, DuckDBQuery +from viadot.flows import EpicorOrdersToDuckDB +from viadot.config import local_config + +TABLE = "test_epicor" +SCHEMA = "sandbox" +LOCAL_PATH = "test_epicor.parquet" + + +def test_epicor_to_duckdb(): + duckdb_creds = {"database": "/home/viadot/database/test.duckdb"} + flow = EpicorOrdersToDuckDB( + name="test", + epicor_config_key="Epicor", + base_url=local_config.get("Epicor").get("test_url"), + filters_xml=""" + + + 001 + 2022-05-16 + 2022-05-16 + 3 + + """, + if_exists="replace", + duckdb_table=TABLE, + duckdb_schema=SCHEMA, + duckdb_credentials=duckdb_creds, + local_file_path=LOCAL_PATH, + ) + + result = flow.run() + assert result.is_successful() + + df_task = DuckDBToDF(credentials=duckdb_creds) + df = df_task.run(table=TABLE, schema=SCHEMA) + + assert df.shape == (24, 58) + + run_query = DuckDBQuery() + run_query.run(query=f"DROP TABLE {SCHEMA}.{TABLE}", credentials=duckdb_creds) + os.remove(LOCAL_PATH) diff --git a/tests/integration/tasks/test_epicor.py b/tests/integration/tasks/test_epicor.py new file mode 100644 index 000000000..92d8336f2 --- /dev/null +++ b/tests/integration/tasks/test_epicor.py @@ -0,0 +1,20 @@ +from viadot.tasks import EpicorOrdersToDF +from viadot.config import local_config + + +def test_epicor_orders_to_df(): + task = EpicorOrdersToDF( + config_key="Epicor", + base_url=local_config.get("Epicor").get("test_url"), + filters_xml=""" + + + 001 + 2022-05-16 + 2022-05-16 + 3 + +""", + ) + df = task.run() + assert df.shape == (24, 57) diff --git a/tests/integration/test_epicor.py b/tests/integration/test_epicor.py new file mode 100644 index 000000000..f3d7aa418 --- /dev/null +++ b/tests/integration/test_epicor.py @@ -0,0 +1,50 @@ +import pytest + +from viadot.sources import Epicor +from viadot.config import local_config +from viadot.exceptions import DataRangeError + + +@pytest.fixture(scope="session") +def epicor(): + epicor = Epicor( + base_url=local_config.get("Epicor").get("test_url"), + config_key="Epicor", + filters_xml=""" + + + 001 + 2022-05-16 + 2022-05-16 + 3 + + """, + ) + yield epicor + + +@pytest.fixture(scope="session") +def epicor_error(): + epicor_error = Epicor( + base_url=local_config.get("Epicor").get("test_url"), + config_key="Epicor", + filters_xml=""" + + + 001 + + 2022-05-16 + 3 + + """, + ) + yield epicor_error + + +def test_connection(epicor): + assert epicor.get_xml_response().status_code == 200 + + +def test_check_filter(epicor_error): + with pytest.raises(DataRangeError): + epicor_error.check_filter() From 23a7699fad9d874a2ebfb14e65403c335160fc16 Mon Sep 17 00:00:00 2001 From: afraijat Date: Mon, 23 May 2022 15:21:36 +0200 Subject: [PATCH 075/119] =?UTF-8?q?=F0=9F=90=9B=20Added=20lines=20deleted?= =?UTF-8?q?=20by=20mistake=20to=20Dockerfile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 929fa1d4d..a0ee8ad72 100644 --- a/.gitignore +++ b/.gitignore @@ -152,4 +152,7 @@ desktop.ini .viminfo # SAP RFC lib -sap_netweaver_rfc \ No newline at end of file +sap_netweaver_rfc + +# Databricks-connect +.databricks-conenct \ No newline at end of file From 8d527640ed952a442aa44950968aa9983cc6999b Mon Sep 17 00:00:00 2001 From: afraijat Date: Mon, 23 May 2022 15:23:54 +0200 Subject: [PATCH 076/119] =?UTF-8?q?=F0=9F=90=9B=20Added=20previously=20rem?= =?UTF-8?q?oved=20lines=20to=20Dockerfile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- CHANGELOG.md | 9 ++++++--- README.md | 13 +++++++------ docker/Dockerfile | 37 ++++++++++++++++++++++--------------- requirements.txt | 3 ++- 5 files changed, 38 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index a0ee8ad72..ee322950e 100644 --- a/.gitignore +++ b/.gitignore @@ -155,4 +155,4 @@ desktop.ini sap_netweaver_rfc # Databricks-connect -.databricks-conenct \ No newline at end of file +.databricks-connect \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ca37d15c..f2ce51cef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - - Added `.databricks-connect` file for configuring databricks connectivity + - Enabled Databricks Connect in the image. Specify the variables [this way](./README.md#executing-spark-jobs) + ### Changed - - Changed README with added databricks/spark usage instructions - - Modified dockerfile to accomodate databricks installation + - Changed README with added Databricks/Spark usage instructions + - Modified Dockerfile to accomodate Databricks installation + + ## [0.4.3] - 2022-04-28 ### Added - Added `adls_file_name` in `SupermetricsToADLS` and `SharepointToADLS` flows diff --git a/README.md b/README.md index f9fc22e59..bad186ba7 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ FLOW_NAME=hello_world; python -m viadot.examples.$FLOW_NAME However, when developing, the easiest way is to use the provided Jupyter Lab container available in the browser at `http://localhost:9000/`. -## Executing Spark jobs locally using databricks-connect +## Executing Spark jobs ### Setting up To begin using spark, you must first declare the environmental variables as follows: ``` @@ -111,7 +111,7 @@ DATABRICKS_PORT = os.getenv("DATABRICKS_PORT") DATABRICKS_CLUSTER_ID = os.getenv("DATABRICKS_CLUSTER_ID") ``` -Alternatively, you can also create a file called `.databricks-connect` in the root directory and add the required variables there. It should follow the following format: +Alternatively, you can also create a file called `.databricks-connect` in the root directory of viadot and add the required variables there. It should follow the following format: ``` { "host": "", @@ -122,12 +122,13 @@ Alternatively, you can also create a file called `.databricks-connect` in the ro } ``` To retrieve the values, follow step 2 in this [link](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect) + ### Executing spark functions -To begin using spark you must first create a Spark Session: `spark = SparkSession.builder.appName('session_name').getOrCreate()`. `spark` will be used to access all the spark methods. Here is a list of commonly used spark methods (WIP): -* spark.createDataFrame(df): Create a Spark dataframe from a Pandas dataframe -* sparkdf.write.saveAsTable("schema.table"): Takes a Spark Dataframe and saves it as a table in Databricks. +To begin using Spark, you must first create a Spark Session: `spark = SparkSession.builder.appName('session_name').getOrCreate()`. `spark` will be used to access all the spark methods. Here is a list of commonly used Spark methods (WIP): +* `spark.createDataFrame(df)`: Create a Spark DataFrame from a Pandas DataFrame +* `sparkdf.write.saveAsTable("schema.table")`: Takes a Spark DataFrame and saves it as a table in Databricks. * Ensure to use the correct schema, as it should be created and specified by the administrator -* table = spark.sql("select * from schema.table"): example of a simple query run through Python +* `table = spark.sql("select * from schema.table")`: example of a simple query ran through Python ## How to contribute diff --git a/docker/Dockerfile b/docker/Dockerfile index 1ddc79f69..4fbd99096 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM prefecthq/prefect:latest-python3.8 +FROM prefecthq/prefect:0.15.11-python3.8 # Add user RUN useradd --create-home viadot && \ chown -R viadot /home/viadot && \ @@ -14,25 +14,34 @@ RUN groupadd docker && \ RUN echo "Acquire::Check-Valid-Until \"false\";\nAcquire::Check-Date \"false\";" | cat > /etc/apt/apt.conf.d/10no--check-valid-until # System packages -RUN apt update && yes | apt install vim unixodbc-dev build-essential \ - curl python3-dev libboost-all-dev libpq-dev graphviz python3-gi sudo git +RUN apt update -q && yes | apt install -q vim unixodbc-dev build-essential \ + curl python3-dev libboost-all-dev libpq-dev graphviz python3-gi sudo git software-properties-common RUN pip install --upgrade cffi RUN curl http://archive.ubuntu.com/ubuntu/pool/main/g/glibc/multiarch-support_2.27-3ubuntu1_amd64.deb \ -o multiarch-support_2.27-3ubuntu1_amd64.deb && \ - apt install ./multiarch-support_2.27-3ubuntu1_amd64.deb + apt install -q ./multiarch-support_2.27-3ubuntu1_amd64.deb + +# Fix for old SQL Servers still using TLS < 1.2 +RUN chmod +rwx /usr/lib/ssl/openssl.cnf && \ + sed -i 's/SECLEVEL=2/SECLEVEL=1/g' /usr/lib/ssl/openssl.cnf # ODBC -- make sure to pin driver version as it's reflected in odbcinst.ini RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ - apt update && \ - apt install libsqliteodbc && \ - ACCEPT_EULA=Y apt install -y msodbcsql17=17.8.1.1-1 && \ - ACCEPT_EULA=Y apt install -y mssql-tools && \ + apt update -q && \ + apt install -q libsqliteodbc && \ + ACCEPT_EULA=Y apt install -q -y msodbcsql17=17.8.1.1-1 && \ + ACCEPT_EULA=Y apt install -q -y mssql-tools && \ echo 'export PATH="$PATH:/opt/mssql-tools/bin"' >> ~/.bashrc COPY docker/odbcinst.ini /etc +# This one's needed for the SAP RFC connector. +# It must be installed here as the SAP package does not define its dependencies, +# so `pip install pyrfc` breaks if all deps are not already present. +RUN pip install cython==0.29.24 + # Python env WORKDIR /code COPY requirements.txt /code/ @@ -42,15 +51,13 @@ RUN pip install -r requirements.txt COPY . . RUN pip install . -# Instaling databricks-connect -RUN apt-get update && apt-get -y install sudo -RUN sudo apt-get -y install software-properties-common -## Install Java 8 -RUN curl https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | sudo apt-key add - && \ +## Install Java 11 +RUN curl https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - && \ add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \ - apt update && \ - apt install adoptopenjdk-11-hotspot -y && \ + apt update -q && \ + apt install -q adoptopenjdk-11-hotspot -y && \ find /usr/bin/java -type d -exec chmod 777 {} \; + ### Export env variable ENV SPARK_HOME /usr/local/lib/python3.8/site-packages/pyspark RUN export SPARK_HOME diff --git a/requirements.txt b/requirements.txt index ba7146655..70463f7f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,5 @@ sql-metadata==2.3.0 duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 -pandas-gbq==0.17.4 \ No newline at end of file +pandas-gbq==0.17.4 +databricks-connect==10.4.0b0 \ No newline at end of file From 1e6a7de2c19a1f9960fc30aad247dad8ae7275e2 Mon Sep 17 00:00:00 2001 From: afraijat Date: Mon, 23 May 2022 15:29:39 +0200 Subject: [PATCH 077/119] =?UTF-8?q?=F0=9F=9A=91=20Removed=20merge=20confli?= =?UTF-8?q?ct=20message=20in=20requirements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index c1d4d0424..831e361da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,8 +32,4 @@ duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 pandas-gbq==0.17.4 -<<<<<<< HEAD databricks-connect==10.4.0b0 -======= -databricks-connect==10.4.0b0 ->>>>>>> e04b6386ac2083dffd4b2a773e9dd804ee32a5dc From dcc2a02a459b24cb32140fe00d61cfdeb97521ca Mon Sep 17 00:00:00 2001 From: Abdallah Al Fraijat Date: Tue, 24 May 2022 13:31:02 +0200 Subject: [PATCH 078/119] =?UTF-8?q?=F0=9F=93=9D=20Fixed=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bad186ba7..91c8e6207 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ However, when developing, the easiest way is to use the provided Jupyter Lab con ## Executing Spark jobs ### Setting up -To begin using spark, you must first declare the environmental variables as follows: +To begin using Spark, you must first declare the environmental variables as follows: ``` DATABRICKS_HOST = os.getenv("DATABRICKS_HOST") DATABRICKS_API_TOKEN = os.getenv("DATABRICKS_API_TOKEN") @@ -123,8 +123,8 @@ Alternatively, you can also create a file called `.databricks-connect` in the ro ``` To retrieve the values, follow step 2 in this [link](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect) -### Executing spark functions -To begin using Spark, you must first create a Spark Session: `spark = SparkSession.builder.appName('session_name').getOrCreate()`. `spark` will be used to access all the spark methods. Here is a list of commonly used Spark methods (WIP): +### Executing Spark functions +To begin using Spark, you must first create a Spark Session: `spark = SparkSession.builder.appName('session_name').getOrCreate()`. `spark` will be used to access all the Spark methods. Here is a list of commonly used Spark methods (WIP): * `spark.createDataFrame(df)`: Create a Spark DataFrame from a Pandas DataFrame * `sparkdf.write.saveAsTable("schema.table")`: Takes a Spark DataFrame and saves it as a table in Databricks. * Ensure to use the correct schema, as it should be created and specified by the administrator @@ -193,4 +193,4 @@ pip install black 2. Go to the settings - gear icon in the bottom left corner and select `Settings` or type "Ctrl" + ",". 3. Find the `Format On Save` setting - check the box. 4. Find the `Python Formatting Provider` and select "black" in the drop-down list. -5. Your code should auto format on save now. \ No newline at end of file +5. Your code should auto format on save now. From 41a97bca033b349e642e72147328d640090875ef Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 24 May 2022 17:18:39 +0200 Subject: [PATCH 079/119] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Added=20key=20vaul?= =?UTF-8?q?t=20option?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mysql_to_adls.py | 7 ++++++- viadot/tasks/mysql_to_df.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index 888abe4b7..4a29281c3 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -21,6 +21,7 @@ def __init__( if_exists: Literal["replace", "append", "delete"] = "replace", overwrite: bool = True, sp_credentials_secret: str = None, + credentials_secret: str = None, *args: List[any], **kwargs: Dict[str, Any] ): @@ -41,6 +42,7 @@ def __init__( overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. + credentials_secret (str, optional): Key Vault name. Defaults to None. remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. @@ -56,6 +58,7 @@ def __init__( self.to_path = to_path self.if_exists = if_exists self.sp_credentials_secret = sp_credentials_secret + self.credentials_secret = credentials_secret super().__init__(*args, name=name, **kwargs) @@ -65,7 +68,9 @@ def gen_flow(self) -> Flow: df_task = MySqlToDf(country_short=self.country_short) - df = df_task.bind(query=self.query, flow=self) + df = df_task.bind( + credentials_secret=self.credentials_secret, query=self.query, flow=self + ) create_csv = df_to_csv.bind( df, diff --git a/viadot/tasks/mysql_to_df.py b/viadot/tasks/mysql_to_df.py index 4194746cb..d794aa926 100644 --- a/viadot/tasks/mysql_to_df.py +++ b/viadot/tasks/mysql_to_df.py @@ -50,6 +50,7 @@ def run( if not credentials_secret: try: credentials_secret = PrefectSecret("CONVIDERA").run() + logger.info("Loaded credentials from Key Vault") except ValueError: pass From bf7ec9997d99de92aee6f34b44aa665ae8edcf92 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 30 May 2022 08:45:59 +0200 Subject: [PATCH 080/119] =?UTF-8?q?=F0=9F=8E=A8=20Corrected=20docstrings?= =?UTF-8?q?=20and=20function=20name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_epicor.py | 6 +++--- viadot/flows/epicor_to_duckdb.py | 10 +++++----- viadot/sources/epicor.py | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_epicor.py b/tests/integration/test_epicor.py index f3d7aa418..cfb4aaa09 100644 --- a/tests/integration/test_epicor.py +++ b/tests/integration/test_epicor.py @@ -42,9 +42,9 @@ def epicor_error(): def test_connection(epicor): - assert epicor.get_xml_response().status_code == 200 + assert epicor.get_xml_response().ok -def test_check_filter(epicor_error): +def test_validate_filter(epicor_error): with pytest.raises(DataRangeError): - epicor_error.check_filter() + epicor_error.validate_filter() diff --git a/viadot/flows/epicor_to_duckdb.py b/viadot/flows/epicor_to_duckdb.py index 12fc3d055..aa5ed6f36 100644 --- a/viadot/flows/epicor_to_duckdb.py +++ b/viadot/flows/epicor_to_duckdb.py @@ -24,17 +24,17 @@ def __init__( **kwargs: Dict[str, Any], ): """ - Flow for downloading orders data from Epicor API and uploading it to DuckDB using .parquet files. + Flow for downloading orders data from Epicor API and uploading it to DuckDB using Parquet files. Args: base_url (str, required): Base url to Epicor Orders. - filters_xml (str, required): Filters in form of XML. The date filter is necessary. + filters_xml (str, required): Filters in form of XML. The date filter is required. local_file_path (str): The path to the source Parquet file. - epicor_credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, + epicor_credentials (Dict[str, Any], optional): Credentials to connect with Epicor API containing host, port, username and password. Defaults to None. epicor_config_key (str, optional): Credential key to dictionary where details are stored. Defaults to None. - start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". - end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + start_date_field (str, optional) The name of filters field containing start date. Defaults to "BegInvoiceDate". + end_date_field (str, optional) The name of filters field containing end date. Defaults to "EndInvoiceDate". duckdb_table (str, optional): Destination table in DuckDB. Defaults to None. duckdb_schema (str, optional): Destination schema in DuckDB. Defaults to None. if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 4237c8cd1..58cc5dd2c 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -24,12 +24,12 @@ def __init__( Args: base_url (str, required): Base url to Epicor Orders. - filters_xml (str, required): Filters in form of XML. The date filter is necessary. - credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, username and password. + filters_xml (str, required): Filters in form of XML. The date filter is required. + credentials (Dict[str, Any], optional): Credentials to connect with Epicor API containing host, port, username and password. Defaults to None. config_key (str, optional): Credential key to dictionary where details are stored. - start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". - end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + start_date_field (str, optional) The name of filters field containing start date. Defaults to "BegInvoiceDate". + end_date_field (str, optional) The name of filters field containing end date. Defaults to "EndInvoiceDate". """ DEFAULT_CREDENTIALS = local_config.get(config_key) credentials = credentials or DEFAULT_CREDENTIALS @@ -85,7 +85,7 @@ def generate_url(self) -> str: + str(self.generate_token()) ) - def check_filter(self) -> None: + def validate_filter(self) -> None: "Function checking if user had specified date range filters." root = ET.fromstring(self.filters_xml) @@ -96,13 +96,13 @@ def check_filter(self) -> None: or subchild.tag == self.end_date_field ) and subchild.text == None: raise DataRangeError( - "The data filter must be provided due to full data size." + "Too much data. Please provide a date range filter." ) def get_xml_response(self): "Function for getting response from Epicor API" - self.check_filter() + self.validate_filter() payload = self.filters_xml url = self.generate_url() headers = {"Content-Type": "application/xml"} From 2508d9099fa037106bdcd2e024157c25727eca59 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 30 May 2022 12:59:42 +0200 Subject: [PATCH 081/119] edited gitignore --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 81e1857e7..929fa1d4d 100644 --- a/.gitignore +++ b/.gitignore @@ -152,5 +152,4 @@ desktop.ini .viminfo # SAP RFC lib -sap_netweaver_rfc -michal \ No newline at end of file +sap_netweaver_rfc \ No newline at end of file From 0d6945a3befe3810515bdf092935a3e9192c3e40 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 31 May 2022 09:48:23 +0200 Subject: [PATCH 082/119] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20parsing=20func?= =?UTF-8?q?tion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_epicor_to_duckdb.py | 2 +- tests/integration/tasks/test_epicor.py | 2 +- viadot/sources/epicor.py | 185 +++++++++++++++++- viadot/tasks/epicor.py | 161 +-------------- 4 files changed, 180 insertions(+), 170 deletions(-) diff --git a/tests/integration/flows/test_epicor_to_duckdb.py b/tests/integration/flows/test_epicor_to_duckdb.py index 16f933012..1330d00f5 100644 --- a/tests/integration/flows/test_epicor_to_duckdb.py +++ b/tests/integration/flows/test_epicor_to_duckdb.py @@ -37,7 +37,7 @@ def test_epicor_to_duckdb(): df_task = DuckDBToDF(credentials=duckdb_creds) df = df_task.run(table=TABLE, schema=SCHEMA) - assert df.shape == (24, 58) + assert df.shape == (24, 59) run_query = DuckDBQuery() run_query.run(query=f"DROP TABLE {SCHEMA}.{TABLE}", credentials=duckdb_creds) diff --git a/tests/integration/tasks/test_epicor.py b/tests/integration/tasks/test_epicor.py index 92d8336f2..199a5fadd 100644 --- a/tests/integration/tasks/test_epicor.py +++ b/tests/integration/tasks/test_epicor.py @@ -17,4 +17,4 @@ def test_epicor_orders_to_df(): """, ) df = task.run() - assert df.shape == (24, 57) + assert df.shape == (24, 58) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 58cc5dd2c..5e912946e 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -1,11 +1,178 @@ import requests -from typing import Any, Dict +import pandas as pd +from typing import Any, Dict, Optional import xml.etree.ElementTree as ET from .base import Source from ..config import local_config from ..exceptions import CredentialError, DataRangeError +from pydantic import BaseModel + +""" +The official documentation does not specify the list of required +fields so they were set as optional in BaseModel classes. +""" + + +class TrackingNumbers(BaseModel): + TrackingNumber: Optional[str] + + +class ShipToAddress(BaseModel): + ShipToNumber: Optional[str] + Attention: Optional[str] + AddressLine1: Optional[str] + AddressLine2: Optional[str] + AddressLine3: Optional[str] + City: Optional[str] + State: Optional[str] + Zip: Optional[str] + Country: Optional[str] + EmailAddress: Optional[str] + PhoneNumber: Optional[str] + FaxNumber: Optional[str] + + +class InvoiceTotals(BaseModel): + Merchandise: Optional[str] + InboundFreight: Optional[str] + OutboundFreight: Optional[str] + Handling: Optional[str] + Delivery: Optional[str] + Pickup: Optional[str] + Restocking: Optional[str] + MinimumCharge: Optional[str] + DiscountAllowance: Optional[str] + SalesTax: Optional[str] + TotalInvoice: Optional[str] + + +class HeaderInformation(BaseModel): + CompanyNumber: Optional[str] + OrderNumber: Optional[str] + InvoiceNumber: Optional[str] + CustomerNumber: Optional[str] + CustomerDescription: Optional[str] + CustomerPurchaseOrderNumber: Optional[str] + Contact: Optional[str] + SellingWarehouse: Optional[str] + ShippingWarehouse: Optional[str] + ShippingMethod: Optional[str] + PaymentTerms: Optional[str] + PaymentTermsDescription: Optional[str] + FreightTerms: Optional[str] + FreightTermsDescription: Optional[str] + SalesRepOne: Optional[str] + SalesRepOneDescription: Optional[str] + EntryDate: Optional[str] + OrderDate: Optional[str] + RequiredDate: Optional[str] + ShippedDate: Optional[str] + InvoiceDate: Optional[str] + ShipToAddress: Optional[ShipToAddress] + TrackingNumbers: Optional[TrackingNumbers] + InvoiceTotals: Optional[InvoiceTotals] + + +class LineItemDetail(BaseModel): + ProductNumber: Optional[str] + ProductDescription1: Optional[str] + ProductDescription2: Optional[str] + CustomerProductNumber: Optional[str] + LineItemNumber: Optional[str] + QuantityOrdered: Optional[str] + QuantityShipped: Optional[str] + QuantityBackordered: Optional[str] + Price: Optional[str] + UnitOfMeasure: Optional[str] + ExtendedPrice: Optional[str] + QuantityShippedExtension: Optional[str] + LineItemShipWarehouse: Optional[str] + + +class Order(BaseModel): + HeaderInformation: Optional[HeaderInformation] + LineItemDetail: Optional[LineItemDetail] + + +def parse_orders_xml(xml_data: str) -> pd.DataFrame: + """ + Function to parse xml containing Epicor Orders Data. + + Args: + xml_data (str, required): Response from Epicor API in form of xml + Returns: + pd.DataFrame: DataFrame containing parsed orders data. + """ + final_df = pd.DataFrame() + ship_dict = {} + invoice_dict = {} + header_params_dict = {} + item_params_dict = {} + + root = ET.fromstring(xml_data.text) + + for order in root.findall("Order"): + for header in order.findall("HeaderInformation"): + for tracking_numbers in header.findall("TrackingNumbers"): + numbers = "" + for tracking_number in tracking_numbers.findall("TrackingNumber"): + numbers = numbers + "'" + tracking_number.text + "'" + result_numbers = TrackingNumbers(TrackingNumber=numbers) + + for shipto in header.findall("ShipToAddress"): + for ship_param in ShipToAddress.__dict__.get("__annotations__"): + try: + ship_value = shipto.find(f"{ship_param}").text + except: + ship_value = None + ship_parameter = {ship_param: ship_value} + ship_dict.update(ship_parameter) + ship_address = ShipToAddress(**ship_dict) + + for invoice in header.findall("InvoiceTotals"): + for invoice_param in InvoiceTotals.__dict__.get("__annotations__"): + try: + invoice_value = invoice.find(f"{invoice_param}").text + except: + invoice_value = None + invoice_parameter = {invoice_param: invoice_value} + invoice_dict.update(invoice_parameter) + invoice_total = InvoiceTotals(**invoice_dict) + + for header_param in HeaderInformation.__dict__.get("__annotations__"): + try: + header_value = header.find(f"{header_param}").text + except: + header_value = None + if header_param == "TrackingNumbers": + header_parameter = {header_param: result_numbers} + elif header_param == "ShipToAddress": + header_parameter = {header_param: ship_address} + elif header_param == "InvoiceTotals": + header_parameter = {header_param: invoice_total} + else: + header_parameter = {header_param: header_value} + header_params_dict.update(header_parameter) + header_info = HeaderInformation(**header_params_dict) + for items in order.findall("LineItemDetails"): + for item in items.findall("LineItemDetail"): + for item_param in LineItemDetail.__dict__.get("__annotations__"): + try: + item_value = item.find(f"{item_param}").text + except: + item_value = None + item_parameter = {item_param: item_value} + item_params_dict.update(item_parameter) + line_item = LineItemDetail(**item_params_dict) + row = Order(HeaderInformation=header_info, LineItemDetail=line_item) + my_dict = row.dict() + final_df = final_df.append( + pd.json_normalize(my_dict, max_level=2), ignore_index=True + ) + return final_df + class Epicor(Source): def __init__( @@ -34,7 +201,8 @@ def __init__( DEFAULT_CREDENTIALS = local_config.get(config_key) credentials = credentials or DEFAULT_CREDENTIALS - if credentials is None: + required_credentials = ["host", "port", "username", "password"] + if any([cred_key not in credentials for cred_key in required_credentials]): raise CredentialError("Credentials not found.") self.credentials = credentials @@ -47,7 +215,7 @@ def __init__( super().__init__(*args, credentials=credentials, **kwargs) def generate_token(self) -> str: - "Function to generate API access token that last 24 hours" + "Function to generate API access token that is valid for 24 hours" url = ( "http://" @@ -60,15 +228,11 @@ def generate_token(self) -> str: + self.credentials["password"] ) - payload = {} - files = {} headers = { "Content-Type": "application/xml", } - response = requests.request( - "POST", url, headers=headers, data=payload, files=files - ) + response = requests.request("POST", url, headers=headers) return response.text @@ -109,3 +273,8 @@ def get_xml_response(self): response = requests.request("POST", url, headers=headers, data=payload) return response + + def to_df(self): + data = self.get_xml_response() + df = parse_orders_xml(data) + return df diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index 540fc4a2d..f47d41d88 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -8,163 +8,6 @@ from ..sources import Epicor -from pydantic import BaseModel - - -class HeaderInformation(BaseModel): - CompanyNumber: Optional[str] - OrderNumber: Optional[str] - InvoiceNumber: Optional[str] - CustomerNumber: Optional[str] - CustomerDescription: Optional[str] - CustomerPurchaseOrderNumber: Optional[str] - Contact: Optional[str] - SellingWarehouse: Optional[str] - ShippingWarehouse: Optional[str] - ShippingMethod: Optional[str] - PaymentTerms: Optional[str] - PaymentTermsDescription: Optional[str] - FreightTermsDescription: Optional[str] - SalesRepOne: Optional[str] - SalesRepOneDescription: Optional[str] - EntryDate: Optional[str] - OrderDate: Optional[str] - RequiredDate: Optional[str] - ShippedDate: Optional[str] - InvoiceDate: Optional[str] - ShipToAddress: Optional[List[Any]] - TrackingNumbers: Optional[List[Any]] - InvoiceTotals: Optional[List[Any]] - - -class ShipToAddress(BaseModel): - ShipToNumber: Optional[str] - Attention: Optional[str] - AddressLine1: Optional[str] - AddressLine2: Optional[str] - City: Optional[str] - State: Optional[str] - Zip: Optional[str] - Country: Optional[str] - EmailAddress: Optional[str] - PhoneNumber: Optional[str] - FaxNumber: Optional[str] - - -class TrackingNumbers(BaseModel): - TrackingNumber: Optional[str] - - -class InvoiceTotals(BaseModel): - Merchandise: Optional[str] - InboundFreight: Optional[str] - OutboundFreight: Optional[str] - Handling: Optional[str] - Delivery: Optional[str] - Pickup: Optional[str] - Restocking: Optional[str] - MinimumCharge: Optional[str] - DiscountAllowance: Optional[str] - SalesTax: Optional[str] - TotalInvoice: Optional[str] - - -class LineItemDetail(BaseModel): - ProductNumber: Optional[str] - ProductDescription1: Optional[str] - ProductDescription2: Optional[str] - LineItemNumber: Optional[str] - QuantityOrdered: Optional[str] - QuantityShipped: Optional[str] - Price: Optional[str] - UnitOfMeasure: Optional[str] - ExtendedPrice: Optional[str] - QuantityShippedExtension: Optional[str] - LineItemShipWarehouse: Optional[str] - - -class LineItemDetails(BaseModel): - LineItemDetail: Optional[LineItemDetail] - - -class Order(BaseModel): - HeaderInformation: Optional[HeaderInformation] - LineItemDetails: Optional[LineItemDetails] - - -class Orders(BaseModel): - Order: Optional[List[Any]] - - -def parse_orders_xml(xml_data: str) -> pd.DataFrame: - """ - Function to parse xml containing Epicor Orders Data. - - Args: - xml_data (str, required): Response from Epicor API in form of xml - Returns: - pd.DataFrame: DataFrame containing parsed orders data. - - """ - full_df = pd.DataFrame() - ship_dict = {} - invoice_dict = {} - header_params_dict = {} - item_params_dict = {} - almost_full = {} - - root = ET.fromstring(xml_data.text) - for order in root.findall("Order"): - for header in order.findall("HeaderInformation"): - for tracking_numbers in header.findall("TrackingNumbers"): - numbers = "" - for tracking_number in tracking_numbers.findall("TrackingNumber"): - numbers = numbers + "'" + tracking_number.text + "'" - result_numbers = TrackingNumbers(TrackingNumber=numbers) - almost_full.update(result_numbers) - - for shipto in header.findall("ShipToAddress"): - for ship_param in ShipToAddress.__dict__.get("__annotations__"): - try: - ship_value = shipto.find(f"{ship_param}").text - except: - ship_value = None - ship_parameter = {ship_param: ship_value} - ship_dict.update(ship_parameter) - almost_full.update(ship_dict) - - for invoice in header.findall("InvoiceTotals"): - for invoice_param in InvoiceTotals.__dict__.get("__annotations__"): - try: - invoice_value = invoice.find(f"{invoice_param}").text - except: - invoice_value = None - invoice_parameter = {invoice_param: invoice_value} - invoice_dict.update(invoice_parameter) - almost_full.update(invoice_dict) - - for header_param in HeaderInformation.__dict__.get("__annotations__"): - try: - header_value = header.find(f"{header_param}").text - except: - header_value = None - header_parameter = {header_param: header_value} - header_params_dict.update(header_parameter) - almost_full.update(header_params_dict) - - for items in order.findall("LineItemDetails"): - for item in items.findall("LineItemDetail"): - for item_param in LineItemDetail.__dict__.get("__annotations__"): - try: - item_value = item.find(f"{item_param}").text - except: - item_value = None - item_parameter = {item_param: item_value} - item_params_dict.update(item_parameter) - almost_full.update(item_params_dict) - full_df = full_df.append(almost_full, ignore_index=True) - return full_df - class EpicorOrdersToDF(Task): def __init__( @@ -234,6 +77,4 @@ def run( start_date_field=start_date_field, end_date_field=end_date_field, ) - data = epicor.get_xml_response() - df = parse_orders_xml(data) - return df + return epicor.to_df() From b80c960175bffd43b84b7a603bd66a6b39731847 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Wed, 1 Jun 2022 10:39:21 +0200 Subject: [PATCH 083/119] Update __init__.py --- viadot/flows/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 59d53943b..7e7a298f8 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -25,6 +25,5 @@ from .sap_rfc_to_adls import SAPRFCToADLS except ImportError: pass - + from .sql_server_to_duckdb import SQLServerToDuckDB - From e2c2c09dc644ae8726a230b1edfe3905dcc53733 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 1 Jun 2022 11:09:38 +0200 Subject: [PATCH 084/119] =?UTF-8?q?=F0=9F=8E=A8=20Formatted=20code=20with?= =?UTF-8?q?=20black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 7e7a298f8..b3a2b90ae 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -25,5 +25,5 @@ from .sap_rfc_to_adls import SAPRFCToADLS except ImportError: pass - + from .sql_server_to_duckdb import SQLServerToDuckDB From 4c2dc9c6c08e2223884951a291e99605d76c9bc7 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 1 Jun 2022 13:53:58 +0200 Subject: [PATCH 085/119] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20API=20request?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/epicor.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 5e912946e..80912e4a7 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -222,19 +222,19 @@ def generate_token(self) -> str: + self.credentials["host"] + ":" + str(self.credentials["port"]) - + "/api/security/token/?username=" - + self.credentials["username"] - + "&password=" - + self.credentials["password"] + + "/api/security/token/" ) headers = { - "Content-Type": "application/xml", + "Content-Type": "application/json", + "username": self.credentials["username"], + "password": self.credentials["password"], } response = requests.request("POST", url, headers=headers) - - return response.text + root = ET.fromstring(response.text) + token = root.find("AccessToken").text + return token def generate_url(self) -> str: "Function to generate url to download data" @@ -245,8 +245,6 @@ def generate_url(self) -> str: + ":" + str(self.credentials["port"]) + self.base_url - + "?token=" - + str(self.generate_token()) ) def validate_filter(self) -> None: @@ -269,12 +267,16 @@ def get_xml_response(self): self.validate_filter() payload = self.filters_xml url = self.generate_url() - headers = {"Content-Type": "application/xml"} + headers = { + "Content-Type": "application/xml", + "Authorization": "Bearer " + self.generate_token(), + } response = requests.request("POST", url, headers=headers, data=payload) return response def to_df(self): + "Function for creating pandas DataFrame from Epicor API response" data = self.get_xml_response() df = parse_orders_xml(data) return df From bc93da62a63fcd673fc5620de935d20927ffeeb1 Mon Sep 17 00:00:00 2001 From: Michal Zawadzki Date: Wed, 1 Jun 2022 15:14:05 +0200 Subject: [PATCH 086/119] Updated the example Dockerfile for SAP RFC --- viadot/examples/sap_rfc/Dockerfile | 15 +++++++++++---- viadot/examples/sap_rfc/requirements.txt | 3 --- 2 files changed, 11 insertions(+), 7 deletions(-) delete mode 100644 viadot/examples/sap_rfc/requirements.txt diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index a7fa620c8..f1160c8e7 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -9,13 +9,20 @@ ENV SAPNWRFC_HOME=/usr/local/sap/nwrfcsdk RUN ldconfig -COPY requirements.txt . -RUN xargs -L 1 pip install < requirements.txt - ARG HTTP_PROXY="" +ARG HTTPS_PROXY="" ARG NO_PROXY="" ENV HTTP_PROXY=$HTTP_PROXY +ENV HTTPS_PROXY=$HTTPS_PROXY ENV NO_PROXY=$NO_PROXY RUN git config --global http.proxy ${HTTP_PROXY:-""} -USER viadot \ No newline at end of file +RUN pip install pyrfc==2.5.0 + +ARG VIADOT_USER=viadot_user +ARG GID=1111 +ARG UID=1111 +RUN groupadd -g $GID -o $VIADOT_USER +RUN useradd -m -u $UID -g $GID -o -s /bin/bash $VIADOT_USER + +USER $VIADOT_USER diff --git a/viadot/examples/sap_rfc/requirements.txt b/viadot/examples/sap_rfc/requirements.txt deleted file mode 100644 index 3e0342e61..000000000 --- a/viadot/examples/sap_rfc/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -cython==0.29.24 -pyrfc==2.5.0 -sql-metadata==2.3.0 \ No newline at end of file From f97acbe7dcf5f1bc577c54a04b8f1ec5abf4442a Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 2 Jun 2022 10:51:02 +0200 Subject: [PATCH 087/119] =?UTF-8?q?=F0=9F=90=9B=20Edited=20after=20code=20?= =?UTF-8?q?review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mysql_to_adls.py | 14 +++++++------- viadot/sources/mysql.py | 25 +++++++++++++++++++------ viadot/tasks/mysql_to_df.py | 16 +++++++++------- viadot/utils.py | 5 +++-- 4 files changed, 38 insertions(+), 22 deletions(-) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index 4a29281c3..06d8cde8b 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -11,7 +11,7 @@ class MySqlToADLS(Flow): def __init__( self, name: str, - country_short: Literal["AT", "DE", "CH"], + country_short: Literal["AT", "DE", "CH", None], query: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, @@ -19,7 +19,7 @@ def __init__( sep: str = "\t", to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", - overwrite: bool = True, + overwrite_adls : bool = True, sp_credentials_secret: str = None, credentials_secret: str = None, *args: List[any], @@ -39,20 +39,20 @@ def __init__( sep (str, optional): The delimiter for the output CSV file. Defaults to "\t". to_path (str): The path to an ADLS file. Defaults to None. if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". - overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. + overwrite_adls (str, optional): Whether to overwrite_adls the destination file. Defaults to True. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. credentials_secret (str, optional): Key Vault name. Defaults to None. - remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. """ + #Connect to sql self.country_short = country_short self.query = query self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name - self.overwrite = overwrite - + self.overwrite_adls = overwrite_adls + #Upload to ADLS self.file_path = file_path self.sep = sep self.to_path = to_path @@ -83,7 +83,7 @@ def gen_flow(self) -> Flow: adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, - overwrite=self.overwrite, + overwrite_adls =self.overwrite_adls , sp_credentials_secret=self.sp_credentials_secret, flow=self, ) diff --git a/viadot/sources/mysql.py b/viadot/sources/mysql.py index 51b6efc89..57ea6e542 100644 --- a/viadot/sources/mysql.py +++ b/viadot/sources/mysql.py @@ -10,8 +10,6 @@ from prefect import Flow, task, unmapped from viadot.sources.base import Source from viadot.config import local_config - -# from ..exceptions import CredentialError from viadot.exceptions import CredentialError @@ -24,7 +22,8 @@ def __init__( **kwargs, ): - """A class for interacting with DuckDB. + """ + A class for interacting with DuckDB. Args: config_key (str, optional): The key inside local config containing the config. @@ -44,7 +43,8 @@ def __init__( @property def con(self) -> pymysql.connect: - """Return a new connection to the MySQL database. + """ + Return a new connection to the MySQL database. Returns: pymysql.connect: database connection. @@ -63,14 +63,27 @@ def con(self) -> pymysql.connect: return conn def to_df(self, query: str) -> pd.DataFrame: + """ + Get DataFrame from MySQL database by pandas and SQL query. + + Returns: + pd.DataFrame: Pandas dataframe. + """ data = pd.read_sql_query(query, self.con) self.con.close() return data def connect_sql_ssh( self, - query, - ): + query: str, + ) -> pd.DataFrame: + """ + Establish connection with database using Secure Shell Protocol (SSH) and + get pandas DataFrame by proper SQL query. + + Returns: + pd.DataFrame: Pandas dataframe. + """ if self.credentials.get("host") is None: host = "127.0.0.1" diff --git a/viadot/tasks/mysql_to_df.py b/viadot/tasks/mysql_to_df.py index d794aa926..50c5fc075 100644 --- a/viadot/tasks/mysql_to_df.py +++ b/viadot/tasks/mysql_to_df.py @@ -11,7 +11,7 @@ class MySqlToDf(Task): def __init__( self, - country_short: Literal["AT", "DE", "CH"], + country_short: Literal["AT", "DE", "CH", None], credentials: Dict[str, Any] = None, query: str = None, *args, @@ -19,10 +19,12 @@ def __init__( ): """ Task for obtaining data from MySql source. + Args: credentials (Dict[str, Any], optional): MySql Database credentials. Defaults to None. query(str, optional): Query to perform on a database. Defaults to None. - country_short (Dict[str, Any], optional): country short to select proper credential. + country_short (Dict[str, Any], optional): Country short to select proper credential. + Returns: Pandas DataFrame """ self.credentials = credentials @@ -30,7 +32,7 @@ def __init__( self.query = query super().__init__( - name="MySqlToDf", + name="MySQLToDF", *args, **kwargs, ) @@ -50,7 +52,7 @@ def run( if not credentials_secret: try: credentials_secret = PrefectSecret("CONVIDERA").run() - logger.info("Loaded credentials from Key Vault") + logger.info("Loaded credentials from Key Vault.") except ValueError: pass @@ -59,16 +61,16 @@ def run( credentials_secret, vault_name=vault_name ).run() credentials = json.loads(credentials_str) - logger.info("Loaded credentials from Key Vault") + logger.info("Loaded credentials from Key Vault.") else: credentials = local_config.get("CONVIDERA") - logger.info("Loaded credentials from local source") + logger.info("Loaded credentials from local source.") country_cred = credentials.get(f"{self.country_short}") ssh_creds = credentials.get("SSH_CREDS") credentials_country = dict(country_cred, **ssh_creds) mysql = MySQL(credentials=credentials_country) - logger.info("Connected to MySql Database") + logger.info("Connected to MySql Database.") df = mysql.connect_sql_ssh(query=query) logger.info("Succefully collected data from query") return df diff --git a/viadot/utils.py b/viadot/utils.py index 51fb1ce1d..087b6fd7d 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -326,8 +326,9 @@ def _gen_insert_query_from_records(records: List[tuple]) -> str: return _gen_insert_query_from_records(tuples_escaped) -def union_credentials_dict(*dicts): - """Function that union list of dictionaries +def union_dict(*dicts): + """ + Function that union list of dictionaries Args: dicts (List[Dict]): list of dictionaries with credentials. From 11464f682bb9a09a392599e4df01819660a09380 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 2 Jun 2022 10:58:51 +0200 Subject: [PATCH 088/119] =?UTF-8?q?=F0=9F=90=9B=20correct=20black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mysql_to_adls.py | 11 ++++++----- viadot/sources/mysql.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index 06d8cde8b..25e494afb 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -19,7 +19,7 @@ def __init__( sep: str = "\t", to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", - overwrite_adls : bool = True, + overwrite_adls: bool = True, sp_credentials_secret: str = None, credentials_secret: str = None, *args: List[any], @@ -46,13 +46,14 @@ def __init__( columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. """ - #Connect to sql + + # Connect to sql self.country_short = country_short self.query = query self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name - self.overwrite_adls = overwrite_adls - #Upload to ADLS + self.overwrite_adls = overwrite_adls + # Upload to ADLS self.file_path = file_path self.sep = sep self.to_path = to_path @@ -83,7 +84,7 @@ def gen_flow(self) -> Flow: adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, - overwrite_adls =self.overwrite_adls , + overwrite_adls=self.overwrite_adls, sp_credentials_secret=self.sp_credentials_secret, flow=self, ) diff --git a/viadot/sources/mysql.py b/viadot/sources/mysql.py index 57ea6e542..657215fa2 100644 --- a/viadot/sources/mysql.py +++ b/viadot/sources/mysql.py @@ -79,7 +79,7 @@ def connect_sql_ssh( ) -> pd.DataFrame: """ Establish connection with database using Secure Shell Protocol (SSH) and - get pandas DataFrame by proper SQL query. + get pandas DataFrame by proper SQL query. Returns: pd.DataFrame: Pandas dataframe. From 6eacf08850338df092b865dc046a8e5be07668c3 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 2 Jun 2022 13:59:33 +0200 Subject: [PATCH 089/119] =?UTF-8?q?=F0=9F=8E=A8=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ad5381ae..778da920c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Enabled Databricks Connect in the image. To enable, [follow this guide](./README.md#executing-spark-jobs) + - Added `MySQL` source and `MySqlToADLS` flow ### Changed - Added `SQLServerToDF` task From 02015f3a31801cbc9821d4e559b0411d6e7ae3aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zawadzki?= Date: Thu, 2 Jun 2022 21:32:58 +0200 Subject: [PATCH 090/119] Revert "Tests update" --- CHANGELOG.md | 5 ----- .../flows/test_adls_container_to_container.py | 16 ---------------- .../flows/test_azure_sql_transform.py | 1 - tests/integration/tasks/test_azure_data_lake.py | 16 +++++----------- .../tasks/test_cloud_for_customers.py | 2 +- 5 files changed, 6 insertions(+), 34 deletions(-) delete mode 100644 tests/integration/flows/test_adls_container_to_container.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f08ad214e..97af9d397 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - - - Added `ADLSContainerToContainer` test - Enabled Databricks Connect in the image. To enable, [follow this guide](./README.md#executing-spark-jobs) - Added `MySQL` source and `MySqlToADLS` flow @@ -30,15 +28,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added C4C secret handling to `CloudForCustomersReportToADLS` flow (`c4c_credentials_secret` parameter) ### Fixed -- Fixed `C4CToDF`, `TEST_TABLE` in AzureSQLTransform tests - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed C4C secret handling (tasks now correctly read the secret as the credentials, rather than assuming the secret is a container for credentials for all environments and trying to access specific key inside it). In other words, tasks now assume the secret holds credentials, rather than a dict of the form `{env: credentials, env2: credentials2}` - Fixed `utils.gen_bulk_insert_query_from_df()` failing with > 1000 rows due to INSERT clause limit by chunking the data into multiple INSERTs - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed `MultipleFlows` when one flow is passed and when last flow fails. -### Changed -- Changed `AzureDataLake` tests ## [0.4.2] - 2022-04-08 ### Added diff --git a/tests/integration/flows/test_adls_container_to_container.py b/tests/integration/flows/test_adls_container_to_container.py deleted file mode 100644 index 91aeb14be..000000000 --- a/tests/integration/flows/test_adls_container_to_container.py +++ /dev/null @@ -1,16 +0,0 @@ -from viadot.flows import ADLSContainerToContainer -from viadot.sources import AzureDataLake - -TEST_FILE_BLOB_PATH = "raw/supermetrics/mp/test.csv" -TEST_FILE_BLOB_PATH2 = "operations/supermetrics/mp/test.csv" - - -def test_adls_container_to_container(): - flow = ADLSContainerToContainer( - name="test to container", - from_path=TEST_FILE_BLOB_PATH, - to_path=TEST_FILE_BLOB_PATH2, - ) - flow.run() - file = AzureDataLake(TEST_FILE_BLOB_PATH2) - assert file.exists() diff --git a/tests/integration/flows/test_azure_sql_transform.py b/tests/integration/flows/test_azure_sql_transform.py index 732de7a33..9e401e2ca 100644 --- a/tests/integration/flows/test_azure_sql_transform.py +++ b/tests/integration/flows/test_azure_sql_transform.py @@ -11,7 +11,6 @@ @pytest.fixture() def TEST_TABLE(): run_sql_task = AzureSQLDBQuery() - run_sql_task.run(f"DROP TABLE IF EXISTS {FQN}") run_sql_task.run(f"CREATE TABLE {FQN} (id INT, name VARCHAR(25))") run_sql_task.run(f"INSERT INTO {FQN} VALUES (1, 'Mike')") yield diff --git a/tests/integration/tasks/test_azure_data_lake.py b/tests/integration/tasks/test_azure_data_lake.py index 92a625932..de3ecf49e 100644 --- a/tests/integration/tasks/test_azure_data_lake.py +++ b/tests/integration/tasks/test_azure_data_lake.py @@ -1,6 +1,5 @@ import os import uuid -import pytest from viadot.sources import AzureDataLake from viadot.tasks import ( @@ -9,9 +8,8 @@ AzureDataLakeUpload, AzureDataLakeCopy, AzureDataLakeList, - AzureDataLakeRemove, ) - +from viadot.tasks.azure_data_lake import AzureDataLakeRemove uuid_4 = uuid.uuid4() uuid_4_2 = uuid.uuid4() @@ -24,18 +22,17 @@ file_name_parquet = f"test_file_{uuid_4}.parquet" adls_path_parquet = f"raw/supermetrics/{file_name_parquet}" +# TODO: add pytest-depends as download tests depend on the upload +# and can't be ran separately + def test_azure_data_lake_upload(TEST_CSV_FILE_PATH): upload_task = AzureDataLakeUpload() - upload_task.run( - from_path=TEST_CSV_FILE_PATH, - to_path=adls_path, - ) + upload_task.run(from_path=TEST_CSV_FILE_PATH, to_path=adls_path) file = AzureDataLake(adls_path) assert file.exists() -@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_download(): download_task = AzureDataLakeDownload() download_task.run(from_path=adls_path) @@ -43,7 +40,6 @@ def test_azure_data_lake_download(): os.remove(file_name) -@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_to_df(): task = AzureDataLakeToDF() df = task.run(path=adls_path, sep="\t") @@ -59,7 +55,6 @@ def test_azure_data_lake_to_df_parquet(TEST_PARQUET_FILE_PATH): assert not df.empty -@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_copy(): copy_task = AzureDataLakeCopy() copy_task.run(from_path=adls_path, to_path=adls_path_2) @@ -73,7 +68,6 @@ def test_azure_data_lake_list(): assert adls_path in files -@pytest.mark.depends(on=["test_azure_data_lake_upload"]) def test_azure_data_lake_remove(): file = AzureDataLake(adls_path) assert file.exists() diff --git a/tests/integration/tasks/test_cloud_for_customers.py b/tests/integration/tasks/test_cloud_for_customers.py index 9446161f9..50a6a1b8c 100644 --- a/tests/integration/tasks/test_cloud_for_customers.py +++ b/tests/integration/tasks/test_cloud_for_customers.py @@ -7,7 +7,7 @@ def test_c4c_to_df(): url = "http://services.odata.org/V2/Northwind/Northwind.svc/" endpoint = "Employees" c4c_to_df = C4CToDF() - df = c4c_to_df.run(url=url, endpoint=endpoint, params={}) + df = c4c_to_df.run(url=url, endpoint=endpoint) answer = df.head() assert answer.shape[1] == 23 From a077afda893c691b22299c4ccd251cb937d4d20a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zawadzki?= Date: Thu, 2 Jun 2022 21:54:15 +0200 Subject: [PATCH 091/119] Revert "Fix upsert error handling" --- tests/integration/tasks/test_salesforce.py | 53 ++-------------------- tests/integration/test_salesforce.py | 35 +++++--------- viadot/sources/salesforce.py | 45 +++++++++--------- 3 files changed, 36 insertions(+), 97 deletions(-) diff --git a/tests/integration/tasks/test_salesforce.py b/tests/integration/tasks/test_salesforce.py index 2474b5037..df659eae0 100644 --- a/tests/integration/tasks/test_salesforce.py +++ b/tests/integration/tasks/test_salesforce.py @@ -1,26 +1,14 @@ import pandas as pd import pytest from viadot.tasks import SalesforceUpsert -from simple_salesforce import SalesforceResourceNotFound @pytest.fixture(scope="session") def test_df(): data = { "Id": ["111"], - "LastName": ["John Tester-External"], - "SAPContactId__c": ["111"], - } - df = pd.DataFrame(data=data) - yield df - - -@pytest.fixture(scope="session") -def test_df_wrong(): - data = { - "Id": ["123"], - "LastName": ["John Tester-Wrong"], - "SAPContactId__c": ["111"], + "LastName": ["John Tester-External 3"], + "SAPContactId__c": [111], } df = pd.DataFrame(data=data) yield df @@ -34,41 +22,6 @@ def test_salesforce_upsert(test_df): """ try: sf = SalesforceUpsert() - sf.run(test_df, table="Contact", raise_on_error=True) - except Exception as exception: - assert False, exception - - -def test_salesforce_upsert_incorrect(test_df_wrong): - """ - Checks if the error handling system catches errors regarding improper IDs. - """ - with pytest.raises(SalesforceResourceNotFound): - sf = SalesforceUpsert() - sf.run(test_df_wrong, table="Contact", raise_on_error=True) - - -def test_salesforce_upsert_incorrect_warn(test_df_wrong): - """ - Checks if the error handling system catches errors regarding improper IDs. - """ - try: - sf = SalesforceUpsert() - sf.run(test_df_wrong, table="Contact", raise_on_error=False) - except Exception as exception: - assert False, exception - - -def test_salesforce_upsert_external(test_df): - """ - Id and SAPContactId__c are unique values, you can update only non-unique values for this test. - If the combiantion of Id and SAPContactId__c do not exist, the test will fail. - The Id and SAPContactId__c values '111' needs to be replaced with proper one (that exist in the testing system). - """ - try: - sf = SalesforceUpsert() - sf.run( - test_df, table="Contact", external_id="SAPContactId__c", raise_on_error=True - ) + sf.run(test_df, table="Contact") except Exception as exception: assert False, exception diff --git a/tests/integration/test_salesforce.py b/tests/integration/test_salesforce.py index 97b230ed9..d54a25933 100644 --- a/tests/integration/test_salesforce.py +++ b/tests/integration/test_salesforce.py @@ -14,7 +14,7 @@ def test_df_external(): data = { "Id": ["111"], "LastName": ["John Tester-External"], - "SAPContactId__c": ["111"], + "SAPContactId__c": ["112"], } df = pd.DataFrame(data=data) yield df @@ -28,36 +28,23 @@ def test_upsert_empty(salesforce): assert False, exception -def test_upsert(salesforce): - new_name = "Test Upsert" - correct_row = [salesforce.download(table="Contact", columns=["Id", "LastName"])[0]] - to_upsert = pd.DataFrame(correct_row) - to_upsert["LastName"] = new_name - +def test_upsert_external_id_correct(salesforce, test_df_external): try: salesforce.upsert( - df=to_upsert, - table="Contact", - raise_on_error=True, + df=test_df_external, table="Contact", external_id="SAPContactId__c" ) except Exception as exception: assert False, exception - result = salesforce.to_df(table="Contact", columns=["Id", "LastName"]) - assert len(result.loc[result["LastName"] == new_name]) > 0 + result = salesforce.download(table="Contact") + exists = list( + filter(lambda contact: contact["LastName"] == "John Tester-External", result) + ) + assert exists != None -def test_upsert_external_id(salesforce, test_df_external): - try: - salesforce.upsert( - df=test_df_external, - table="Contact", - external_id="SAPContactId__c", - raise_on_error=True, - ) - except Exception as exception: - assert False, exception - result = salesforce.to_df(table="Contact", columns=["Id", "LastName"]) - assert len(result.loc[result["LastName"] == "John Tester-External"]) > 0 +def test_upsert_external_id_wrong(salesforce, test_df_external): + with pytest.raises(ValueError): + salesforce.upsert(df=test_df_external, table="Contact", external_id="SAPId") def test_download_no_query(salesforce): diff --git a/viadot/sources/salesforce.py b/viadot/sources/salesforce.py index 6d8fafdcc..b8931fcc1 100644 --- a/viadot/sources/salesforce.py +++ b/viadot/sources/salesforce.py @@ -3,7 +3,7 @@ import pandas as pd from prefect.utilities import logging from simple_salesforce import Salesforce as SF -from simple_salesforce.exceptions import SalesforceResourceNotFound +from simple_salesforce.exceptions import SalesforceMalformedRequest from ..config import local_config from ..exceptions import CredentialError @@ -91,41 +91,40 @@ def upsert( table_to_upsert = getattr(self.salesforce, table) records = df.to_dict("records") records_cp = records.copy() - successes = 0 + for record in records_cp: + response = 0 if external_id: if record[external_id] is None: continue else: merge_key = f"{external_id}/{record[external_id]}" record.pop(external_id) - record.pop("Id") else: merge_key = record.pop("Id") + try: response = table_to_upsert.upsert(data=record, record_id=merge_key) - codes = {200: "updated", 201: "created", 204: "updated"} - - if response not in codes: - msg = ( - f"Upsert failed for record: \n{record} with response {response}" - ) - if raise_on_error: - raise ValueError(msg) - else: - self.logger.warning(msg) - else: - successes += 1 - logger.info(f"Successfully {codes[response]} record {merge_key}.") - except SalesforceResourceNotFound as e: + except SalesforceMalformedRequest as e: + msg = f"Upsert of record {merge_key} failed." if raise_on_error: - raise e + raise ValueError(msg) from e else: - self.logger.warning( - f"Upsert failed for record: \n{record} with response {e}" - ) + self.logger.warning(msg) - logger.info(f"Successfully upserted {successes} records into table '{table}'.") + codes = {200: "updated", 201: "created", 204: "updated"} + logger.info(f"Successfully {codes[response]} record {merge_key}.") + + if response not in codes: + raise ValueError( + f"Upsert failed for record: \n{record} with response {response}" + ) + else: + logger.info(f"Successfully {codes[response]} record {merge_key}.") + + logger.info( + f"Successfully upserted {len(records)} records into table '{table}'." + ) def bulk_upsert( self, @@ -150,7 +149,7 @@ def bulk_upsert( response = self.salesforce.bulk.__getattr__(table).upsert( data=records, external_id_field=external_id, batch_size=batch_size ) - except SalesforceResourceNotFound as e: + except SalesforceMalformedRequest as e: # Bulk insert didn't work at all. raise ValueError(f"Upsert of records failed: {e}") from e From f530b9b9aae09d4b57252dc16d31fc6ea66ecd15 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 3 Jun 2022 15:32:02 +0200 Subject: [PATCH 092/119] =?UTF-8?q?=F0=9F=8E=A8=20=20Changed=20missing=20c?= =?UTF-8?q?redentials=20error=20message?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/epicor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 80912e4a7..05c2d1132 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -124,7 +124,7 @@ def parse_orders_xml(xml_data: str) -> pd.DataFrame: for shipto in header.findall("ShipToAddress"): for ship_param in ShipToAddress.__dict__.get("__annotations__"): try: - ship_value = shipto.find(f"{ship_param}").text + ship_value = shipto.find(ship_param).text except: ship_value = None ship_parameter = {ship_param: ship_value} @@ -134,7 +134,7 @@ def parse_orders_xml(xml_data: str) -> pd.DataFrame: for invoice in header.findall("InvoiceTotals"): for invoice_param in InvoiceTotals.__dict__.get("__annotations__"): try: - invoice_value = invoice.find(f"{invoice_param}").text + invoice_value = invoice.find(invoice_param).text except: invoice_value = None invoice_parameter = {invoice_param: invoice_value} @@ -143,7 +143,7 @@ def parse_orders_xml(xml_data: str) -> pd.DataFrame: for header_param in HeaderInformation.__dict__.get("__annotations__"): try: - header_value = header.find(f"{header_param}").text + header_value = header.find(header_param).text except: header_value = None if header_param == "TrackingNumbers": @@ -160,7 +160,7 @@ def parse_orders_xml(xml_data: str) -> pd.DataFrame: for item in items.findall("LineItemDetail"): for item_param in LineItemDetail.__dict__.get("__annotations__"): try: - item_value = item.find(f"{item_param}").text + item_value = item.find(item_param).text except: item_value = None item_parameter = {item_param: item_value} @@ -203,7 +203,8 @@ def __init__( required_credentials = ["host", "port", "username", "password"] if any([cred_key not in credentials for cred_key in required_credentials]): - raise CredentialError("Credentials not found.") + not_found = [c for c in required_credentials if c not in credentials] + raise CredentialError(f"Missing credential(s): '{not_found}'.") self.credentials = credentials self.config_key = config_key From 9d1b1fe0aca07a899bb6a9f2f3be16acf153c2d1 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 3 Jun 2022 16:05:31 +0200 Subject: [PATCH 093/119] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20credentials=20?= =?UTF-8?q?name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_epicor_to_duckdb.py | 4 ++-- tests/integration/tasks/test_epicor.py | 4 ++-- tests/integration/test_epicor.py | 8 ++++---- viadot/tasks/epicor.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/integration/flows/test_epicor_to_duckdb.py b/tests/integration/flows/test_epicor_to_duckdb.py index 1330d00f5..03de302dc 100644 --- a/tests/integration/flows/test_epicor_to_duckdb.py +++ b/tests/integration/flows/test_epicor_to_duckdb.py @@ -13,8 +13,8 @@ def test_epicor_to_duckdb(): duckdb_creds = {"database": "/home/viadot/database/test.duckdb"} flow = EpicorOrdersToDuckDB( name="test", - epicor_config_key="Epicor", - base_url=local_config.get("Epicor").get("test_url"), + epicor_config_key="EPICOR", + base_url=local_config.get("EPICOR").get("test_url"), filters_xml=""" diff --git a/tests/integration/tasks/test_epicor.py b/tests/integration/tasks/test_epicor.py index 199a5fadd..fbbd04b2b 100644 --- a/tests/integration/tasks/test_epicor.py +++ b/tests/integration/tasks/test_epicor.py @@ -4,8 +4,8 @@ def test_epicor_orders_to_df(): task = EpicorOrdersToDF( - config_key="Epicor", - base_url=local_config.get("Epicor").get("test_url"), + config_key="EPICOR", + base_url=local_config.get("EPICOR").get("test_url"), filters_xml=""" diff --git a/tests/integration/test_epicor.py b/tests/integration/test_epicor.py index cfb4aaa09..83a350115 100644 --- a/tests/integration/test_epicor.py +++ b/tests/integration/test_epicor.py @@ -8,8 +8,8 @@ @pytest.fixture(scope="session") def epicor(): epicor = Epicor( - base_url=local_config.get("Epicor").get("test_url"), - config_key="Epicor", + base_url=local_config.get("EPICOR").get("test_url"), + config_key="EPICOR", filters_xml=""" @@ -26,8 +26,8 @@ def epicor(): @pytest.fixture(scope="session") def epicor_error(): epicor_error = Epicor( - base_url=local_config.get("Epicor").get("test_url"), - config_key="Epicor", + base_url=local_config.get("EPICOR").get("test_url"), + config_key="EPICOR", filters_xml=""" diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index f47d41d88..a30b39f06 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -43,7 +43,7 @@ def __init__( self.start_date_field = start_date_field self.end_date_field = end_date_field super().__init__( - name="EpicorOrders_to_df", + name="epicor_orders_to_df", *args, **kwargs, ) From 596c4fa3ce07231d0b8be602e7db1a3cfef3387e Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 7 Jun 2022 14:05:24 +0200 Subject: [PATCH 094/119] Changed default name for the Prefect secret holding the name of the Azure KV secret storing Sendgrid credentials --- viadot/task_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 350334510..51ae395a4 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -387,7 +387,7 @@ def custom_mail_state_handler( if credentials_secret is None: try: - credentials_secret = PrefectSecret("mail_notifier_api_key").run() + credentials_secret = PrefectSecret("SENDGRID_DEFAULT_SECRET").run() except ValueError: pass From 64d25c9a3b39ef348a241afb14e54f1c5102e8e0 Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 7 Jun 2022 14:09:30 +0200 Subject: [PATCH 095/119] Update changelog --- CHANGELOG.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97af9d397..de4c51ecc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,15 +6,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - - Enabled Databricks Connect in the image. To enable, [follow this guide](./README.md#executing-spark-jobs) - - Added `MySQL` source and `MySqlToADLS` flow - -### Changed +- Enabled Databricks Connect in the image. To enable, [follow this guide](./README.md#executing-spark-jobs) +- Added `MySQL` source and `MySqlToADLS` flow - Added `SQLServerToDF` task - Added `SQLServerToDuckDB` flow which downloads data from SQLServer table, loads it to parquet file and then uplads it do DuckDB - Added complete proxy set up in `SAPRFC` example (`viadot/examples/sap_rfc`) - Added Databricks/Spark setup to the image. See README for setup & usage instructions. +### Changed +- Changed default name for the Prefect secret holding the name of the Azure KV secret storing Sendgrid credentials + ## [0.4.3] - 2022-04-28 ### Added @@ -53,7 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.4.0] - 2022-04-07 ### Added -- Added `custom_mail_state_handler` function that sends mail notification using custom smtp server. +- Added `custom_mail_state_handler` task that sends email notification using a custom SMTP server. - Added new function `df_clean_column` that cleans data frame columns from special characters - Added `df_clean_column` util task that removes special characters from a pandas DataFrame - Added `MultipleFlows` flow class which enables running multiple flows in a given order. From 754e7bef241a8af58917feca6ee47a7de8dbbd63 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 8 Jun 2022 12:09:00 +0200 Subject: [PATCH 096/119] =?UTF-8?q?=F0=9F=90=9B=20Changed=20overwrite=5Fad?= =?UTF-8?q?ls=20to=20overwrite?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mysql_to_adls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index 25e494afb..cb36c4c23 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -84,7 +84,7 @@ def gen_flow(self) -> Flow: adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, - overwrite_adls=self.overwrite_adls, + overwrite=self.overwrite_adls, sp_credentials_secret=self.sp_credentials_secret, flow=self, ) From b095c4c87c4592e78dbb61b61718f2da4aac2a71 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 8 Jun 2022 12:15:01 +0200 Subject: [PATCH 097/119] =?UTF-8?q?=E2=9C=85=20Changed=20args=20name=20in?= =?UTF-8?q?=20tests=20mysql=20and=20bigquery?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_mysql_to_adls.py | 2 +- tests/integration/tasks/test_bigquery.py | 2 +- tests/integration/test_bigquery.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/flows/test_mysql_to_adls.py b/tests/integration/flows/test_mysql_to_adls.py index dc6e55092..ebe1aa837 100644 --- a/tests/integration/flows/test_mysql_to_adls.py +++ b/tests/integration/flows/test_mysql_to_adls.py @@ -18,7 +18,7 @@ def test_adls_gen1_to_azure_sql_new_mock(TEST_PARQUET_FILE_PATH): file_path=TEST_PARQUET_FILE_PATH, to_path=f"raw/examples/{TEST_PARQUET_FILE_PATH}", sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA-DEV", - overwrite=True, + overwrite_adls=True, ) flow.run() mock_method.assert_called_with() diff --git a/tests/integration/tasks/test_bigquery.py b/tests/integration/tasks/test_bigquery.py index 2d5d373f8..cae34ef1c 100644 --- a/tests/integration/tasks/test_bigquery.py +++ b/tests/integration/tasks/test_bigquery.py @@ -4,7 +4,7 @@ from viadot.tasks import BigQueryToDF logger = logging.getLogger(__name__) -DATASET_NAME = "official_empty" +DATASET_NAME = "manigeo" TABLE_NAME = "space" diff --git a/tests/integration/test_bigquery.py b/tests/integration/test_bigquery.py index 50d00860b..06adee5c8 100644 --- a/tests/integration/test_bigquery.py +++ b/tests/integration/test_bigquery.py @@ -18,7 +18,7 @@ def test_list_datasets(): def test_list_tables(): datasets = BIGQ.list_datasets() tables = list(BIGQ.list_tables(datasets[0])) - assert tables == ["test_data", "manigeo_tab"] + assert tables == ["space", "test_data", "manigeo_tab"] def test_query_is_df(): From 01bae6ea44024eaf2520086c073522a57be7c759 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 28 Apr 2022 20:50:58 +0200 Subject: [PATCH 098/119] =?UTF-8?q?=E2=9C=A8=20Adding=20new=20source=20Out?= =?UTF-8?q?look?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/__init__.py | 2 +- viadot/sources/outlook.py | 117 +++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 viadot/sources/outlook.py diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index 9e22bdecf..beac987e8 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -6,7 +6,7 @@ from .sharepoint import Sharepoint from .bigquery import BigQuery from .salesforce import Salesforce - +from .outlook import Outlook try: from .sap_rfc import SAPRFC diff --git a/viadot/sources/outlook.py b/viadot/sources/outlook.py new file mode 100644 index 000000000..e705308a4 --- /dev/null +++ b/viadot/sources/outlook.py @@ -0,0 +1,117 @@ +from .base import Source +from O365 import Account +import pandas as pd +import datetime +from typing import Any, Dict, List +from ..config import local_config + + +class Outlook(Source): + def __init__( + self, + mailbox_name: str, + start_date: str = None, + end_date: str = None, + credentials: Dict[str, Any] = None, + extension_file: str = ".csv", + limit: int = 10000, + *args: List[Any], + **kwargs: Dict[str, Any], + ): + """Outlook connector build for fetching Outlook API source. + + Args: + mailbox_name (str): Mailbox name. + start_date (str, optional): A filtering start date parameter e.g. "2022-01-01". Defaults to None. + end_date (str, optional): A filtering end date parameter e.g. "2022-01-02". Defaults to None. + credentials (Dict[str, Any], optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Application. + Defaults to None. + extension_file (str, optional): Output file extension - to allow selection of .csv for data which is not easy + to handle with parquet. Defaults to ".csv". + limit (int, optional): Number of fetched top messages. Defaults to 10000. + """ + super().__init__(*args, **kwargs) + + try: + DEFAULT_CREDENTIALS = local_config["OUTLOOK"] + except KeyError: + DEFAULT_CREDENTIALS = None + + self.credentials = credentials or DEFAULT_CREDENTIALS + self.extension_file = extension_file + self.mailbox_name = mailbox_name + self.start_date = start_date + self.end_date = end_date + self.account = Account( + (self.credentials["client_id"], self.credentials["client_secret"]), + auth_flow_type="credentials", + tenant_id=self.credentials["tenant_id"], + main_resource=self.mailbox_name, + ) + if self.account.authenticate(): + print(f"{self.mailbox_name} Authenticated!") + else: + print(f"{self.mailbox_name} NOT Authenticated!") + + self.mailbox_obj = self.account.mailbox() + self.mailbox_messages = self.mailbox_obj.get_messages(limit) + super().__init__(*args, credentials=self.credentials, **kwargs) + + def to_df(self): + date_range_end_time = datetime.datetime.strptime(self.end_date, "%Y-%m-%d") + date_range_start_time = datetime.datetime.strptime(self.start_date, "%Y-%m-%d") + data_list_outbox, data_list_inbox = [], [] + + while True: + try: + message = next(self.mailbox_messages) + received_time = message.received + date_time_str = str(received_time) + dd = date_time_str[0:19] + date_obj = datetime.datetime.strptime(dd, "%Y-%m-%d %H:%M:%S") + if date_obj < date_range_start_time or date_obj > date_range_end_time: + continue + else: + fetched = message.to_api_data() + try: + sender_mail = fetched["from"]["emailAddress"]["address"] + reciver_list = fetched.get("toRecipients") + recivers = "" + if reciver_list is not None: + recivers = ", ".join( + r["emailAddress"]["address"] for r in reciver_list + ) + else: + recivers = "" + categories = "" + if message.categories: + categories = ", ".join(c for c in message.categories) + row = { + "subject": fetched.get("subject"), + "conversation ID": fetched.get("conversationId"), + "conversation index": message.conversation_index, + "categories": categories, + "sender": sender_mail, + "recivers": recivers, + "read": fetched.get("isRead"), + "received time": fetched.get("receivedDateTime"), + } + if sender_mail == self.mailbox_name: + data_list_outbox.append(row) + else: + data_list_inbox.append(row) + except KeyError: + print(f"KeyError - nie ma w:") + except StopIteration: + break + df_inbox = pd.DataFrame(data=data_list_inbox) + df_outbox = pd.DataFrame(data=data_list_outbox) + + return df_inbox, df_outbox + + def to_csv(self): + df_inbox, df_outbox = self.to_df() + file_name = self.mailbox_name.split("@")[0].replace(".", "_").replace("-", "_") + df_inbox.to_csv(f"{file_name}_Inbox{self.extension_file}", index=False) + df_outbox.to_csv(f"{file_name}_Outbox{self.extension_file}", index=False) From 6bc2faeb69c4d0989f301c8d5309a799f4dcf4d2 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 28 Apr 2022 20:53:00 +0200 Subject: [PATCH 099/119] =?UTF-8?q?=E2=9C=A8=20Adding=20new=20Task=20Outlo?= =?UTF-8?q?okToDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/__init__.py | 1 + viadot/tasks/outlook.py | 71 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 viadot/tasks/outlook.py diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index b7ebb0953..fa75c3262 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -32,6 +32,7 @@ from .aselite import ASELiteToDF from .bigquery import BigQueryToDF from .salesforce import SalesforceUpsert, SalesforceBulkUpsert +from .outlook import OutlookToDF try: from .sap_rfc import SAPRFCToDF diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py new file mode 100644 index 000000000..accf0a867 --- /dev/null +++ b/viadot/tasks/outlook.py @@ -0,0 +1,71 @@ +import pandas as pd +from viadot.config import local_config +from typing import Any, Dict, List +from prefect import Task +from prefect.utilities.tasks import defaults_from_attrs + +from ..sources import Outlook + + +class OutlookToDF(Task): + def __init__( + self, + mailbox_name: str = None, + start_date: str = None, + end_date: str = None, + credentials: Dict[str, Any] = None, + extension_file: str = ".csv", + limit: int = 10000, + *args: List[Any], + **kwargs: Dict[str, Any] + ): + + self.mailbox_name = mailbox_name + self.start_date = start_date + self.end_date = end_date + self.extension_file = extension_file + self.limit = limit + + try: + DEFAULT_CREDENTIALS = local_config["OUTLOOK"] + except KeyError: + DEFAULT_CREDENTIALS = None + + self.credentials = credentials or DEFAULT_CREDENTIALS + + super().__init__( + name="outlook_to_csv", + *args, + **kwargs, + ) + + def __call__(self, *args, **kwargs): + """Download Outlook Mesagess to DF""" + return super().__call__(*args, **kwargs) + + @defaults_from_attrs( + "mailbox_name", + "start_date", + "end_date", + ) + def run( + self, + mailbox_name: str, + start_date: str = None, + end_date: str = None, + ): + """ + Task for downloading data from the Outlook API to a CSV file. + + Args: + mailbox_name (str): Mailbox name. + start_date (str, optional): A filtering start date parameter e.g. "2022-01-01". Defaults to None. + end_date (str, optional): A filtering end date parameter e.g. "2022-01-02". Defaults to None. + + Returns: + pd.DataFrame: The API GET as a pandas DataFrame. + """ + df_inbox, df_outbox = Outlook( + mailbox_name=mailbox_name, start_date=start_date, end_date=end_date + ).to_df() + return df_inbox, df_outbox From a07601d4fd9cab74aab17316b9136fa2cb93c175 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Fri, 29 Apr 2022 15:24:01 +0200 Subject: [PATCH 100/119] =?UTF-8?q?=F0=9F=9A=91=20Adding=20multiple=20retu?= =?UTF-8?q?rn=20in=20Task=20ToDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/outlook.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index accf0a867..15a14ec92 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -1,6 +1,6 @@ import pandas as pd from viadot.config import local_config -from typing import Any, Dict, List +from typing import Any, Dict, List, Tuple from prefect import Task from prefect.utilities.tasks import defaults_from_attrs @@ -43,17 +43,15 @@ def __call__(self, *args, **kwargs): """Download Outlook Mesagess to DF""" return super().__call__(*args, **kwargs) - @defaults_from_attrs( - "mailbox_name", - "start_date", - "end_date", - ) + @defaults_from_attrs("mailbox_name", "start_date", "end_date", "limit") def run( self, mailbox_name: str, start_date: str = None, end_date: str = None, - ): + limit: int = 10000, + nout=2, + ) -> Tuple[int, int]: """ Task for downloading data from the Outlook API to a CSV file. @@ -66,6 +64,9 @@ def run( pd.DataFrame: The API GET as a pandas DataFrame. """ df_inbox, df_outbox = Outlook( - mailbox_name=mailbox_name, start_date=start_date, end_date=end_date + mailbox_name=mailbox_name, + start_date=start_date, + end_date=end_date, + limit=limit, ).to_df() - return df_inbox, df_outbox + return (df_inbox, df_outbox) From f20b2521e0142e655792adee1f2a4dbf55566abd Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Fri, 29 Apr 2022 15:25:54 +0200 Subject: [PATCH 101/119] =?UTF-8?q?=E2=9C=A8=20Adding=20Outlook=20Flow=20v?= =?UTF-8?q?0.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/__init__.py | 1 + viadot/flows/outlook_to_adls.py | 118 ++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 viadot/flows/outlook_to_adls.py diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index b3a2b90ae..232dc8eaa 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -11,6 +11,7 @@ from .cloud_for_customers_report_to_adls import CloudForCustomersReportToADLS from .aselite_to_adls import ASELiteToADLS from .bigquery_to_adls import BigQueryToADLS +from .outlook_to_adls import OutlookToCSVs try: from .sap_to_duckdb import SAPToDuckDB diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py new file mode 100644 index 000000000..c04417cc2 --- /dev/null +++ b/viadot/flows/outlook_to_adls.py @@ -0,0 +1,118 @@ +import os +from typing import Any, Dict, List, Union, Tuple + +import pendulum +from prefect import Flow, Task, apply_map, task +import pandas as pd +from ..utils import slugify +from ..task_utils import df_to_csv + +from ..tasks import OutlookToDF, AzureDataLakeUpload + +file_to_adls_task = AzureDataLakeUpload() +outlook_to_df = OutlookToDF() + +# @task +# def df_to_csv_file_task(df, file_path, extension_file: str = ".csv"): +# df.to_csv(f"{file_path}{extension_file}", index=False) +COLUMN_LIST = [ + "subject", + "conversation ID", + "conversation index", + "categories", + "sender", + "unread", + "received time", +] + + +@task +def df_to_csv_file_task( + df_tuple: Tuple[pd.DataFrame, pd.DataFrame], + dir_path: str, + extension_file: str = ".csv", + # header: List[str] = COLUMN_LIST, +): + # DF_IN = pd.DataFrame(columns=header) + # DF_OUT = pd.DataFrame(columns=header) + df_in = df_tuple[0] + df_out = df_tuple[1] + # df_in = DF_IN.append(df_in, ignore_index=True) + # df_out = DF_OUT.append(df_out, ignore_index=True) + df_out.to_csv( + f"{dir_path}/Outbox{extension_file}", mode="a", index=False + ) # , header=False) + df_in.to_csv( + f"{dir_path}/Inbox{extension_file}", mode="a", index=False + ) # , header=False) + + +class OutlookToCSVs(Flow): + def __init__( + self, + mailbox_list: List[str], + name: str = None, + start_date: str = None, + end_date: str = None, + local_file_path: str = None, + extension_file: str = ".csv", + adls_dir_path: str = None, + # adls_file_path: str = None, + overwrite_adls: bool = True, + adls_sp_credentials_secret: str = None, + limit: int = 10000, + *args: List[Any], + **kwargs: Dict[str, Any], + ): + + self.mailbox_list = mailbox_list + self.start_date = start_date + self.end_date = end_date + self.limit = limit + + # AzureDataLakeUpload + self.extension_file = extension_file + self.overwrite_adls = overwrite_adls + self.adls_sp_credentials_secret = adls_sp_credentials_secret + self.dir_names = [ + mailbox.split("@")[0].replace(".", "_").replace("-", "_") + for mailbox in self.mailbox_list + ] + self.local_file_paths = [ + f"{dir_path}/Outbox{self.extension_file}" for dir_path in self.dir_names + ] + self.adls_file_paths = [ + f"{adls_dir_path}/{file}" for file in self.local_file_paths + ] + for dir in self.dir_names: + if not os.path.exists(dir): + os.makedirs(dir) + + super().__init__(*args, name=name, **kwargs) + + self.gen_flow() + + def gen_outlook_df( + self, mailbox_list: Union[str, List[str]], flow: Flow = None + ) -> Task: + dfs_tuple = outlook_to_df.bind( + mailbox_name=mailbox_list, + start_date=self.start_date, + end_date=self.end_date, + limit=self.limit, + flow=flow, + ) + return dfs_tuple + + def gen_flow(self) -> Flow: + df_tuples = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) + + df_to_csv_file_task.map(df_tuple=df_tuples, dir_path=self.dir_names, flow=self) + + # file_to_adls_task.map( + # from_path=self.local_file_paths, + # to_path=self.adls_file_paths, + # overwrite=self.overwrite_adls, + # sp_credentials_secret=self.adls_sp_credentials_secret, + # flow=self, + # ) From 4b88201740d8f4ed23a902fad3ab9ecc14d8c2f3 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 5 May 2022 22:28:18 +0200 Subject: [PATCH 102/119] =?UTF-8?q?=F0=9F=8E=A8=20Changing=20dfetching=20s?= =?UTF-8?q?tructure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/outlook_to_adls.py | 89 +++++++++++++++++++++------------ viadot/sources/outlook.py | 32 +++++++++--- viadot/tasks/outlook.py | 23 ++++++--- 3 files changed, 100 insertions(+), 44 deletions(-) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index c04417cc2..c9a8d5164 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -1,20 +1,18 @@ import os -from typing import Any, Dict, List, Union, Tuple +from typing import Any, Dict, List, Union, Tuple, Literal import pendulum from prefect import Flow, Task, apply_map, task import pandas as pd from ..utils import slugify -from ..task_utils import df_to_csv +from ..task_utils import df_to_csv, union_dfs_task from ..tasks import OutlookToDF, AzureDataLakeUpload file_to_adls_task = AzureDataLakeUpload() outlook_to_df = OutlookToDF() -# @task -# def df_to_csv_file_task(df, file_path, extension_file: str = ".csv"): -# df.to_csv(f"{file_path}{extension_file}", index=False) + COLUMN_LIST = [ "subject", "conversation ID", @@ -57,10 +55,11 @@ def __init__( local_file_path: str = None, extension_file: str = ".csv", adls_dir_path: str = None, - # adls_file_path: str = None, + adls_file_path: str = None, overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, limit: int = 10000, + if_exists: Literal["append", "replace", "skip"] = "append", *args: List[Any], **kwargs: Dict[str, Any], ): @@ -69,50 +68,78 @@ def __init__( self.start_date = start_date self.end_date = end_date self.limit = limit - + self.local_file_path = local_file_path # AzureDataLakeUpload + self.adls_dir_path = adls_dir_path + self.adls_file_path = adls_file_path self.extension_file = extension_file self.overwrite_adls = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret - self.dir_names = [ - mailbox.split("@")[0].replace(".", "_").replace("-", "_") - for mailbox in self.mailbox_list - ] - self.local_file_paths = [ - f"{dir_path}/Outbox{self.extension_file}" for dir_path in self.dir_names - ] - self.adls_file_paths = [ - f"{adls_dir_path}/{file}" for file in self.local_file_paths - ] - for dir in self.dir_names: - if not os.path.exists(dir): - os.makedirs(dir) + # self.dir_names = [ + # mailbox.split("@")[0].replace(".", "_").replace("-", "_") + # for mailbox in self.mailbox_list + # ] + # self.local_file_paths = [ + # dir + "/" + box + self.extension_file + # for dir in self.dir_names + # for box in ["Inbox", "Outbox"] + # ] + + # self.adls_file_paths = [ + # f"{self.adls_dir_path}/{file}" for file in self.local_file_paths + # ] + # for dir in self.dir_names: + # if not os.path.exists(dir): + # os.makedirs(dir) + self.if_exsists = if_exists super().__init__(*args, name=name, **kwargs) self.gen_flow() + # def gen_outlook_df( + # self, mailbox_list: Union[str, List[str]], flow: Flow = None + # ) -> Task: + # df = outlook_to_df.bind( + # mailbox_name=mailbox_list, + # start_date=self.start_date, + # end_date=self.end_date, + # limit=self.limit, + # flow=flow, + # ) + # return df # dfs_tuple + def gen_outlook_df( self, mailbox_list: Union[str, List[str]], flow: Flow = None ) -> Task: - dfs_tuple = outlook_to_df.bind( + + report = outlook_to_df.bind( mailbox_name=mailbox_list, start_date=self.start_date, end_date=self.end_date, limit=self.limit, flow=flow, ) - return dfs_tuple + + return report def gen_flow(self) -> Flow: - df_tuples = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) - df_to_csv_file_task.map(df_tuple=df_tuples, dir_path=self.dir_names, flow=self) + dfs = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) # df_tuples + + df = union_dfs_task.bind(dfs, flow=self) + + df_to_file = df_to_csv.bind( + df=df, path=self.local_file_path, if_exists=self.if_exsists, flow=self + ) + + file_to_adls_task.bind( + from_path=self.local_file_path, + to_path=self.adls_file_path, + overwrite=self.overwrite_adls, + sp_credentials_secret=self.adls_sp_credentials_secret, + flow=self, + ) - # file_to_adls_task.map( - # from_path=self.local_file_paths, - # to_path=self.adls_file_paths, - # overwrite=self.overwrite_adls, - # sp_credentials_secret=self.adls_sp_credentials_secret, - # flow=self, - # ) + df_to_file.set_upstream(df, flow=self) + file_to_adls_task.set_upstream(df_to_file, flow=self) diff --git a/viadot/sources/outlook.py b/viadot/sources/outlook.py index e705308a4..fedfad392 100644 --- a/viadot/sources/outlook.py +++ b/viadot/sources/outlook.py @@ -15,6 +15,8 @@ def __init__( credentials: Dict[str, Any] = None, extension_file: str = ".csv", limit: int = 10000, + request_retries: int = 10, + # token_backend: str = "token_backend.txt", *args: List[Any], **kwargs: Dict[str, Any], ): @@ -38,6 +40,8 @@ def __init__( except KeyError: DEFAULT_CREDENTIALS = None + # self.token_backend = token_backend + self.request_retries = request_retries self.credentials = credentials or DEFAULT_CREDENTIALS self.extension_file = extension_file self.mailbox_name = mailbox_name @@ -48,6 +52,8 @@ def __init__( auth_flow_type="credentials", tenant_id=self.credentials["tenant_id"], main_resource=self.mailbox_name, + request_retries=self.request_retries, + # token_backend=self.token_backend, ) if self.account.authenticate(): print(f"{self.mailbox_name} Authenticated!") @@ -61,7 +67,8 @@ def __init__( def to_df(self): date_range_end_time = datetime.datetime.strptime(self.end_date, "%Y-%m-%d") date_range_start_time = datetime.datetime.strptime(self.start_date, "%Y-%m-%d") - data_list_outbox, data_list_inbox = [], [] + data = [] + # data_list_outbox, data_list_inbox = [], [] while True: try: @@ -87,6 +94,7 @@ def to_df(self): categories = "" if message.categories: categories = ", ".join(c for c in message.categories) + row = { "subject": fetched.get("subject"), "conversation ID": fetched.get("conversationId"), @@ -97,18 +105,30 @@ def to_df(self): "read": fetched.get("isRead"), "received time": fetched.get("receivedDateTime"), } + row["mail adress"] = ( + self.mailbox_name.split("@")[0] + .replace(".", "_") + .replace("-", "_") + ) + if sender_mail == self.mailbox_name: - data_list_outbox.append(row) + row["Inbox"] = False + # data_list_outbox.append(row) else: - data_list_inbox.append(row) + row["Inbox"] = True + # data_list_inbox.append(row) + + data.append(row) except KeyError: print(f"KeyError - nie ma w:") except StopIteration: break - df_inbox = pd.DataFrame(data=data_list_inbox) - df_outbox = pd.DataFrame(data=data_list_outbox) + df = pd.DataFrame(data=data) + + # df_inbox = pd.DataFrame(data=data_list_inbox) + # df_outbox = pd.DataFrame(data=data_list_outbox) - return df_inbox, df_outbox + return df # df_inbox, df_outbox def to_csv(self): df_inbox, df_outbox = self.to_df() diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index 15a14ec92..c86cd8692 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -3,9 +3,12 @@ from typing import Any, Dict, List, Tuple from prefect import Task from prefect.utilities.tasks import defaults_from_attrs +from prefect.utilities import logging from ..sources import Outlook +logger = logging.get_logger() + class OutlookToDF(Task): def __init__( @@ -17,7 +20,7 @@ def __init__( extension_file: str = ".csv", limit: int = 10000, *args: List[Any], - **kwargs: Dict[str, Any] + **kwargs: Dict[str, Any], ): self.mailbox_name = mailbox_name @@ -50,8 +53,8 @@ def run( start_date: str = None, end_date: str = None, limit: int = 10000, - nout=2, - ) -> Tuple[int, int]: + # nout: int = 2, + ) -> pd.DataFrame: # Tuple[int, int]: """ Task for downloading data from the Outlook API to a CSV file. @@ -59,14 +62,20 @@ def run( mailbox_name (str): Mailbox name. start_date (str, optional): A filtering start date parameter e.g. "2022-01-01". Defaults to None. end_date (str, optional): A filtering end date parameter e.g. "2022-01-02". Defaults to None. + limit (str, optional): A limit to access last top messages. Defaults to 10_000. Returns: - pd.DataFrame: The API GET as a pandas DataFrame. + pd.DataFrame: The API GET as a pandas DataFrames from Outlook inbox and outbox, respectively. """ - df_inbox, df_outbox = Outlook( + outlook = Outlook( mailbox_name=mailbox_name, start_date=start_date, end_date=end_date, limit=limit, - ).to_df() - return (df_inbox, df_outbox) + ) + df = outlook.to_df() + # df_inbox, df_outbox = outlook.to_df() + logger.info( + f"Downloaded the data from the '{outlook.mailbox_name}' into the Data Frame." + ) + return df # (df_inbox, df_outbox) From a3b2669e23dcecb83708a3c3eb0d3379ad712537 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 5 May 2022 22:31:24 +0200 Subject: [PATCH 103/119] =?UTF-8?q?=F0=9F=93=9D=20Removing=20coments=20fro?= =?UTF-8?q?m=20v0.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/outlook_to_adls.py | 63 ++------------------------------- viadot/sources/outlook.py | 12 +------ viadot/tasks/outlook.py | 7 ++-- 3 files changed, 6 insertions(+), 76 deletions(-) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index c9a8d5164..b528b1c9d 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -1,5 +1,5 @@ import os -from typing import Any, Dict, List, Union, Tuple, Literal +from typing import Any, Dict, List, Union, Literal import pendulum from prefect import Flow, Task, apply_map, task @@ -13,38 +13,6 @@ outlook_to_df = OutlookToDF() -COLUMN_LIST = [ - "subject", - "conversation ID", - "conversation index", - "categories", - "sender", - "unread", - "received time", -] - - -@task -def df_to_csv_file_task( - df_tuple: Tuple[pd.DataFrame, pd.DataFrame], - dir_path: str, - extension_file: str = ".csv", - # header: List[str] = COLUMN_LIST, -): - # DF_IN = pd.DataFrame(columns=header) - # DF_OUT = pd.DataFrame(columns=header) - df_in = df_tuple[0] - df_out = df_tuple[1] - # df_in = DF_IN.append(df_in, ignore_index=True) - # df_out = DF_OUT.append(df_out, ignore_index=True) - df_out.to_csv( - f"{dir_path}/Outbox{extension_file}", mode="a", index=False - ) # , header=False) - df_in.to_csv( - f"{dir_path}/Inbox{extension_file}", mode="a", index=False - ) # , header=False) - - class OutlookToCSVs(Flow): def __init__( self, @@ -75,40 +43,13 @@ def __init__( self.extension_file = extension_file self.overwrite_adls = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret - # self.dir_names = [ - # mailbox.split("@")[0].replace(".", "_").replace("-", "_") - # for mailbox in self.mailbox_list - # ] - # self.local_file_paths = [ - # dir + "/" + box + self.extension_file - # for dir in self.dir_names - # for box in ["Inbox", "Outbox"] - # ] - - # self.adls_file_paths = [ - # f"{self.adls_dir_path}/{file}" for file in self.local_file_paths - # ] - # for dir in self.dir_names: - # if not os.path.exists(dir): - # os.makedirs(dir) + self.if_exsists = if_exists super().__init__(*args, name=name, **kwargs) self.gen_flow() - # def gen_outlook_df( - # self, mailbox_list: Union[str, List[str]], flow: Flow = None - # ) -> Task: - # df = outlook_to_df.bind( - # mailbox_name=mailbox_list, - # start_date=self.start_date, - # end_date=self.end_date, - # limit=self.limit, - # flow=flow, - # ) - # return df # dfs_tuple - def gen_outlook_df( self, mailbox_list: Union[str, List[str]], flow: Flow = None ) -> Task: diff --git a/viadot/sources/outlook.py b/viadot/sources/outlook.py index fedfad392..5e095a2e4 100644 --- a/viadot/sources/outlook.py +++ b/viadot/sources/outlook.py @@ -16,7 +16,6 @@ def __init__( extension_file: str = ".csv", limit: int = 10000, request_retries: int = 10, - # token_backend: str = "token_backend.txt", *args: List[Any], **kwargs: Dict[str, Any], ): @@ -40,7 +39,6 @@ def __init__( except KeyError: DEFAULT_CREDENTIALS = None - # self.token_backend = token_backend self.request_retries = request_retries self.credentials = credentials or DEFAULT_CREDENTIALS self.extension_file = extension_file @@ -53,7 +51,6 @@ def __init__( tenant_id=self.credentials["tenant_id"], main_resource=self.mailbox_name, request_retries=self.request_retries, - # token_backend=self.token_backend, ) if self.account.authenticate(): print(f"{self.mailbox_name} Authenticated!") @@ -68,7 +65,6 @@ def to_df(self): date_range_end_time = datetime.datetime.strptime(self.end_date, "%Y-%m-%d") date_range_start_time = datetime.datetime.strptime(self.start_date, "%Y-%m-%d") data = [] - # data_list_outbox, data_list_inbox = [], [] while True: try: @@ -110,13 +106,10 @@ def to_df(self): .replace(".", "_") .replace("-", "_") ) - if sender_mail == self.mailbox_name: row["Inbox"] = False - # data_list_outbox.append(row) else: row["Inbox"] = True - # data_list_inbox.append(row) data.append(row) except KeyError: @@ -125,10 +118,7 @@ def to_df(self): break df = pd.DataFrame(data=data) - # df_inbox = pd.DataFrame(data=data_list_inbox) - # df_outbox = pd.DataFrame(data=data_list_outbox) - - return df # df_inbox, df_outbox + return df def to_csv(self): df_inbox, df_outbox = self.to_df() diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index c86cd8692..61aab63b9 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -53,8 +53,7 @@ def run( start_date: str = None, end_date: str = None, limit: int = 10000, - # nout: int = 2, - ) -> pd.DataFrame: # Tuple[int, int]: + ) -> pd.DataFrame: """ Task for downloading data from the Outlook API to a CSV file. @@ -74,8 +73,8 @@ def run( limit=limit, ) df = outlook.to_df() - # df_inbox, df_outbox = outlook.to_df() + logger.info( f"Downloaded the data from the '{outlook.mailbox_name}' into the Data Frame." ) - return df # (df_inbox, df_outbox) + return df From 4ec12881f7776987c2229b3e28af07d3010fa65f Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Fri, 6 May 2022 09:37:23 +0200 Subject: [PATCH 104/119] =?UTF-8?q?=F0=9F=93=9D=20Adding=20docstring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/outlook_to_adls.py | 25 ++++++++++++++++++++----- viadot/tasks/outlook.py | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index b528b1c9d..353365c5a 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -22,7 +22,6 @@ def __init__( end_date: str = None, local_file_path: str = None, extension_file: str = ".csv", - adls_dir_path: str = None, adls_file_path: str = None, overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, @@ -31,21 +30,37 @@ def __init__( *args: List[Any], **kwargs: Dict[str, Any], ): + """Flow for downloading data from Outlook source to a local CSV + using Outlook API, then uploading it to Azure Data Lake. + + Args: + mailbox_list (List[str]): Mailbox name. + name (str, optional): The name of the flow. Defaults to None. + start_date (str, optional): A filtering start date parameter e.g. "2022-01-01". Defaults to None. + end_date (str, optional): A filtering end date parameter e.g. "2022-01-02". Defaults to None. + local_file_path (str, optional): Local destination path. Defaults to None. + extension_file (str, optional): Output file extension. Defaults to ".csv". + adls_file_path (str, optional): Azure Data Lake destination file path. Defaults to None. + overwrite_adls (bool, optional): Whether to overwrite the file in ADLS. Defaults to True. + adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. + limit (int, optional): Number of fetched top messages. Defaults to 10000. + if_exists (Literal['append', 'replace', 'skip'], optional): What to do if the local file already exists. Defaults to "append". + """ self.mailbox_list = mailbox_list self.start_date = start_date self.end_date = end_date self.limit = limit self.local_file_path = local_file_path + self.if_exsists = if_exists + # AzureDataLakeUpload - self.adls_dir_path = adls_dir_path self.adls_file_path = adls_file_path self.extension_file = extension_file self.overwrite_adls = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret - self.if_exsists = if_exists - super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -66,7 +81,7 @@ def gen_outlook_df( def gen_flow(self) -> Flow: - dfs = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) # df_tuples + dfs = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) df = union_dfs_task.bind(dfs, flow=self) diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index 61aab63b9..a72c78cc7 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -64,7 +64,7 @@ def run( limit (str, optional): A limit to access last top messages. Defaults to 10_000. Returns: - pd.DataFrame: The API GET as a pandas DataFrames from Outlook inbox and outbox, respectively. + pd.DataFrame: The API GET as a pandas DataFrames from Outlook. """ outlook = Outlook( mailbox_name=mailbox_name, From 179324377ec8f6b030a2d64a3f6e68eaa73754f8 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Mon, 9 May 2022 13:35:42 +0200 Subject: [PATCH 105/119] =?UTF-8?q?=F0=9F=93=9D=20Changing=20Flow=20name?= =?UTF-8?q?=20and=20to=5Fcsv=20method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/__init__.py | 2 +- viadot/flows/outlook_to_adls.py | 6 +++--- viadot/sources/outlook.py | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 232dc8eaa..35a1d66dc 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -11,7 +11,7 @@ from .cloud_for_customers_report_to_adls import CloudForCustomersReportToADLS from .aselite_to_adls import ASELiteToADLS from .bigquery_to_adls import BigQueryToADLS -from .outlook_to_adls import OutlookToCSVs +from .outlook_to_adls import OutlookToADLS try: from .sap_to_duckdb import SAPToDuckDB diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index 353365c5a..c6bd0042f 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -13,7 +13,7 @@ outlook_to_df = OutlookToDF() -class OutlookToCSVs(Flow): +class OutlookToADLS(Flow): def __init__( self, mailbox_list: List[str], @@ -69,7 +69,7 @@ def gen_outlook_df( self, mailbox_list: Union[str, List[str]], flow: Flow = None ) -> Task: - report = outlook_to_df.bind( + df = outlook_to_df.bind( mailbox_name=mailbox_list, start_date=self.start_date, end_date=self.end_date, @@ -77,7 +77,7 @@ def gen_outlook_df( flow=flow, ) - return report + return df def gen_flow(self) -> Flow: diff --git a/viadot/sources/outlook.py b/viadot/sources/outlook.py index 5e095a2e4..32f2293a2 100644 --- a/viadot/sources/outlook.py +++ b/viadot/sources/outlook.py @@ -121,7 +121,6 @@ def to_df(self): return df def to_csv(self): - df_inbox, df_outbox = self.to_df() + df = self.to_df() file_name = self.mailbox_name.split("@")[0].replace(".", "_").replace("-", "_") - df_inbox.to_csv(f"{file_name}_Inbox{self.extension_file}", index=False) - df_outbox.to_csv(f"{file_name}_Outbox{self.extension_file}", index=False) + df.to_csv(f"{file_name}{self.extension_file}", index=False) From 6ee6a8f620d639c4c52c4122dce2febfcf791124 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Mon, 9 May 2022 13:44:37 +0200 Subject: [PATCH 106/119] =?UTF-8?q?=E2=9C=85=20Adding=20test=20for=20Outlo?= =?UTF-8?q?ok=20to=5Fdf=20method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_outlook.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/integration/test_outlook.py diff --git a/tests/integration/test_outlook.py b/tests/integration/test_outlook.py new file mode 100644 index 000000000..d3aa655ef --- /dev/null +++ b/tests/integration/test_outlook.py @@ -0,0 +1,12 @@ +from viadot.sources import Outlook + + +def test_outlook_to_df(): + outlook = Outlook( + mailbox_name="bulgaria@velux.com", + start_date="2022-04-28", + end_date="2022-04-29", + ) + df = outlook.to_df() + assert df.shape[1] == 10 + assert df.empty == False From b715960be23605530e288bc0548955799c74fb6e Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Wed, 8 Jun 2022 15:19:26 +0200 Subject: [PATCH 107/119] changing CHANGELOG.md --- CHANGELOG.md | 131 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 86 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de4c51ecc..27ffa595f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,26 +1,22 @@ # Changelog + All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -### Added -- Enabled Databricks Connect in the image. To enable, [follow this guide](./README.md#executing-spark-jobs) -- Added `MySQL` source and `MySqlToADLS` flow -- Added `SQLServerToDF` task -- Added `SQLServerToDuckDB` flow which downloads data from SQLServer table, loads it to parquet file and then uplads it do DuckDB -- Added complete proxy set up in `SAPRFC` example (`viadot/examples/sap_rfc`) -- Added Databricks/Spark setup to the image. See README for setup & usage instructions. -### Changed -- Changed default name for the Prefect secret holding the name of the Azure KV secret storing Sendgrid credentials +### Added +-Added `Outlook` source +-Added `Outlook` task +-Added `Outlook` flow ## [0.4.3] - 2022-04-28 + ### Added -- Added `func` parameter to `SAPRFC` -- Added `SAPRFCToADLS` flow which downloads data from SAP Database to to a pandas DataFrame, exports df to csv and uploads it to Azure Data Lake. + - Added `adls_file_name` in `SupermetricsToADLS` and `SharepointToADLS` flows - Added `BigQueryToADLS` flow class which anables extract data from BigQuery. - Added `Salesforce` source @@ -29,38 +25,44 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added C4C secret handling to `CloudForCustomersReportToADLS` flow (`c4c_credentials_secret` parameter) ### Fixed + - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed C4C secret handling (tasks now correctly read the secret as the credentials, rather than assuming the secret is a container for credentials for all environments and trying to access specific key inside it). In other words, tasks now assume the secret holds credentials, rather than a dict of the form `{env: credentials, env2: credentials2}` - Fixed `utils.gen_bulk_insert_query_from_df()` failing with > 1000 rows due to INSERT clause limit by chunking the data into multiple INSERTs - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed `MultipleFlows` when one flow is passed and when last flow fails. - ## [0.4.2] - 2022-04-08 + ### Added + - Added `AzureDataLakeRemove` task ### Changed + - Changed name of task file from `prefect` to `prefect_date_range` ### Fixed -- Fixed out of range issue in `prefect_date_range` +- Fixed out of range issue in `prefect_date_range` ## [0.4.1] - 2022-04-07 + ### Changed -- bumped version +- bumped version ## [0.4.0] - 2022-04-07 + ### Added -- Added `custom_mail_state_handler` task that sends email notification using a custom SMTP server. + +- Added `custom_mail_state_handler` function that sends mail notification using custom smtp server. - Added new function `df_clean_column` that cleans data frame columns from special characters - Added `df_clean_column` util task that removes special characters from a pandas DataFrame - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added `GetFlowNewDateRange` task to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` -- Added new source `ASElite` +- Added new source `ASElite` - Added KeyVault support in `CloudForCustomers` tasks - Added `SQLServer` source - Added `DuckDBToDF` task @@ -81,15 +83,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `AzureSQLUpsert` task ### Changed + - Changed the base class of `AzureSQL` to `SQLServer` - `df_to_parquet()` task now creates directories if needed - Added several more separators to check for automatically in `SAPRFC.to_df()` - Upgraded `duckdb` version to 0.3.2 ### Fixed + - Fixed bug with `CheckColumnOrder` task - Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 -- `BCPTask` now correctly handles custom SQL Server port +- `BCPTask` now correctly handles custom SQL Server port - Fixed `SAPRFC.to_df()` ignoring user-specified separator - Fixed temporary CSV generated by the `DuckDBToSQLServer` flow not being cleaned up - Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance @@ -99,25 +103,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `SQL.to_df()` incorrectly handling queries that begin with whitespace ### Removed + - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. - Removed `dtypes_to_json` task to task_utils.py - ## [0.3.2] - 2022-02-17 + ### Fixed -- fixed an issue with schema info within `CheckColumnOrder` class. +- fixed an issue with schema info within `CheckColumnOrder` class. ## [0.3.1] - 2022-02-17 + ### Changed --`ADLSToAzureSQL` - added `remove_tab` parameter to remove uncessery tab separators from data. + +-`ADLSToAzureSQL` - added `remove_tab` parameter to remove uncessery tab separators from data. ### Fixed -- fixed an issue with return df within `CheckColumnOrder` class. +- fixed an issue with return df within `CheckColumnOrder` class. ## [0.3.0] - 2022-02-16 + ### Added + - new source `SAPRFC` for connecting with SAP using the `pyRFC` library (requires pyrfc as well as the SAP NW RFC library that can be downloaded [here](https://support.sap.com/en/product/connectors/nwrfcsdk.html) - new source `DuckDB` for connecting with the `DuckDB` database - new task `SAPRFCToDF` for loading data from SAP to a pandas DataFrame @@ -130,49 +139,57 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - KeyVault support in `CloudForCustomers` tasks ### Changed + - pinned Prefect version to 0.15.11 - `df_to_csv` now creates dirs if they don't exist - `ADLSToAzureSQL` - when data in csv coulmns has unnecessary "\t" then removes them ### Fixed + - fixed an issue with duckdb calls seeing initial db snapshot instead of the updated state (#282) - C4C connection with url and report_url optimization - column mapper in C4C source - ## [0.2.15] - 2022-01-12 + ### Added + - new option to `ADLSToAzureSQL` Flow - `if_exists="delete"` - `SQL` source: `create_table()` already handles `if_exists`; now it handles a new option for `if_exists()` - `C4CToDF` and `C4CReportToDF` tasks are provided as a class instead of function +### Fixed -### Fixed - Appending issue within CloudForCustomers source - An early return bug in `UKCarbonIntensity` in `to_df` method - ## [0.2.14] - 2021-12-01 + ### Fixed -- authorization issue within `CloudForCustomers` source +- authorization issue within `CloudForCustomers` source ## [0.2.13] - 2021-11-30 + ### Added + - Added support for file path to `CloudForCustomersReportToADLS` flow - Added `flow_of_flows` list handling - Added support for JSON files in `AzureDataLakeToDF` ### Fixed + - `Supermetrics` source: `to_df()` now correctly handles `if_empty` in case of empty results ### Changed + - `Sharepoint` and `CloudForCustomers` sources will now provide an informative `CredentialError` which is also raised early. This will make issues with input credenials immediately clear to the user. - Removed set_key_value from `CloudForCustomersReportToADLS` flow - ## [0.2.12] - 2021-11-25 + ### Added + - Added `Sharepoint` source - Added `SharepointToDF` task - Added `SharepointToADLS` flow @@ -184,35 +201,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `df_to_parquet` task to task_utils.py - Added `dtypes_to_json` task to task_utils.py - ## [0.2.11] - 2021-10-30 + ### Fixed -- `ADLSToAzureSQL` - fixed path to csv issue. -- `SupermetricsToADLS` - fixed local json path issue. +- `ADLSToAzureSQL` - fixed path to csv issue. +- `SupermetricsToADLS` - fixed local json path issue. ## [0.2.10] - 2021-10-29 -### Release due to CI/CD error +### Release due to CI/CD error ## [0.2.9] - 2021-10-29 -### Release due to CI/CD error +### Release due to CI/CD error ## [0.2.8] - 2021-10-29 + ### Changed + - CI/CD: `dev` image is now only published on push to the `dev` branch -- Docker: +- Docker: - updated registry links to use the new `ghcr.io` domain - `run.sh` now also accepts the `-t` option. When run in standard mode, it will only spin up the `viadot_jupyter_lab` service. When ran with `-t dev`, it will also spin up `viadot_testing` and `viadot_docs` containers. ### Fixed -- ADLSToAzureSQL - fixed path parameter issue. +- ADLSToAzureSQL - fixed path parameter issue. ## [0.2.7] - 2021-10-04 + ### Added + - Added `SQLiteQuery` task - Added `CloudForCustomers` source - Added `CloudForCustomersToDF` and `CloudForCustomersToCSV` tasks @@ -222,40 +243,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added local setup and commands to the `README` ### Changed + - Changed CI/CD algorithm - the `latest` Docker image is now only updated on release and is the same exact image as the latest release - the `dev` image is released only on pushes and PRs to the `dev` branch (so dev branch = dev image) - Modified `ADLSToAzureSQL` - *read_sep* and *write_sep* parameters added to the flow. ### Fixed -- Fixed `ADLSToAzureSQL` breaking in `"append"` mode if the table didn't exist (#145). -- Fixed `ADLSToAzureSQL` breaking in promotion path for csv files. +- Fixed `ADLSToAzureSQL` breaking in `"append"` mode if the table didn't exist (#145). +- Fixed `ADLSToAzureSQL` breaking in promotion path for csv files. ## [0.2.6] - 2021-09-22 + ### Added + - Added flows library docs to the references page ### Changed + - Moved task library docs page to topbar - Updated docs for task and flows - ## [0.2.5] - 2021-09-20 + ### Added + - Added `start` and `end_date` parameters to `SupermetricsToADLS` flow - Added a tutorial on how to pull data from `Supermetrics` - ## [0.2.4] - 2021-09-06 + ### Added + - Added documentation (both docstrings and MKDocs docs) for multiple tasks - Added `start_date` and `end_date` parameters to the `SupermetricsToAzureSQL` flow - Added a temporary workaround `df_to_csv_task` task to the `SupermetricsToADLS` flow to handle mixed dtype columns not handled automatically by DataFrame's `to_parquet()` method - ## [0.2.3] - 2021-08-19 + ### Changed + - Modified `RunGreatExpectationsValidation` task to use the built in support for evaluation parameters added in Prefect v0.15.3 - Modified `SupermetricsToADLS` and `ADLSGen1ToAzureSQLNew` flows to align with this [recipe](https://docs.prefect.io/orchestration/flow_config/storage.html#loading-additional-files-with-git-storage) for reading the expectation suite JSON The suite now has to be loaded before flow initialization in the flow's python file and passed as an argument to the flow's constructor. @@ -264,6 +292,7 @@ Great Expectations project directory, which was confusing. The project directory - Changed the logging of docs URL for `RunGreatExpectationsValidation` task to use GE's recipe from [the docs](https://docs.greatexpectations.io/docs/guides/validation/advanced/how_to_implement_custom_notifications/) ### Added + - Added a test for `SupermetricsToADLS` flow -Added a test for `AzureDataLakeList` task - Added PR template for new PRs @@ -275,12 +304,14 @@ This allows the user to simply pass a dict with their expectations and not worry - Added `keep_validation_output` parameter and `cleanup_validation_clutter` task to the `SupermetricsToADLS` flow to control Great Expectations output to the filesystem ### Removed + - Removed `SupermetricsToAzureSQLv2` and `SupermetricsToAzureSQLv3` flows - Removed `geopy` dependency - ## [0.2.2] - 2021-07-27 + ### Added + - Added support for parquet in `AzureDataLakeToDF` - Added proper logging to the `RunGreatExpectationsValidation` task - Added the `viz` Prefect extra to requirements to allow flow visualizaion @@ -289,36 +320,42 @@ This allows the user to simply pass a dict with their expectations and not worry - Tasks: - `AzureDataLakeList` - for listing files in an ADLS directory - Flows: - - `ADLSToAzureSQL` - promoting files to conformed, operations, + - `ADLSToAzureSQL` - promoting files to conformed, operations, creating an SQL table and inserting the data into it - `ADLSContainerToContainer` - copying files between ADLS containers ### Changed + - Renamed `ReadAzureKeyVaultSecret` and `RunAzureSQLDBQuery` tasks to match Prefect naming style - Flows: - - `SupermetricsToADLS` - changed csv to parquet file extension. File and schema info are loaded to the `RAW` container. + - `SupermetricsToADLS` - changed csv to parquet file extension. File and schema info are loaded to the `RAW` container. ### Fixed -- Removed the broken version autobump from CI +- Removed the broken version autobump from CI ## [0.2.1] - 2021-07-14 + ### Added + - Flows: - `SupermetricsToADLS` - supporting immutable ADLS setup ### Changed -- A default value for the `ds_user` parameter in `SupermetricsToAzureSQLv3` can now be + +- A default value for the `ds_user` parameter in `SupermetricsToAzureSQLv3` can now be specified in the `SUPERMETRICS_DEFAULT_USER` secret - Updated multiple dependencies ### Fixed + - Fixed "Local run of `SupermetricsToAzureSQLv3` skips all tasks after `union_dfs_task`" (#59) - Fixed the `release` GitHub action - ## [0.2.0] - 2021-07-12 + ### Added + - Sources: - `AzureDataLake` (supports gen1 & gen2) - `SQLite` @@ -354,17 +391,21 @@ specified in the `SUPERMETRICS_DEFAULT_USER` secret - Supermetrics Google Ads extract ### Changed + - Tasks now use secrets for credential management (azure tasks use Azure Key Vault secrets) - SQL source now has a default query timeout of 1 hour ### Fixed + - Fix `SQLite` tests - Multiple stability improvements with retries and timeouts - ## [0.1.12] - 2021-05-08 + ### Changed + - Moved from poetry to pip ### Fixed + - Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part From b8ef150380a3b463511de178644837a3c387f62a Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Wed, 8 Jun 2022 15:24:02 +0200 Subject: [PATCH 108/119] updating requirements.txt --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index 33caca751..3761dfc1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,11 @@ duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 pandas-gbq==0.17.4 +<<<<<<< HEAD PyMySQL==1.0.2 paramiko==2.11.0 sshtunnel==0.4.0 databricks-connect==10.4.0b0 +======= +O365==2.0.18.1 +>>>>>>> 959bf71 (🔧 Updating requirements) From 8b8f0ed966bf9ee5644b234a089b9a70deb2825b Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 2 Jun 2022 09:48:35 +0200 Subject: [PATCH 109/119] =?UTF-8?q?=E2=9C=85=20Adding=20env=20var=20in=20t?= =?UTF-8?q?est?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_outlook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_outlook.py b/tests/integration/test_outlook.py index d3aa655ef..51542b6db 100644 --- a/tests/integration/test_outlook.py +++ b/tests/integration/test_outlook.py @@ -1,9 +1,11 @@ from viadot.sources import Outlook +from viadot.config import local_config def test_outlook_to_df(): + outlook_env_vars = local_config.get("OUTLOOK") outlook = Outlook( - mailbox_name="bulgaria@velux.com", + mailbox_name=outlook_env_vars["mail_example"], start_date="2022-04-28", end_date="2022-04-29", ) From 9c0a710cf6f9f03cfe6b8ff6bfc94c2a4ebe93b0 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 2 Jun 2022 10:06:46 +0200 Subject: [PATCH 110/119] =?UTF-8?q?Revert=20"=F0=9F=93=9D=20Updating=20the?= =?UTF-8?q?=20CHANGELOG"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 808cad1edf7a1612e716a9981665ad067542f6bc. --- CHANGELOG.md | 120 ++++++++++++++------------------------------------- 1 file changed, 32 insertions(+), 88 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27ffa595f..c7a19b46d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,22 +1,12 @@ # Changelog - All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] - -### Added - --Added `Outlook` source --Added `Outlook` task --Added `Outlook` flow - ## [0.4.3] - 2022-04-28 - ### Added - - Added `adls_file_name` in `SupermetricsToADLS` and `SharepointToADLS` flows - Added `BigQueryToADLS` flow class which anables extract data from BigQuery. - Added `Salesforce` source @@ -25,7 +15,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added C4C secret handling to `CloudForCustomersReportToADLS` flow (`c4c_credentials_secret` parameter) ### Fixed - - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed C4C secret handling (tasks now correctly read the secret as the credentials, rather than assuming the secret is a container for credentials for all environments and trying to access specific key inside it). In other words, tasks now assume the secret holds credentials, rather than a dict of the form `{env: credentials, env2: credentials2}` - Fixed `utils.gen_bulk_insert_query_from_df()` failing with > 1000 rows due to INSERT clause limit by chunking the data into multiple INSERTs @@ -33,36 +22,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `MultipleFlows` when one flow is passed and when last flow fails. ## [0.4.2] - 2022-04-08 - ### Added - - Added `AzureDataLakeRemove` task ### Changed - - Changed name of task file from `prefect` to `prefect_date_range` ### Fixed - - Fixed out of range issue in `prefect_date_range` -## [0.4.1] - 2022-04-07 +## [0.4.1] - 2022-04-07 ### Changed - - bumped version -## [0.4.0] - 2022-04-07 +## [0.4.0] - 2022-04-07 ### Added - - Added `custom_mail_state_handler` function that sends mail notification using custom smtp server. - Added new function `df_clean_column` that cleans data frame columns from special characters - Added `df_clean_column` util task that removes special characters from a pandas DataFrame - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added `GetFlowNewDateRange` task to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` -- Added new source `ASElite` +- Added new source `ASElite` - Added KeyVault support in `CloudForCustomers` tasks - Added `SQLServer` source - Added `DuckDBToDF` task @@ -83,17 +66,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `AzureSQLUpsert` task ### Changed - - Changed the base class of `AzureSQL` to `SQLServer` - `df_to_parquet()` task now creates directories if needed - Added several more separators to check for automatically in `SAPRFC.to_df()` - Upgraded `duckdb` version to 0.3.2 ### Fixed - - Fixed bug with `CheckColumnOrder` task - Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 -- `BCPTask` now correctly handles custom SQL Server port +- `BCPTask` now correctly handles custom SQL Server port - Fixed `SAPRFC.to_df()` ignoring user-specified separator - Fixed temporary CSV generated by the `DuckDBToSQLServer` flow not being cleaned up - Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance @@ -103,30 +84,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `SQL.to_df()` incorrectly handling queries that begin with whitespace ### Removed - - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. - Removed `dtypes_to_json` task to task_utils.py -## [0.3.2] - 2022-02-17 +## [0.3.2] - 2022-02-17 ### Fixed +- fixed an issue with schema info within `CheckColumnOrder` class. -- fixed an issue with schema info within `CheckColumnOrder` class. ## [0.3.1] - 2022-02-17 - ### Changed - --`ADLSToAzureSQL` - added `remove_tab` parameter to remove uncessery tab separators from data. +-`ADLSToAzureSQL` - added `remove_tab` parameter to remove uncessery tab separators from data. ### Fixed +- fixed an issue with return df within `CheckColumnOrder` class. -- fixed an issue with return df within `CheckColumnOrder` class. ## [0.3.0] - 2022-02-16 - ### Added - - new source `SAPRFC` for connecting with SAP using the `pyRFC` library (requires pyrfc as well as the SAP NW RFC library that can be downloaded [here](https://support.sap.com/en/product/connectors/nwrfcsdk.html) - new source `DuckDB` for connecting with the `DuckDB` database - new task `SAPRFCToDF` for loading data from SAP to a pandas DataFrame @@ -139,57 +115,49 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - KeyVault support in `CloudForCustomers` tasks ### Changed - - pinned Prefect version to 0.15.11 - `df_to_csv` now creates dirs if they don't exist - `ADLSToAzureSQL` - when data in csv coulmns has unnecessary "\t" then removes them ### Fixed - - fixed an issue with duckdb calls seeing initial db snapshot instead of the updated state (#282) - C4C connection with url and report_url optimization - column mapper in C4C source -## [0.2.15] - 2022-01-12 +## [0.2.15] - 2022-01-12 ### Added - - new option to `ADLSToAzureSQL` Flow - `if_exists="delete"` - `SQL` source: `create_table()` already handles `if_exists`; now it handles a new option for `if_exists()` - `C4CToDF` and `C4CReportToDF` tasks are provided as a class instead of function -### Fixed +### Fixed - Appending issue within CloudForCustomers source - An early return bug in `UKCarbonIntensity` in `to_df` method -## [0.2.14] - 2021-12-01 +## [0.2.14] - 2021-12-01 ### Fixed - - authorization issue within `CloudForCustomers` source -## [0.2.13] - 2021-11-30 +## [0.2.13] - 2021-11-30 ### Added - - Added support for file path to `CloudForCustomersReportToADLS` flow - Added `flow_of_flows` list handling - Added support for JSON files in `AzureDataLakeToDF` ### Fixed - - `Supermetrics` source: `to_df()` now correctly handles `if_empty` in case of empty results ### Changed - - `Sharepoint` and `CloudForCustomers` sources will now provide an informative `CredentialError` which is also raised early. This will make issues with input credenials immediately clear to the user. - Removed set_key_value from `CloudForCustomersReportToADLS` flow -## [0.2.12] - 2021-11-25 +## [0.2.12] - 2021-11-25 ### Added - - Added `Sharepoint` source - Added `SharepointToDF` task - Added `SharepointToADLS` flow @@ -201,39 +169,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `df_to_parquet` task to task_utils.py - Added `dtypes_to_json` task to task_utils.py -## [0.2.11] - 2021-10-30 +## [0.2.11] - 2021-10-30 ### Fixed +- `ADLSToAzureSQL` - fixed path to csv issue. +- `SupermetricsToADLS` - fixed local json path issue. -- `ADLSToAzureSQL` - fixed path to csv issue. -- `SupermetricsToADLS` - fixed local json path issue. ## [0.2.10] - 2021-10-29 - ### Release due to CI/CD error -## [0.2.9] - 2021-10-29 +## [0.2.9] - 2021-10-29 ### Release due to CI/CD error -## [0.2.8] - 2021-10-29 +## [0.2.8] - 2021-10-29 ### Changed - - CI/CD: `dev` image is now only published on push to the `dev` branch -- Docker: +- Docker: - updated registry links to use the new `ghcr.io` domain - `run.sh` now also accepts the `-t` option. When run in standard mode, it will only spin up the `viadot_jupyter_lab` service. When ran with `-t dev`, it will also spin up `viadot_testing` and `viadot_docs` containers. ### Fixed - - ADLSToAzureSQL - fixed path parameter issue. -## [0.2.7] - 2021-10-04 +## [0.2.7] - 2021-10-04 ### Added - - Added `SQLiteQuery` task - Added `CloudForCustomers` source - Added `CloudForCustomersToDF` and `CloudForCustomersToCSV` tasks @@ -243,47 +207,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added local setup and commands to the `README` ### Changed - - Changed CI/CD algorithm - the `latest` Docker image is now only updated on release and is the same exact image as the latest release - the `dev` image is released only on pushes and PRs to the `dev` branch (so dev branch = dev image) - Modified `ADLSToAzureSQL` - *read_sep* and *write_sep* parameters added to the flow. ### Fixed - - Fixed `ADLSToAzureSQL` breaking in `"append"` mode if the table didn't exist (#145). -- Fixed `ADLSToAzureSQL` breaking in promotion path for csv files. +- Fixed `ADLSToAzureSQL` breaking in promotion path for csv files. -## [0.2.6] - 2021-09-22 +## [0.2.6] - 2021-09-22 ### Added - - Added flows library docs to the references page ### Changed - - Moved task library docs page to topbar - Updated docs for task and flows -## [0.2.5] - 2021-09-20 +## [0.2.5] - 2021-09-20 ### Added - - Added `start` and `end_date` parameters to `SupermetricsToADLS` flow - Added a tutorial on how to pull data from `Supermetrics` -## [0.2.4] - 2021-09-06 +## [0.2.4] - 2021-09-06 ### Added - - Added documentation (both docstrings and MKDocs docs) for multiple tasks - Added `start_date` and `end_date` parameters to the `SupermetricsToAzureSQL` flow - Added a temporary workaround `df_to_csv_task` task to the `SupermetricsToADLS` flow to handle mixed dtype columns not handled automatically by DataFrame's `to_parquet()` method -## [0.2.3] - 2021-08-19 +## [0.2.3] - 2021-08-19 ### Changed - - Modified `RunGreatExpectationsValidation` task to use the built in support for evaluation parameters added in Prefect v0.15.3 - Modified `SupermetricsToADLS` and `ADLSGen1ToAzureSQLNew` flows to align with this [recipe](https://docs.prefect.io/orchestration/flow_config/storage.html#loading-additional-files-with-git-storage) for reading the expectation suite JSON The suite now has to be loaded before flow initialization in the flow's python file and passed as an argument to the flow's constructor. @@ -292,7 +249,6 @@ Great Expectations project directory, which was confusing. The project directory - Changed the logging of docs URL for `RunGreatExpectationsValidation` task to use GE's recipe from [the docs](https://docs.greatexpectations.io/docs/guides/validation/advanced/how_to_implement_custom_notifications/) ### Added - - Added a test for `SupermetricsToADLS` flow -Added a test for `AzureDataLakeList` task - Added PR template for new PRs @@ -304,14 +260,12 @@ This allows the user to simply pass a dict with their expectations and not worry - Added `keep_validation_output` parameter and `cleanup_validation_clutter` task to the `SupermetricsToADLS` flow to control Great Expectations output to the filesystem ### Removed - - Removed `SupermetricsToAzureSQLv2` and `SupermetricsToAzureSQLv3` flows - Removed `geopy` dependency -## [0.2.2] - 2021-07-27 +## [0.2.2] - 2021-07-27 ### Added - - Added support for parquet in `AzureDataLakeToDF` - Added proper logging to the `RunGreatExpectationsValidation` task - Added the `viz` Prefect extra to requirements to allow flow visualizaion @@ -320,42 +274,36 @@ This allows the user to simply pass a dict with their expectations and not worry - Tasks: - `AzureDataLakeList` - for listing files in an ADLS directory - Flows: - - `ADLSToAzureSQL` - promoting files to conformed, operations, + - `ADLSToAzureSQL` - promoting files to conformed, operations, creating an SQL table and inserting the data into it - `ADLSContainerToContainer` - copying files between ADLS containers ### Changed - - Renamed `ReadAzureKeyVaultSecret` and `RunAzureSQLDBQuery` tasks to match Prefect naming style - Flows: - - `SupermetricsToADLS` - changed csv to parquet file extension. File and schema info are loaded to the `RAW` container. + - `SupermetricsToADLS` - changed csv to parquet file extension. File and schema info are loaded to the `RAW` container. ### Fixed - - Removed the broken version autobump from CI -## [0.2.1] - 2021-07-14 +## [0.2.1] - 2021-07-14 ### Added - - Flows: - `SupermetricsToADLS` - supporting immutable ADLS setup ### Changed - -- A default value for the `ds_user` parameter in `SupermetricsToAzureSQLv3` can now be +- A default value for the `ds_user` parameter in `SupermetricsToAzureSQLv3` can now be specified in the `SUPERMETRICS_DEFAULT_USER` secret - Updated multiple dependencies ### Fixed - - Fixed "Local run of `SupermetricsToAzureSQLv3` skips all tasks after `union_dfs_task`" (#59) - Fixed the `release` GitHub action -## [0.2.0] - 2021-07-12 +## [0.2.0] - 2021-07-12 ### Added - - Sources: - `AzureDataLake` (supports gen1 & gen2) - `SQLite` @@ -391,21 +339,17 @@ specified in the `SUPERMETRICS_DEFAULT_USER` secret - Supermetrics Google Ads extract ### Changed - - Tasks now use secrets for credential management (azure tasks use Azure Key Vault secrets) - SQL source now has a default query timeout of 1 hour ### Fixed - - Fix `SQLite` tests - Multiple stability improvements with retries and timeouts -## [0.1.12] - 2021-05-08 +## [0.1.12] - 2021-05-08 ### Changed - - Moved from poetry to pip ### Fixed - - Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part From 1f29638bf14954154d74969ab12b6286d2879577 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 2 Jun 2022 12:29:28 +0200 Subject: [PATCH 111/119] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Changing=20default?= =?UTF-8?q?=20load=20to=20adls=20as=20parquet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/outlook_to_adls.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index c6bd0042f..772fc6d95 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -5,7 +5,12 @@ from prefect import Flow, Task, apply_map, task import pandas as pd from ..utils import slugify -from ..task_utils import df_to_csv, union_dfs_task +from ..task_utils import ( + df_to_csv, + union_dfs_task, + add_ingestion_metadata_task, + df_to_parquet, +) from ..tasks import OutlookToDF, AzureDataLakeUpload @@ -21,7 +26,7 @@ def __init__( start_date: str = None, end_date: str = None, local_file_path: str = None, - extension_file: str = ".csv", + extension_file: str = ".parquet", adls_file_path: str = None, overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, @@ -30,7 +35,7 @@ def __init__( *args: List[Any], **kwargs: Dict[str, Any], ): - """Flow for downloading data from Outlook source to a local CSV + """Flow for downloading data from Outlook source to a local file (parquet by default, otherwise csv for example) using Outlook API, then uploading it to Azure Data Lake. Args: @@ -39,7 +44,7 @@ def __init__( start_date (str, optional): A filtering start date parameter e.g. "2022-01-01". Defaults to None. end_date (str, optional): A filtering end date parameter e.g. "2022-01-02". Defaults to None. local_file_path (str, optional): Local destination path. Defaults to None. - extension_file (str, optional): Output file extension. Defaults to ".csv". + extension_file (str, optional): Output file extension. Defaults to ".parquet". adls_file_path (str, optional): Azure Data Lake destination file path. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite the file in ADLS. Defaults to True. adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with @@ -84,10 +89,22 @@ def gen_flow(self) -> Flow: dfs = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) df = union_dfs_task.bind(dfs, flow=self) - - df_to_file = df_to_csv.bind( - df=df, path=self.local_file_path, if_exists=self.if_exsists, flow=self - ) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) + + if self.extension_file == ".parquet": + df_to_file = df_to_parquet.bind( + df=df_with_metadata, + path=self.local_file_path, + if_exists=self.if_exsists, + flow=self, + ) + else: + df_to_file = df_to_csv.bind( + df=df_with_metadata, + path=self.local_file_path, + if_exists=self.if_exsists, + flow=self, + ) file_to_adls_task.bind( from_path=self.local_file_path, @@ -97,5 +114,6 @@ def gen_flow(self) -> Flow: flow=self, ) - df_to_file.set_upstream(df, flow=self) + df_with_metadata.set_upstream(df, flow=self) + df_to_file.set_upstream(df_with_metadata, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) From 54da19feaaf0001b7cda9e3892c02c89235bfc04 Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 2 Jun 2022 13:08:12 +0200 Subject: [PATCH 112/119] =?UTF-8?q?=F0=9F=8E=A8=20Adding=20data=20range=20?= =?UTF-8?q?to=20Outlook=20source?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/outlook_to_adls.py | 17 +++++++---------- viadot/tasks/outlook.py | 15 +++++---------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index 772fc6d95..f18a9596c 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -1,10 +1,7 @@ -import os from typing import Any, Dict, List, Union, Literal -import pendulum -from prefect import Flow, Task, apply_map, task +from prefect import Flow, Task, apply_map import pandas as pd -from ..utils import slugify from ..task_utils import ( df_to_csv, union_dfs_task, @@ -26,7 +23,7 @@ def __init__( start_date: str = None, end_date: str = None, local_file_path: str = None, - extension_file: str = ".parquet", + output_file_extension: str = ".parquet", adls_file_path: str = None, overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, @@ -35,8 +32,8 @@ def __init__( *args: List[Any], **kwargs: Dict[str, Any], ): - """Flow for downloading data from Outlook source to a local file (parquet by default, otherwise csv for example) - using Outlook API, then uploading it to Azure Data Lake. + """ + Flow for downloading data from Outlook source to Azure Data Lake in parquet format by default. Args: mailbox_list (List[str]): Mailbox name. @@ -44,7 +41,7 @@ def __init__( start_date (str, optional): A filtering start date parameter e.g. "2022-01-01". Defaults to None. end_date (str, optional): A filtering end date parameter e.g. "2022-01-02". Defaults to None. local_file_path (str, optional): Local destination path. Defaults to None. - extension_file (str, optional): Output file extension. Defaults to ".parquet". + output_file_extension (str, optional): Output file extension. Defaults to ".parquet". adls_file_path (str, optional): Azure Data Lake destination file path. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite the file in ADLS. Defaults to True. adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with @@ -62,7 +59,7 @@ def __init__( # AzureDataLakeUpload self.adls_file_path = adls_file_path - self.extension_file = extension_file + self.output_file_extension = output_file_extension self.overwrite_adls = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret @@ -91,7 +88,7 @@ def gen_flow(self) -> Flow: df = union_dfs_task.bind(dfs, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) - if self.extension_file == ".parquet": + if self.output_file_extension == ".parquet": df_to_file = df_to_parquet.bind( df=df_with_metadata, path=self.local_file_path, diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index a72c78cc7..e2440e74e 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -17,7 +17,7 @@ def __init__( start_date: str = None, end_date: str = None, credentials: Dict[str, Any] = None, - extension_file: str = ".csv", + output_file_extension: str = ".csv", limit: int = 10000, *args: List[Any], **kwargs: Dict[str, Any], @@ -26,15 +26,9 @@ def __init__( self.mailbox_name = mailbox_name self.start_date = start_date self.end_date = end_date - self.extension_file = extension_file + self.output_file_extension = output_file_extension self.limit = limit - - try: - DEFAULT_CREDENTIALS = local_config["OUTLOOK"] - except KeyError: - DEFAULT_CREDENTIALS = None - - self.credentials = credentials or DEFAULT_CREDENTIALS + self.credentials = credentials super().__init__( name="outlook_to_csv", @@ -43,7 +37,7 @@ def __init__( ) def __call__(self, *args, **kwargs): - """Download Outlook Mesagess to DF""" + """Download Outlook Messages to DF""" return super().__call__(*args, **kwargs) @defaults_from_attrs("mailbox_name", "start_date", "end_date", "limit") @@ -67,6 +61,7 @@ def run( pd.DataFrame: The API GET as a pandas DataFrames from Outlook. """ outlook = Outlook( + credentials=self.credentials, mailbox_name=mailbox_name, start_date=start_date, end_date=end_date, From 66839cae575438fe376f1e9fe472c9ea253a3e9b Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Thu, 2 Jun 2022 15:12:41 +0200 Subject: [PATCH 113/119] =?UTF-8?q?=F0=9F=8E=A8=20Small=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/outlook.py | 92 +++++++++++++++++++++++++++------------ viadot/tasks/outlook.py | 8 +++- 2 files changed, 70 insertions(+), 30 deletions(-) diff --git a/viadot/sources/outlook.py b/viadot/sources/outlook.py index 32f2293a2..178eca312 100644 --- a/viadot/sources/outlook.py +++ b/viadot/sources/outlook.py @@ -4,6 +4,7 @@ import datetime from typing import Any, Dict, List from ..config import local_config +from ..exceptions import CredentialError class Outlook(Source): @@ -13,13 +14,15 @@ def __init__( start_date: str = None, end_date: str = None, credentials: Dict[str, Any] = None, - extension_file: str = ".csv", limit: int = 10000, request_retries: int = 10, *args: List[Any], **kwargs: Dict[str, Any], ): - """Outlook connector build for fetching Outlook API source. + """ + Outlook connector build for fetching Outlook API source. + Data are fetched from start to end date range. If start or end date are not provided + then flow fetched data from yestarday by default. Args: mailbox_name (str): Mailbox name. @@ -28,23 +31,42 @@ def __init__( credentials (Dict[str, Any], optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Application. Defaults to None. - extension_file (str, optional): Output file extension - to allow selection of .csv for data which is not easy - to handle with parquet. Defaults to ".csv". limit (int, optional): Number of fetched top messages. Defaults to 10000. """ - super().__init__(*args, **kwargs) - try: DEFAULT_CREDENTIALS = local_config["OUTLOOK"] except KeyError: DEFAULT_CREDENTIALS = None + self.credentials = credentials or DEFAULT_CREDENTIALS + if self.credentials is None: + raise CredentialError("You do not provide credentials!") self.request_retries = request_retries - self.credentials = credentials or DEFAULT_CREDENTIALS - self.extension_file = extension_file self.mailbox_name = mailbox_name self.start_date = start_date self.end_date = end_date + if self.start_date is not None and self.end_date is not None: + self.date_range_end_time = datetime.datetime.strptime( + self.end_date, "%Y-%m-%d" + ) + self.date_range_start_time = datetime.datetime.strptime( + self.start_date, "%Y-%m-%d" + ) + else: + self.date_range_end_time = datetime.date.today() - datetime.timedelta( + days=1 + ) + self.date_range_start_time = datetime.date.today() - datetime.timedelta( + days=2 + ) + min_time = datetime.datetime.min.time() + self.date_range_end_time = datetime.datetime.combine( + self.date_range_end_time, min_time + ) + self.date_range_start_time = datetime.datetime.combine( + self.date_range_start_time, min_time + ) + self.account = Account( (self.credentials["client_id"], self.credentials["client_secret"]), auth_flow_type="credentials", @@ -61,9 +83,12 @@ def __init__( self.mailbox_messages = self.mailbox_obj.get_messages(limit) super().__init__(*args, credentials=self.credentials, **kwargs) - def to_df(self): - date_range_end_time = datetime.datetime.strptime(self.end_date, "%Y-%m-%d") - date_range_start_time = datetime.datetime.strptime(self.start_date, "%Y-%m-%d") + def to_df(self) -> pd.DataFrame: + """Download Outlook data into a pandas DataFrame. + + Returns: + pd.DataFrame: the DataFrame with time range + """ data = [] while True: @@ -71,37 +96,46 @@ def to_df(self): message = next(self.mailbox_messages) received_time = message.received date_time_str = str(received_time) - dd = date_time_str[0:19] - date_obj = datetime.datetime.strptime(dd, "%Y-%m-%d %H:%M:%S") - if date_obj < date_range_start_time or date_obj > date_range_end_time: + date_time_string = date_time_str[0:19] + date_obj = datetime.datetime.strptime( + date_time_string, "%Y-%m-%d %H:%M:%S" + ) + if ( + date_obj < self.date_range_start_time + or date_obj > self.date_range_end_time + ): continue else: fetched = message.to_api_data() try: sender_mail = fetched["from"]["emailAddress"]["address"] reciver_list = fetched.get("toRecipients") - recivers = "" + recivers = " " if reciver_list is not None: recivers = ", ".join( - r["emailAddress"]["address"] for r in reciver_list + reciver["emailAddress"]["address"] + for reciver in reciver_list ) - else: - recivers = "" - categories = "" - if message.categories: - categories = ", ".join(c for c in message.categories) + categories = " " + if message.categories is not None: + categories = ", ".join( + categories for categories in message.categories + ) + + conversation_index = " " + if message.conversation_index is not None: + conversation_index = message.conversation_index row = { - "subject": fetched.get("subject"), "conversation ID": fetched.get("conversationId"), - "conversation index": message.conversation_index, + "conversation index": conversation_index, "categories": categories, "sender": sender_mail, "recivers": recivers, - "read": fetched.get("isRead"), - "received time": fetched.get("receivedDateTime"), + "received_time": fetched.get("receivedDateTime"), } - row["mail adress"] = ( + + row["mail_adress"] = ( self.mailbox_name.split("@")[0] .replace(".", "_") .replace("-", "_") @@ -112,8 +146,8 @@ def to_df(self): row["Inbox"] = True data.append(row) - except KeyError: - print(f"KeyError - nie ma w:") + except KeyError as e: + print("KeyError : " + str(e)) except StopIteration: break df = pd.DataFrame(data=data) @@ -123,4 +157,4 @@ def to_df(self): def to_csv(self): df = self.to_df() file_name = self.mailbox_name.split("@")[0].replace(".", "_").replace("-", "_") - df.to_csv(f"{file_name}{self.extension_file}", index=False) + df.to_csv(f"{file_name}.csv", index=False) diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index e2440e74e..c28c8bf19 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -30,6 +30,12 @@ def __init__( self.limit = limit self.credentials = credentials + # try: + # DEFAULT_CREDENTIALS = local_config["OUTLOOK"] + # except KeyError: + # DEFAULT_CREDENTIALS = None + # self.credentials = credentials or DEFAULT_CREDENTIALS + super().__init__( name="outlook_to_csv", *args, @@ -49,7 +55,7 @@ def run( limit: int = 10000, ) -> pd.DataFrame: """ - Task for downloading data from the Outlook API to a CSV file. + Task for downloading data from the Outlook API to DF. Args: mailbox_name (str): Mailbox name. From 173d072f8ca16bf32e847b1b9da8881b7747737f Mon Sep 17 00:00:00 2001 From: AdamSulek <48438721+AdamSulek@users.noreply.github.com> Date: Wed, 8 Jun 2022 15:46:11 +0200 Subject: [PATCH 114/119] Changes in requirements.txt --- requirements.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3761dfc1e..308d431af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,11 +32,4 @@ duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 pandas-gbq==0.17.4 -<<<<<<< HEAD -PyMySQL==1.0.2 -paramiko==2.11.0 -sshtunnel==0.4.0 -databricks-connect==10.4.0b0 -======= O365==2.0.18.1 ->>>>>>> 959bf71 (🔧 Updating requirements) From b43bd752826f2821022e801a1d3e9deb49a55527 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Thu, 9 Jun 2022 11:53:36 +0200 Subject: [PATCH 115/119] Update CHANGELOG.md --- CHANGELOG.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a19b46d..bf6a6e15f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,23 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added new connector - Outlook. Created `Outlook` source, `OutlookToDF` task and `OutlookToADLS` flow. +- Added new connector - Epicor. Created `Epicor` source, `EpicorToDF` task and `EpicorToDuckDB` flow. +- Enabled Databricks Connect in the image. To enable, [follow this guide](./README.md#executing-spark-jobs) +- Added `MySQL` source and `MySqlToADLS` flow +- Added `SQLServerToDF` task +- Added `SQLServerToDuckDB` flow which downloads data from SQLServer table, loads it to parquet file and then uplads it do DuckDB +- Added complete proxy set up in `SAPRFC` example (`viadot/examples/sap_rfc`) + +### Changed +- Changed default name for the Prefect secret holding the name of the Azure KV secret storing Sendgrid credentials + + ## [0.4.3] - 2022-04-28 ### Added +- Added `func` parameter to `SAPRFC` +- Added `SAPRFCToADLS` flow which downloads data from SAP Database to to a pandas DataFrame, exports df to csv and uploads it to Azure Data Lake. - Added `adls_file_name` in `SupermetricsToADLS` and `SharepointToADLS` flows - Added `BigQueryToADLS` flow class which anables extract data from BigQuery. - Added `Salesforce` source @@ -21,6 +36,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `get_flow_last_run_date()` incorrectly parsing the date - Fixed `MultipleFlows` when one flow is passed and when last flow fails. + ## [0.4.2] - 2022-04-08 ### Added - Added `AzureDataLakeRemove` task @@ -39,7 +55,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.4.0] - 2022-04-07 ### Added -- Added `custom_mail_state_handler` function that sends mail notification using custom smtp server. +- Added `custom_mail_state_handler` task that sends email notification using a custom SMTP server. - Added new function `df_clean_column` that cleans data frame columns from special characters - Added `df_clean_column` util task that removes special characters from a pandas DataFrame - Added `MultipleFlows` flow class which enables running multiple flows in a given order. From 37a1106602e9b97e34bfbc2abde5717f8c0de498 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Thu, 9 Jun 2022 11:56:05 +0200 Subject: [PATCH 116/119] Update requirements.txt From cf83fbc6277636568d3fdf131442730dca043ea9 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Thu, 9 Jun 2022 12:06:49 +0200 Subject: [PATCH 117/119] Update requirements.txt --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index 308d431af..5d80baa5d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,4 +32,9 @@ duckdb==0.3.2 google-auth==2.6.2 sendgrid==6.9.7 pandas-gbq==0.17.4 +pydantic==1.9.0 +PyMySQL==1.0.2 +paramiko==2.11.0 +sshtunnel==0.4.0 +databricks-connect==10.4.0b0 O365==2.0.18.1 From e9cf4bab136d5afa87b9b27714bf3c273b19b9a1 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Thu, 9 Jun 2022 13:55:34 +0200 Subject: [PATCH 118/119] Update __init__.py From 8718803c9a7e0976a60148a6a4bc5383b6c0efbe Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 9 Jun 2022 14:45:01 +0200 Subject: [PATCH 119/119] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog=20be?= =?UTF-8?q?fore=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0bd094e4..c6ca36533 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +## [0.4.4] - 2022-06-09 ### Added - Added new connector - Outlook. Created `Outlook` source, `OutlookToDF` task and `OutlookToADLS` flow.