From 8202644e3ebf868de9eb78116f12318e602f5728 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 10 Feb 2022 11:15:22 +0100 Subject: [PATCH 001/135] =?UTF-8?q?=F0=9F=8E=89=20Added=20new=20task=20for?= =?UTF-8?q?=20Prefect?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 viadot/tasks/prefect.py diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py new file mode 100644 index 000000000..db30894a6 --- /dev/null +++ b/viadot/tasks/prefect.py @@ -0,0 +1,59 @@ +from typing import List +import prefect +from datetime import date, datetime +import pandas as pd + +from prefect import Task +from prefect.utilities.tasks import defaults_from_attrs + +logger = logging.get_logger() + + +class PrefectExtract(Task): + def __init__( + self, + *args, + **kwargs, + ): + + super().__init__( + name="prefect_extract_details", + *args, + **kwargs, + ) + + def __call__(self): + """Extract details from Prefect Flow""" + super().__call__(self) + + def iter_throught_flow_runs_ids(self, run_ids_list: List[str] = None): + """ + Generate Flow run ids + """ + for ind in range(len(run_ids_list)): + yield run_ids_list[ind] + + def check_fails(self, flow_run_ids: str = None): + """ + Get start_time from last Flow run where state was success + """ + for flow_run in self.iter_throught_flow_runs_ids(flow_run_ids): + if flow_run.state == "Success": + return flow_run.start_time + + def format_date(self, last_success: str = None): + """ + Split date to date and time. Calculations for set new date are needed. + """ + today = date.today() + full_date_success = last_success.split("T") + date_success = full_date_success[0] + time_success = full_date_success[1].split(".")[0] + return [date_success, time_success] + + @defaults_from_attrs() + def run( + self, + **kwargs, + ) -> None: + pass From 9ea987a5b32117af260e9df61212e444e0f64116 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 11 Feb 2022 13:27:37 +0100 Subject: [PATCH 002/135] =?UTF-8?q?=F0=9F=9A=A7=20Added=20PrefectExtract?= =?UTF-8?q?=20task=20to=20SupermetricsToADLS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 12 ++++++++++++ viadot/tasks/__init__.py | 1 + 2 files changed, 13 insertions(+) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 083e9fcdd..53c0d443f 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -24,6 +24,7 @@ DownloadGitHubFile, RunGreatExpectationsValidation, SupermetricsToDF, + PrefectExtract, ) logger = logging.get_logger(__name__) @@ -33,6 +34,7 @@ validation_task = RunGreatExpectationsValidation() file_to_adls_task = AzureDataLakeUpload() json_to_adls_task = AzureDataLakeUpload() +prefect_extract = PrefectExtract() class SupermetricsToADLS(Flow): @@ -115,6 +117,7 @@ def __init__( msg = "Neither 'ds_user' parameter nor 'SUPERMETRICS_DEFAULT_USER' secret were not specified" raise ValueError(msg) from e + self.flow_name = name # SupermetricsToDF self.ds_id = ds_id self.ds_accounts = ds_accounts @@ -197,6 +200,15 @@ def gen_supermetrics_task( return t def gen_flow(self) -> Flow: + if self.date_range_type is not None: + prefect_extract.run( + flow_name=self.flow_name, if_date_range_type=True, flow=self + ) + if self.start_date is not None and self.end_date is not None: + prefect_extract.run( + flow_name=self.flow_name, if_date_range_type=False, flow=self + ) + if self.parallel: # generate a separate task for each account dfs = apply_map(self.gen_supermetrics_task, self.ds_accounts, flow=self) diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 000e3d42e..e1051dfe9 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -24,3 +24,4 @@ from .supermetrics import SupermetricsToCSV, SupermetricsToDF from .sharepoint import SharepointToDF from .cloud_for_customers import C4CReportToDF, C4CToDF +from .prefect import PrefectExtract From 0d681bf000cdcd555be6b4e5b42eead4d81cdd90 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 11 Feb 2022 13:31:50 +0100 Subject: [PATCH 003/135] =?UTF-8?q?=F0=9F=9A=A7=20Added=20class=20argument?= =?UTF-8?q?s=20and=20code=20to=20run()=20method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 56 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index db30894a6..46f0fa019 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -5,6 +5,7 @@ from prefect import Task from prefect.utilities.tasks import defaults_from_attrs +from prefect.utilities import logging logger = logging.get_logger() @@ -12,10 +13,17 @@ class PrefectExtract(Task): def __init__( self, + flow_name: str = None, + if_date_range_type: bool = None, + date: List[str] = None, *args, **kwargs, ): + self.flow_name = flow_name + self.if_date_range_type = if_date_range_type + self.date = date + super().__init__( name="prefect_extract_details", *args, @@ -30,8 +38,8 @@ def iter_throught_flow_runs_ids(self, run_ids_list: List[str] = None): """ Generate Flow run ids """ - for ind in range(len(run_ids_list)): - yield run_ids_list[ind] + for id in range(len(run_ids_list)): + yield run_ids_list[id] def check_fails(self, flow_run_ids: str = None): """ @@ -51,9 +59,49 @@ def format_date(self, last_success: str = None): time_success = full_date_success[1].split(".")[0] return [date_success, time_success] - @defaults_from_attrs() + @defaults_from_attrs( + "flow_name", + "if_date_range_type", + "date", + ) def run( self, + flow_name, + if_date_range_type, + date, **kwargs, ) -> None: - pass + + client = prefect.Client() + + query = ( + """ + { + flow (where: { name: { _eq: "%s" } } ) + { + flow_runs( + order_by: {end_time: desc} + where: {start_time:{ _is_null:false } } ) + { + id + end_time + start_time + state + } + } + } + """ + % flow_name + ) + + flow_runs = client.graphql(query) + flow_runs_ids = flow_runs.data.flow[0]["flow_runs"] + + last_success = self.check_fails(flow_runs_ids) + if if_date_range_type is True: + print(self.if_date_range_type) + + new_date = self.format_date(last_success)[0] + new_last_days = self.format_date(last_success)[1] + + return [new_date, new_last_days] From aa65ac5b8a3e2f3f944b3293233bc1f0890624e4 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 14 Feb 2022 16:02:24 +0100 Subject: [PATCH 004/135] =?UTF-8?q?=F0=9F=9A=A7=20Added=20ability=20to=20a?= =?UTF-8?q?utomatic=20change=20date=5Frange=20if=20flows=20have=20failed?= =?UTF-8?q?=20in=20the=20last=20days?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 11 ++++++++++- viadot/tasks/prefect.py | 26 ++++++++++++++------------ 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 53c0d443f..ee50fa954 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -201,9 +201,18 @@ def gen_supermetrics_task( def gen_flow(self) -> Flow: if self.date_range_type is not None: - prefect_extract.run( + difference = prefect_extract.run( flow_name=self.flow_name, if_date_range_type=True, flow=self ) + old_range_splitted = self.date_range_type.split("_") + + old_range = int(old_range_splitted[1]) + new_range = old_range + difference + + new_range_splitted = old_range_splitted + new_range_splitted[1] = str(new_range) + self.date_range_type = "_".join(new_range_splitted) + if self.start_date is not None and self.end_date is not None: prefect_extract.run( flow_name=self.flow_name, if_date_range_type=False, flow=self diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index 46f0fa019..f7cb82143 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -49,15 +49,21 @@ def check_fails(self, flow_run_ids: str = None): if flow_run.state == "Success": return flow_run.start_time - def format_date(self, last_success: str = None): + def format_date(self, last_success: str = None, data_range: bool = None): """ Split date to date and time. Calculations for set new date are needed. """ - today = date.today() - full_date_success = last_success.split("T") - date_success = full_date_success[0] - time_success = full_date_success[1].split(".")[0] - return [date_success, time_success] + today = datetime.today() + date_success = last_success.split("T")[0] + date_success = datetime.strptime(date_success, "%Y-%m-%d") + + if data_range is True: + difference = today - date_success + return difference.days + if data_range is False: + formated_date = date_success + + return formated_date @defaults_from_attrs( "flow_name", @@ -98,10 +104,6 @@ def run( flow_runs_ids = flow_runs.data.flow[0]["flow_runs"] last_success = self.check_fails(flow_runs_ids) - if if_date_range_type is True: - print(self.if_date_range_type) - - new_date = self.format_date(last_success)[0] - new_last_days = self.format_date(last_success)[1] + new_date = self.format_date(last_success, data_range=if_date_range_type) - return [new_date, new_last_days] + return new_date From 8a77863981ba68e24b882ff4e57173bfddaab902 Mon Sep 17 00:00:00 2001 From: m-paz Date: Thu, 17 Feb 2022 11:14:11 +0100 Subject: [PATCH 005/135] =?UTF-8?q?=F0=9F=93=9D=20Bumped=20version=20after?= =?UTF-8?q?=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_viadot.py | 2 +- viadot/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_viadot.py b/tests/test_viadot.py index d9e54d633..5c3f36d5a 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.3.2" + assert __version__ == "0.3.3" diff --git a/viadot/__init__.py b/viadot/__init__.py index f9aa3e110..e19434e2e 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.3.2" +__version__ = "0.3.3" From 29e1706cc4f11f3533ef570dddce787988ee7d7b Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 17 Feb 2022 14:52:18 +0100 Subject: [PATCH 006/135] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20Changed=20class=20?= =?UTF-8?q?name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 2 +- viadot/tasks/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index ee50fa954..727780e08 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -34,7 +34,7 @@ validation_task = RunGreatExpectationsValidation() file_to_adls_task = AzureDataLakeUpload() json_to_adls_task = AzureDataLakeUpload() -prefect_extract = PrefectExtract() +prefect_extract = GetFlowLastSuccessfulRun() class SupermetricsToADLS(Flow): diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index e1051dfe9..5955f735f 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -24,4 +24,4 @@ from .supermetrics import SupermetricsToCSV, SupermetricsToDF from .sharepoint import SharepointToDF from .cloud_for_customers import C4CReportToDF, C4CToDF -from .prefect import PrefectExtract +from .prefect import GetFlowLastSuccessfulRun From d104e2fe74a02d72b2f288834e677068e5ca906f Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 17 Feb 2022 14:53:14 +0100 Subject: [PATCH 007/135] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20Changed=20class=20?= =?UTF-8?q?name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 727780e08..000414bc1 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -6,6 +6,7 @@ from prefect.backend import set_key_value from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging +from tasks.prefect import GetFlowLastSuccessfulRun from ..task_utils import ( add_ingestion_metadata_task, @@ -34,7 +35,7 @@ validation_task = RunGreatExpectationsValidation() file_to_adls_task = AzureDataLakeUpload() json_to_adls_task = AzureDataLakeUpload() -prefect_extract = GetFlowLastSuccessfulRun() +prefect_get_successful_run = GetFlowLastSuccessfulRun() class SupermetricsToADLS(Flow): @@ -201,8 +202,8 @@ def gen_supermetrics_task( def gen_flow(self) -> Flow: if self.date_range_type is not None: - difference = prefect_extract.run( - flow_name=self.flow_name, if_date_range_type=True, flow=self + difference = prefect_get_successful_run.run( + flow_name=self.flow_name, is_date_range_type=True, flow=self ) old_range_splitted = self.date_range_type.split("_") @@ -214,7 +215,7 @@ def gen_flow(self) -> Flow: self.date_range_type = "_".join(new_range_splitted) if self.start_date is not None and self.end_date is not None: - prefect_extract.run( + prefect_get_successful_run.run( flow_name=self.flow_name, if_date_range_type=False, flow=self ) From 00f464aecebc55edd5c463340da6586e2bccd20b Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 17 Feb 2022 14:54:46 +0100 Subject: [PATCH 008/135] =?UTF-8?q?=F0=9F=9A=A7=20Added=20check=5Fif=5Fsch?= =?UTF-8?q?eduled=5Frun()=20and=20modified=20calculate=5Fdifference()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 88 ++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index f7cb82143..fda202c4f 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Literal import prefect from datetime import date, datetime import pandas as pd @@ -10,18 +10,18 @@ logger = logging.get_logger() -class PrefectExtract(Task): +class GetFlowLastSuccessfulRun(Task): def __init__( self, flow_name: str = None, - if_date_range_type: bool = None, + is_date_range_type: bool = None, date: List[str] = None, *args, **kwargs, ): self.flow_name = flow_name - self.if_date_range_type = if_date_range_type + self.is_date_range_type = is_date_range_type self.date = date super().__init__( @@ -49,31 +49,51 @@ def check_fails(self, flow_run_ids: str = None): if flow_run.state == "Success": return flow_run.start_time - def format_date(self, last_success: str = None, data_range: bool = None): - """ - Split date to date and time. Calculations for set new date are needed. - """ - today = datetime.today() - date_success = last_success.split("T")[0] - date_success = datetime.strptime(date_success, "%Y-%m-%d") - - if data_range is True: - difference = today - date_success - return difference.days - if data_range is False: - formated_date = date_success - - return formated_date + def calculate_difference( + self, + date_to_compare: datetime = None, + base_date: datetime = datetime.today(), + is_date_range_type: bool = None, + diff_type: Literal["time", "date"] = "date", + ): + """Calculations for set new date are needed.""" + if is_date_range_type is True: + if diff_type == "date": + difference = base_date - date_to_compare + return difference.days + if diff_type == "time": + difference_h = abs(base_date.hour - date_to_compare.hour) + if difference_h >= 1: + return difference_h + else: + return difference_h + if is_date_range_type is False: + return date_to_compare + + def check_if_scheduled_run( + self, time_run: str = None, time_schedule: str = None + ) -> bool: + """Check if run was schduled or started by user""" + diff = self.calculate_difference( + date_to_compare=time_run, + base_date=time_schedule, + is_date_range_type=True, + diff_type="time", + ) + if diff < 1: + return True + if diff > 1: + return False @defaults_from_attrs( "flow_name", - "if_date_range_type", + "is_date_range_type", "date", ) def run( self, flow_name, - if_date_range_type, + is_date_range_type, date, **kwargs, ) -> None: @@ -93,6 +113,7 @@ def run( end_time start_time state + scheduled_start_time } } } @@ -100,10 +121,31 @@ def run( % flow_name ) + ## check if is scheduled + flow_runs = client.graphql(query) flow_runs_ids = flow_runs.data.flow[0]["flow_runs"] - last_success = self.check_fails(flow_runs_ids) - new_date = self.format_date(last_success, data_range=if_date_range_type) + last_success_start_time = self.check_fails(flow_runs_ids) + time_schedule = flow_runs_ids[0]["scheduled_start_time"] + + is_scheduled = self.check_if_scheduled_run( + time_run=last_success_start_time, + time_schedule=time_schedule, + ) + if is_scheduled is True: + new_date = self.calculate_difference( + date_to_compare=last_success_start_time, + base_date=time_schedule, + is_date_range_type=is_date_range_type, + diff_type="date", + ) + if is_scheduled is False: + new_date = self.calculate_difference( + date_to_compare=last_success_start_time, + base_date=time_schedule, + is_date_range_type=is_date_range_type, + diff_type="date", + ) return new_date From a162822b5d7cd0e1640a10681d290e3dae034fb6 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 17 Feb 2022 15:01:28 +0100 Subject: [PATCH 009/135] =?UTF-8?q?=F0=9F=9A=A7=20Added=20get=5Fformatted?= =?UTF-8?q?=5Fdate()=20method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index fda202c4f..b2fd6ace5 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -57,6 +57,9 @@ def calculate_difference( diff_type: Literal["time", "date"] = "date", ): """Calculations for set new date are needed.""" + base_date = self.get_formatted_date(base_date, diff_type) + date_to_compare = self.get_formatted_date(date_to_compare, diff_type) + if is_date_range_type is True: if diff_type == "date": difference = base_date - date_to_compare @@ -85,6 +88,23 @@ def check_if_scheduled_run( if diff > 1: return False + def get_formatted_date( + self, + time_unclean: str = None, + return_value: Literal["time", "date"] = "date", + ): + """from prefect format date (in string) get clean time or date in datetime type.""" + if return_value == "time": + time_extracted = time_unclean.split("T")[1] + time_clean_str = time_extracted.split(".")[0] + time_clean = datetime.strptime(time_clean_str[:8], "%H:%M:%S") + return time_clean.time() + + if return_value == "date": + date_extracted = time_unclean.split("T")[0] + date_clean = datetime.strptime(date_extracted, "%Y-%m-%d") + return date_clean.date() + @defaults_from_attrs( "flow_name", "is_date_range_type", From a52efe5f6101f998e4976fd9444939ca3b409ddf Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 18 Feb 2022 16:26:36 +0100 Subject: [PATCH 010/135] =?UTF-8?q?=F0=9F=9A=A7=20Added=20change=5Fdate=5F?= =?UTF-8?q?range=5Ftask()=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 33 +++++++++++++++------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 000414bc1..726e5fab6 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -6,7 +6,6 @@ from prefect.backend import set_key_value from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging -from tasks.prefect import GetFlowLastSuccessfulRun from ..task_utils import ( add_ingestion_metadata_task, @@ -25,7 +24,7 @@ DownloadGitHubFile, RunGreatExpectationsValidation, SupermetricsToDF, - PrefectExtract, + GetFlowLastSuccessfulRun, ) logger = logging.get_logger(__name__) @@ -175,6 +174,18 @@ def __init__( def slugify(name): return name.replace(" ", "_").lower() + def change_date_range_task( + date_range_type: str = None, difference: int = None, flow: Flow = None + ) -> Task: + old_range_splitted = date_range_type.split("_") + old_range = int(old_range_splitted[1]) + new_range = old_range + difference + + new_range_splitted = old_range_splitted + new_range_splitted[1] = str(new_range) + date_range_type = "_".join(new_range_splitted) + return date_range_type + def gen_supermetrics_task( self, ds_accounts: Union[str, List[str]], flow: Flow = None ) -> Task: @@ -203,20 +214,12 @@ def gen_supermetrics_task( def gen_flow(self) -> Flow: if self.date_range_type is not None: difference = prefect_get_successful_run.run( - flow_name=self.flow_name, is_date_range_type=True, flow=self + flow_name=self.flow_name, + date_range_type=self.date_range_type, + flow=self, ) - old_range_splitted = self.date_range_type.split("_") - - old_range = int(old_range_splitted[1]) - new_range = old_range + difference - - new_range_splitted = old_range_splitted - new_range_splitted[1] = str(new_range) - self.date_range_type = "_".join(new_range_splitted) - - if self.start_date is not None and self.end_date is not None: - prefect_get_successful_run.run( - flow_name=self.flow_name, if_date_range_type=False, flow=self + self.date_range_type = self.change_date_range_task( + date_range_type=self.date_range_type, difference=difference, flow=self ) if self.parallel: From c0fbb4c5b79315f6a133ed78d2d32d625c6a8291 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Feb 2022 11:37:21 +0100 Subject: [PATCH 011/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20for=20old=20SQL=20?= =?UTF-8?q?Servers=20still=20using=20TLS=20<=201.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++++ docker/Dockerfile | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53ccb56b6..4ca3fef36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixed +- fixed OpenSSL config for old SQL Servers still using TLS < 1.2 + ## [0.3.2] - 2022-02-17 ### Fixed - fixed an issue with schema info within `CheckColumnOrder` class. + ## [0.3.1] - 2022-02-17 ### Changed -`ADLSToAzureSQL` - added `remove_tab` parameter to remove uncessery tab separators from data. diff --git a/docker/Dockerfile b/docker/Dockerfile index 6eeab2934..97a5a3cbc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,7 +18,7 @@ RUN echo "Acquire::Check-Valid-Until \"false\";\nAcquire::Check-Date \"false\";" # System packages -RUN apt update && yes | apt install vim unixodbc-dev build-essential \ +RUN apt update -q && yes | apt install -q vim unixodbc-dev build-essential \ curl python3-dev libboost-all-dev libpq-dev graphviz python3-gi sudo git RUN pip install --upgrade cffi @@ -26,6 +26,9 @@ RUN curl http://archive.ubuntu.com/ubuntu/pool/main/g/glibc/multiarch-support_2. -o multiarch-support_2.27-3ubuntu1_amd64.deb && \ apt install ./multiarch-support_2.27-3ubuntu1_amd64.deb +# Fix for old SQL Servers still using TLS < 1.2 +RUN chmod +rwx /usr/lib/ssl/openssl.cnf && \ + sed -i 's/SECLEVEL=2/SECLEVEL=1/g' /usr/lib/ssl/openssl.cnf # ODBC -- make sure to pin driver version as it's reflected in odbcinst.ini RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ From 0f174c43c5e250a638a9555c606f9373dc9964f6 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 21 Feb 2022 15:11:10 +0100 Subject: [PATCH 012/135] =?UTF-8?q?=F0=9F=9A=A7=20Modified=20calculate=5Fd?= =?UTF-8?q?ifference()=20method?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 6 +-- viadot/tasks/prefect.py | 67 ++++++++++++---------------- 2 files changed, 32 insertions(+), 41 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 726e5fab6..ce847716e 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -175,9 +175,9 @@ def slugify(name): return name.replace(" ", "_").lower() def change_date_range_task( - date_range_type: str = None, difference: int = None, flow: Flow = None + self, date_range: str = None, difference: int = None, flow: Flow = None ) -> Task: - old_range_splitted = date_range_type.split("_") + old_range_splitted = date_range.split("_") old_range = int(old_range_splitted[1]) new_range = old_range + difference @@ -219,7 +219,7 @@ def gen_flow(self) -> Flow: flow=self, ) self.date_range_type = self.change_date_range_task( - date_range_type=self.date_range_type, difference=difference, flow=self + date_range=self.date_range_type, difference=difference, flow=self ) if self.parallel: diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index b2fd6ace5..dccfa9698 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -1,3 +1,4 @@ +from os import times_result from typing import List, Literal import prefect from datetime import date, datetime @@ -14,15 +15,13 @@ class GetFlowLastSuccessfulRun(Task): def __init__( self, flow_name: str = None, - is_date_range_type: bool = None, - date: List[str] = None, + date_range_type: bool = None, *args, **kwargs, ): self.flow_name = flow_name - self.is_date_range_type = is_date_range_type - self.date = date + self.date_range_type = date_range_type super().__init__( name="prefect_extract_details", @@ -41,7 +40,7 @@ def iter_throught_flow_runs_ids(self, run_ids_list: List[str] = None): for id in range(len(run_ids_list)): yield run_ids_list[id] - def check_fails(self, flow_run_ids: str = None): + def get_time_from_last_successful_run(self, flow_run_ids: str = None): """ Get start_time from last Flow run where state was success """ @@ -51,27 +50,28 @@ def check_fails(self, flow_run_ids: str = None): def calculate_difference( self, - date_to_compare: datetime = None, - base_date: datetime = datetime.today(), - is_date_range_type: bool = None, + date_to_compare: str = None, + base_date: str = str(datetime.today()), diff_type: Literal["time", "date"] = "date", ): """Calculations for set new date are needed.""" base_date = self.get_formatted_date(base_date, diff_type) date_to_compare = self.get_formatted_date(date_to_compare, diff_type) - if is_date_range_type is True: - if diff_type == "date": - difference = base_date - date_to_compare - return difference.days - if diff_type == "time": - difference_h = abs(base_date.hour - date_to_compare.hour) - if difference_h >= 1: - return difference_h - else: - return difference_h - if is_date_range_type is False: - return date_to_compare + if diff_type == "date": + difference = abs(base_date - date_to_compare) + return difference.days + + if diff_type == "time": + difference_h = abs(base_date.hour - date_to_compare.hour) + if difference_h <= 1: + difference_m = date_to_compare.minute - base_date.minute + if difference_m <= 0: + return 1 + if difference_m > 0: + return float(f"1.{(abs(difference_m))}") + if difference_h > 1: + return difference_h def check_if_scheduled_run( self, time_run: str = None, time_schedule: str = None @@ -80,12 +80,11 @@ def check_if_scheduled_run( diff = self.calculate_difference( date_to_compare=time_run, base_date=time_schedule, - is_date_range_type=True, diff_type="time", ) if diff < 1: return True - if diff > 1: + if diff >= 1: return False def get_formatted_date( @@ -93,7 +92,10 @@ def get_formatted_date( time_unclean: str = None, return_value: Literal["time", "date"] = "date", ): - """from prefect format date (in string) get clean time or date in datetime type.""" + """ + from prefect format date (in string) get clean time or date in datetime type. + - date from Prefect: '2022-02-21T01:00:00+00:00' + """ if return_value == "time": time_extracted = time_unclean.split("T")[1] time_clean_str = time_extracted.split(".")[0] @@ -107,14 +109,12 @@ def get_formatted_date( @defaults_from_attrs( "flow_name", - "is_date_range_type", - "date", + "date_range_type", ) def run( self, flow_name, - is_date_range_type, - date, + date_range_type, **kwargs, ) -> None: @@ -141,14 +141,11 @@ def run( % flow_name ) - ## check if is scheduled - flow_runs = client.graphql(query) flow_runs_ids = flow_runs.data.flow[0]["flow_runs"] - last_success_start_time = self.check_fails(flow_runs_ids) + last_success_start_time = self.get_time_from_last_successful_run(flow_runs_ids) time_schedule = flow_runs_ids[0]["scheduled_start_time"] - is_scheduled = self.check_if_scheduled_run( time_run=last_success_start_time, time_schedule=time_schedule, @@ -157,15 +154,9 @@ def run( new_date = self.calculate_difference( date_to_compare=last_success_start_time, base_date=time_schedule, - is_date_range_type=is_date_range_type, diff_type="date", ) if is_scheduled is False: - new_date = self.calculate_difference( - date_to_compare=last_success_start_time, - base_date=time_schedule, - is_date_range_type=is_date_range_type, - diff_type="date", - ) + return self.date_range_type return new_date From 3d36890da3c0269bed4f22e715c8cee22e7369c3 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Feb 2022 15:47:10 +0100 Subject: [PATCH 013/135] =?UTF-8?q?=E2=9C=A8=20Added=20`SQLServer`=20sourc?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ viadot/sources/__init__.py | 1 + viadot/sources/sql_server.py | 48 ++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 viadot/sources/sql_server.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ca3fef36..208ee7d21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added `SQLServer` source + ### Fixed - fixed OpenSSL config for old SQL Servers still using TLS < 1.2 diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index a4d9d1204..ee99fba90 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -14,3 +14,4 @@ from .uk_carbon_intensity import UKCarbonIntensity from .sqlite import SQLite from .duckdb import DuckDB +from .sql_server import SQLServer diff --git a/viadot/sources/sql_server.py b/viadot/sources/sql_server.py new file mode 100644 index 000000000..1c646c4e1 --- /dev/null +++ b/viadot/sources/sql_server.py @@ -0,0 +1,48 @@ +from .base import SQL +from typing import List + + +class SQLServer(SQL): + DEFAULT_SCHEMA = "dbo" + + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.credentials["driver"] = "ODBC Driver 17 for SQL Server" + + @property + def schemas(self) -> List[str]: + """Returns list of schemas""" + schemas_tuples = self.run("SELECT s.name as schema_name from sys.schemas s") + return [schema_tuple[0] for schema_tuple in schemas_tuples] + + @property + def tables(self) -> List[str]: + """Returns list of tables""" + tables_tuples = self.run("SELECT * FROM information_schema.tables") + return [table for row in tables_tuples for table in row] + + def exists(self, table: str, schema: str = None) -> bool: + """Check whether a table exists. + Args: + table (str): The table to be checked. + schema (str, optional): The schema whethe the table is located. Defaults to 'dbo'. + Returns: + bool: Whether the table exists. + """ + + if not schema: + schema = self.DEFAULT_SCHEMA + + list_table_info_query = f""" + SELECT * + FROM sys.tables t + JOIN sys.schemas s + ON t.schema_id = s.schema_id + WHERE s.name = '{schema}' AND t.name = '{table}' + """ + exists = bool(self.run(list_table_info_query)) + return exists \ No newline at end of file From c2527758fe25a8c9506754967051d0ce0ee300c9 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 21 Feb 2022 15:47:41 +0100 Subject: [PATCH 014/135] =?UTF-8?q?=E2=9C=85=20=20Added=20tests=20for=20Ge?= =?UTF-8?q?tFlowLastSuccessfulRun=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_prefect.py | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/integration/tasks/test_prefect.py diff --git a/tests/integration/tasks/test_prefect.py b/tests/integration/tasks/test_prefect.py new file mode 100644 index 000000000..7281bdfda --- /dev/null +++ b/tests/integration/tasks/test_prefect.py @@ -0,0 +1,49 @@ +import pytest +import pandas as pd +import datetime +from datetime import date + +from viadot.tasks import GetFlowLastSuccessfulRun + +PREFECT_TASK = GetFlowLastSuccessfulRun() +DATE_FROM_PREFECT = "2022-01-01T01:30:00+00:00" +DATE_FROM_PREFECT2 = "2022-01-04T02:20:00+00:00" + + +def test_get_formatted_dated(): + new_date = PREFECT_TASK.get_formatted_date( + time_unclean=DATE_FROM_PREFECT, return_value="date" + ) + assert new_date == datetime.date(2022, 1, 1) + assert isinstance(new_date, date) + + new_time = PREFECT_TASK.get_formatted_date( + time_unclean=DATE_FROM_PREFECT, return_value="time" + ) + assert new_time == datetime.time(1, 30) + assert isinstance(new_time, datetime.time) + + +def test_calculate_difference_date(): + diff_days = PREFECT_TASK.calculate_difference( + date_to_compare=DATE_FROM_PREFECT2, + base_date=DATE_FROM_PREFECT, + diff_type="date", + ) + assert diff_days == 3 + + +def test_calculate_difference_time(): + diff_hours = PREFECT_TASK.calculate_difference( + date_to_compare=DATE_FROM_PREFECT2, + base_date=DATE_FROM_PREFECT, + diff_type="time", + ) + assert diff_hours == 1 + + diff_hours = PREFECT_TASK.calculate_difference( + date_to_compare="2022-01-04T02:50:00+00:00", + base_date=DATE_FROM_PREFECT, + diff_type="time", + ) + assert diff_hours == 1.20 From 1ecc24048d73cb696eddfa8c34f8e50fb28d6fcb Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Feb 2022 15:48:46 +0100 Subject: [PATCH 015/135] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Changed=20the=20ba?= =?UTF-8?q?se=20class=20of=20`AzureSQL`=20to=20`SQLServer`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ viadot/sources/azure_sql.py | 52 +++---------------------------------- 2 files changed, 6 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 208ee7d21..34df682ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added `SQLServer` source +### Changed +- Changed the base class of `AzureSQL` to `SQLServer` + ### Fixed - fixed OpenSSL config for old SQL Servers still using TLS < 1.2 diff --git a/viadot/sources/azure_sql.py b/viadot/sources/azure_sql.py index 64a98ea99..d9168a89f 100644 --- a/viadot/sources/azure_sql.py +++ b/viadot/sources/azure_sql.py @@ -1,35 +1,13 @@ -from typing import List, Literal +from typing import Literal from prefect.utilities import logging -from .base import SQL +from .sql_server import SQLServer logger = logging.get_logger(__name__) -class AzureSQL(SQL): - DEFAULT_SCHEMA = "dbo" - - def __init__( - self, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.credentials["driver"] = "ODBC Driver 17 for SQL Server" - - @property - def schemas(self) -> List[str]: - """Returns list of schemas""" - schemas_tuples = self.run("SELECT s.name as schema_name from sys.schemas s") - return [schema_tuple[0] for schema_tuple in schemas_tuples] - - @property - def tables(self) -> List[str]: - """Returns list of tables""" - tables_tuples = self.run("SELECT * FROM information_schema.tables") - return [table for row in tables_tuples for table in row] - +class AzureSQL(SQLServer): def bulk_insert( self, table: str, @@ -120,27 +98,3 @@ def create_external_database( self.run(create_master_key_sql) self.run(create_external_db_credential_sql) self.run(create_external_db_sql) - - def exists(self, table: str, schema: str = None) -> bool: - """Check whether a table exists. - - Args: - table (str): The table to be checked. - schema (str, optional): The schema whethe the table is located. Defaults to 'dbo'. - - Returns: - bool: Whether the table exists. - """ - - if not schema: - schema = self.DEFAULT_SCHEMA - - list_table_info_query = f""" - SELECT * - FROM sys.tables t - JOIN sys.schemas s - ON t.schema_id = s.schema_id - WHERE s.name = '{schema}' AND t.name = '{table}' - """ - exists = bool(self.run(list_table_info_query)) - return exists From d7128ee316a07e5af880fb7830ee0b86c5145af9 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Feb 2022 15:57:15 +0100 Subject: [PATCH 016/135] =?UTF-8?q?=F0=9F=8E=A8=20Fixed=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sql_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/sql_server.py b/viadot/sources/sql_server.py index 1c646c4e1..af0682da2 100644 --- a/viadot/sources/sql_server.py +++ b/viadot/sources/sql_server.py @@ -45,4 +45,4 @@ def exists(self, table: str, schema: str = None) -> bool: WHERE s.name = '{schema}' AND t.name = '{table}' """ exists = bool(self.run(list_table_info_query)) - return exists \ No newline at end of file + return exists From ad55c1dfc8baeabb749ab863aa1dca1e1dad7b48 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 21 Feb 2022 16:14:21 +0100 Subject: [PATCH 017/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20docstrings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 48 +++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index dccfa9698..7bca7a07d 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -30,19 +30,31 @@ def __init__( ) def __call__(self): - """Extract details from Prefect Flow""" + """Extract time from Prefect Flow run""" super().__call__(self) def iter_throught_flow_runs_ids(self, run_ids_list: List[str] = None): """ Generate Flow run ids + + Args: + run_ids_list (List[str], optional): List of Flow run ids. Defaults to None. + + Yields: + str: Flow id """ for id in range(len(run_ids_list)): yield run_ids_list[id] - def get_time_from_last_successful_run(self, flow_run_ids: str = None): + def get_time_from_last_successful_run(self, flow_run_ids: List[str] = None) -> str: """ - Get start_time from last Flow run where state was success + Get start_time from last Flow run where state was success. + + Args: + flow_run_ids (List[str], optional): List of Flow run ids. Defaults to None. + + Returns: + str: start_time of Flow run """ for flow_run in self.iter_throught_flow_runs_ids(flow_run_ids): if flow_run.state == "Success": @@ -54,7 +66,14 @@ def calculate_difference( base_date: str = str(datetime.today()), diff_type: Literal["time", "date"] = "date", ): - """Calculations for set new date are needed.""" + """ + Calculate diffrence between two dates. + + Args: + date_to_compare (str, optional): Date to compare with base_date. Defaults to None. + base_date (str, optional): The base date - can be saved as Prefect schedule date. Defaults to str(datetime.today()). + diff_type (Literal["time", "date"], optional): _description_. Defaults to "date". + """ base_date = self.get_formatted_date(base_date, diff_type) date_to_compare = self.get_formatted_date(date_to_compare, diff_type) @@ -76,7 +95,16 @@ def calculate_difference( def check_if_scheduled_run( self, time_run: str = None, time_schedule: str = None ) -> bool: - """Check if run was schduled or started by user""" + """ + Check if run was schduled or started by user. + + Args: + time_run (str, optional): The time the Flow was started. Defaults to None. + time_schedule (str, optional): Scheduled time of Flow. Defaults to None. + + Returns: + bool: True if flow run was started automatically. False if Flow was started by user. + """ diff = self.calculate_difference( date_to_compare=time_run, base_date=time_schedule, @@ -93,8 +121,14 @@ def get_formatted_date( return_value: Literal["time", "date"] = "date", ): """ - from prefect format date (in string) get clean time or date in datetime type. - - date from Prefect: '2022-02-21T01:00:00+00:00' + Format date from "2022-02-21T01:00:00+00:00" to date or time. + + Args: + time_unclean (str, optional): _description_. Defaults to None. + return_value (Literal["time", "date"], optional): Choose the format to be extracted from datetime - time or date. Defaults to "date". + + Returns: + datetime: Date (datetime.date) or time (datetime.time) """ if return_value == "time": time_extracted = time_unclean.split("T")[1] From 48024a6a9b4ee46da034c515f714d1ad90bc94ab Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Feb 2022 18:15:54 +0100 Subject: [PATCH 018/135] =?UTF-8?q?=E2=9C=A8=20Added=20`DuckDBToDF`=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/__init__.py | 2 +- viadot/tasks/duckdb.py | 68 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34df682ec..88f988dca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `SQLServer` source +- Added `DuckDBToDF` task ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 7860a77df..c52285f50 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -31,4 +31,4 @@ except ImportError: pass -from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery +from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery, DuckDBToDF diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index cb4d5e3c8..461728cd8 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -2,6 +2,7 @@ from prefect import Task from prefect.utilities.tasks import defaults_from_attrs +import pandas as pd from ..sources import DuckDB @@ -128,3 +129,70 @@ def run( self.logger.info( f"Table {fqn} has not been created as if_exists is set to {if_exists}." ) + + +class DuckDBToDF(Task): + """ + Load a table from DuckDB to a pandas DataFrame. + + Args: + schema (str, optional): Source schema. + table (str, optional): Source table. + if_empty (Literal[, optional): What to do if the query returns no data. + Defaults to "warn". + credentials (dict, optional): The config to use for connecting with the db. + + Returns: + pd.DataFrame: a pandas DataFrame containing the table data. + """ + + def __init__( + self, + schema: str = None, + table: str = None, + if_empty: Literal["warn", "skip", "fail"] = "warn", + credentials: dict = None, + *args, + **kwargs, + ): + + self.schema = schema + self.table = table + self.if_empty = if_empty + self.credentials = credentials + + super().__init__(name="duckdb_to_df", *args, **kwargs) + + @defaults_from_attrs("schema", "table", "if_empty", "credentials") + def run( + self, + schema: str = None, + table: str = None, + if_empty: Literal["warn", "skip", "fail"] = None, + credentials: dict = None, + ) -> pd.DataFrame: + """Load a DuckDB table into a pandas DataFrame. + + Args: + schema (str, optional): Source schema. + table (str, optional): Source table. + if_empty (Literal[, optional): What to do if the query returns no data. + Defaults to "warn". + credentials (dict, optional): The config to use for connecting with the db. + + Returns: + pd.DataFrame: a pandas DataFrame containing the table data. + """ + + if table is None: + raise ValueError("Table is required.") + + duckdb = DuckDB(credentials=credentials) + + # run the query and fetch the results if it's a select + fqn = f"{schema}.{table}" if schema else table + query = f"SELECT * FROM {fqn}" + df = duckdb.to_df(query, if_empty=if_empty) + + self.logger.info(f"Data has been loaded sucessfully.") + return df From 7a3eb2ae57d65ad71abc20d8e4bb2d5fef93f8a2 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Feb 2022 18:16:30 +0100 Subject: [PATCH 019/135] =?UTF-8?q?=F0=9F=9A=B8=20Changed=20default=20`if?= =?UTF-8?q?=5Fempty`=20value=20to=20"warn"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/duckdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/duckdb.py b/viadot/sources/duckdb.py index 3d7a00fdc..37486f4d6 100644 --- a/viadot/sources/duckdb.py +++ b/viadot/sources/duckdb.py @@ -115,7 +115,7 @@ def run( cursor.close() return result - def _handle_if_empty(self, if_empty: str = None) -> NoReturn: + def _handle_if_empty(self, if_empty: str = "warn") -> NoReturn: if if_empty == "warn": logger.warning("The query produced no data.") elif if_empty == "skip": From c41e1afa8cd6c688485b01a48a4c369cc8e6679c Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 22 Feb 2022 12:02:40 +0100 Subject: [PATCH 020/135] aselite --- viadot/sources/__init__.py | 1 + viadot/sources/aselite.py | 48 +++++++++++++++++++++++++++++++++ viadot/tasks/__init__.py | 1 + viadot/tasks/aselite.py | 54 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+) create mode 100644 viadot/sources/aselite.py create mode 100644 viadot/tasks/aselite.py diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index 177d13485..f8fa0a7f6 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -8,3 +8,4 @@ # APIS from .uk_carbon_intensity import UKCarbonIntensity from .sqlite import SQLite +from .aselite import ASELite diff --git a/viadot/sources/aselite.py b/viadot/sources/aselite.py new file mode 100644 index 000000000..689ba04ed --- /dev/null +++ b/viadot/sources/aselite.py @@ -0,0 +1,48 @@ +from viadot.sources.base import SQL +from viadot.exceptions import CredentialError +from typing import Any, Dict, List +from viadot.config import local_config +import pandas as pd +from prefect import task + + +@task +def query_result_to_df_task(result: List[tuple], cols: List[str]): + return pd.DataFrame.from_records(result, columns=cols) + + +class ASELite(SQL): + """ + Python class that inheridate form SQL class located in viadot platform. + + Args: + credentials: (Dict[str, Any] , optional): Also credentials can be stored in credantials.json file in config folder. + It can be necessary to change driver type: "driver": "PostgreSQL Unicode" + """ + + def __init__( + self, credentials: Dict[str, Any] = None, db_name: str = None, *args, **kwargs + ): + DEFAULT_CREDENTIALS = local_config.get("ASLite_SQL") + credentials = DEFAULT_CREDENTIALS or credentials + if credentials is None: + raise CredentialError("Missing credentials.") + + super().__init__(*args, credentials=credentials, **kwargs) + self.credentials = credentials + self.credentials["db_name"] = db_name + + def to_df(self, query: str, if_empty: str = None) -> pd.DataFrame: + """Creates DataFrame form SQL query. + Args: + query (str): SQL query. If don't start with "SELECT" returns empty DataFrame. + if_empty (str, optional): What to do if the query returns no data. Defaults to None. + """ + conn = self.con + if query.upper().startswith("SELECT"): + df = pd.read_sql_query(query, conn) + if df.empty: + self._handle_if_empty(if_empty=if_empty) + else: + df = pd.DataFrame() + return df diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 000e3d42e..d08358912 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -24,3 +24,4 @@ from .supermetrics import SupermetricsToCSV, SupermetricsToDF from .sharepoint import SharepointToDF from .cloud_for_customers import C4CReportToDF, C4CToDF +from .aselite import ASELiteToDF diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py new file mode 100644 index 000000000..7f7c8ab55 --- /dev/null +++ b/viadot/tasks/aselite.py @@ -0,0 +1,54 @@ +from prefect import Task +from ..sources import ASELite +from typing import Any, Dict, List +from prefect.tasks.secrets import PrefectSecret +from .azure_key_vault import AzureKeyVaultSecret +from viadot.config import local_config +import json + + +class ASELiteToDF(Task): + def __init__( + self, credentials: Dict[str, Any] = None, db_name: str = None, *args, **kwargs + ): + self.credentials = credentials + self.db_name = db_name + + super().__init__( + name="aselite", + *args, + **kwargs, + ) + + def __call__(self, *args, **kwargs): + """Download aselite to df""" + return super().__call__(*args, **kwargs) + + def run( + self, + credentials: Dict[str, Any] = None, + db_name: str = None, + query: str = None, + if_empty: str = None, + credentials_secret: str = None, + vault_name: str = None, + ): + + if not credentials_secret: + try: + credentials_secret = PrefectSecret("xxxxxxxxxxxxxxxxxxxxxxxx").run() + except ValueError: + pass + + if credentials_secret: + credentials_str = AzureKeyVaultSecret( + credentials_secret, vault_name=vault_name + ).run() + credentials = json.loads(credentials_str) + else: + credentials = local_config.get("xxxxxxxxxxxxxxxxxxxxxxxxxxx") + aselite = ASElite(credentials=credentials, db_name=db_name) + + df = aselite.to_df(query=query, if_empty=if_empty) + + return df From 39269bf1baba38576d37ae438822052b265538bd Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 22 Feb 2022 12:25:19 +0100 Subject: [PATCH 021/135] =?UTF-8?q?=E2=9C=A8=20Add=20`DuckDBTransform`=20f?= =?UTF-8?q?low?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/flows/test_duckdb_transform.py | 48 +++++++++++++++++++++++ tests/unit/test_duckdb.py | 6 +-- viadot/flows/__init__.py | 2 + viadot/flows/duckdb_transform.py | 42 ++++++++++++++++++++ viadot/sources/duckdb.py | 19 +++++++-- viadot/tasks/duckdb.py | 2 +- 6 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 tests/unit/flows/test_duckdb_transform.py create mode 100644 viadot/flows/duckdb_transform.py diff --git a/tests/unit/flows/test_duckdb_transform.py b/tests/unit/flows/test_duckdb_transform.py new file mode 100644 index 000000000..8cbc78f6b --- /dev/null +++ b/tests/unit/flows/test_duckdb_transform.py @@ -0,0 +1,48 @@ +import os + +import pytest +from viadot.flows import DuckDBTransform +from viadot.sources import DuckDB + +BRONZE_SCHEMA = "bronze_schema" +SILVER_SCHEMA = "silver_schema" +TABLE = "test_table" +DATABASE_PATH = "test_db_123.duckdb" +CREDENTIALS = dict(database=DATABASE_PATH) + + +@pytest.fixture(scope="session") +def duckdb(): + duckdb = DuckDB(credentials=CREDENTIALS) + duckdb.run(f"CREATE SCHEMA IF NOT EXISTS {BRONZE_SCHEMA}") + duckdb.run(f"CREATE SCHEMA IF NOT EXISTS {SILVER_SCHEMA}") + + # create placeholder tables so that we can list schemas later on + # (DuckDB does not expose a way to list schemas without a table) + duckdb.run(f"CREATE TABLE {BRONZE_SCHEMA}.placeholder(a INTEGER)") + duckdb.run(f"CREATE TABLE {SILVER_SCHEMA}.placeholder(a INTEGER)") + yield duckdb + os.remove(DATABASE_PATH) + + +def test_duckdb_transform(duckdb, TEST_PARQUET_FILE_PATH): + silver_table_fqn = SILVER_SCHEMA + "." + TABLE + assert silver_table_fqn not in duckdb.tables + + # create a table to transform + duckdb.create_table_from_parquet( + schema=BRONZE_SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH + ) + + # run the flow + flow = DuckDBTransform( + name="First DuckDBTransform flow", + query=f"CREATE TABLE {SILVER_SCHEMA}.{TABLE} AS SELECT * FROM {BRONZE_SCHEMA}.{TABLE}", + credentials=CREDENTIALS, + ) + result = flow.run() + + assert result.is_successful() + + df = duckdb.to_df(f"SELECT * FROM {SILVER_SCHEMA}.{TABLE}") + assert df.shape[0] == 3 diff --git a/tests/unit/test_duckdb.py b/tests/unit/test_duckdb.py index 05b1778da..ae0177063 100644 --- a/tests/unit/test_duckdb.py +++ b/tests/unit/test_duckdb.py @@ -7,15 +7,11 @@ TABLE = "test_table" SCHEMA = "test_schema" TABLE_MULTIPLE_PARQUETS = "test_multiple_parquets" -DATABASE_PATH = "test.duckdb" +DATABASE_PATH = "test_db_123.duckdb" @pytest.fixture(scope="session") def duckdb(): - try: - os.remove(DATABASE_PATH) - except FileNotFoundError: - pass duckdb = DuckDB(credentials=dict(database=DATABASE_PATH)) yield duckdb os.remove(DATABASE_PATH) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 696c7aeec..cb9e7a7d1 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -14,3 +14,5 @@ from .sap_to_duckdb import SAPToDuckDB except ImportError: pass + +from .duckdb_transform import DuckDBTransform diff --git a/viadot/flows/duckdb_transform.py b/viadot/flows/duckdb_transform.py new file mode 100644 index 000000000..934539cc7 --- /dev/null +++ b/viadot/flows/duckdb_transform.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, List + +from prefect import Flow + +from ..tasks.duckdb import DuckDBQuery + +query_task = DuckDBQuery() + + +class DuckDBTransform(Flow): + def __init__( + self, + name: str, + query: str, + credentials: dict = None, + tags: List[str] = ["transform"], + *args: List[any], + **kwargs: Dict[str, Any] + ): + """ + Flow for running SQL queries on top of DuckDB. + + Args: + name (str): The name of the flow. + query (str, required): The query to execute on the database. + credentials (dict, optional): Credentials for the connection. Defaults to None. + tags (list, optional): Tag for marking flow. Defaults to "transform". + """ + self.query = query + self.credentials = credentials + self.tags = tags + self.tasks = [query_task] + + super().__init__(*args, name=name, **kwargs) + self.gen_flow() + + def gen_flow(self) -> Flow: + query_task.bind( + query=self.query, + credentials=self.credentials, + flow=self, + ) diff --git a/viadot/sources/duckdb.py b/viadot/sources/duckdb.py index 37486f4d6..d385640b8 100644 --- a/viadot/sources/duckdb.py +++ b/viadot/sources/duckdb.py @@ -1,4 +1,3 @@ -from multiprocessing.sharedctypes import Value from typing import Any, List, Literal, NoReturn, Tuple, Union import pandas as pd @@ -70,6 +69,20 @@ def tables(self) -> List[str]: tables = [table_meta[1] + "." + table_meta[2] for table_meta in tables_meta] return tables + @property + def schemas(self) -> List[str]: + """Show the list of schemas. + + Returns: + List[str]: The list ofschemas. + """ + self.logger.warning( + "DuckDB does not expose a way to list schemas. `DuckDB.schemas` only contains schemas with tables." + ) + tables_meta: List[Tuple] = self.run("SELECT * FROM information_schema.tables") + schemas = [table_meta[1] for table_meta in tables_meta] + return schemas + def to_df(self, query: str, if_empty: str = None) -> pd.DataFrame: if query.upper().startswith("SELECT"): df = self.run(query, fetch_type="dataframe") @@ -162,9 +175,7 @@ def create_table_from_parquet( elif if_exists == "skip": return False - schema_exists = self._check_if_schema_exists(schema) - if not schema_exists: - self.run(f"CREATE SCHEMA {schema}") + self.run(f"CREATE SCHEMA IF NOT EXISTS {schema}") self.logger.info(f"Creating table {fqn}...") ingest_query = f"CREATE TABLE {fqn} AS SELECT * FROM '{path}';" diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index 461728cd8..88ca3fcb1 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -190,7 +190,7 @@ def run( duckdb = DuckDB(credentials=credentials) # run the query and fetch the results if it's a select - fqn = f"{schema}.{table}" if schema else table + fqn = f"{schema}.{table}" if schema is not None else table query = f"SELECT * FROM {fqn}" df = duckdb.to_df(query, if_empty=if_empty) From c0a6f24fe0f49438454b69da1fc613cad2af7736 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 22 Feb 2022 13:37:14 +0100 Subject: [PATCH 022/135] =?UTF-8?q?=E2=9C=A8=20added=20new=20source?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/aselite.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index 7f7c8ab55..ec5c5a6df 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -5,6 +5,7 @@ from .azure_key_vault import AzureKeyVaultSecret from viadot.config import local_config import json +from viadot.sources.azure_sql import AzureSQL class ASELiteToDF(Task): @@ -46,9 +47,9 @@ def run( ).run() credentials = json.loads(credentials_str) else: - credentials = local_config.get("xxxxxxxxxxxxxxxxxxxxxxxxxxx") - aselite = ASElite(credentials=credentials, db_name=db_name) + credentials = local_config.get("ASLite_SQL") - df = aselite.to_df(query=query, if_empty=if_empty) - - return df + ase = AzureSQL(credentials=credentials) + ase.conn_str + ase.con + return ase.to_df(query) From 6bf77a5af52c51d5acddb2a4f38968f3bcf1aef5 Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 22 Feb 2022 13:51:20 +0100 Subject: [PATCH 023/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20conflict=20with?= =?UTF-8?q?=20DuckDB=20file=20in=20another=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/flows/test_duckdb_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/flows/test_duckdb_transform.py b/tests/unit/flows/test_duckdb_transform.py index 8cbc78f6b..79491614d 100644 --- a/tests/unit/flows/test_duckdb_transform.py +++ b/tests/unit/flows/test_duckdb_transform.py @@ -7,7 +7,7 @@ BRONZE_SCHEMA = "bronze_schema" SILVER_SCHEMA = "silver_schema" TABLE = "test_table" -DATABASE_PATH = "test_db_123.duckdb" +DATABASE_PATH = "test_db_1234.duckdb" CREDENTIALS = dict(database=DATABASE_PATH) From 4603b12575e4e4be5da4684224212c705034bf96 Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 22 Feb 2022 13:51:31 +0100 Subject: [PATCH 024/135] =?UTF-8?q?=F0=9F=93=9D=20Add=20changelog=20entry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f988dca..3de9642c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added `SQLServer` source - Added `DuckDBToDF` task +- Added `DuckDBTransform` flow ### Changed - Changed the base class of `AzureSQL` to `SQLServer` From 10a37c2b94d7f8af6010fdf7059e24525fdeb15d Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 22 Feb 2022 14:45:21 +0100 Subject: [PATCH 025/135] new source --- viadot/task_utils.py | 5 +++++ viadot/tasks/aselite.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index aeb7375d9..0805a7114 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -195,6 +195,11 @@ def cleanup_validation_clutter(expectations_path): shutil.rmtree(ge_project_path) +@task +def df_converts_bytes_to_int(cls, df): + return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) + + class Git(Git): @property def git_clone_url(self): diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index ec5c5a6df..30fc4026d 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -10,10 +10,11 @@ class ASELiteToDF(Task): def __init__( - self, credentials: Dict[str, Any] = None, db_name: str = None, *args, **kwargs + self, credentials: Dict[str, Any] = None, db_name: str = None, query: str =None, *args, **kwargs ): self.credentials = credentials self.db_name = db_name + self.query = query super().__init__( name="aselite", @@ -52,4 +53,5 @@ def run( ase = AzureSQL(credentials=credentials) ase.conn_str ase.con - return ase.to_df(query) + final_df = ase.to_df(self.query) + return final_df From 4a76ce2bfab4b62fb8fdca83c339d6bd0fb66180 Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 22 Feb 2022 14:46:31 +0100 Subject: [PATCH 026/135] =?UTF-8?q?=F0=9F=9A=B8=20Add=20HTTP=20proxy=20to?= =?UTF-8?q?=20SAP=20RFC=20example?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/examples/sap_rfc/Dockerfile | 6 +++++- viadot/examples/sap_rfc/build.sh | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 viadot/examples/sap_rfc/build.sh diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index 2832a3ecd..c08acb30f 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -1,4 +1,4 @@ -FROM viadot:latest +FROM viadot:dev USER root @@ -12,4 +12,8 @@ RUN ldconfig COPY requirements.txt . RUN xargs -L 1 pip install < requirements.txt +ARG HTTP_PROXY="" +ENV HTTP_PROXY=$HTTP_PROXY +RUN git config --global http.proxy ${HTTP_PROXY:-""} + USER viadot \ No newline at end of file diff --git a/viadot/examples/sap_rfc/build.sh b/viadot/examples/sap_rfc/build.sh new file mode 100644 index 000000000..9d5d45380 --- /dev/null +++ b/viadot/examples/sap_rfc/build.sh @@ -0,0 +1 @@ +docker build --no-cache . -t viadot:sap_rfc From e3ac2b3e45c6cc6eb70ad43a53241254a0303741 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 22 Feb 2022 15:06:15 +0100 Subject: [PATCH 027/135] added new flow class --- viadot/flows/aselite_to_adls.py | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 viadot/flows/aselite_to_adls.py diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py new file mode 100644 index 000000000..cc4cf785b --- /dev/null +++ b/viadot/flows/aselite_to_adls.py @@ -0,0 +1,68 @@ +import pandas as pd +from typing import Any, Dict, List, Literal +from prefect import Flow, task +from viadot.tasks import AzureDataLakeUpload, AzureSQLDBQuery, A +from viadot.task_utils import df_to_csv +from viadot.tasks.aselite import ASELiteToDF +#tasks +df_task = ASELiteToDF() +#query to df albo od razu query to csv +#df_to_csv_task = df_to_csv() +file_to_adls_task = AzureDataLakeUpload() + +@task +def query_result_to_df_task(result: List[tuple], cols: List[str] ): + return pd.DataFrame.from_records(result, columns = cols) + +class ASLitetoADLS(Flow): + def __init__( + self, + name: str, + query: str = None, + sqldb_credentials_secret: str = None, + adls_sp_credentials_secret: str = None, + vault_name: str = None, + # schema: str = None, + # table: str = None, + if_empty: str = "warn", + file_path: str = "None", #from path + sep: str = "\t", + to_path: str = None, # storage + if_exists: Literal["replace", "append", "delete"] = "replace", + col_names: List[str] = None, + *args: List[any], + **kwargs: Dict[str, Any] + ): + + # Query task + self.query = query + self.sqldb_credentials_secret = sqldb_credentials_secret + self.adls_sp_credentials_secret = adls_sp_credentials_secret + self.vault_name = vault_name + + # table and schema needed only in query + # self.schema = schema + # self.table = table + self.if_empty = if_empty # empty query + self.col_names = col_names + + self.file_path = file_path # path where to write file (csv) locally + self.sep = sep + # Svae to storage + self.to_path = to_path + self.if_exists = if_exists + + super().__init__(*args, name=name, **kwargs) + + self.gen_flow() + + def gen_flow(self) -> Flow: + query_result = query_task.bind( query = self.query, credentials_secret = self.sqldb_credentials_secret, vault_name = self.vault_name, flow = self) + df = query_result_to_df_task.bind(query_result, cols = self.col_names, flow = self) + csv = df_to_csv.bind(df, path= self.file_path, sep = self.sep, if_exists = self.if_exists, flow = self) + adls_upload = file_to_adls_task.bind(from_path = self.file_path, to_path = self.to_path, flow =self) + print(query_result) + df.set_upstream(query_result, flow=self) + csv.set_upstream(df, flow =self) + adls_upload.set_upstream(csv, flow =self) + From cef37d0940e2b720f49afead7d3817d820a4f5a5 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 22 Feb 2022 15:08:27 +0100 Subject: [PATCH 028/135] =?UTF-8?q?=E2=9C=A8=20added=20new=20flow=20aselit?= =?UTF-8?q?e=5Fto=5Fadls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/aselite_to_adls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index cc4cf785b..9618b7fd2 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -1,7 +1,7 @@ import pandas as pd from typing import Any, Dict, List, Literal from prefect import Flow, task -from viadot.tasks import AzureDataLakeUpload, AzureSQLDBQuery, A +from viadot.tasks import AzureDataLakeUpload, AzureSQLDBQuery from viadot.task_utils import df_to_csv from viadot.tasks.aselite import ASELiteToDF #tasks From 5f6a938e7961ca57ac87727d94d7c9fb35f955f6 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 23 Feb 2022 10:18:08 +0100 Subject: [PATCH 029/135] =?UTF-8?q?=E2=9C=85=20Added=20tests=20for=20GetFl?= =?UTF-8?q?owLastSuccessfulRun=20Task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_prefect.py | 50 ++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/tests/integration/tasks/test_prefect.py b/tests/integration/tasks/test_prefect.py index 7281bdfda..94731d8a7 100644 --- a/tests/integration/tasks/test_prefect.py +++ b/tests/integration/tasks/test_prefect.py @@ -8,6 +8,34 @@ PREFECT_TASK = GetFlowLastSuccessfulRun() DATE_FROM_PREFECT = "2022-01-01T01:30:00+00:00" DATE_FROM_PREFECT2 = "2022-01-04T02:20:00+00:00" +PREFECT_JSON = { + "data": { + "flow": [ + { + "flow_runs": [ + { + "id": "88a505ec-f7c9-4242-baa4-81c9db1aaafe", + "start_time": "2022-02-21T01:04:44.146191+00:00", + "state": "Failed", + "scheduled_start_time": "2022-02-21T01:00:00+00:00", + }, + { + "id": "9d310a47-2fb0-4435-971c-4fcaf3efca06", + "start_time": "2022-02-20T01:05:36.142547+00:00", + "state": "Success", + "scheduled_start_time": "2022-02-20T01:00:00+00:00", + }, + { + "id": "90c32b13-4a0a-467b-a07f-c568a34f1bc2", + "start_time": "2022-02-19T01:05:30.572431+00:00", + "state": "Failed", + "scheduled_start_time": "2022-02-19T01:00:00+00:00", + }, + ] + } + ] + } +} def test_get_formatted_dated(): @@ -39,7 +67,7 @@ def test_calculate_difference_time(): base_date=DATE_FROM_PREFECT, diff_type="time", ) - assert diff_hours == 1 + assert diff_hours == 0 diff_hours = PREFECT_TASK.calculate_difference( date_to_compare="2022-01-04T02:50:00+00:00", @@ -47,3 +75,23 @@ def test_calculate_difference_time(): diff_type="time", ) assert diff_hours == 1.20 + + +def test_get_time_from_last_successful_run(): + flow_runs = PREFECT_JSON["data"]["flow"][0]["flow_runs"] + start_time_success = PREFECT_TASK.get_time_from_last_successful_run( + flow_runs_details=flow_runs + ) + assert start_time_success == "2022-02-20T01:05:36.142547+00:00" + + +def test_check_if_scheduled_run(): + is_scheduled = PREFECT_TASK.check_if_scheduled_run( + time_run="2022-02-21T01:40:00+00:00", time_schedule="2022-02-15T01:00:00+00:00" + ) + assert is_scheduled is True + + is_scheduled = PREFECT_TASK.check_if_scheduled_run( + time_run="2022-02-21T02:20:00+00:00", time_schedule="2022-02-15T01:00:00+00:00" + ) + assert is_scheduled is False From 7fca68730c9cc20e6178d56bf948790fc4b9566b Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 23 Feb 2022 11:14:45 +0100 Subject: [PATCH 030/135] =?UTF-8?q?=E2=9C=A8=20added/=20edited=20ASELite?= =?UTF-8?q?=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/__init__.py | 1 + viadot/flows/aselite_to_adls.py | 80 ++++++++++++++++++--------------- viadot/sources/__init__.py | 1 - viadot/tasks/aselite.py | 32 +++++++++---- 4 files changed, 69 insertions(+), 45 deletions(-) diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 122016a08..a2fcca9be 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -9,3 +9,4 @@ from .adls_container_to_container import ADLSContainerToContainer from .sharepoint_to_adls import SharepointToADLS from .cloud_for_customers_report_to_adls import CloudForCustomersReportToADLS +from .aselite_to_adls import ASELitetoADLS diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 9618b7fd2..5efcab9dc 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -1,54 +1,54 @@ -import pandas as pd from typing import Any, Dict, List, Literal -from prefect import Flow, task -from viadot.tasks import AzureDataLakeUpload, AzureSQLDBQuery +from prefect import Flow +from viadot.tasks import AzureDataLakeUpload from viadot.task_utils import df_to_csv from viadot.tasks.aselite import ASELiteToDF -#tasks + + df_task = ASELiteToDF() -#query to df albo od razu query to csv -#df_to_csv_task = df_to_csv() file_to_adls_task = AzureDataLakeUpload() -@task -def query_result_to_df_task(result: List[tuple], cols: List[str] ): - return pd.DataFrame.from_records(result, columns = cols) -class ASLitetoADLS(Flow): +class ASELitetoADLS(Flow): def __init__( self, name: str, query: str = None, + db_name: str = None, sqldb_credentials_secret: str = None, - adls_sp_credentials_secret: str = None, vault_name: str = None, - # schema: str = None, - # table: str = None, - if_empty: str = "warn", - file_path: str = "None", #from path + file_path: str = "None", sep: str = "\t", - to_path: str = None, # storage + to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", - col_names: List[str] = None, + overwrite: bool = True, *args: List[any], **kwargs: Dict[str, Any] ): + """ + Flow for downloading data from ASElite to csv file, then uploading it to Azure Storage Explorer. - # Query task + Args: + name (str): The name of the flow. + query (str): + db_name (str): + sqldb_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ASElite SQL Database credentials. Defaults to None. + vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + file_path (str, optional): Local destination path. Defaults to None. + sep (str, optional): The delimiter for the output CSV file. Defaults to "\t". + to_path (str): The path to an ADLS file. Defaults to None. + if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". + overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. + """ self.query = query + self.db_name = db_name self.sqldb_credentials_secret = sqldb_credentials_secret - self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name + self.overwrite = overwrite - # table and schema needed only in query - # self.schema = schema - # self.table = table - self.if_empty = if_empty # empty query - self.col_names = col_names - - self.file_path = file_path # path where to write file (csv) locally + self.file_path = file_path self.sep = sep - # Svae to storage self.to_path = to_path self.if_exists = if_exists @@ -57,12 +57,22 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - query_result = query_task.bind( query = self.query, credentials_secret = self.sqldb_credentials_secret, vault_name = self.vault_name, flow = self) - df = query_result_to_df_task.bind(query_result, cols = self.col_names, flow = self) - csv = df_to_csv.bind(df, path= self.file_path, sep = self.sep, if_exists = self.if_exists, flow = self) - adls_upload = file_to_adls_task.bind(from_path = self.file_path, to_path = self.to_path, flow =self) - print(query_result) - df.set_upstream(query_result, flow=self) - csv.set_upstream(df, flow =self) - adls_upload.set_upstream(csv, flow =self) + df = df_task.bind( + query=self.query, + db_name=self.db_name, + credentials_secret=self.sqldb_credentials_secret, + vault_name=self.vault_name, + flow=self, + ) + csv = df_to_csv.bind( + df, path=self.file_path, sep=self.sep, if_exists=self.if_exists, flow=self + ) + adls_upload = file_to_adls_task.bind( + from_path=self.file_path, + to_path=self.to_path, + overwrite=self.overwrite, + flow=self, + ) + csv.set_upstream(df, flow=self) + adls_upload.set_upstream(csv, flow=self) diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index f8fa0a7f6..177d13485 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -8,4 +8,3 @@ # APIS from .uk_carbon_intensity import UKCarbonIntensity from .sqlite import SQLite -from .aselite import ASELite diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index 30fc4026d..5f4ceafea 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -1,17 +1,31 @@ from prefect import Task from ..sources import ASELite +from ..sources import AzureSQL +from viadot.sources.base import SQL from typing import Any, Dict, List from prefect.tasks.secrets import PrefectSecret from .azure_key_vault import AzureKeyVaultSecret from viadot.config import local_config import json -from viadot.sources.azure_sql import AzureSQL class ASELiteToDF(Task): def __init__( - self, credentials: Dict[str, Any] = None, db_name: str = None, query: str =None, *args, **kwargs + self, + credentials: Dict[str, Any] = None, + db_name: str = None, + query: str = None, + *args, + **kwargs ): + """ + Task for obtaining data from ASElite source. + Args: + credentials (Dict[str, Any], optional): ASElite SQL Database credentials. Defaults to None. + db_name(str, optional): Name of ASElite database. Defaults to None. + query(str, optional), + Returns: Pandas DataFrame + """ self.credentials = credentials self.db_name = db_name self.query = query @@ -38,7 +52,7 @@ def run( if not credentials_secret: try: - credentials_secret = PrefectSecret("xxxxxxxxxxxxxxxxxxxxxxxx").run() + credentials_secret = PrefectSecret("aselite").run() except ValueError: pass @@ -48,10 +62,10 @@ def run( ).run() credentials = json.loads(credentials_str) else: - credentials = local_config.get("ASLite_SQL") + credentials = local_config.get("ASELite_SQL") + + aselite = SQL(credentials=credentials, db_name=db_name) + + df = aselite.to_df(query=query, if_empty=if_empty) - ase = AzureSQL(credentials=credentials) - ase.conn_str - ase.con - final_df = ase.to_df(self.query) - return final_df + return df From d65e260621e4b80fda39466f609d19aa3318ff5b Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 23 Feb 2022 11:15:10 +0100 Subject: [PATCH 031/135] =?UTF-8?q?=F0=9F=94=A5=20removed=20aselite=20sour?= =?UTF-8?q?ce?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/aselite.py | 48 --------------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 viadot/sources/aselite.py diff --git a/viadot/sources/aselite.py b/viadot/sources/aselite.py deleted file mode 100644 index 689ba04ed..000000000 --- a/viadot/sources/aselite.py +++ /dev/null @@ -1,48 +0,0 @@ -from viadot.sources.base import SQL -from viadot.exceptions import CredentialError -from typing import Any, Dict, List -from viadot.config import local_config -import pandas as pd -from prefect import task - - -@task -def query_result_to_df_task(result: List[tuple], cols: List[str]): - return pd.DataFrame.from_records(result, columns=cols) - - -class ASELite(SQL): - """ - Python class that inheridate form SQL class located in viadot platform. - - Args: - credentials: (Dict[str, Any] , optional): Also credentials can be stored in credantials.json file in config folder. - It can be necessary to change driver type: "driver": "PostgreSQL Unicode" - """ - - def __init__( - self, credentials: Dict[str, Any] = None, db_name: str = None, *args, **kwargs - ): - DEFAULT_CREDENTIALS = local_config.get("ASLite_SQL") - credentials = DEFAULT_CREDENTIALS or credentials - if credentials is None: - raise CredentialError("Missing credentials.") - - super().__init__(*args, credentials=credentials, **kwargs) - self.credentials = credentials - self.credentials["db_name"] = db_name - - def to_df(self, query: str, if_empty: str = None) -> pd.DataFrame: - """Creates DataFrame form SQL query. - Args: - query (str): SQL query. If don't start with "SELECT" returns empty DataFrame. - if_empty (str, optional): What to do if the query returns no data. Defaults to None. - """ - conn = self.con - if query.upper().startswith("SELECT"): - df = pd.read_sql_query(query, conn) - if df.empty: - self._handle_if_empty(if_empty=if_empty) - else: - df = pd.DataFrame() - return df From 85a26c29ee96f9da3cd687cecf79dff340c39fc3 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 23 Feb 2022 11:19:34 +0100 Subject: [PATCH 032/135] =?UTF-8?q?=F0=9F=8E=A8=20formated=20aselite=20tas?= =?UTF-8?q?k?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/aselite.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index 5f4ceafea..acd2ad26c 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -1,8 +1,6 @@ from prefect import Task -from ..sources import ASELite -from ..sources import AzureSQL from viadot.sources.base import SQL -from typing import Any, Dict, List +from typing import Any, Dict from prefect.tasks.secrets import PrefectSecret from .azure_key_vault import AzureKeyVaultSecret from viadot.config import local_config @@ -23,7 +21,7 @@ def __init__( Args: credentials (Dict[str, Any], optional): ASElite SQL Database credentials. Defaults to None. db_name(str, optional): Name of ASElite database. Defaults to None. - query(str, optional), + query(str, optional): Query to perform on a database. Defaults to None. Returns: Pandas DataFrame """ self.credentials = credentials @@ -31,13 +29,13 @@ def __init__( self.query = query super().__init__( - name="aselite", + name="ASElite_to_df", *args, **kwargs, ) def __call__(self, *args, **kwargs): - """Download aselite to df""" + """Download from aselite database to df""" return super().__call__(*args, **kwargs) def run( From c7ac322e571f79bafe3fa2800e15d517f6c018c5 Mon Sep 17 00:00:00 2001 From: trymzet Date: Wed, 23 Feb 2022 12:43:03 +0100 Subject: [PATCH 033/135] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20`df=5Fto=5Fparquet?= =?UTF-8?q?()`=20task=20now=20creates=20directories=20if=20needed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/task_utils.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3de9642c2..564a22000 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Changed the base class of `AzureSQL` to `SQLServer` +- `df_to_parquet()` task now creates directories if needed ### Fixed - fixed OpenSSL config for old SQL Servers still using TLS < 1.2 diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 059222c38..05df5df47 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -194,6 +194,15 @@ def df_to_parquet( return else: out_df = df + + # create directories if they don't exist + try: + if not os.path.isfile(path): + directory = os.path.dirname(path) + os.makedirs(directory, exist_ok=True) + except: + pass + out_df.to_parquet(path, index=False, **kwargs) From 61e5845cef5abf2296e82d7da255908497f3b500 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 23 Feb 2022 13:30:43 +0100 Subject: [PATCH 034/135] :sparkles: added logging --- viadot/flows/aselite_to_adls.py | 25 +++++++++++++++++-------- viadot/tasks/aselite.py | 13 ++++++++----- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 5efcab9dc..f130d32d6 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Literal from prefect import Flow from viadot.tasks import AzureDataLakeUpload -from viadot.task_utils import df_to_csv +from viadot.task_utils import df_to_csv, df_converts_bytes_to_int from viadot.tasks.aselite import ASELiteToDF @@ -14,7 +14,7 @@ def __init__( self, name: str, query: str = None, - db_name: str = None, + #db_name: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, file_path: str = "None", @@ -42,7 +42,7 @@ def __init__( overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. """ self.query = query - self.db_name = db_name + #self.db_name = db_name self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name self.overwrite = overwrite @@ -59,14 +59,22 @@ def __init__( def gen_flow(self) -> Flow: df = df_task.bind( query=self.query, - db_name=self.db_name, + #db_name=self.db_name, credentials_secret=self.sqldb_credentials_secret, vault_name=self.vault_name, flow=self, ) - csv = df_to_csv.bind( - df, path=self.file_path, sep=self.sep, if_exists=self.if_exists, flow=self + + convert_df = df_converts_bytes_to_int.bind(df, flow=self) + + create_csv = df_to_csv.bind( + convert_df, + path=self.file_path, + sep=self.sep, + if_exists=self.if_exists, + flow=self, ) + adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, @@ -74,5 +82,6 @@ def gen_flow(self) -> Flow: flow=self, ) - csv.set_upstream(df, flow=self) - adls_upload.set_upstream(csv, flow=self) + convert_df.set_upstream(df, flow=self) + create_csv.set_upstream(convert_df, flow=self) + adls_upload.set_upstream(create_csv, flow=self) diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index acd2ad26c..250363d1c 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -5,6 +5,7 @@ from .azure_key_vault import AzureKeyVaultSecret from viadot.config import local_config import json +import prefect class ASELiteToDF(Task): @@ -41,13 +42,13 @@ def __call__(self, *args, **kwargs): def run( self, credentials: Dict[str, Any] = None, - db_name: str = None, + # db_name: str = None, query: str = None, if_empty: str = None, credentials_secret: str = None, vault_name: str = None, ): - + logger = prefect.context.get("logger") if not credentials_secret: try: credentials_secret = PrefectSecret("aselite").run() @@ -59,11 +60,13 @@ def run( credentials_secret, vault_name=vault_name ).run() credentials = json.loads(credentials_str) + logger.info("Loaded credentials from Key Vault") else: credentials = local_config.get("ASELite_SQL") + logger.info("Loaded credentials from local source") - aselite = SQL(credentials=credentials, db_name=db_name) - + aselite = SQL(credentials=credentials) # , db_name=db_name + logger.info("Connected to ASELITE SOURCE") df = aselite.to_df(query=query, if_empty=if_empty) - + logger.info("Succefully collected data from query") return df From 00a28fbba0133837c73245432db60d6353cbb601 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 23 Feb 2022 13:40:04 +0100 Subject: [PATCH 035/135] =?UTF-8?q?=E2=9C=A8=20added=20logger?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 0805a7114..c8f461db5 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -196,7 +196,9 @@ def cleanup_validation_clutter(expectations_path): @task -def df_converts_bytes_to_int(cls, df): +def df_converts_bytes_to_int(df): + logger = prefect.context.get("logger") + logger.info("Converting bytes in dataframe columns to list of integers") return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) From 5b31a8b566ece8e81fbf2010c8362e46bbdcab18 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 23 Feb 2022 13:49:00 +0100 Subject: [PATCH 036/135] =?UTF-8?q?=F0=9F=9A=A7=20Moved=20change=5Fdate=5F?= =?UTF-8?q?range=5Ftask()=20from=20SupermetricsToADLS=20to=20GetFlowNewDat?= =?UTF-8?q?eRange=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 21 +------ viadot/tasks/__init__.py | 2 +- viadot/tasks/prefect.py | 82 ++++++++++++++++++---------- 3 files changed, 58 insertions(+), 47 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index ce847716e..991a4c659 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -24,7 +24,7 @@ DownloadGitHubFile, RunGreatExpectationsValidation, SupermetricsToDF, - GetFlowLastSuccessfulRun, + GetFlowNewDateRange, ) logger = logging.get_logger(__name__) @@ -34,7 +34,7 @@ validation_task = RunGreatExpectationsValidation() file_to_adls_task = AzureDataLakeUpload() json_to_adls_task = AzureDataLakeUpload() -prefect_get_successful_run = GetFlowLastSuccessfulRun() +prefect_get_new_date_range = GetFlowNewDateRange() class SupermetricsToADLS(Flow): @@ -174,18 +174,6 @@ def __init__( def slugify(name): return name.replace(" ", "_").lower() - def change_date_range_task( - self, date_range: str = None, difference: int = None, flow: Flow = None - ) -> Task: - old_range_splitted = date_range.split("_") - old_range = int(old_range_splitted[1]) - new_range = old_range + difference - - new_range_splitted = old_range_splitted - new_range_splitted[1] = str(new_range) - date_range_type = "_".join(new_range_splitted) - return date_range_type - def gen_supermetrics_task( self, ds_accounts: Union[str, List[str]], flow: Flow = None ) -> Task: @@ -213,14 +201,11 @@ def gen_supermetrics_task( def gen_flow(self) -> Flow: if self.date_range_type is not None: - difference = prefect_get_successful_run.run( + self.date_range_type = prefect_get_new_date_range.run( flow_name=self.flow_name, date_range_type=self.date_range_type, flow=self, ) - self.date_range_type = self.change_date_range_task( - date_range=self.date_range_type, difference=difference, flow=self - ) if self.parallel: # generate a separate task for each account diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 5955f735f..087f5767e 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -24,4 +24,4 @@ from .supermetrics import SupermetricsToCSV, SupermetricsToDF from .sharepoint import SharepointToDF from .cloud_for_customers import C4CReportToDF, C4CToDF -from .prefect import GetFlowLastSuccessfulRun +from .prefect import GetFlowNewDateRange diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index 7bca7a07d..58f841fe1 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -11,7 +11,7 @@ logger = logging.get_logger() -class GetFlowLastSuccessfulRun(Task): +class GetFlowNewDateRange(Task): def __init__( self, flow_name: str = None, @@ -33,32 +33,38 @@ def __call__(self): """Extract time from Prefect Flow run""" super().__call__(self) - def iter_throught_flow_runs_ids(self, run_ids_list: List[str] = None): + def iter_throught_flow_runs(self, flow_runs_details: List[dict] = None): """ Generate Flow run ids Args: - run_ids_list (List[str], optional): List of Flow run ids. Defaults to None. + flow_runs_details (List[dict], optional): List of Flow run details. Defaults to None. Yields: - str: Flow id + dict: Flow run details """ - for id in range(len(run_ids_list)): - yield run_ids_list[id] + for x in range(len(flow_runs_details)): + for flow_run in flow_runs_details[x]["flow_runs"]: + yield flow_run - def get_time_from_last_successful_run(self, flow_run_ids: List[str] = None) -> str: + def get_time_from_last_successful_run( + self, flow_runs_details: List[dict] = None + ) -> str: """ Get start_time from last Flow run where state was success. Args: - flow_run_ids (List[str], optional): List of Flow run ids. Defaults to None. + flow_runs_details (List[dict], optional): List of Flow run details. Defaults to None. Returns: - str: start_time of Flow run + str: Flow run start_time """ - for flow_run in self.iter_throught_flow_runs_ids(flow_run_ids): - if flow_run.state == "Success": - return flow_run.start_time + + for flow_run in self.iter_throught_flow_runs( + flow_runs_details=flow_runs_details + ): + if flow_run["state"] == "Failed": + return flow_run["start_time"] def calculate_difference( self, @@ -83,12 +89,16 @@ def calculate_difference( if diff_type == "time": difference_h = abs(base_date.hour - date_to_compare.hour) - if difference_h <= 1: - difference_m = date_to_compare.minute - base_date.minute - if difference_m <= 0: - return 1 + difference_m = date_to_compare.minute - base_date.minute + if difference_h == 1: + if difference_m < 0: + return 0 if difference_m > 0: return float(f"1.{(abs(difference_m))}") + if difference_m == 0: + return 1 + if difference_h < 1: + return 0 if difference_h > 1: return difference_h @@ -110,9 +120,9 @@ def check_if_scheduled_run( base_date=time_schedule, diff_type="time", ) - if diff < 1: + if diff <= 1: return True - if diff >= 1: + if diff > 1: return False def get_formatted_date( @@ -141,6 +151,16 @@ def get_formatted_date( date_clean = datetime.strptime(date_extracted, "%Y-%m-%d") return date_clean.date() + def change_date_range(self, date_range: str = None, difference: int = None): + old_range_splitted = date_range.split("_") + old_range = int(old_range_splitted[1]) + new_range = old_range + difference + + new_range_splitted = old_range_splitted + new_range_splitted[1] = str(new_range) + date_range_type = "_".join(new_range_splitted) + return date_range_type + @defaults_from_attrs( "flow_name", "date_range_type", @@ -151,9 +171,6 @@ def run( date_range_type, **kwargs, ) -> None: - - client = prefect.Client() - query = ( """ { @@ -175,22 +192,31 @@ def run( % flow_name ) + client = prefect.Client() flow_runs = client.graphql(query) - flow_runs_ids = flow_runs.data.flow[0]["flow_runs"] + flow_runs_details = flow_runs.data.flow + + time_schedule = flow_runs_details[0]["flow_runs"][0]["scheduled_start_time"] + + last_success_start_time = self.get_time_from_last_successful_run( + flow_runs_details + ) - last_success_start_time = self.get_time_from_last_successful_run(flow_runs_ids) - time_schedule = flow_runs_ids[0]["scheduled_start_time"] is_scheduled = self.check_if_scheduled_run( time_run=last_success_start_time, time_schedule=time_schedule, ) + if is_scheduled is True: - new_date = self.calculate_difference( + difference_days = self.calculate_difference( date_to_compare=last_success_start_time, base_date=time_schedule, diff_type="date", ) - if is_scheduled is False: - return self.date_range_type + date_range_type = self.change_date_range( + date_range=date_range_type, difference=difference_days + ) + return date_range_type - return new_date + if is_scheduled is False: + return 0 From 48bca88a5990b13bdb1e39dac475612187e95ddb Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 23 Feb 2022 13:50:37 +0100 Subject: [PATCH 037/135] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20Changed=20from=20'?= =?UTF-8?q?Failed'=20to=20'Success'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index 58f841fe1..31c00dc59 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -63,7 +63,7 @@ def get_time_from_last_successful_run( for flow_run in self.iter_throught_flow_runs( flow_runs_details=flow_runs_details ): - if flow_run["state"] == "Failed": + if flow_run["state"] == "Success": return flow_run["start_time"] def calculate_difference( From 43ace93b8aca25f05413deae015437ec4fba8a75 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 23 Feb 2022 16:25:44 +0100 Subject: [PATCH 038/135] =?UTF-8?q?=E2=9C=85=20added=20super=20crazy=20tes?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index f4bc977ee..6d0af450c 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -1,8 +1,12 @@ import pytest import pandas as pd from typing import List - -from viadot.task_utils import df_get_data_types_task, df_map_mixed_dtypes_for_parquet +import numpy as np +from viadot.task_utils import ( + df_get_data_types_task, + df_map_mixed_dtypes_for_parquet, + df_converts_bytes_to_int, +) def count_dtypes(dtypes_dict: dict = None, dtypes_to_count: List[str] = None) -> int: @@ -32,3 +36,30 @@ def test_map_dtypes_for_parquet(): sum_of_mapped_dtypes = count_dtypes(dtyps_dict_mapped, ["String"]) assert sum_of_dtypes == sum_of_mapped_dtypes + + +def test_df_converts_bytes_to_int(): + dane = { + "ID": {0: 1, 1: 2, 2: 100, 3: 101, 4: 102}, + "SpracheText": { + 0: "TE_CATALOG_BASE_LANG", + 1: "TE_Docu", + 2: "TE_German", + 3: "TE_English", + 4: "TE_French", + }, + "RKZ": { + 0: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\r\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00q\x9f#NV\x8dG\x00", + 1: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\r\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00<\xa0#NV\x8dG\x00", + 2: b"\r\xa3\x86\x01\x00\x01\x00\x00\x00\x00\x04\r\x01\x00\x00\x00\x00\x00\x00\x04\x00\x003\x9f#NV\x8dG\x00", + 3: b"\r\xa3\x86\x01\x00\x01\x00\x00\x00\x00\x04\r\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00R\x9f#NV\x8dG\x00", + 4: b"\r\xa3\x86\x01\x00\x01\x00\x00\x00\x00\x04\r\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\xee\x9f#NV\x8dG\x00", + }, + } + + df = pd.DataFrame.from_dict(dane) + test_df = df_converts_bytes_to_int.run(df) + test_df = df_converts_bytes_to_int.run(df) + lst = test_df["RKZ"][0] + is_it_or_not = all(isinstance(x, (int, int)) for x in lst) + assert is_it_or_not == True From 53bb51afd128fa74167a49f6256ea083bdbeff0b Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 24 Feb 2022 08:36:06 +0100 Subject: [PATCH 039/135] =?UTF-8?q?=F0=9F=90=9B=20remove=20a=20duplicate?= =?UTF-8?q?=20line=20of=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 6d0af450c..dc6e1f820 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -59,7 +59,6 @@ def test_df_converts_bytes_to_int(): df = pd.DataFrame.from_dict(dane) test_df = df_converts_bytes_to_int.run(df) - test_df = df_converts_bytes_to_int.run(df) lst = test_df["RKZ"][0] is_it_or_not = all(isinstance(x, (int, int)) for x in lst) assert is_it_or_not == True From ef383b00e4ed6585e85ff195d21f874e4acfcad9 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 24 Feb 2022 11:15:15 +0100 Subject: [PATCH 040/135] =?UTF-8?q?=F0=9F=8E=A8=20Edited=20aselite=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/aselite.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index 250363d1c..b400de18c 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -10,23 +10,16 @@ class ASELiteToDF(Task): def __init__( - self, - credentials: Dict[str, Any] = None, - db_name: str = None, - query: str = None, - *args, - **kwargs + self, credentials: Dict[str, Any] = None, query: str = None, *args, **kwargs ): """ Task for obtaining data from ASElite source. Args: credentials (Dict[str, Any], optional): ASElite SQL Database credentials. Defaults to None. - db_name(str, optional): Name of ASElite database. Defaults to None. query(str, optional): Query to perform on a database. Defaults to None. Returns: Pandas DataFrame """ self.credentials = credentials - self.db_name = db_name self.query = query super().__init__( @@ -42,9 +35,7 @@ def __call__(self, *args, **kwargs): def run( self, credentials: Dict[str, Any] = None, - # db_name: str = None, query: str = None, - if_empty: str = None, credentials_secret: str = None, vault_name: str = None, ): @@ -65,8 +56,8 @@ def run( credentials = local_config.get("ASELite_SQL") logger.info("Loaded credentials from local source") - aselite = SQL(credentials=credentials) # , db_name=db_name + aselite = SQL(credentials=credentials) logger.info("Connected to ASELITE SOURCE") - df = aselite.to_df(query=query, if_empty=if_empty) + df = aselite.to_df(query=self.query) logger.info("Succefully collected data from query") return df From f0efb518d52373bea522d4b0db18529de7aae846 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 24 Feb 2022 11:40:29 +0100 Subject: [PATCH 041/135] =?UTF-8?q?=E2=9C=85=20=20Added=20ASElitetoDF=20ta?= =?UTF-8?q?sk=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_aselite.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/integration/tasks/test_aselite.py diff --git a/tests/integration/tasks/test_aselite.py b/tests/integration/tasks/test_aselite.py new file mode 100644 index 000000000..81e827e3a --- /dev/null +++ b/tests/integration/tasks/test_aselite.py @@ -0,0 +1,15 @@ +from viadot.tasks import ASELiteToDF +import pandas as pd + + +def test_aselite_to_df(): + query = """SELECT TOP (10) [usageid] + ,[configid] + ,[verticalid] + ,[textgroupid] + ,[nr] + ,[storedate] + FROM [UCRMDEV_DESIGNER].[dbo].[PORTAL_APPLICATION_TEXTUSAGE]""" + task = ASELiteToDF() + df = task.run(query=query) + assert isinstance(df, pd.DataFrame) From 6e5087a0bd4e5b4535c8ec7752d59b4235822e58 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 24 Feb 2022 11:41:03 +0100 Subject: [PATCH 042/135] =?UTF-8?q?=F0=9F=8E=A8=20Edited=20aselite=20flow?= =?UTF-8?q?=20and=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/aselite_to_adls.py | 9 ++++----- viadot/tasks/aselite.py | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index f130d32d6..ba1c2d8d1 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -14,7 +14,7 @@ def __init__( self, name: str, query: str = None, - #db_name: str = None, + # db_name: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, file_path: str = "None", @@ -30,8 +30,7 @@ def __init__( Args: name (str): The name of the flow. - query (str): - db_name (str): + query (str): Query to perform on a database. Defaults to None. sqldb_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ASElite SQL Database credentials. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. @@ -42,7 +41,7 @@ def __init__( overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. """ self.query = query - #self.db_name = db_name + # self.db_name = db_name self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name self.overwrite = overwrite @@ -59,7 +58,7 @@ def __init__( def gen_flow(self) -> Flow: df = df_task.bind( query=self.query, - #db_name=self.db_name, + # db_name=self.db_name, credentials_secret=self.sqldb_credentials_secret, vault_name=self.vault_name, flow=self, diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index b400de18c..b0214b410 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -34,8 +34,8 @@ def __call__(self, *args, **kwargs): def run( self, + query: str, credentials: Dict[str, Any] = None, - query: str = None, credentials_secret: str = None, vault_name: str = None, ): @@ -58,6 +58,6 @@ def run( aselite = SQL(credentials=credentials) logger.info("Connected to ASELITE SOURCE") - df = aselite.to_df(query=self.query) + df = aselite.to_df(query=query) logger.info("Succefully collected data from query") return df From 816039be7589f18e359ed330a1fc667fda8b58c3 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 24 Feb 2022 11:42:24 +0100 Subject: [PATCH 043/135] =?UTF-8?q?=E2=9C=85=20=20added=20aselite=20flow?= =?UTF-8?q?=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_aselite_to_adls.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 tests/integration/flows/test_aselite_to_adls.py diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py new file mode 100644 index 000000000..97f957ef3 --- /dev/null +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -0,0 +1,72 @@ +from typing import Any, Dict, List, Literal +from prefect import Flow +from viadot.tasks import AzureDataLakeUpload +from prefect.run_configs import DockerRun +from viadot.task_utils import df_to_csv, df_converts_bytes_to_int +from viadot.tasks.aselite import ASELiteToDF +import logging +from viadot.flows.aselite_to_adls import ASELitetoADLS +from prefect.tasks.secrets import PrefectSecret +import pandas as pd +import os + +TMP_FILE_NAME = "test_flow.csv" +MAIN_DF = None + +df_task = ASELiteToDF() +file_to_adls_task = AzureDataLakeUpload() + + +def test_aselite_to_adls(): + + credentials_secret = PrefectSecret("aselite").run() + vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() + + query_designer = """SELECT TOP 10 [ID] + ,[SpracheText] + ,[SpracheKat] + ,[SpracheMM] + ,[KatSprache] + ,[KatBasisSprache] + ,[CodePage] + ,[Font] + ,[Neu] + ,[Upd] + ,[UpdL] + ,[LosKZ] + ,[AstNr] + ,[KomKz] + ,[RKZ] + ,[ParentLanguageNo] + ,[UPD_FIELD] + FROM [UCRMDEV].[dbo].[CRM_00]""" + + RUN_CONFIG = DockerRun( + image="docker.pkg.github.com/dyvenia/viadot/viadot:latest", + labels=["prod"], + ) + + flow = ASELitetoADLS( + "Test flow ", + query=query_designer, + sqldb_credentials_secret="AIA-ASELITE-QA", + vault_name="azuwevelcrkeyv001s", + file_path=TMP_FILE_NAME, + to_path="raw/supermetrics/mp/result_df_flow_at_des_m.csv", + run_config=RUN_CONFIG, + ) + + result = flow.run() + assert result.is_successful() + + +def test_generated_csv_file(): + + MAIN_DF = pd.read_csv(TMP_FILE_NAME, delimiter="\t") + + if isinstance(MAIN_DF, pd.DataFrame): + assert True + + assert MAIN_DF.shape == (10, 17) + + os.remove(TMP_FILE_NAME) From 3a8374db65b321cc39ab04e7fb6055cb0872ec71 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 24 Feb 2022 11:44:38 +0100 Subject: [PATCH 044/135] =?UTF-8?q?=E2=9C=85=20=20added=20flow=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_aselite_to_adls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index 97f957ef3..5608cd133 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -49,8 +49,8 @@ def test_aselite_to_adls(): flow = ASELitetoADLS( "Test flow ", query=query_designer, - sqldb_credentials_secret="AIA-ASELITE-QA", - vault_name="azuwevelcrkeyv001s", + sqldb_credentials_secret=credentials_secret, + vault_name=vault_name, file_path=TMP_FILE_NAME, to_path="raw/supermetrics/mp/result_df_flow_at_des_m.csv", run_config=RUN_CONFIG, From 939eec1a7c70ed4c21fd750a1870872517a0a73f Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 24 Feb 2022 12:32:06 +0100 Subject: [PATCH 045/135] =?UTF-8?q?=E2=9C=85=20=20added=20ASElite=20flow?= =?UTF-8?q?=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/conftest.py | 6 ++++++ tests/integration/flows/test_aselite_to_adls.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3e5b9327c..790678faa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,3 +44,9 @@ def create_test_parquet_file(DF, TEST_PARQUET_FILE_PATH): DF.to_parquet(TEST_PARQUET_FILE_PATH, index=False) yield os.remove(TEST_PARQUET_FILE_PATH) + + +@pytest.fixture(scope="session") +def TEST_CSV_ASELITE_PATH(): + file_path = "raw/supermetrics/mp/result_df_flow_at_des_m.csv" + return file_path diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index 5608cd133..1c89436db 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -53,7 +53,7 @@ def test_aselite_to_adls(): vault_name=vault_name, file_path=TMP_FILE_NAME, to_path="raw/supermetrics/mp/result_df_flow_at_des_m.csv", - run_config=RUN_CONFIG, + run_config=None, ) result = flow.run() From 53e6c244edf222c621d6c315119c8d06828dde38 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 24 Feb 2022 13:45:48 +0100 Subject: [PATCH 046/135] =?UTF-8?q?=F0=9F=93=9D=20Upadated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b69c8a31b..aafe49923 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- added `ASEliteToDF` task and `ASEliteToADLS` flow - added KeyVault support in `CloudForCustomers` tasks ## [0.2.15] - 2022-01-12 From aa5d9e0698b00f295342bcffbf77310ef7ccde27 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 25 Feb 2022 08:55:00 +0100 Subject: [PATCH 047/135] =?UTF-8?q?=E2=9A=A1=EF=B8=8FAdded=20a=20restricti?= =?UTF-8?q?on=20that=20only=20considers=20last=5Fdays?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 991a4c659..6a19dd360 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -200,7 +200,7 @@ def gen_supermetrics_task( return t def gen_flow(self) -> Flow: - if self.date_range_type is not None: + if self.date_range_type is not None and "days" in self.date_range_type: self.date_range_type = prefect_get_new_date_range.run( flow_name=self.flow_name, date_range_type=self.date_range_type, From 369356d0e02972576c673052f60cb12c6f759e4d Mon Sep 17 00:00:00 2001 From: lzuchowska Date: Fri, 25 Feb 2022 10:21:40 +0100 Subject: [PATCH 048/135] =?UTF-8?q?=F0=9F=93=9D=20Added=20git=20forking=20?= =?UTF-8?q?flow=20for=20development?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 11862577c..03abb7c55 100644 --- a/README.md +++ b/README.md @@ -108,10 +108,42 @@ However, when developing, the easiest way is to use the provided Jupyter Lab con 2. Set up locally 3. Test your changes with `pytest` 4. Submit a PR. The PR should contain the following: -- new/changed functionality -- tests for the changes -- changes added to `CHANGELOG.md` -- any other relevant resources updated (esp. `viadot/docs`) + - new/changed functionality + - tests for the changes + - changes added to `CHANGELOG.md` + - any other relevant resources updated (esp. `viadot/docs`) + +The general flow of working for this repository in case of forking: +1. Pull before making any changes +2. Create a new branch with +``` +git checkout -b +``` +3. Make some work on repository +4. Stage changes with +``` +git add +``` +5. Commit the changes with +``` +git commit -m +``` +__Note__: See out Style Guidelines for more information about commit messages and PR names + +6. Fetch and pull the changes that could happen while working with +``` +git fetch +git checkout / +``` +7. Push your changes on repostory using +``` +git push origin +``` +8. Use merge to finish your push to repository +``` +git checkout +git merge +``` Please follow the standards and best practices used within the library (eg. when adding tasks, see how other tasks are constructed, etc.). For any questions, please reach out to us here on GitHub. From 2c560d35399dc1cef6db326c1165b6467f37ebdf Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 25 Feb 2022 10:24:22 +0100 Subject: [PATCH 049/135] =?UTF-8?q?=E2=9C=85=20Added=20test=20test=5Fchang?= =?UTF-8?q?e=5Fdate=5Frange?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_prefect.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/integration/tasks/test_prefect.py b/tests/integration/tasks/test_prefect.py index 94731d8a7..a2332a73c 100644 --- a/tests/integration/tasks/test_prefect.py +++ b/tests/integration/tasks/test_prefect.py @@ -3,9 +3,9 @@ import datetime from datetime import date -from viadot.tasks import GetFlowLastSuccessfulRun +from viadot.tasks import GetFlowNewDateRange -PREFECT_TASK = GetFlowLastSuccessfulRun() +PREFECT_TASK = GetFlowNewDateRange() DATE_FROM_PREFECT = "2022-01-01T01:30:00+00:00" DATE_FROM_PREFECT2 = "2022-01-04T02:20:00+00:00" PREFECT_JSON = { @@ -78,7 +78,7 @@ def test_calculate_difference_time(): def test_get_time_from_last_successful_run(): - flow_runs = PREFECT_JSON["data"]["flow"][0]["flow_runs"] + flow_runs = PREFECT_JSON["data"]["flow"] start_time_success = PREFECT_TASK.get_time_from_last_successful_run( flow_runs_details=flow_runs ) @@ -95,3 +95,12 @@ def test_check_if_scheduled_run(): time_run="2022-02-21T02:20:00+00:00", time_schedule="2022-02-15T01:00:00+00:00" ) assert is_scheduled is False + + +def test_change_date_range(): + date_range = "last_5_days" + difference = 10 + date_range_new = PREFECT_TASK.change_date_range( + date_range=date_range, difference=difference + ) + assert date_range_new == "last_15_days" From cc730e6246aa02206f80dfcbbf6cb88990287c81 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 25 Feb 2022 10:26:02 +0100 Subject: [PATCH 050/135] =?UTF-8?q?=F0=9F=8E=A8=20Moved=20functions=20outs?= =?UTF-8?q?ide=20the=20GetFlowNewDateRange=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/prefect.py | 249 +++++++++++++++++++++------------------- 1 file changed, 128 insertions(+), 121 deletions(-) diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index 31c00dc59..71a6c2a52 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -1,8 +1,6 @@ -from os import times_result from typing import List, Literal import prefect from datetime import date, datetime -import pandas as pd from prefect import Task from prefect.utilities.tasks import defaults_from_attrs @@ -11,6 +9,125 @@ logger = logging.get_logger() +def iter_throught_flow_runs(flow_runs_details: List[dict] = None) -> dict: + """ + Generate Flow run details from dict containing flow runs + + Args: + flow_runs_details (List[dict], optional): List of Flow run details. Defaults to None. + + Yields: + dict: Flow run details + """ + for x in range(len(flow_runs_details)): + for flow_run in flow_runs_details[x]["flow_runs"]: + yield flow_run + + +def get_time_from_last_successful_run(flow_runs_details: List[dict] = None) -> str: + """ + Get start_time from last flow run where state was success. + + Args: + flow_runs_details (List[dict], optional): List of Flow run details. Defaults to None. + + Returns: + str: Flow run start_time + """ + + for flow_run in iter_throught_flow_runs(flow_runs_details=flow_runs_details): + if flow_run["state"] == "Success": + return flow_run["start_time"] + + +def calculate_difference( + date_to_compare: str = None, + base_date: str = str(datetime.today()), + diff_type: Literal["time", "date"] = "date", +): + """ + Calculate diffrence between two dates. + + Args: + date_to_compare (str, optional): Date to compare with base_date (Flow run start_time). Defaults to None. + base_date (str, optional): The base date (be saved as Prefect schedule date. Defaults to str(datetime.today()). + diff_type (Literal["time", "date"], optional): _description_. Defaults to "date". + + Returns: + (int, float): Differences in days when calculating date or time (hours, minutes) when calculating time. + """ + base_date = get_formatted_date(base_date, diff_type) + date_to_compare = get_formatted_date(date_to_compare, diff_type) + + if diff_type == "date": + difference = abs(base_date - date_to_compare) + return difference.days + + if diff_type == "time": + difference_h = abs(base_date.hour - date_to_compare.hour) + difference_m = date_to_compare.minute - base_date.minute + if difference_h == 1: + if difference_m < 0: + return 0 + if difference_m > 0: + return float(f"1.{(abs(difference_m))}") + if difference_m == 0: + return 1 + if difference_h < 1: + return 0 + if difference_h > 1: + return difference_h + + +def check_if_scheduled_run(time_run: str = None, time_schedule: str = None) -> bool: + """ + Check if run was scheduled or started by user. + + Args: + time_run (str, optional): The time the Flow was started. Defaults to None. + time_schedule (str, optional): Scheduled time of Flow. Defaults to None. + + Returns: + bool: True if flow run was started automatically. False if Flow was started by user. + """ + diff = calculate_difference( + date_to_compare=time_run, + base_date=time_schedule, + diff_type="time", + ) + if diff <= 1: + return True + if diff > 1: + return False + + +def get_formatted_date( + time_unclean: str = None, + return_value: Literal["time", "date"] = "date", +): + """ + Format date from "2022-02-21T01:00:00+00:00" to date or time. + + Args: + time_unclean (str, optional): Time in datetime format obtained from Prefect. Defaults to None. + return_value (Literal["time", "date"], optional): Choose the format to be extracted from datetime - time or date. + Defaults to "date". + + Returns: + datetime: Date (datetime.date) or time (datetime.time) + """ + if return_value == "time": + time_extracted = time_unclean.split("T")[1] + time_clean_str = time_extracted.split(".")[0] + time_clean = datetime.strptime(time_clean_str[:8], "%H:%M:%S") + return time_clean.time() + + if return_value == "date": + date_extracted = time_unclean.split("T")[0] + date_clean = datetime.strptime(date_extracted, "%Y-%m-%d") + return date_clean.date() + + class GetFlowNewDateRange(Task): def __init__( self, @@ -33,125 +150,18 @@ def __call__(self): """Extract time from Prefect Flow run""" super().__call__(self) - def iter_throught_flow_runs(self, flow_runs_details: List[dict] = None): - """ - Generate Flow run ids - - Args: - flow_runs_details (List[dict], optional): List of Flow run details. Defaults to None. - - Yields: - dict: Flow run details - """ - for x in range(len(flow_runs_details)): - for flow_run in flow_runs_details[x]["flow_runs"]: - yield flow_run - - def get_time_from_last_successful_run( - self, flow_runs_details: List[dict] = None - ) -> str: + def change_date_range(self, date_range: str = None, difference: int = None) -> str: """ - Get start_time from last Flow run where state was success. + Replace number of days in string "last_X_days" based on the sum of number extracted from string + and difference passed to function. Args: - flow_runs_details (List[dict], optional): List of Flow run details. Defaults to None. + date_range (str, optional): Date range in format "last_X_days". Defaults to None. + difference (int, optional): Int value representing days. Defaults to None. Returns: - str: Flow run start_time + str: Date range in "last_X_days" format """ - - for flow_run in self.iter_throught_flow_runs( - flow_runs_details=flow_runs_details - ): - if flow_run["state"] == "Success": - return flow_run["start_time"] - - def calculate_difference( - self, - date_to_compare: str = None, - base_date: str = str(datetime.today()), - diff_type: Literal["time", "date"] = "date", - ): - """ - Calculate diffrence between two dates. - - Args: - date_to_compare (str, optional): Date to compare with base_date. Defaults to None. - base_date (str, optional): The base date - can be saved as Prefect schedule date. Defaults to str(datetime.today()). - diff_type (Literal["time", "date"], optional): _description_. Defaults to "date". - """ - base_date = self.get_formatted_date(base_date, diff_type) - date_to_compare = self.get_formatted_date(date_to_compare, diff_type) - - if diff_type == "date": - difference = abs(base_date - date_to_compare) - return difference.days - - if diff_type == "time": - difference_h = abs(base_date.hour - date_to_compare.hour) - difference_m = date_to_compare.minute - base_date.minute - if difference_h == 1: - if difference_m < 0: - return 0 - if difference_m > 0: - return float(f"1.{(abs(difference_m))}") - if difference_m == 0: - return 1 - if difference_h < 1: - return 0 - if difference_h > 1: - return difference_h - - def check_if_scheduled_run( - self, time_run: str = None, time_schedule: str = None - ) -> bool: - """ - Check if run was schduled or started by user. - - Args: - time_run (str, optional): The time the Flow was started. Defaults to None. - time_schedule (str, optional): Scheduled time of Flow. Defaults to None. - - Returns: - bool: True if flow run was started automatically. False if Flow was started by user. - """ - diff = self.calculate_difference( - date_to_compare=time_run, - base_date=time_schedule, - diff_type="time", - ) - if diff <= 1: - return True - if diff > 1: - return False - - def get_formatted_date( - self, - time_unclean: str = None, - return_value: Literal["time", "date"] = "date", - ): - """ - Format date from "2022-02-21T01:00:00+00:00" to date or time. - - Args: - time_unclean (str, optional): _description_. Defaults to None. - return_value (Literal["time", "date"], optional): Choose the format to be extracted from datetime - time or date. Defaults to "date". - - Returns: - datetime: Date (datetime.date) or time (datetime.time) - """ - if return_value == "time": - time_extracted = time_unclean.split("T")[1] - time_clean_str = time_extracted.split(".")[0] - time_clean = datetime.strptime(time_clean_str[:8], "%H:%M:%S") - return time_clean.time() - - if return_value == "date": - date_extracted = time_unclean.split("T")[0] - date_clean = datetime.strptime(date_extracted, "%Y-%m-%d") - return date_clean.date() - - def change_date_range(self, date_range: str = None, difference: int = None): old_range_splitted = date_range.split("_") old_range = int(old_range_splitted[1]) new_range = old_range + difference @@ -197,18 +207,15 @@ def run( flow_runs_details = flow_runs.data.flow time_schedule = flow_runs_details[0]["flow_runs"][0]["scheduled_start_time"] + last_success_start_time = get_time_from_last_successful_run(flow_runs_details) - last_success_start_time = self.get_time_from_last_successful_run( - flow_runs_details - ) - - is_scheduled = self.check_if_scheduled_run( + is_scheduled = check_if_scheduled_run( time_run=last_success_start_time, time_schedule=time_schedule, ) if is_scheduled is True: - difference_days = self.calculate_difference( + difference_days = calculate_difference( date_to_compare=last_success_start_time, base_date=time_schedule, diff_type="date", From 7c2c0e138ca4c1f3cbd5954176fde58ed050936c Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 25 Feb 2022 10:30:41 +0100 Subject: [PATCH 051/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b10342b22..7b3e398b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +### Added +- Added new task `GetFlowNewDateRange` to change date range based on Prefect flows ### Changed - pinned Prefect version to 0.15.11 - `df_to_csv` now creates dirs if they don't exist From 2f5a791a3f810b9680d1f8766761f0a3e47d6a9d Mon Sep 17 00:00:00 2001 From: trymzet Date: Fri, 25 Feb 2022 16:55:59 +0100 Subject: [PATCH 052/135] =?UTF-8?q?=E2=9C=A8=20Add=20`SQLServerCreateTable?= =?UTF-8?q?`=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/sources/sql_server.py | 6 ++- viadot/tasks/__init__.py | 1 + viadot/tasks/sql_server.py | 83 ++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 viadot/tasks/sql_server.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 564a22000..8cb6b24c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `SQLServer` source - Added `DuckDBToDF` task - Added `DuckDBTransform` flow +- Added `SQLServerCreateTable` task ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/sources/sql_server.py b/viadot/sources/sql_server.py index af0682da2..821d5b15a 100644 --- a/viadot/sources/sql_server.py +++ b/viadot/sources/sql_server.py @@ -22,8 +22,10 @@ def schemas(self) -> List[str]: @property def tables(self) -> List[str]: """Returns list of tables""" - tables_tuples = self.run("SELECT * FROM information_schema.tables") - return [table for row in tables_tuples for table in row] + tables_tuples = self.run( + "SELECT schema_name(t.schema_id), t.name FROM sys.tables t" + ) + return [".".join(row) for row in tables_tuples] def exists(self, table: str, schema: str = None) -> bool: """Check whether a table exists. diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index c52285f50..0d496890f 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -32,3 +32,4 @@ pass from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery, DuckDBToDF +from .sql_server import SQLServerCreateTable diff --git a/viadot/tasks/sql_server.py b/viadot/tasks/sql_server.py new file mode 100644 index 000000000..349091c40 --- /dev/null +++ b/viadot/tasks/sql_server.py @@ -0,0 +1,83 @@ +from datetime import timedelta +from typing import Any, Dict, Literal + +from prefect import Task +from prefect.utilities.tasks import defaults_from_attrs + +from ..config import local_config +from ..sources import SQLServer + + +class SQLServerCreateTable(Task): + """ + Create a table in SQL Server. + + Args: + schema (str, optional): Destination schema. + table (str, optional): Destination table. + dtypes (Dict[str, Any], optional): Data types to enforce. + if_exists (Literal, optional): What to do if the table already exists. + credentials (dict, optional): Credentials for the connection. + """ + + def __init__( + self, + schema: str = None, + table: str = None, + dtypes: Dict[str, Any] = None, + if_exists: Literal["fail", "replace", "skip", "delete"] = "fail", + credentials: dict = None, + max_retries: int = 3, + retry_delay: timedelta = timedelta(seconds=10), + *args, + **kwargs, + ): + self.schema = schema + self.table = table + self.dtypes = dtypes + self.if_exists = if_exists + self.credentials = credentials + super().__init__( + name="sql_server_create_table", + max_retries=max_retries, + retry_delay=retry_delay, + *args, + **kwargs, + ) + + @defaults_from_attrs("if_exists") + def run( + self, + schema: str = None, + table: str = None, + dtypes: Dict[str, Any] = None, + if_exists: Literal["fail", "replace", "skip", "delete"] = None, + credentials: str = None, + max_retries: int = None, + retry_delay: timedelta = None, + ): + """ + Create a table in SQL Server. + + Args: + schema (str, optional): Destination schema. + table (str, optional): Destination table. + dtypes (Dict[str, Any], optional): Data types to enforce. + if_exists (Literal, optional): What to do if the table already exists. + credentials (dict, optional): Credentials for the connection. + """ + + if credentials is None: + credentials = local_config.get("SQL_SERVER").get("DEV") + sql_server = SQLServer(credentials=credentials) + + fqn = f"{schema}.{table}" if schema is not None else table + created = sql_server.create_table( + schema=schema, table=table, dtypes=dtypes, if_exists=if_exists + ) + if created: + self.logger.info(f"Successfully created table {fqn}.") + else: + self.logger.info( + f"Table {fqn} has not been created as if_exists is set to {if_exists}." + ) From 7632d0a85bdcee9083bb26a9f828e003bfdc2abf Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 28 Feb 2022 12:09:34 +0100 Subject: [PATCH 053/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20CheckColumnOrder?= =?UTF-8?q?=20=20if=5Fexists=20=20=3D=20append?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 20 +++++++++++-------- viadot/tasks/azure_sql.py | 32 +++++++++++++++++++------------ 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 2f21bf19c..311f94d0e 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -75,14 +75,18 @@ def map_data_types_task(json_shema_path: str): @task def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): - if remove_tab == True: - for col in range(len(df.columns)): - df[df.columns[col]] = ( - df[df.columns[col]].astype(str).str.replace(r"\t", "", regex=True) - ) - df.to_csv(path, sep=sep, index=False) + # if table doesn't exist it will be created later - df equals None + if df == None: + logger.warning("DataFrame is None") else: - df.to_csv(path, sep=sep, index=False) + if remove_tab == True: + for col in range(len(df.columns)): + df[df.columns[col]] = ( + df[df.columns[col]].astype(str).str.replace(r"\t", "", regex=True) + ) + df.to_csv(path, sep=sep, index=False) + else: + df.to_csv(path, sep=sep, index=False) class ADLSToAzureSQL(Flow): @@ -238,7 +242,7 @@ def gen_flow(self) -> Flow: credentials_secret=self.sqldb_credentials_secret, flow=self, ) - + print(df) df_to_csv = df_to_csv_task.bind( df=df_reorder, path=self.local_file_path, diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 71f0d75cd..fafe1d79c 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -2,6 +2,7 @@ from datetime import timedelta from typing import Any, Dict, List, Literal import pandas as pd +import sys from prefect import Task from prefect.tasks.secrets import PrefectSecret @@ -339,19 +340,26 @@ def run( credentials = get_credentials(credentials_secret, vault_name=vault_name) azure_sql = AzureSQL(credentials=credentials) + check_if_exists_query = f"""SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table}' AND TABLE_SCHEMA='{schema}'""" + check_result = azure_sql.run(query=check_if_exists_query) if if_exists not in ["replace", "fail"]: - query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" - result = azure_sql.run(query=query) - sql_column_list = [table for row in result for table in row] - df_column_list = list(df.columns) - - if sql_column_list != df_column_list: - self.logger.warning( - "Detected column order difference between the CSV file and the table. Reordering..." - ) - df = self.df_change_order(df=df, sql_column_list=sql_column_list) - else: - return df + if if_exists == "append" and not check_result: + self.logger.warning("Table doesn't exists.") + return + elif check_result: + + query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" + result = azure_sql.run(query=query) + sql_column_list = [table for row in result for table in row] + df_column_list = list(df.columns) + + if sql_column_list != df_column_list: + self.logger.warning( + "Detected column order difference between the CSV file and the table. Reordering..." + ) + df = self.df_change_order(df=df, sql_column_list=sql_column_list) + else: + return df else: self.logger.info("The table will be replaced.") return df From f17f4d1522ba77cd3a6775237833a6aab665c3ed Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 28 Feb 2022 14:00:35 +0100 Subject: [PATCH 054/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20bug=20while=20ap?= =?UTF-8?q?pend=20in=20df=5Fto=5Fcsv=5Ftask?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 311f94d0e..f33e1edf0 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -76,7 +76,7 @@ def map_data_types_task(json_shema_path: str): @task def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): # if table doesn't exist it will be created later - df equals None - if df == None: + if df is None: logger.warning("DataFrame is None") else: if remove_tab == True: From 3dae71255833a1bc793ba049bd9e7314d8a1406d Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 28 Feb 2022 14:08:35 +0100 Subject: [PATCH 055/135] =?UTF-8?q?=F0=9F=90=9B=20Add=20`credentials`=20pa?= =?UTF-8?q?ram=20to=20`BCPTask`=20and=20make=20more=20resilient?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/bcp.py | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/viadot/tasks/bcp.py b/viadot/tasks/bcp.py index 196090e8b..c38426c80 100644 --- a/viadot/tasks/bcp.py +++ b/viadot/tasks/bcp.py @@ -16,6 +16,7 @@ class BCPTask(ShellTask): - path (str, optional): the path to the local CSV file to be inserted - schema (str, optional): the destination schema - table (str, optional): the destination table + - credentials (dict, optional): The credentials to use for connecting with the database. - vault_name (str): the name of the vault from which to fetch the secret - **kwargs (dict, optional): additional keyword arguments to pass to the Task constructor """ @@ -25,6 +26,7 @@ def __init__( path: str = None, schema: str = None, table: str = None, + credentials: dict = None, vault_name: str = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), @@ -34,6 +36,7 @@ def __init__( self.path = path self.schema = schema self.table = table + self.credentials = credentials self.vault_name = vault_name super().__init__( @@ -47,13 +50,20 @@ def __init__( ) @defaults_from_attrs( - "path", "schema", "table", "vault_name", "max_retries", "retry_delay" + "path", + "schema", + "table", + "credentials", + "vault_name", + "max_retries", + "retry_delay", ) def run( self, path: str = None, schema: str = None, table: str = None, + credentials: dict = None, credentials_secret: str = None, vault_name: str = None, max_retries: int = None, @@ -67,6 +77,7 @@ def run( - path (str, optional): the path to the local CSV file to be inserted - schema (str, optional): the destination schema - table (str, optional): the destination table + - credentials (dict, optional): The credentials to use for connecting with SQL Server. - credentials_secret (str, optional): the name of the Key Vault secret containing database credentials (server, db_name, user, password) - vault_name (str): the name of the vault from which to fetch the secret @@ -74,20 +85,21 @@ def run( Returns: str: the output of the bcp CLI command """ - if not credentials_secret: - # attempt to read a default for the service principal secret name - try: - credentials_secret = PrefectSecret( - "AZURE_DEFAULT_SQLDB_SERVICE_PRINCIPAL_SECRET" - ).run() - except ValueError: - pass + if not credentials: + if not credentials_secret: + # attempt to read a default for the service principal secret name + try: + credentials_secret = PrefectSecret( + "AZURE_DEFAULT_SQLDB_SERVICE_PRINCIPAL_SECRET" + ).run() + except ValueError: + pass - if credentials_secret: - credentials_str = AzureKeyVaultSecret( - credentials_secret, vault_name=vault_name - ).run() - credentials = json.loads(credentials_str) + if credentials_secret: + credentials_str = AzureKeyVaultSecret( + credentials_secret, vault_name=vault_name + ).run() + credentials = json.loads(credentials_str) fqn = f"{schema}.{table}" if schema else table @@ -96,5 +108,10 @@ def run( uid = credentials["user"] pwd = credentials["password"] + if "," in server: + # A space after the comma is allowed in the ODBC connection string + # but not in BCP's 'server' argument. + server = server.replace(" ", "") + command = f"/opt/mssql-tools/bin/bcp {fqn} in {path} -S {server} -d {db_name} -U {uid} -P '{pwd}' -c -F 2 -b 5000 -h 'TABLOCK'" return super().run(command=command, **kwargs) From 7d30b38460dc6bfeb4fe52b862fb569b625b023e Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 28 Feb 2022 14:32:30 +0100 Subject: [PATCH 056/135] =?UTF-8?q?=E2=9C=A8=20Add=20`get=5Fsql=5Fdtypes?= =?UTF-8?q?=5Ffrom=5Fdf`=20and=20`update=5Fdict`=20tasks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 63 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 05df5df47..c6820fc91 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -1,21 +1,18 @@ +import copy import json import os +import shutil from datetime import datetime, timezone +from pathlib import Path from typing import List, Literal import pandas as pd -from pathlib import Path -import shutil -import json - -from visions.functional import infer_type -from visions.typesets.complete_set import CompleteSet - import prefect from prefect import task from prefect.storage import Git from prefect.utilities import logging - +from visions.functional import infer_type +from visions.typesets.complete_set import CompleteSet logger = logging.get_logger() METADATA_COLUMNS = {"_viadot_downloaded_at_utc": "DATETIME"} @@ -90,6 +87,56 @@ def df_get_data_types_task(df: pd.DataFrame) -> dict: return dtypes_dict +@task +def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict: + """Obtain SQL data types from a pandas DataFrame""" + typeset = CompleteSet() + dtypes = infer_type(df, typeset) + dtypes_dict = {k: str(v) for k, v in dtypes.items()} + dict_mapping = { + "Float": "REAL", + "Image": None, + "Categorical": "VARCHAR(500)", + "Time": "TIME", + "Boolean": "BIT", + "DateTime": "DATETIMEOFFSET", # DATETIMEOFFSET is the only timezone-aware dtype in TSQL + "Object": "VARCHAR(500)", + "EmailAddress": "VARCHAR(50)", + "File": None, + "Geometry": "GEOMETRY", + "Ordinal": "VARCHAR(500)", + "Integer": "INT", + "Generic": "VARCHAR(500)", + "UUID": "UNIQUEIDENTIFIER", + "Complex": None, + "Date": "DATE", + "String": "VARCHAR(500)", + "IPAddress": "VARCHAR(39)", + "Path": "VARCHAR(500)", + "TimeDelta": "VARCHAR(20)", # datetime.datetime.timedelta; eg. '1 days 11:00:00' + "URL": "VARCHAR(500)", + "Count": "INT", + } + dict_dtypes_mapped = {} + for k in dtypes_dict: + dict_dtypes_mapped[k] = dict_mapping[dtypes_dict[k]] + + # This is required as pandas cannot handle mixed dtypes in Object columns + dtypes_dict_fixed = { + k: ("String" if v == "Object" else str(v)) + for k, v in dict_dtypes_mapped.items() + } + + return dtypes_dict_fixed + + +@task +def update_dict(d: dict, d_new: dict) -> dict: + d_copy = copy.deepcopy(d) + d_copy.update(d_new) + return d_copy + + @task def df_map_mixed_dtypes_for_parquet( df: pd.DataFrame, dtypes_dict: dict From b7836fb6ae1ddbef66f69d61c556d0291664dbb0 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 28 Feb 2022 14:38:15 +0100 Subject: [PATCH 057/135] =?UTF-8?q?=E2=9C=A8=20Add=20`DuckDBToSQLServer`?= =?UTF-8?q?=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 + viadot/flows/__init__.py | 1 + viadot/flows/adls_to_azure_sql.py | 7 +- viadot/flows/duckdb_to_sql_server.py | 130 +++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 4 deletions(-) create mode 100644 viadot/flows/duckdb_to_sql_server.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cb6b24c5..098a4986f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `DuckDBToDF` task - Added `DuckDBTransform` flow - Added `SQLServerCreateTable` task +- Added `credentials` param to `BCPTask` +- Added `get_sql_dtypes_from_df` and `update_dict` util tasks +- Added `DuckDBToSQLServer` flow ### Changed - Changed the base class of `AzureSQL` to `SQLServer` @@ -16,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - fixed OpenSSL config for old SQL Servers still using TLS < 1.2 +- `BCPTask` now correctly handles custom SQL Server port ## [0.3.2] - 2022-02-17 ### Fixed diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index cb9e7a7d1..ee63c5902 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -16,3 +16,4 @@ pass from .duckdb_transform import DuckDBTransform +from .duckdb_to_sql_server import DuckDBToSQLServer diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 2f21bf19c..26449baf2 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -3,9 +3,8 @@ from typing import Any, Dict, List, Literal import pandas as pd -from prefect import Flow, Parameter, task +from prefect import Flow, task from prefect.backend import get_key_value -from prefect.storage import Local from prefect.utilities import logging from viadot.tasks.azure_data_lake import AzureDataLakeDownload @@ -13,7 +12,6 @@ from ..tasks import ( AzureDataLakeCopy, AzureDataLakeToDF, - AzureDataLakeUpload, AzureSQLCreateTable, BCPTask, DownloadGitHubFile, @@ -46,7 +44,7 @@ def map_data_types_task(json_shema_path: str): dict_mapping = { "Float": "REAL", "Image": None, - "" "Categorical": "VARCHAR(500)", + "Categorical": "VARCHAR(500)", "Time": "TIME", "Boolean": "BIT", "DateTime": "DATETIMEOFFSET", # DATETIMEOFFSET is the only timezone-aware dtype in TSQL @@ -174,6 +172,7 @@ def __init__( # Generate CSV self.remove_tab = remove_tab + # BCPTask self.sqldb_credentials_secret = sqldb_credentials_secret diff --git a/viadot/flows/duckdb_to_sql_server.py b/viadot/flows/duckdb_to_sql_server.py new file mode 100644 index 000000000..4ebafbbd8 --- /dev/null +++ b/viadot/flows/duckdb_to_sql_server.py @@ -0,0 +1,130 @@ +from typing import Any, Dict, List, Literal + +from prefect import Flow +from prefect.utilities import logging + +from ..task_utils import df_to_csv as df_to_csv_task +from ..task_utils import get_sql_dtypes_from_df as get_sql_dtypes_from_df_task +from ..task_utils import update_dict as update_dict_task +from ..tasks import BCPTask, DuckDBToDF, SQLServerCreateTable + +logger = logging.get_logger(__name__) + +duckdb_to_df_task = DuckDBToDF() +create_table_task = SQLServerCreateTable() +bulk_insert_task = BCPTask() + + +class DuckDBToSQLServer(Flow): + def __init__( + self, + name: str, + duckdb_schema: str = None, + duckdb_table: str = None, + if_empty: str = "warn", + duckdb_credentials: dict = None, + local_file_path: str = None, + write_sep: str = "\t", + sql_server_schema: str = None, + sql_server_table: str = None, + dtypes: Dict[str, Any] = None, + if_exists: Literal["fail", "replace", "append", "delete"] = "replace", + sql_server_credentials: dict = None, + tags: List[str] = ["load"], + *args: List[any], + **kwargs: Dict[str, Any], + ): + """ + Flow for moving a table from DuckDB to SQL Server. + + Args: + name (str): The name of the flow. + duckdb_schema (str, optional): Destination schema. Defaults to None. + duckdb_table (str, optional): Destination table. Defaults to None. + if_empty (str, optional): What to do if the query returns no data. Defaults to "warn". + duckdb_credentials (dict, optional): The config to use for connecting with DuckDB. + local_file_path (str, optional): Local destination path. Defaults to None. + write_sep (str, optional): The delimiter for the output CSV file. Defaults to "\t". + sql_server_schema (str, optional): Destination schema. Defaults to None. + sql_server_table (str, optional): Destination table. Defaults to None. + dtypes (dict, optional): The data types to be enforced for the resulting table. By default, + we infer them from the DataFrame. Defaults to None. + if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". + sql_server_credentials (dict, optional): The credentials to use for connecting with SQL Server. + tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["load"]. + """ + + # DuckDBToDF + self.duckdb_schema = duckdb_schema + self.duckdb_table = duckdb_table + self.duckdb_credentials = duckdb_credentials + + # df_to_csv_task + self.local_file_path = local_file_path or self.slugify(name) + ".csv" + self.write_sep = write_sep + self.if_empty = if_empty + + # SQLServerCreateTable + self.sql_server_table = sql_server_table + self.sql_server_schema = sql_server_schema + self.dtypes = dtypes + self.if_exists = self._map_if_exists(if_exists) + self.sql_server_credentials = sql_server_credentials + + # Global + self.tags = tags + + super().__init__(*args, name=name, **kwargs) + + self.gen_flow() + + @staticmethod + def _map_if_exists(if_exists: str) -> str: + mapping = {"append": "skip"} + return mapping.get(if_exists, if_exists) + + @staticmethod + def slugify(name): + return name.replace(" ", "_").lower() + + def gen_flow(self) -> Flow: + df = duckdb_to_df_task.bind( + schema=self.duckdb_schema, + table=self.duckdb_table, + if_empty=self.if_empty, + credentials=self.duckdb_credentials, + flow=self, + ) + + df_to_csv = df_to_csv_task.bind( + df=df, + path=self.local_file_path, + sep=self.write_sep, + flow=self, + ) + + dtypes_from_df = get_sql_dtypes_from_df_task.bind(df=df, flow=self) + if self.dtypes: + # Update dtypes with the ones provided by user + dtypes = update_dict_task.bind(dtypes_from_df, self.dtypes, flow=self) + else: + dtypes = dtypes_from_df + + create_table_task.bind( + schema=self.sql_server_schema, + table=self.sql_server_table, + dtypes=dtypes, + if_exists=self.if_exists, + credentials=self.sql_server_credentials, + flow=self, + ) + bulk_insert_task.bind( + path=self.local_file_path, + schema=self.sql_server_schema, + table=self.sql_server_table, + credentials=self.sql_server_credentials, + flow=self, + ) + + create_table_task.set_upstream(df_to_csv, flow=self) + bulk_insert_task.set_upstream(create_table_task, flow=self) From 7cf67f39caca478eaca715026a5e206711e993cb Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 28 Feb 2022 14:46:46 +0100 Subject: [PATCH 058/135] =?UTF-8?q?=E2=9C=85=20Added=20tests=20CheckColumn?= =?UTF-8?q?Order=20-=20append?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_adls_to_azure_sql.py | 12 +++++++++++- tests/integration/tasks/test_azure_sql.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/integration/flows/test_adls_to_azure_sql.py b/tests/integration/flows/test_adls_to_azure_sql.py index ca6c30a25..75b85b19b 100644 --- a/tests/integration/flows/test_adls_to_azure_sql.py +++ b/tests/integration/flows/test_adls_to_azure_sql.py @@ -1,4 +1,5 @@ import pandas as pd +import os from viadot.flows import ADLSToAzureSQL from viadot.flows.adls_to_azure_sql import df_to_csv_task @@ -53,5 +54,14 @@ def test_df_to_csv_task(): df = pd.DataFrame(data=d) assert df["col1"].astype(str).str.contains("\t")[1] == True task = df_to_csv_task - task.run(df, "result.csv") + task.run(df, path="result.csv", remove_tab=True) assert df["col1"].astype(str).str.contains("\t")[1] != True + + +def test_df_to_csv_task_none(caplog): + df = None + task = df_to_csv_task + path = "result_none.csv" + task.run(df, path=path, remove_tab=False) + assert "DataFrame is None" in caplog.text + assert os.path.isfile(path) == False diff --git a/tests/integration/tasks/test_azure_sql.py b/tests/integration/tasks/test_azure_sql.py index 4fbcc48de..6628a05b2 100644 --- a/tests/integration/tasks/test_azure_sql.py +++ b/tests/integration/tasks/test_azure_sql.py @@ -132,3 +132,13 @@ def test_check_column_order_replace(caplog): with caplog.at_level(logging.INFO): check_column_order.run(table=TABLE, if_exists="replace", df=df) assert "The table will be replaced." in caplog.text + + +def test_check_column_order_append_not_exists(caplog): + check_column_order = CheckColumnOrder() + data = {"id": [1], "street": ["Green"], "name": ["Tom"]} + df = pd.DataFrame(data) + check_column_order.run( + table="non_existing_table_123", schema="sandbox", if_exists="append", df=df + ) + assert "Table doesn't exists" in caplog.text From d477d3759a89e54b82c411fb2cd6b25ce001fc85 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 28 Feb 2022 18:27:23 +0100 Subject: [PATCH 059/135] =?UTF-8?q?=E2=9C=A8=20Add=20append=20option=20to?= =?UTF-8?q?=20`DuckDB.create=5Ftable=5Ffrom=5Fparquet()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + tests/unit/test_duckdb.py | 18 ++++++++++++++++++ viadot/flows/sap_to_duckdb.py | 4 +++- viadot/sources/duckdb.py | 12 +++++++----- viadot/tasks/duckdb.py | 4 ++-- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 098a4986f..51f26f612 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `credentials` param to `BCPTask` - Added `get_sql_dtypes_from_df` and `update_dict` util tasks - Added `DuckDBToSQLServer` flow +- Added `if_exists="append"` option to `DuckDB.create_table_from_parquet()` ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/tests/unit/test_duckdb.py b/tests/unit/test_duckdb.py index ae0177063..e31d4acf4 100644 --- a/tests/unit/test_duckdb.py +++ b/tests/unit/test_duckdb.py @@ -38,6 +38,24 @@ def test_create_table_from_parquet(duckdb, TEST_PARQUET_FILE_PATH): duckdb.run(f"DROP SCHEMA {SCHEMA}") +def test_create_table_from_parquet_append(duckdb, TEST_PARQUET_FILE_PATH): + duckdb.create_table_from_parquet( + schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH + ) + df = duckdb.to_df(f"SELECT * FROM {SCHEMA}.{TABLE}") + assert df.shape[0] == 3 + + # now append + duckdb.create_table_from_parquet( + schema=SCHEMA, table=TABLE, path=TEST_PARQUET_FILE_PATH, if_exists="append" + ) + df = duckdb.to_df(f"SELECT * FROM {SCHEMA}.{TABLE}") + assert df.shape[0] == 6 + + duckdb.drop_table(TABLE, schema=SCHEMA) + duckdb.run(f"DROP SCHEMA {SCHEMA}") + + def test_create_table_from_multiple_parquet(duckdb): # we use the two Parquet files generated by fixtures in conftest duckdb.create_table_from_parquet( diff --git a/viadot/flows/sap_to_duckdb.py b/viadot/flows/sap_to_duckdb.py index e50e1817d..d41085695 100644 --- a/viadot/flows/sap_to_duckdb.py +++ b/viadot/flows/sap_to_duckdb.py @@ -21,7 +21,9 @@ def __init__( sep: str = "\t", autopick_sep: bool = True, schema: str = None, - table_if_exists: Literal["fail", "replace", "skip", "delete"] = "fail", + table_if_exists: Literal[ + "fail", "replace", "append", "skip", "delete" + ] = "fail", sap_credentials: dict = None, duckdb_credentials: dict = None, *args: List[any], diff --git a/viadot/sources/duckdb.py b/viadot/sources/duckdb.py index d385640b8..6c77bbeb4 100644 --- a/viadot/sources/duckdb.py +++ b/viadot/sources/duckdb.py @@ -141,7 +141,7 @@ def create_table_from_parquet( table: str, path: str, schema: str = None, - if_exists: Literal["fail", "replace", "skip", "delete"] = "fail", + if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", ) -> NoReturn: """Create a DuckDB table with a CTAS from Parquet file(s). @@ -165,6 +165,12 @@ def create_table_from_parquet( if exists: if if_exists == "replace": self.run(f"DROP TABLE {fqn}") + elif if_exists == "append": + self.logger.info(f"Appending to table {fqn}...") + ingest_query = f"COPY {fqn} FROM '{path}' (FORMAT 'parquet')" + self.run(ingest_query) + self.logger.info(f"Successfully appended data to table '{fqn}'.") + return True elif if_exists == "delete": self.run(f"DELETE FROM {fqn}") return True @@ -182,10 +188,6 @@ def create_table_from_parquet( self.run(ingest_query) self.logger.info(f"Table {fqn} has been created successfully.") - def insert_into_from_parquet(): - # check with Marcin if needed - pass - def drop_table(self, table: str, schema: str = None) -> bool: """ Drop a table. diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index 88ca3fcb1..fa040a82d 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -77,7 +77,7 @@ class DuckDBCreateTableFromParquet(Task): def __init__( self, schema: str = None, - if_exists: Literal["fail", "replace", "skip", "delete"] = "fail", + if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", credentials: dict = None, *args, **kwargs, @@ -98,7 +98,7 @@ def run( table: str, path: str, schema: str = None, - if_exists: Literal["fail", "replace", "skip", "delete"] = None, + if_exists: Literal["fail", "replace", "append", "skip", "delete"] = None, ) -> NoReturn: """ Create a DuckDB table with a CTAS from Parquet file(s). From 9b3289b3f91a76ddb6c7b540db928393f4cc886b Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 28 Feb 2022 19:08:11 +0100 Subject: [PATCH 060/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20`SAPRFC.to=5Fdf(?= =?UTF-8?q?)`=20ignoring=20user-specified=20separator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 6 +++++- viadot/flows/sap_to_duckdb.py | 10 +++------- viadot/sources/sap_rfc.py | 11 +++++------ viadot/tasks/sap_rfc.py | 18 ++++++------------ 4 files changed, 19 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51f26f612..341cefa65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,8 +19,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `df_to_parquet()` task now creates directories if needed ### Fixed -- fixed OpenSSL config for old SQL Servers still using TLS < 1.2 +- Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 - `BCPTask` now correctly handles custom SQL Server port +- Fixed `SAPRFC.to_df()` ignoring user-specified separator + +### Removed +- Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. ## [0.3.2] - 2022-02-17 ### Fixed diff --git a/viadot/flows/sap_to_duckdb.py b/viadot/flows/sap_to_duckdb.py index d41085695..0c4867a6a 100644 --- a/viadot/flows/sap_to_duckdb.py +++ b/viadot/flows/sap_to_duckdb.py @@ -18,8 +18,7 @@ def __init__( table: str, local_file_path: str, name: str = None, - sep: str = "\t", - autopick_sep: bool = True, + sep: str = None, schema: str = None, table_if_exists: Literal[ "fail", "replace", "append", "skip", "delete" @@ -36,9 +35,8 @@ def __init__( table (str): Destination table in DuckDB. local_file_path (str): The path to the source Parquet file. name (str, optional): The name of the flow. Defaults to None. - sep (str, optional): The separator to use when reading query results. Defaults to "\t". - autopick_sep (bool, optional): Whether SAPRFC should try different separators - in case the query fails with the default one. Defaults to True. + sep (str, optional): The separator to use when reading query results. If not provided, + multiple options are automatically tried. Defaults to None. schema (str, optional): Destination schema in DuckDB. Defaults to None. table_if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". sap_credentials (dict, optional): The credentials to use to authenticate with SAP. @@ -49,7 +47,6 @@ def __init__( # SAPRFCToDF self.query = query self.sep = sep - self.autopick_sep = autopick_sep self.sap_credentials = sap_credentials # DuckDBCreateTableFromParquet @@ -73,7 +70,6 @@ def gen_flow(self) -> Flow: df = self.sap_to_df_task.bind( query=self.query, sep=self.sep, - autopick_sep=self.autopick_sep, flow=self, ) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index f45a9645c..6b83ceec7 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -93,13 +93,12 @@ class SAPRFC(Source): - etc. """ - def __init__(self, sep: str = None, autopick_sep: bool = True, *args, **kwargs): + def __init__(self, sep: str = None, *args, **kwargs): """Create an instance of the SAPRFC class. Args: - sep (str, optional): Which separator to use when querying SAP. Defaults to None. - autopick_sep (bool, optional): Whether to automatically pick a working separator. - Defaults to True. + sep (str, optional): Which separator to use when querying SAP. If not provided, + multiple options are automatically tried. Raises: CredentialError: If provided credentials are incorrect. @@ -114,7 +113,6 @@ def __init__(self, sep: str = None, autopick_sep: bool = True, *args, **kwargs): super().__init__(*args, credentials=credentials, **kwargs) self.sep = sep - self.autopick_sep = autopick_sep self.client_side_filters = None @property @@ -403,7 +401,8 @@ def to_df(self): columns = self.select_columns_aliased sep = self._query.get("DELIMITER") - if sep is None or self.autopick_sep: + if sep is None: + # automatically find a working separator SEPARATORS = ["|", "/t", "#", ";", "@"] for sep in SEPARATORS: self._query["DELIMITER"] = sep diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index d44dac2bb..538c943d7 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -14,8 +14,7 @@ class SAPRFCToDF(Task): def __init__( self, query: str = None, - sep: str = "\t", - autopick_sep: bool = True, + sep: str = None, credentials: dict = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), @@ -38,15 +37,13 @@ def __init__( Args: query (str, optional): The query to be executed with pyRFC. - sep (str, optional): The separator to use when reading query results. Defaults to "\t". - autopick_sep (str, optional): Whether SAPRFC should try different separators in case - the query fails with the default one. + sep (str, optional): The separator to use when reading query results. If not provided, + multiple options are automatically tried. Defaults to None. credentials (dict, optional): The credentials to use to authenticate with SAP. By default, they're taken from the local viadot config. """ self.query = query self.sep = sep - self.autopick_sep = autopick_sep self.credentials = credentials super().__init__( @@ -60,7 +57,6 @@ def __init__( @defaults_from_attrs( "query", "sep", - "autopick_sep", "credentials", "max_retries", "retry_delay", @@ -69,7 +65,6 @@ def run( self, query: str = None, sep: str = None, - autopick_sep: bool = None, credentials: dict = None, max_retries: int = None, retry_delay: timedelta = None, @@ -78,15 +73,14 @@ def run( Args: query (str, optional): The query to be executed with pyRFC. - sep (str, optional): The separator to use when reading a CSV file. Defaults to "\t". - autopick_sep (str, optional): Whether SAPRFC should try different separators in case - the query fails with the default one. + sep (str, optional): The separator to use when reading query results. If not provided, + multiple options are automatically tried. Defaults to None. """ if query is None: raise ValueError("Please provide the query.") - sap = SAPRFC(sep=sep, autopick_sep=autopick_sep, credentials=credentials) + sap = SAPRFC(sep=sep, credentials=credentials) sap.query(query) self.logger.info(f"Downloading data from SAP to a DataFrame...") From 7b3fff396d7caf8a2cdded7d8783e1de90d1bbcd Mon Sep 17 00:00:00 2001 From: trymzet Date: Tue, 1 Mar 2022 13:44:09 +0100 Subject: [PATCH 061/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20temporary=20CSV?= =?UTF-8?q?=20generated=20by=20the=20`DuckDBToSQLServer`=20flow=20not=20be?= =?UTF-8?q?ing=20cleaned=20up?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/flows/duckdb_to_sql_server.py | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 341cefa65..2abd11f26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 - `BCPTask` now correctly handles custom SQL Server port - Fixed `SAPRFC.to_df()` ignoring user-specified separator +- Fixed temporary CSV generated by the `DuckDBToSQLServer` flow not being cleaned up ### Removed - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. diff --git a/viadot/flows/duckdb_to_sql_server.py b/viadot/flows/duckdb_to_sql_server.py index 4ebafbbd8..b76bbd85b 100644 --- a/viadot/flows/duckdb_to_sql_server.py +++ b/viadot/flows/duckdb_to_sql_server.py @@ -1,6 +1,8 @@ +import os from typing import Any, Dict, List, Literal -from prefect import Flow +import prefect +from prefect import Flow, task from prefect.utilities import logging from ..task_utils import df_to_csv as df_to_csv_task @@ -15,6 +17,21 @@ bulk_insert_task = BCPTask() +@task +def cleanup_csv_task(path: str): + + logger = prefect.context.get("logger") + + logger.info(f"Removing file {path}...") + try: + os.remove(path) + logger.info(f"File {path} has been successfully removed.") + return True + except Exception as e: + logger.exception(f"File {path} could not be removed.") + return False + + class DuckDBToSQLServer(Flow): def __init__( self, @@ -125,6 +142,8 @@ def gen_flow(self) -> Flow: credentials=self.sql_server_credentials, flow=self, ) + cleanup_csv_task.bind(path=self.local_file_path, flow=self) create_table_task.set_upstream(df_to_csv, flow=self) bulk_insert_task.set_upstream(create_table_task, flow=self) + cleanup_csv_task.set_upstream(bulk_insert_task, flow=self) From 0b0795a21f708ae2de0096869a610412a0ad633f Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 2 Mar 2022 09:27:05 +0100 Subject: [PATCH 062/135] =?UTF-8?q?=E2=9C=85=20Updated=20check=5Fcolumn=5F?= =?UTF-8?q?order=5Fappend=5Fdiffcol=5Fnumber?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_azure_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/tasks/test_azure_sql.py b/tests/integration/tasks/test_azure_sql.py index 6628a05b2..b43541f88 100644 --- a/tests/integration/tasks/test_azure_sql.py +++ b/tests/integration/tasks/test_azure_sql.py @@ -111,7 +111,7 @@ def test_check_column_order_append_diff_col_number(caplog): ValidationError, match=r"Detected discrepancies in number of columns or different column names between the CSV file and the SQL table!", ): - check_column_order.run(table=TABLE, if_exists="append", df=df) + check_column_order.run(table=TABLE, schema=SCHEMA, if_exists="append", df=df) def test_check_column_order_replace(caplog): From ce16e6475534ef8453ab37b8120e2773cdd3bd7c Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 2 Mar 2022 10:21:23 +0100 Subject: [PATCH 063/135] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Added=20check=5Fco?= =?UTF-8?q?l=5Foreder=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 28 +++++++++++++++++++--------- viadot/tasks/azure_sql.py | 2 +- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index f33e1edf0..661a7679a 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -105,6 +105,7 @@ def __init__( table: str = None, schema: str = None, if_exists: Literal["fail", "replace", "append", "delete"] = "replace", + check_col_order: bool = True, sqldb_credentials_secret: str = None, max_download_retries: int = 5, tags: List[str] = ["promotion"], @@ -135,6 +136,7 @@ def __init__( table (str, optional): Destination table. Defaults to None. schema (str, optional): Destination schema. Defaults to None. if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". + check_col_order (bool, optional): Whether to check column order. Defaults to True. sqldb_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with Azure SQL Database credentials. Defaults to None. max_download_retries (int, optional): How many times to retry the download. Defaults to 5. @@ -175,7 +177,7 @@ def __init__( self.table = table self.schema = schema self.if_exists = self._map_if_exists(if_exists) - + self.check_col_oreder = check_col_order # Generate CSV self.remove_tab = remove_tab # BCPTask @@ -242,14 +244,22 @@ def gen_flow(self) -> Flow: credentials_secret=self.sqldb_credentials_secret, flow=self, ) - print(df) - df_to_csv = df_to_csv_task.bind( - df=df_reorder, - path=self.local_file_path, - sep=self.write_sep, - remove_tab=self.remove_tab, - flow=self, - ) + if self.check_col_oreder == False: + df_to_csv = df_to_csv_task.bind( + df=df, + path=self.local_file_path, + sep=self.write_sep, + remove_tab=self.remove_tab, + flow=self, + ) + else: + df_to_csv = df_to_csv_task.bind( + df=df_reorder, + path=self.local_file_path, + sep=self.write_sep, + remove_tab=self.remove_tab, + flow=self, + ) promote_to_conformed_task.bind( from_path=self.adls_path, diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index fafe1d79c..1a2fa1a05 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -344,7 +344,7 @@ def run( check_result = azure_sql.run(query=check_if_exists_query) if if_exists not in ["replace", "fail"]: if if_exists == "append" and not check_result: - self.logger.warning("Table doesn't exists.") + self.logger.warning("Aimed table doesn't exists.") return elif check_result: From aec4b9275d4bf666a5ab36ad34f9fe6466f47ab9 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 2 Mar 2022 10:25:01 +0100 Subject: [PATCH 064/135] =?UTF-8?q?=E2=9C=85=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 564a22000..7a203d896 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixed +- Fixed bug with `CheckColumnOrder` task +### Added + - Added `check_col_order` parameter in `ADLSToAzureSQL` ### Added - Added `SQLServer` source - Added `DuckDBToDF` task From 06746ee0f8eb69d7ed20d6b1dccd88622a27f530 Mon Sep 17 00:00:00 2001 From: trymzet Date: Wed, 2 Mar 2022 11:44:01 +0100 Subject: [PATCH 065/135] =?UTF-8?q?=F0=9F=9A=B8=20Add=20several=20more=20s?= =?UTF-8?q?eparators=20to=20check=20for=20automatically?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/sources/sap_rfc.py | 40 +++++++++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2abd11f26..2a8b1ab4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Changed the base class of `AzureSQL` to `SQLServer` - `df_to_parquet()` task now creates directories if needed +- Added several more separators to check for automatically in `SAPRFC.to_df()` ### Fixed - Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 6b83ceec7..94243d5e8 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -403,16 +403,36 @@ def to_df(self): if sep is None: # automatically find a working separator - SEPARATORS = ["|", "/t", "#", ";", "@"] - for sep in SEPARATORS: - self._query["DELIMITER"] = sep - try: - response = self.call("RFC_READ_TABLE", **params) - record_key = "WA" - data_raw = response["DATA"] - records = [row[record_key].split(sep) for row in data_raw] - except ValueError: - continue + SEPARATORS = [ + "|", + "/t", + "#", + ";", + "@", + "%", + "^", + "`", + "~", + "{", + "}", + "$", + ] + else: + SEPARATORS = [sep] + + records = None + for sep in SEPARATORS: + self._query["DELIMITER"] = sep + try: + response = self.call("RFC_READ_TABLE", **params) + record_key = "WA" + data_raw = response["DATA"] + records = [row[record_key].split(sep) for row in data_raw] + except ValueError: + continue + if records is None: + raise ValueError("None of the separators worked.") + df = pd.DataFrame(records, columns=columns) if self.client_side_filters: From ef1236afefaab3c07c3d3a97bffe812c98ed148f Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 4 Mar 2022 09:25:34 +0100 Subject: [PATCH 066/135] =?UTF-8?q?=F0=9F=94=A5=20Removed=20dtypes=5Fto=5F?= =?UTF-8?q?json?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ tests/unit/test_task_utils.py | 6 +++--- viadot/flows/sharepoint_to_adls.py | 10 +++++----- viadot/task_utils.py | 18 ++++++------------ 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a8b1ab4e..08f441d7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Removed +- Removed `dtypes_to_json` task to task_utils.py ### Added - Added `SQLServer` source - Added `DuckDBToDF` task diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index aea910957..18ea1992c 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -11,7 +11,7 @@ df_to_csv, df_to_parquet, union_dfs_task, - dtypes_to_json, + dtypes_to_json_task, write_to_json, ) @@ -132,9 +132,9 @@ def test_union_dfs_task(): assert len(res) == 5 -def test_dtypes_to_json(): +def test_dtypes_to_json_task(): dtypes = {"country": "VARCHAR(100)", "sales": "FLOAT(24)"} - dtypes_to_json.run(dtypes_dict=dtypes, local_json_path="dtypes.json") + dtypes_to_json_task.run(dtypes_dict=dtypes, local_json_path="dtypes.json") assert os.path.exists("dtypes.json") os.remove("dtypes.json") diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 3a42cacb5..9d9d4c819 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -12,7 +12,7 @@ add_ingestion_metadata_task, df_to_csv, df_to_parquet, - dtypes_to_json, + dtypes_to_json_task, df_map_mixed_dtypes_for_parquet, ) from ..tasks import AzureDataLakeUpload @@ -131,7 +131,7 @@ def gen_flow(self) -> Flow: flow=self, ) - dtypes_to_json.bind( + dtypes_to_json_task.bind( dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self ) json_to_adls_task.bind( @@ -142,11 +142,11 @@ def gen_flow(self) -> Flow: ) df_mapped.set_upstream(df_with_metadata, flow=self) - dtypes_to_json.set_upstream(df_mapped, flow=self) - df_to_file.set_upstream(dtypes_to_json, flow=self) + dtypes_to_json_task.set_upstream(df_mapped, flow=self) + df_to_file.set_upstream(dtypes_to_json_task, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) - json_to_adls_task.set_upstream(dtypes_to_json, flow=self) + json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path) @staticmethod diff --git a/viadot/task_utils.py b/viadot/task_utils.py index c6820fc91..5e4d73c8c 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -57,6 +57,12 @@ def get_latest_timestamp_file_path(files: List[str]) -> str: @task def dtypes_to_json_task(dtypes_dict, local_json_path: str): + """ + Creates json file from a dictionary. + Args: + dtypes_dict (dict): Dictionary containing data types. + local_json_path (str): Path to local json file. + """ with open(local_json_path, "w") as fp: json.dump(dtypes_dict, fp) @@ -253,18 +259,6 @@ def df_to_parquet( out_df.to_parquet(path, index=False, **kwargs) -@task -def dtypes_to_json(dtypes_dict: dict, local_json_path: str) -> None: - """ - Creates json file from a dictionary. - Args: - dtypes_dict (dict): Dictionary containing data types. - local_json_path (str): Path to local json file. - """ - with open(local_json_path, "w") as fp: - json.dump(dtypes_dict, fp) - - @task def union_dfs_task(dfs: List[pd.DataFrame]): """ From 109d0d1c8a7eaf5748c2be4e00a2dece5b1c9a50 Mon Sep 17 00:00:00 2001 From: trymzet Date: Fri, 4 Mar 2022 12:33:18 +0100 Subject: [PATCH 067/135] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Upgraded=20`duckdb?= =?UTF-8?q?`=20version=20to=200.3.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a8b1ab4e..14ba2574d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Changed the base class of `AzureSQL` to `SQLServer` - `df_to_parquet()` task now creates directories if needed - Added several more separators to check for automatically in `SAPRFC.to_df()` +- Upgraded `duckdb` version to 0.3.2 ### Fixed - Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 diff --git a/requirements.txt b/requirements.txt index 6074b74dc..56bced663 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,4 +26,4 @@ imagehash==4.2.1 visions==0.7.4 sharepy==1.3.0 sql-metadata==2.3.0 -duckdb==0.3.1 \ No newline at end of file +duckdb==0.3.2 \ No newline at end of file From 39a359f8446d3de45036cc90249a6c498eccef8e Mon Sep 17 00:00:00 2001 From: winiar93 Date: Fri, 4 Mar 2022 14:09:44 +0100 Subject: [PATCH 068/135] =?UTF-8?q?=F0=9F=8E=A8=20Added=20convert=5Fbytes?= =?UTF-8?q?=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/aselite_to_adls.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index ba1c2d8d1..2943bbb57 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -14,7 +14,6 @@ def __init__( self, name: str, query: str = None, - # db_name: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, file_path: str = "None", @@ -22,6 +21,7 @@ def __init__( to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", overwrite: bool = True, + convert_bytes: bool = False, *args: List[any], **kwargs: Dict[str, Any] ): @@ -41,7 +41,6 @@ def __init__( overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. """ self.query = query - # self.db_name = db_name self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name self.overwrite = overwrite @@ -50,6 +49,7 @@ def __init__( self.sep = sep self.to_path = to_path self.if_exists = if_exists + self.convert_bytes = convert_bytes super().__init__(*args, name=name, **kwargs) @@ -58,16 +58,16 @@ def __init__( def gen_flow(self) -> Flow: df = df_task.bind( query=self.query, - # db_name=self.db_name, credentials_secret=self.sqldb_credentials_secret, vault_name=self.vault_name, flow=self, ) - convert_df = df_converts_bytes_to_int.bind(df, flow=self) + if self.convert_bytes == True: + df = df_converts_bytes_to_int.bind(df, flow=self) create_csv = df_to_csv.bind( - convert_df, + df, path=self.file_path, sep=self.sep, if_exists=self.if_exists, @@ -81,6 +81,5 @@ def gen_flow(self) -> Flow: flow=self, ) - convert_df.set_upstream(df, flow=self) - create_csv.set_upstream(convert_df, flow=self) + create_csv.set_upstream(df, flow=self) adls_upload.set_upstream(create_csv, flow=self) From ec416e1463c66ddb2a3dede188b061182ba945fd Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 7 Mar 2022 18:05:13 +0100 Subject: [PATCH 069/135] =?UTF-8?q?=F0=9F=92=84=20Optimize=20type=20infere?= =?UTF-8?q?nce=20in=20=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/duckdb_to_sql_server.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/viadot/flows/duckdb_to_sql_server.py b/viadot/flows/duckdb_to_sql_server.py index b76bbd85b..f86753ed6 100644 --- a/viadot/flows/duckdb_to_sql_server.py +++ b/viadot/flows/duckdb_to_sql_server.py @@ -7,7 +7,6 @@ from ..task_utils import df_to_csv as df_to_csv_task from ..task_utils import get_sql_dtypes_from_df as get_sql_dtypes_from_df_task -from ..task_utils import update_dict as update_dict_task from ..tasks import BCPTask, DuckDBToDF, SQLServerCreateTable logger = logging.get_logger(__name__) @@ -119,13 +118,11 @@ def gen_flow(self) -> Flow: sep=self.write_sep, flow=self, ) - - dtypes_from_df = get_sql_dtypes_from_df_task.bind(df=df, flow=self) if self.dtypes: - # Update dtypes with the ones provided by user - dtypes = update_dict_task.bind(dtypes_from_df, self.dtypes, flow=self) + # Use user-provided dtypes. + dtypes = self.dtypes else: - dtypes = dtypes_from_df + dtypes = get_sql_dtypes_from_df_task.bind(df=df, flow=self) create_table_task.bind( schema=self.sql_server_schema, From 6d0011e3c4dcea2b2716fadc21cb179da2dbd0b7 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 9 Mar 2022 11:11:23 +0100 Subject: [PATCH 070/135] =?UTF-8?q?=E2=9C=85=20Updated=20tests=20-=20impor?= =?UTF-8?q?ted=20functions=20as=20external?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_prefect.py | 29 +++++++++++++------------ 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/integration/tasks/test_prefect.py b/tests/integration/tasks/test_prefect.py index a2332a73c..f16a2eab1 100644 --- a/tests/integration/tasks/test_prefect.py +++ b/tests/integration/tasks/test_prefect.py @@ -4,6 +4,13 @@ from datetime import date from viadot.tasks import GetFlowNewDateRange +from viadot.tasks.prefect import ( + get_time_from_last_successful_run, + calculate_difference, + check_if_scheduled_run, + get_formatted_date, +) + PREFECT_TASK = GetFlowNewDateRange() DATE_FROM_PREFECT = "2022-01-01T01:30:00+00:00" @@ -39,21 +46,17 @@ def test_get_formatted_dated(): - new_date = PREFECT_TASK.get_formatted_date( - time_unclean=DATE_FROM_PREFECT, return_value="date" - ) + new_date = get_formatted_date(time_unclean=DATE_FROM_PREFECT, return_value="date") assert new_date == datetime.date(2022, 1, 1) assert isinstance(new_date, date) - new_time = PREFECT_TASK.get_formatted_date( - time_unclean=DATE_FROM_PREFECT, return_value="time" - ) + new_time = get_formatted_date(time_unclean=DATE_FROM_PREFECT, return_value="time") assert new_time == datetime.time(1, 30) assert isinstance(new_time, datetime.time) def test_calculate_difference_date(): - diff_days = PREFECT_TASK.calculate_difference( + diff_days = calculate_difference( date_to_compare=DATE_FROM_PREFECT2, base_date=DATE_FROM_PREFECT, diff_type="date", @@ -62,14 +65,14 @@ def test_calculate_difference_date(): def test_calculate_difference_time(): - diff_hours = PREFECT_TASK.calculate_difference( + diff_hours = calculate_difference( date_to_compare=DATE_FROM_PREFECT2, base_date=DATE_FROM_PREFECT, diff_type="time", ) assert diff_hours == 0 - diff_hours = PREFECT_TASK.calculate_difference( + diff_hours = calculate_difference( date_to_compare="2022-01-04T02:50:00+00:00", base_date=DATE_FROM_PREFECT, diff_type="time", @@ -79,19 +82,17 @@ def test_calculate_difference_time(): def test_get_time_from_last_successful_run(): flow_runs = PREFECT_JSON["data"]["flow"] - start_time_success = PREFECT_TASK.get_time_from_last_successful_run( - flow_runs_details=flow_runs - ) + start_time_success = get_time_from_last_successful_run(flow_runs_details=flow_runs) assert start_time_success == "2022-02-20T01:05:36.142547+00:00" def test_check_if_scheduled_run(): - is_scheduled = PREFECT_TASK.check_if_scheduled_run( + is_scheduled = check_if_scheduled_run( time_run="2022-02-21T01:40:00+00:00", time_schedule="2022-02-15T01:00:00+00:00" ) assert is_scheduled is True - is_scheduled = PREFECT_TASK.check_if_scheduled_run( + is_scheduled = check_if_scheduled_run( time_run="2022-02-21T02:20:00+00:00", time_schedule="2022-02-15T01:00:00+00:00" ) assert is_scheduled is False From 7bc33df66c9e3a21f0beee31c82670b72c984f2b Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 9 Mar 2022 13:09:06 +0100 Subject: [PATCH 071/135] =?UTF-8?q?=E2=9C=85=20Updated=20test=20aselitetoa?= =?UTF-8?q?dls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_aselite_to_adls.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index 1c89436db..ade511194 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -59,9 +59,6 @@ def test_aselite_to_adls(): result = flow.run() assert result.is_successful() - -def test_generated_csv_file(): - MAIN_DF = pd.read_csv(TMP_FILE_NAME, delimiter="\t") if isinstance(MAIN_DF, pd.DataFrame): From 4c60246ce02984e7c0dace9aa41406616a711020 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 10 Mar 2022 10:37:19 +0100 Subject: [PATCH 072/135] =?UTF-8?q?=E2=9C=A8=20Added=20MultipleFlows=20cla?= =?UTF-8?q?ss?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/multiple_flows.py | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 viadot/flows/multiple_flows.py diff --git a/viadot/flows/multiple_flows.py b/viadot/flows/multiple_flows.py new file mode 100644 index 000000000..f785534fc --- /dev/null +++ b/viadot/flows/multiple_flows.py @@ -0,0 +1,55 @@ +from prefect import Flow, task +from prefect.tasks.prefect import create_flow_run, wait_for_flow_run +from typing import Dict, List, Any +from prefect.utilities import logging + +logger = logging.get_logger(__name__) + + +@task +def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): + """ + Task for running multiple flows in given order. Task will create flow of flows. + Args: + flow_name(str): Name of new flow. + flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. Defaults to [List[None]]. + """ + with Flow(flow_name) as flow: + for i in range(len(flows_list) - 1): + exec( + f"flow_{i}= create_flow_run(flow_name=flows_list[i][0], project_name=flows_list[i][1])" + ) + exec( + f"wait_for_flow_{i} = wait_for_flow_run(flow_{i}, raise_final_state=True)" + ) + exec( + f"flow_{len(flows_list)-1} = create_flow_run(flow_name=flows_list[len(flows_list)-1][0], project_name=flows_list[i][1])" + ) + for i in range(1, len(flows_list)): + exec(f"flow_{i}.set_upstream(wait_for_flow_{i-1})") + flow_state = flow.run() + print(flow_state.is_failed()) + if flow_state.is_failed(): + raise ValueError("One of the flows has failed!") + + +class MultipleFlows(Flow): + """Flow to run multiple flows in given order. + Args: + flow_name(str): Name of new flow. + flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. Defaults to [List[None]]. + """ + + def __init__( + self, + name: str, + flows_list: List[List] = [List[None]], + *args: List[any], + **kwargs: Dict[str, Any], + ): + self.flows_list = flows_list + super().__init__(*args, name=name, **kwargs) + self.gen_flow() + + def gen_flow(self) -> Flow: + run_flows_list.bind(flow_name=self.name, flows_list=self.flows_list, flow=self) From 98fd2f743e5072900b72973542dbe8a11a159ab7 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 10 Mar 2022 13:08:10 +0100 Subject: [PATCH 073/135] =?UTF-8?q?=E2=9C=85=20Added=20tests=20and=20added?= =?UTF-8?q?=20small=20changes=20in=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/flows/test_multiple_flows.py | 25 +++++++++++++++++++++++++ viadot/flows/__init__.py | 1 + viadot/flows/multiple_flows.py | 16 ++++++++++------ 3 files changed, 36 insertions(+), 6 deletions(-) create mode 100644 tests/unit/flows/test_multiple_flows.py diff --git a/tests/unit/flows/test_multiple_flows.py b/tests/unit/flows/test_multiple_flows.py new file mode 100644 index 000000000..cd56f124b --- /dev/null +++ b/tests/unit/flows/test_multiple_flows.py @@ -0,0 +1,25 @@ +from viadot.flows import MultipleFlows +import logging + + +def test_multiple_flows_working(caplog): + list = [ + ["Flow of flows 1 test", "dev"], + ["Flow of flows 2 - working", "dev"], + ["Flow of flows 3", "dev"], + ] + flow = MultipleFlows(name="test", flows_list=list) + with caplog.at_level(logging.INFO): + flow.run() + assert "All of the tasks succeeded." in caplog.text + + +def test_multiple_flows_not_working(caplog): + list = [ + ["Flow of flows 1 test", "dev"], + ["Flow of flows 2 test - not working", "dev"], + ["Flow of flows 3", "dev"], + ] + flow = MultipleFlows(name="test", flows_list=list) + flow.run() + assert "One of the flows has failed!" in caplog.text diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index ee63c5902..0f4dc0acf 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -17,3 +17,4 @@ from .duckdb_transform import DuckDBTransform from .duckdb_to_sql_server import DuckDBToSQLServer +from .multiple_flows import MultipleFlows diff --git a/viadot/flows/multiple_flows.py b/viadot/flows/multiple_flows.py index f785534fc..0670adde3 100644 --- a/viadot/flows/multiple_flows.py +++ b/viadot/flows/multiple_flows.py @@ -3,7 +3,7 @@ from typing import Dict, List, Any from prefect.utilities import logging -logger = logging.get_logger(__name__) +logger = logging.get_logger() @task @@ -11,8 +11,9 @@ def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): """ Task for running multiple flows in given order. Task will create flow of flows. Args: - flow_name(str): Name of new flow. - flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. Defaults to [List[None]]. + flow_name(str): Name of a new flow. + flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. + Flows have to be in the correct oreder. Defaults to [List[None]]. """ with Flow(flow_name) as flow: for i in range(len(flows_list) - 1): @@ -28,16 +29,19 @@ def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): for i in range(1, len(flows_list)): exec(f"flow_{i}.set_upstream(wait_for_flow_{i-1})") flow_state = flow.run() - print(flow_state.is_failed()) if flow_state.is_failed(): + logger.error("One of the flows has failed!") raise ValueError("One of the flows has failed!") + else: + logger.info("All of the tasks succeeded.") class MultipleFlows(Flow): """Flow to run multiple flows in given order. Args: - flow_name(str): Name of new flow. - flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. Defaults to [List[None]]. + flow_name(str): Name of a new flow. + flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. + Flows have to be in the correct oreder. Defaults to [List[None]]. """ def __init__( From 48fae0a8949ceed8ebcdb31afaf560384e6b3c42 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 10 Mar 2022 13:10:57 +0100 Subject: [PATCH 074/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14ba2574d..a8044a204 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ### Added +- Added `MultipleFlows` flow class which enables running multiple flows in a given order. +### Added - Added `SQLServer` source - Added `DuckDBToDF` task - Added `DuckDBTransform` flow From 874792085a4d42822e82e1e17188fb1ac73769f3 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 10 Mar 2022 15:48:14 +0100 Subject: [PATCH 075/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/multiple_flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/flows/multiple_flows.py b/viadot/flows/multiple_flows.py index 0670adde3..8d80995a4 100644 --- a/viadot/flows/multiple_flows.py +++ b/viadot/flows/multiple_flows.py @@ -9,7 +9,7 @@ @task def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): """ - Task for running multiple flows in given order. Task will create flow of flows. + Task for running multiple flows in the given order. Task will create flow of flows. Args: flow_name(str): Name of a new flow. flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. @@ -37,7 +37,7 @@ def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): class MultipleFlows(Flow): - """Flow to run multiple flows in given order. + """Flow to run multiple flows in the given order. Args: flow_name(str): Name of a new flow. flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. From 294484a1f6ac95b5fe909d14c96b47587b985ddf Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 10:18:34 +0100 Subject: [PATCH 076/135] =?UTF-8?q?=F0=9F=8E=A8=20Formatted=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +-- tests/integration/tasks/test_azure_sql.py | 2 +- viadot/flows/adls_to_azure_sql.py | 4 ++-- viadot/tasks/azure_sql.py | 6 ++---- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a203d896..282bbb30e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed bug with `CheckColumnOrder` task ### Added - - Added `check_col_order` parameter in `ADLSToAzureSQL` -### Added +- Added `check_col_order` parameter in `ADLSToAzureSQL` - Added `SQLServer` source - Added `DuckDBToDF` task - Added `DuckDBTransform` flow diff --git a/tests/integration/tasks/test_azure_sql.py b/tests/integration/tasks/test_azure_sql.py index b43541f88..07b201dfd 100644 --- a/tests/integration/tasks/test_azure_sql.py +++ b/tests/integration/tasks/test_azure_sql.py @@ -141,4 +141,4 @@ def test_check_column_order_append_not_exists(caplog): check_column_order.run( table="non_existing_table_123", schema="sandbox", if_exists="append", df=df ) - assert "Table doesn't exists" in caplog.text + assert "table doesn't exists" in caplog.text diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 661a7679a..7959c2de6 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -177,7 +177,7 @@ def __init__( self.table = table self.schema = schema self.if_exists = self._map_if_exists(if_exists) - self.check_col_oreder = check_col_order + self.check_col_order = check_col_order # Generate CSV self.remove_tab = remove_tab # BCPTask @@ -244,7 +244,7 @@ def gen_flow(self) -> Flow: credentials_secret=self.sqldb_credentials_secret, flow=self, ) - if self.check_col_oreder == False: + if self.check_col_order == False: df_to_csv = df_to_csv_task.bind( df=df, path=self.local_file_path, diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 1a2fa1a05..900d49802 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -340,15 +340,13 @@ def run( credentials = get_credentials(credentials_secret, vault_name=vault_name) azure_sql = AzureSQL(credentials=credentials) - check_if_exists_query = f"""SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table}' AND TABLE_SCHEMA='{schema}'""" - check_result = azure_sql.run(query=check_if_exists_query) + query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" + check_result = azure_sql.run(query=query) if if_exists not in ["replace", "fail"]: if if_exists == "append" and not check_result: self.logger.warning("Aimed table doesn't exists.") return elif check_result: - - query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" result = azure_sql.run(query=query) sql_column_list = [table for row in result for table in row] df_column_list = list(df.columns) From ac5f447432433696ba766a687ccbfd065dd00d52 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 11:49:04 +0100 Subject: [PATCH 077/135] =?UTF-8?q?=E2=9C=A8=20Added=20rename=5Fcolums=20f?= =?UTF-8?q?unction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 900d49802..a9e9a12ef 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -316,6 +316,16 @@ def df_change_order( return df_changed + def rename_columns(self, df: pd.DataFrame = None): + """ + Function to remove spaces at the end of column name. + Args: + df(pd.DataFrame): Dataframe to transform. Defaults to None. + """ + for col in df.columns: + df = df.rename(columns={col: col.strip()}) + return df + def run( self, table: str = None, @@ -339,7 +349,10 @@ def run( """ credentials = get_credentials(credentials_secret, vault_name=vault_name) azure_sql = AzureSQL(credentials=credentials) - + print(list(df.columns)) + print("****") + df = self.rename_columns(df) + print(list(df.columns)) query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" check_result = azure_sql.run(query=query) if if_exists not in ["replace", "fail"]: From f18ca84d63cd0db97da4ca91621c0169fca76dd0 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 15 Mar 2022 13:13:31 +0100 Subject: [PATCH 078/135] Added new parameter to class --- viadot/flows/aselite_to_adls.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 2943bbb57..f9d6ce0e2 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -22,6 +22,7 @@ def __init__( if_exists: Literal["replace", "append", "delete"] = "replace", overwrite: bool = True, convert_bytes: bool = False, + sp_credentials_secret: str = None, *args: List[any], **kwargs: Dict[str, Any] ): @@ -39,6 +40,8 @@ def __init__( to_path (str): The path to an ADLS file. Defaults to None. if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. + sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. """ self.query = query self.sqldb_credentials_secret = sqldb_credentials_secret @@ -50,6 +53,7 @@ def __init__( self.to_path = to_path self.if_exists = if_exists self.convert_bytes = convert_bytes + self.sp_credentials_secret = sp_credentials_secret super().__init__(*args, name=name, **kwargs) @@ -78,6 +82,7 @@ def gen_flow(self) -> Flow: from_path=self.file_path, to_path=self.to_path, overwrite=self.overwrite, + sp_credentials_secret=self.sp_credentials_secret, flow=self, ) From e4ad287f3d285d616804443204435fbc98b5c906 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 13:42:26 +0100 Subject: [PATCH 079/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 282bbb30e..d3104b801 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,6 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -### Fixed -- Fixed bug with `CheckColumnOrder` task ### Added - Added `check_col_order` parameter in `ADLSToAzureSQL` - Added `SQLServer` source @@ -17,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `df_to_parquet()` task now creates directories if needed ### Fixed +- Fixed bug with `CheckColumnOrder` task - fixed OpenSSL config for old SQL Servers still using TLS < 1.2 ## [0.3.2] - 2022-02-17 From 6d45aeb90dc911ab1149b0d08ac4be39445f2c6b Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 13:44:04 +0100 Subject: [PATCH 080/135] =?UTF-8?q?=F0=9F=94=A5=20removed=20print?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index a9e9a12ef..1af636fd2 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -349,10 +349,7 @@ def run( """ credentials = get_credentials(credentials_secret, vault_name=vault_name) azure_sql = AzureSQL(credentials=credentials) - print(list(df.columns)) - print("****") df = self.rename_columns(df) - print(list(df.columns)) query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" check_result = azure_sql.run(query=query) if if_exists not in ["replace", "fail"]: From 868cecad85247030ef93ac02c672d82d536c4829 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 14:15:56 +0100 Subject: [PATCH 081/135] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20if=20statement?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 1af636fd2..e15f3eb89 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -352,11 +352,14 @@ def run( df = self.rename_columns(df) query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" check_result = azure_sql.run(query=query) + print("******") + print(check_result) + print("******") if if_exists not in ["replace", "fail"]: if if_exists == "append" and not check_result: self.logger.warning("Aimed table doesn't exists.") return - elif check_result: + elif check_result is not []: result = azure_sql.run(query=query) sql_column_list = [table for row in result for table in row] df_column_list = list(df.columns) From dbb7bdc4ce4c0bea6f62b0f80dcdcc0c94efb3f5 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 14:16:30 +0100 Subject: [PATCH 082/135] =?UTF-8?q?=F0=9F=94=A5=20Reomoved=20print?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index e15f3eb89..a04c31a3a 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -352,9 +352,6 @@ def run( df = self.rename_columns(df) query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" check_result = azure_sql.run(query=query) - print("******") - print(check_result) - print("******") if if_exists not in ["replace", "fail"]: if if_exists == "append" and not check_result: self.logger.warning("Aimed table doesn't exists.") From 91443ef983725513d754b2ec495a55a75d63a753 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 15 Mar 2022 15:39:00 +0100 Subject: [PATCH 083/135] =?UTF-8?q?=F0=9F=8E=A8=20Formatted=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 ++- .../integration/flows/test_aselite_to_adls.py | 25 ++++++++----------- tests/unit/test_task_utils.py | 8 +++--- viadot/flows/__init__.py | 2 +- viadot/flows/adls_to_azure_sql.py | 4 +-- viadot/flows/aselite_to_adls.py | 6 ++--- viadot/task_utils.py | 2 +- viadot/tasks/aselite.py | 10 ++++---- 8 files changed, 28 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c39eb87e4..f7ce16572 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +### Added - added `ASEliteToDF` task and `ASEliteToADLS` flow - added KeyVault support in `CloudForCustomers` tasks -### Added - Added `SQLServer` source - Added `DuckDBToDF` task - Added `DuckDBTransform` flow diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index ade511194..e9f14b125 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -1,14 +1,15 @@ +import logging +import pandas as pd +import os from typing import Any, Dict, List, Literal from prefect import Flow -from viadot.tasks import AzureDataLakeUpload +from prefect.tasks.secrets import PrefectSecret from prefect.run_configs import DockerRun from viadot.task_utils import df_to_csv, df_converts_bytes_to_int from viadot.tasks.aselite import ASELiteToDF -import logging -from viadot.flows.aselite_to_adls import ASELitetoADLS -from prefect.tasks.secrets import PrefectSecret -import pandas as pd -import os +from viadot.tasks import AzureDataLakeUpload +from viadot.flows.aselite_to_adls import ASELiteToADLS + TMP_FILE_NAME = "test_flow.csv" MAIN_DF = None @@ -41,13 +42,8 @@ def test_aselite_to_adls(): ,[UPD_FIELD] FROM [UCRMDEV].[dbo].[CRM_00]""" - RUN_CONFIG = DockerRun( - image="docker.pkg.github.com/dyvenia/viadot/viadot:latest", - labels=["prod"], - ) - - flow = ASELitetoADLS( - "Test flow ", + flow = ASELiteToADLS( + "Test flow", query=query_designer, sqldb_credentials_secret=credentials_secret, vault_name=vault_name, @@ -61,8 +57,7 @@ def test_aselite_to_adls(): MAIN_DF = pd.read_csv(TMP_FILE_NAME, delimiter="\t") - if isinstance(MAIN_DF, pd.DataFrame): - assert True + assert isinstance(MAIN_DF, pd.DataFrame) == True assert MAIN_DF.shape == (10, 17) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 4894c7049..387b76cfd 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -46,7 +46,7 @@ def test_map_dtypes_for_parquet(): def test_df_converts_bytes_to_int(): - dane = { + data = { "ID": {0: 1, 1: 2, 2: 100, 3: 101, 4: 102}, "SpracheText": { 0: "TE_CATALOG_BASE_LANG", @@ -64,11 +64,11 @@ def test_df_converts_bytes_to_int(): }, } - df = pd.DataFrame.from_dict(dane) + df = pd.DataFrame.from_dict(data) test_df = df_converts_bytes_to_int.run(df) lst = test_df["RKZ"][0] - is_it_or_not = all(isinstance(x, (int, int)) for x in lst) - assert is_it_or_not == True + is_int = all(isinstance(x, (int, int)) for x in lst) + assert is_int == True def test_chunk_df(): diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index 0c46d4e72..e56bb4b94 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -9,7 +9,7 @@ from .adls_container_to_container import ADLSContainerToContainer from .sharepoint_to_adls import SharepointToADLS from .cloud_for_customers_report_to_adls import CloudForCustomersReportToADLS -from .aselite_to_adls import ASELitetoADLS +from .aselite_to_adls import ASELiteToADLS try: from .sap_to_duckdb import SAPToDuckDB diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 661a7679a..7959c2de6 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -177,7 +177,7 @@ def __init__( self.table = table self.schema = schema self.if_exists = self._map_if_exists(if_exists) - self.check_col_oreder = check_col_order + self.check_col_order = check_col_order # Generate CSV self.remove_tab = remove_tab # BCPTask @@ -244,7 +244,7 @@ def gen_flow(self) -> Flow: credentials_secret=self.sqldb_credentials_secret, flow=self, ) - if self.check_col_oreder == False: + if self.check_col_order == False: df_to_csv = df_to_csv_task.bind( df=df, path=self.local_file_path, diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index f9d6ce0e2..1575eca49 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -9,14 +9,14 @@ file_to_adls_task = AzureDataLakeUpload() -class ASELitetoADLS(Flow): +class ASELiteToADLS(Flow): def __init__( self, name: str, query: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, - file_path: str = "None", + file_path: str = None, sep: str = "\t", to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", @@ -27,7 +27,7 @@ def __init__( **kwargs: Dict[str, Any] ): """ - Flow for downloading data from ASElite to csv file, then uploading it to Azure Storage Explorer. + Flow for downloading data from ASElite to csv file, then uploading it to ADLS. Args: name (str): The name of the flow. diff --git a/viadot/task_utils.py b/viadot/task_utils.py index a1fc3a9ec..1bcdeaddf 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -258,7 +258,7 @@ def cleanup_validation_clutter(expectations_path): @task -def df_converts_bytes_to_int(df): +def df_converts_bytes_to_int(df: pd.DataFrame) -> pd.DataFrame: logger = prefect.context.get("logger") logger.info("Converting bytes in dataframe columns to list of integers") return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index b0214b410..005db1df0 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -1,11 +1,11 @@ -from prefect import Task -from viadot.sources.base import SQL +import json +import prefect from typing import Any, Dict +from prefect import Task from prefect.tasks.secrets import PrefectSecret from .azure_key_vault import AzureKeyVaultSecret from viadot.config import local_config -import json -import prefect +from viadot.sources import AzureSQL class ASELiteToDF(Task): @@ -56,7 +56,7 @@ def run( credentials = local_config.get("ASELite_SQL") logger.info("Loaded credentials from local source") - aselite = SQL(credentials=credentials) + aselite = AzureSQL(credentials=credentials) logger.info("Connected to ASELITE SOURCE") df = aselite.to_df(query=query) logger.info("Succefully collected data from query") From 5d0c182f58250fdb1d92f064da12668cf9be7356 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 16 Mar 2022 09:29:17 +0100 Subject: [PATCH 084/135] =?UTF-8?q?=F0=9F=9A=9A=20Moved=20test=20from=20un?= =?UTF-8?q?it=20to=20integration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/{unit => integration}/flows/test_multiple_flows.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{unit => integration}/flows/test_multiple_flows.py (100%) diff --git a/tests/unit/flows/test_multiple_flows.py b/tests/integration/flows/test_multiple_flows.py similarity index 100% rename from tests/unit/flows/test_multiple_flows.py rename to tests/integration/flows/test_multiple_flows.py From 1bde1235064005e39c997b7a37631adc0a9a9312 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 16 Mar 2022 15:48:00 +0100 Subject: [PATCH 085/135] =?UTF-8?q?=F0=9F=90=9B=20Changed=20return=20value?= =?UTF-8?q?=20in=20task=20and=20added=20option=20in=20SupermetricsToADLS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/supermetrics_to_adls.py | 16 ++++++++++------ viadot/tasks/prefect.py | 7 +++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 6a19dd360..a1c8da473 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -69,6 +69,7 @@ def __init__( parallel: bool = True, tags: List[str] = ["extract"], vault_name: str = None, + check_missing_data: bool = True, *args: List[any], **kwargs: Dict[str, Any], ): @@ -109,6 +110,7 @@ def __init__( parallel (bool, optional): Whether to parallelize the downloads. Defaults to True. tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["extract"]. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + check_missing_data (bool, optional): Whether to check missing data. Defaults to True. """ if not ds_user: try: @@ -118,6 +120,7 @@ def __init__( raise ValueError(msg) from e self.flow_name = name + self.check_missing_data = check_missing_data # SupermetricsToDF self.ds_id = ds_id self.ds_accounts = ds_accounts @@ -200,12 +203,13 @@ def gen_supermetrics_task( return t def gen_flow(self) -> Flow: - if self.date_range_type is not None and "days" in self.date_range_type: - self.date_range_type = prefect_get_new_date_range.run( - flow_name=self.flow_name, - date_range_type=self.date_range_type, - flow=self, - ) + if self.check_missing_data is True: + if self.date_range_type is not None and "days" in self.date_range_type: + self.date_range_type = prefect_get_new_date_range.run( + flow_name=self.flow_name, + date_range_type=self.date_range_type, + flow=self, + ) if self.parallel: # generate a separate task for each account diff --git a/viadot/tasks/prefect.py b/viadot/tasks/prefect.py index 71a6c2a52..af000db2c 100644 --- a/viadot/tasks/prefect.py +++ b/viadot/tasks/prefect.py @@ -208,7 +208,6 @@ def run( time_schedule = flow_runs_details[0]["flow_runs"][0]["scheduled_start_time"] last_success_start_time = get_time_from_last_successful_run(flow_runs_details) - is_scheduled = check_if_scheduled_run( time_run=last_success_start_time, time_schedule=time_schedule, @@ -220,10 +219,10 @@ def run( base_date=time_schedule, diff_type="date", ) - date_range_type = self.change_date_range( + new_date_range_type = self.change_date_range( date_range=date_range_type, difference=difference_days ) - return date_range_type + return new_date_range_type if is_scheduled is False: - return 0 + return date_range_type From 1ba1d77030101d0ed0fdda5088962a179ea74341 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 17 Mar 2022 11:13:59 +0100 Subject: [PATCH 086/135] =?UTF-8?q?=F0=9F=8E=A8=20Rmoced=20added=20section?= =?UTF-8?q?=20in=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8044a204..a6396097b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `MultipleFlows` flow class which enables running multiple flows in a given order. -### Added - Added `SQLServer` source - Added `DuckDBToDF` task - Added `DuckDBTransform` flow From 0efba9a812ab938fa646c3f2ad03cf62ade0f23c Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Thu, 17 Mar 2022 11:48:30 +0100 Subject: [PATCH 087/135] Update conftest.py --- tests/conftest.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c7b7cd7e0..28f9fff39 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -51,12 +51,6 @@ def create_test_parquet_file(DF, TEST_PARQUET_FILE_PATH): os.remove(TEST_PARQUET_FILE_PATH) -@pytest.fixture(scope="session") -def TEST_CSV_ASELITE_PATH(): - file_path = "raw/supermetrics/mp/result_df_flow_at_des_m.csv" - return file_path - - @pytest.fixture(scope="session", autouse=True) def create_test_parquet_file_2(DF, TEST_PARQUET_FILE_PATH_2): DF.to_parquet(TEST_PARQUET_FILE_PATH_2, index=False) From 0e872fa2fda6eeaf3a0ad8c1886744d062db9056 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Mar 2022 21:06:41 +0000 Subject: [PATCH 088/135] Bump great-expectations from 0.13.44 to 0.14.11 Bumps [great-expectations](https://github.com/great-expectations/great_expectations) from 0.13.44 to 0.14.11. - [Release notes](https://github.com/great-expectations/great_expectations/releases) - [Changelog](https://github.com/great-expectations/great_expectations/blob/develop/docs/changelog.md) - [Commits](https://github.com/great-expectations/great_expectations/compare/0.13.44...0.14.11) --- updated-dependencies: - dependency-name: great-expectations dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6074b74dc..d5c180678 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ openpyxl==3.0.9 jupyterlab==3.2.4 azure-keyvault==4.1.0 azure-identity==1.7.1 -great-expectations==0.13.44 +great-expectations==0.14.11 matplotlib adlfs==2021.10.0 PyGithub==1.55 From 9940c67df1197d312a28e8f2689d83a40919c1ab Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Mar 2022 23:02:44 +0100 Subject: [PATCH 089/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20some=20mappings=20?= =?UTF-8?q?in=20`get=5Fsql=5Fdtypes=5Ffrom=5Fdf()`=20and=20optimize=20perf?= =?UTF-8?q?ormance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 5abdfed5e..c0a3e3b32 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -97,30 +97,30 @@ def df_get_data_types_task(df: pd.DataFrame) -> dict: def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict: """Obtain SQL data types from a pandas DataFrame""" typeset = CompleteSet() - dtypes = infer_type(df, typeset) + dtypes = infer_type(df.head(10000), typeset) dtypes_dict = {k: str(v) for k, v in dtypes.items()} dict_mapping = { "Float": "REAL", "Image": None, "Categorical": "VARCHAR(500)", "Time": "TIME", - "Boolean": "BIT", + "Boolean": "VARCHAR(5)", # Bool is True/False, Microsoft expects 0/1 "DateTime": "DATETIMEOFFSET", # DATETIMEOFFSET is the only timezone-aware dtype in TSQL "Object": "VARCHAR(500)", "EmailAddress": "VARCHAR(50)", "File": None, "Geometry": "GEOMETRY", - "Ordinal": "VARCHAR(500)", + "Ordinal": "INT", "Integer": "INT", "Generic": "VARCHAR(500)", - "UUID": "UNIQUEIDENTIFIER", + "UUID": "VARCHAR(50)", # Microsoft uses a custom UUID format so we can't use it "Complex": None, "Date": "DATE", "String": "VARCHAR(500)", "IPAddress": "VARCHAR(39)", - "Path": "VARCHAR(500)", + "Path": "VARCHAR(255)", "TimeDelta": "VARCHAR(20)", # datetime.datetime.timedelta; eg. '1 days 11:00:00' - "URL": "VARCHAR(500)", + "URL": "VARCHAR(255)", "Count": "INT", } dict_dtypes_mapped = {} From 3c5fda8fea7c8aa25ce6c9895744ff252dbba12f Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Mar 2022 23:03:29 +0100 Subject: [PATCH 090/135] =?UTF-8?q?=F0=9F=8E=A8=20Format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c945635b2..75405c397 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] -### Added +## [Unreleased] +### Added - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added new task `GetFlowNewDateRange` to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` @@ -28,12 +28,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Upgraded `duckdb` version to 0.3.2 ### Fixed - - Fixed bug with `CheckColumnOrder` task - Fixed OpenSSL config for old SQL Servers still using TLS < 1.2 - `BCPTask` now correctly handles custom SQL Server port - Fixed `SAPRFC.to_df()` ignoring user-specified separator - Fixed temporary CSV generated by the `DuckDBToSQLServer` flow not being cleaned up +- Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance ### Removed - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. @@ -44,6 +44,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - fixed an issue with schema info within `CheckColumnOrder` class. + ## [0.3.1] - 2022-02-17 ### Changed -`ADLSToAzureSQL` - added `remove_tab` parameter to remove uncessery tab separators from data. @@ -51,6 +52,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - fixed an issue with return df within `CheckColumnOrder` class. + ## [0.3.0] - 2022-02-16 ### Added - new source `SAPRFC` for connecting with SAP using the `pyRFC` library (requires pyrfc as well as the SAP NW RFC library that can be downloaded [here](https://support.sap.com/en/product/connectors/nwrfcsdk.html) @@ -74,6 +76,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - C4C connection with url and report_url optimization - column mapper in C4C source + ## [0.2.15] - 2022-01-12 ### Added - new option to `ADLSToAzureSQL` Flow - `if_exists="delete"` @@ -87,10 +90,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.2.14] - 2021-12-01 - ### Fixed - authorization issue within `CloudForCustomers` source + ## [0.2.13] - 2021-11-30 ### Added - Added support for file path to `CloudForCustomersReportToADLS` flow @@ -104,6 +107,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `Sharepoint` and `CloudForCustomers` sources will now provide an informative `CredentialError` which is also raised early. This will make issues with input credenials immediately clear to the user. - Removed set_key_value from `CloudForCustomersReportToADLS` flow + ## [0.2.12] - 2021-11-25 ### Added - Added `Sharepoint` source @@ -117,18 +121,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `df_to_parquet` task to task_utils.py - Added `dtypes_to_json` task to task_utils.py -## [0.2.11] - 2021-10-30 +## [0.2.11] - 2021-10-30 ### Fixed - `ADLSToAzureSQL` - fixed path to csv issue. - `SupermetricsToADLS` - fixed local json path issue. + ## [0.2.10] - 2021-10-29 ### Release due to CI/CD error + ## [0.2.9] - 2021-10-29 ### Release due to CI/CD error + ## [0.2.8] - 2021-10-29 ### Changed - CI/CD: `dev` image is now only published on push to the `dev` branch @@ -161,6 +168,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `ADLSToAzureSQL` breaking in `"append"` mode if the table didn't exist (#145). - Fixed `ADLSToAzureSQL` breaking in promotion path for csv files. + ## [0.2.6] - 2021-09-22 ### Added - Added flows library docs to the references page @@ -286,14 +294,12 @@ specified in the `SUPERMETRICS_DEFAULT_USER` secret - Tasks now use secrets for credential management (azure tasks use Azure Key Vault secrets) - SQL source now has a default query timeout of 1 hour - ### Fixed - Fix `SQLite` tests - Multiple stability improvements with retries and timeouts ## [0.1.12] - 2021-05-08 - ### Changed - Moved from poetry to pip From 20b57054cc226793bf79018a24becd93751ae80e Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Mar 2022 23:11:04 +0100 Subject: [PATCH 091/135] =?UTF-8?q?=E2=9C=A8=20Add=20`get=5Fflow=5Flast=5F?= =?UTF-8?q?run=5Fdate`=20util?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/utils.py | 50 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75405c397..d101aad0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `get_sql_dtypes_from_df` and `update_dict` util tasks - Added `DuckDBToSQLServer` flow - Added `if_exists="append"` option to `DuckDB.create_table_from_parquet()` +- Added `get_flow_last_run_date` util function ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/utils.py b/viadot/utils.py index 671ea61d4..3b2b8afdd 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -1,10 +1,15 @@ -from .exceptions import APIError +from typing import Any, Dict + +import pendulum +import prefect +import requests +from prefect.utilities.graphql import EnumValue, with_args from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError, HTTPError, ReadTimeout, Timeout -from urllib3.exceptions import ProtocolError from requests.packages.urllib3.util.retry import Retry -from typing import Any, Dict -import requests +from urllib3.exceptions import ProtocolError + +from .exceptions import APIError def slugify(name: str) -> str: @@ -75,3 +80,40 @@ def handle_api_response( raise APIError("Unknown error.") from e return response + + +def get_flow_last_run_date(flow_name: str) -> str: + """ + Retrieve a flow's last run date as an ISO datetime string. + + This function assumes you are already authenticated with Prefect Cloud. + """ + client = prefect.Client() + result = client.graphql( + { + "query": { + with_args( + "flow_run", + { + "where": { + "flow": {"name": {"_eq": flow_name}}, + "start_time": {"_is_null": False}, + "state": {"_eq": "Success"}, + }, + "order_by": {"start_time": EnumValue("desc")}, + "limit": 1, + }, + ): {"start_time"} + } + } + ) + flow_run_data = result.get("data", {}).get("flow_run") + + if not flow_run_data: + return None + + last_run_date_raw_format = flow_run_data[0]["start_time"] + last_run_date = ( + pendulum.parse(last_run_date_raw_format).format("YYYY-MM-DDTHH:MM:SS") + "Z" + ) + return last_run_date From 999fcd096e6fd5a1c8b0bed2e1613f20580ca5fa Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Mar 2022 23:16:16 +0100 Subject: [PATCH 092/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20`BCPTask`=20-=20ca?= =?UTF-8?q?se=20when=20the=20file=20path=20contained=20a=20space?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/bcp.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d101aad0a..110fa6731 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `SAPRFC.to_df()` ignoring user-specified separator - Fixed temporary CSV generated by the `DuckDBToSQLServer` flow not being cleaned up - Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance +- Fixed `BCPTask` - the case when the file path contained a space ### Removed - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. diff --git a/viadot/tasks/bcp.py b/viadot/tasks/bcp.py index c38426c80..cb986ba91 100644 --- a/viadot/tasks/bcp.py +++ b/viadot/tasks/bcp.py @@ -113,5 +113,5 @@ def run( # but not in BCP's 'server' argument. server = server.replace(" ", "") - command = f"/opt/mssql-tools/bin/bcp {fqn} in {path} -S {server} -d {db_name} -U {uid} -P '{pwd}' -c -F 2 -b 5000 -h 'TABLOCK'" + command = f"/opt/mssql-tools/bin/bcp {fqn} in '{path}' -S {server} -d {db_name} -U {uid} -P '{pwd}' -c -F 2 -b 5000 -h 'TABLOCK'" return super().run(command=command, **kwargs) From 4bd2a7dac51fb7c96cab122349d3769786fc7f45 Mon Sep 17 00:00:00 2001 From: trymzet Date: Mon, 21 Mar 2022 23:24:47 +0100 Subject: [PATCH 093/135] =?UTF-8?q?=E2=9C=A8=20Add=20`df=5Fto=5Fdataset`?= =?UTF-8?q?=20task=20util?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/task_utils.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 110fa6731..32b8871f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `DuckDBToSQLServer` flow - Added `if_exists="append"` option to `DuckDB.create_table_from_parquet()` - Added `get_flow_last_run_date` util function +- Added `df_to_dataset` task util for writing DataFrames to data lakes using `pyarrow` ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/task_utils.py b/viadot/task_utils.py index c0a3e3b32..5a6fdbb45 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -8,6 +8,8 @@ import pandas as pd import prefect +import pyarrow as pa +import pyarrow.dataset as ds from prefect import task from prefect.storage import Git from prefect.utilities import logging @@ -305,6 +307,41 @@ def df_converts_bytes_to_int(df: pd.DataFrame) -> pd.DataFrame: return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) +@task +def df_to_dataset( + df: pd.DataFrame, partitioning_flavor="hive", format="parquet", **kwargs +) -> None: + """ + Use `pyarrow.dataset.write_to_dataset()` to write from a pandas DataFrame to a dataset. + This enables several data lake-specific optimizations such as parallel writes, partitioning, + and file size (via `max_rows_per_file` parameter). + + Args: + df (pd.DataFrame): The pandas DataFrame to write. + partitioning_flavor (str, optional): The partitioning flavor to use. Defaults to "hive". + format (str, optional): The dataset format. Defaults to 'parquet'. + kwargs: Keyword arguments to be passed to `write_to_dataset()`. See + https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html. + + Examples: + table = pa.Table.from_pandas(df_contact) + base_dir = "/home/viadot/contact" + partition_cols = ["updated_at_year", "updated_at_month", "updated_at_day"] + + df_to_dataset( + data=table, + base_dir=base_dir, + partitioning=partition_cols, + existing_data_behavior='overwrite_or_ignore', + max_rows_per_file=100_000 + ) + """ + table = pa.Table.from_pandas(df) + ds.write_dataset( + data=table, partitioning_flavor=partitioning_flavor, format=format, **kwargs + ) + + class Git(Git): @property def git_clone_url(self): From bdd0bff5fa93118730d1077e72a4dc2ccdd26663 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 23 Mar 2022 10:06:20 +0100 Subject: [PATCH 094/135] init files --- viadot/flows/aselite_to_adls.py | 12 +++++++++++- viadot/task_utils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 1575eca49..c7eaaab55 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Literal from prefect import Flow from viadot.tasks import AzureDataLakeUpload -from viadot.task_utils import df_to_csv, df_converts_bytes_to_int +from viadot.task_utils import df_to_csv, df_converts_bytes_to_int, df_clean_column from viadot.tasks.aselite import ASELiteToDF @@ -23,6 +23,8 @@ def __init__( overwrite: bool = True, convert_bytes: bool = False, sp_credentials_secret: str = None, + remove_special_characters: bool = None, + columns_to_clean: List[str] = None, *args: List[any], **kwargs: Dict[str, Any] ): @@ -42,6 +44,9 @@ def __init__( overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. + remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. + columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. + If None whole data frame will be processed. Defaults to None. """ self.query = query self.sqldb_credentials_secret = sqldb_credentials_secret @@ -54,6 +59,8 @@ def __init__( self.if_exists = if_exists self.convert_bytes = convert_bytes self.sp_credentials_secret = sp_credentials_secret + self.remove_special_characters = remove_special_characters + self.columns_to_clean = columns_to_clean super().__init__(*args, name=name, **kwargs) @@ -70,6 +77,9 @@ def gen_flow(self) -> Flow: if self.convert_bytes == True: df = df_converts_bytes_to_int.bind(df, flow=self) + if self.remove_special_characters == True: + df = df_clean_column(df, columns_to_clean=self.columns_to_clean, flow=self) + create_csv = df_to_csv.bind( df, path=self.file_path, diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 5a6fdbb45..7faedb676 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -342,6 +342,36 @@ def df_to_dataset( ) +@task +def df_clean_column( + df: pd.DataFrame, columns_to_clean: List[str] = None +) -> pd.DataFrame: + """ + Function that remove special characters from data frame like escape symbols etc. + Args: + df (pd.DataFrame): DataFrame + columns_to_clean (List[str]): List of columns + """ + if columns_to_clean == None: + df.replace( + to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], + value=["", ""], + regex=True, + inplace=True, + ) + + else: + for x in columns_to_clean: + df[x].replace( + to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], + value=["", ""], + regex=True, + inplace=True, + ) + + return df + + class Git(Git): @property def git_clone_url(self): From 60f78347134c36677c2a93561e50dd0568225d15 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 23 Mar 2022 10:27:22 +0100 Subject: [PATCH 095/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20conflict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 4b6101d25..17ee5254e 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -13,6 +13,7 @@ dtypes_to_json_task, write_to_json, df_converts_bytes_to_int, + df_clean_column, ) @@ -170,3 +171,17 @@ def test_write_to_json(): write_to_json.run(dict, "dict.json") assert os.path.exists("dict.json") os.remove("dict.json") + + +def test_df_clean_column(): + data = { + "col_1": ["a", "b \\r", "\t c", "d \r\n a"], + "col_2": ["a", "b \\r", "\t c", "d \r\n a"], + } + expected_output = { + "col_1": {0: "a", 1: "b ", 2: " c", 3: "d a"}, + "col_2": {0: "a", 1: "b ", 2: " c", 3: "d a"}, + } + df = pd.DataFrame.from_dict(data) + output = df_clean_column.run(df).to_dict() + assert expected_output == output From ae9a561b2359ec8c24e7b26b4a262d573ffb3df3 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 23 Mar 2022 10:37:18 +0100 Subject: [PATCH 096/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32b8871f5..59f7c531b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added new function that clean data frame columns from special characters - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added new task `GetFlowNewDateRange` to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` From a40e49985d6695f67a8908ae7dbebc632e3ba51a Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 01:43:44 +0100 Subject: [PATCH 097/135] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Add=20retries=20to?= =?UTF-8?q?=20C4C=20tasks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/cloud_for_customers.py | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32b8871f5..5a28d83dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `if_exists="append"` option to `DuckDB.create_table_from_parquet()` - Added `get_flow_last_run_date` util function - Added `df_to_dataset` task util for writing DataFrames to data lakes using `pyarrow` +- Added retries to Cloud for Customers tasks ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/tasks/cloud_for_customers.py b/viadot/tasks/cloud_for_customers.py index aefec958e..eaf01505e 100644 --- a/viadot/tasks/cloud_for_customers.py +++ b/viadot/tasks/cloud_for_customers.py @@ -1,12 +1,16 @@ -from prefect import task, Task import json -import pandas as pd -from ..sources import CloudForCustomers +from datetime import timedelta from typing import Any, Dict, List -from prefect.utilities.tasks import defaults_from_attrs + +import pandas as pd +from viadot.config import local_config + +from prefect import Task from prefect.tasks.secrets import PrefectSecret +from prefect.utilities.tasks import defaults_from_attrs + +from ..sources import CloudForCustomers from .azure_key_vault import AzureKeyVaultSecret -from viadot.config import local_config class C4CReportToDF(Task): @@ -17,6 +21,8 @@ def __init__( env: str = "QA", skip: int = 0, top: int = 1000, + max_retries: int = 3, + retry_delay: timedelta = timedelta(seconds=10), **kwargs, ): @@ -27,6 +33,8 @@ def __init__( super().__init__( name="c4c_report_to_df", + max_retries=max_retries, + retry_delay=retry_delay, *args, **kwargs, ) @@ -49,6 +57,8 @@ def run( top: int = 1000, credentials_secret: str = None, vault_name: str = None, + max_retries: int = 3, + retry_delay: timedelta = timedelta(seconds=10), ): """ Task for downloading data from the Cloud for Customers to a pandas DataFrame using report URL @@ -111,6 +121,8 @@ def __init__( params: Dict[str, Any] = {}, env: str = "QA", if_empty: str = "warn", + max_retries: int = 3, + retry_delay: timedelta = timedelta(seconds=10), **kwargs, ): @@ -123,6 +135,8 @@ def __init__( super().__init__( name="c4c_to_df", + max_retries=max_retries, + retry_delay=retry_delay, *args, **kwargs, ) From 2989e682f64d05863633e44c626b3da6e58a01d2 Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 01:46:59 +0100 Subject: [PATCH 098/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20credential=20eva?= =?UTF-8?q?luation=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/sources/azure_sql.py | 3 +++ viadot/sources/base.py | 9 ++++++--- viadot/sources/sql_server.py | 5 +++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a28d83dd..20bf275e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed temporary CSV generated by the `DuckDBToSQLServer` flow not being cleaned up - Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance - Fixed `BCPTask` - the case when the file path contained a space +- Fixed credential evaluation logic (`credentials` is now evaluated before `config_key`) ### Removed - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. diff --git a/viadot/sources/azure_sql.py b/viadot/sources/azure_sql.py index d9168a89f..f14645837 100644 --- a/viadot/sources/azure_sql.py +++ b/viadot/sources/azure_sql.py @@ -8,6 +8,9 @@ class AzureSQL(SQLServer): + def __init__(self, *args, config_key="AZURE_SQL", **kwargs): + super().__init__(*args, config_key=config_key, **kwargs) + def bulk_insert( self, table: str, diff --git a/viadot/sources/base.py b/viadot/sources/base.py index 6dc39160f..a1ba325f7 100644 --- a/viadot/sources/base.py +++ b/viadot/sources/base.py @@ -159,8 +159,10 @@ def __init__( if config_key: config_credentials = local_config.get(config_key) + else: + config_credentials = None - credentials = config_credentials if config_key else credentials or {} + credentials = credentials or config_credentials or {} if driver: credentials["driver"] = driver @@ -172,7 +174,7 @@ def __init__( @property def conn_str(self) -> str: """Generate a connection string from params or config. - Note that the user and password are escapedd with '{}' characters. + Note that the user and password are escaped with '{}' characters. Returns: str: The ODBC connection string. @@ -206,7 +208,8 @@ def run(self, query: str) -> Union[List[Record], bool]: cursor = self.con.cursor() cursor.execute(query) - if query.strip().upper().startswith("SELECT"): + query_sanitized = query.strip().upper() + if query_sanitized.startswith("SELECT") or query_sanitized.startswith("WITH"): result = cursor.fetchall() else: result = True diff --git a/viadot/sources/sql_server.py b/viadot/sources/sql_server.py index 821d5b15a..7f17c0c1d 100644 --- a/viadot/sources/sql_server.py +++ b/viadot/sources/sql_server.py @@ -7,11 +7,12 @@ class SQLServer(SQL): def __init__( self, + config_key="SQL_SERVER", + driver="ODBC Driver 17 for SQL Server", *args, **kwargs, ): - super().__init__(*args, **kwargs) - self.credentials["driver"] = "ODBC Driver 17 for SQL Server" + super().__init__(*args, driver=driver, config_key=config_key, **kwargs) @property def schemas(self) -> List[str]: From 8a6f327f8ea136350cbf7269f87fedfffca63858 Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 01:47:24 +0100 Subject: [PATCH 099/135] =?UTF-8?q?=F0=9F=8E=A8=20Formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index a04c31a3a..0f7dee380 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -1,18 +1,17 @@ import json from datetime import timedelta from typing import Any, Dict, List, Literal + import pandas as pd -import sys from prefect import Task from prefect.tasks.secrets import PrefectSecret from prefect.utilities.tasks import defaults_from_attrs +from ..exceptions import ValidationError from ..sources import AzureSQL from .azure_key_vault import AzureKeyVaultSecret -from ..exceptions import ValidationError - def get_credentials(credentials_secret: str, vault_name: str = None): """ From 645268d8820ce06ec4ea4434ce9f4d4483f19210 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 24 Mar 2022 11:09:01 +0100 Subject: [PATCH 100/135] =?UTF-8?q?=F0=9F=90=9B=20Eddited=20if=20statement?= =?UTF-8?q?=20and=20doc=20string?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 7faedb676..a14d90c1d 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -348,11 +348,15 @@ def df_clean_column( ) -> pd.DataFrame: """ Function that remove special characters from data frame like escape symbols etc. + Args: df (pd.DataFrame): DataFrame - columns_to_clean (List[str]): List of columns + columns_to_clean (List[str]): List of columns. Defaults is None. + + Returns: + pd.DataFrame """ - if columns_to_clean == None: + if columns_to_clean is None: df.replace( to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["", ""], From b1877c7721d00751860e51ee8bed936132611668 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 24 Mar 2022 15:21:54 +0100 Subject: [PATCH 101/135] =?UTF-8?q?=F0=9F=90=9B=20Added=20test=20with=20ne?= =?UTF-8?q?w=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- tests/unit/test_task_utils.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59f7c531b..a5a67591e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added new function that clean data frame columns from special characters +- Added new function `df_clean_column` that clean data frame columns from special characters - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added new task `GetFlowNewDateRange` to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 17ee5254e..bdf13e5db 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -185,3 +185,17 @@ def test_df_clean_column(): df = pd.DataFrame.from_dict(data) output = df_clean_column.run(df).to_dict() assert expected_output == output + + +def test_df_clean_column_defined(): + data = { + "col_1": ["a", "b", "c", "d a"], + "col_2": ["a", "b \\r", "\t c", "d \r\n a"], + } + expected_output = { + "col_1": {0: "a", 1: "b", 2: "c", 3: "d a"}, + "col_2": {0: "a", 1: "b ", 2: " c", 3: "d a"}, + } + df = pd.DataFrame.from_dict(data) + output = df_clean_column.run(df, ["col_2"]).to_dict() + assert output == expected_output From d16525cf60be0daeeba8b454fecf481c1ef16baf Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 21:37:43 +0100 Subject: [PATCH 102/135] =?UTF-8?q?=E2=9C=A8=20Add=20`chunksize`=20paramet?= =?UTF-8?q?er=20to=20`C4CToDF`=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/cloud_for_customers.py | 70 ++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20bf275e6..937d597d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `get_flow_last_run_date` util function - Added `df_to_dataset` task util for writing DataFrames to data lakes using `pyarrow` - Added retries to Cloud for Customers tasks +- Added `chunksize` parameter to `C4CToDF` task to allow pulling data in chunks ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/tasks/cloud_for_customers.py b/viadot/tasks/cloud_for_customers.py index eaf01505e..2378ccf28 100644 --- a/viadot/tasks/cloud_for_customers.py +++ b/viadot/tasks/cloud_for_customers.py @@ -1,6 +1,6 @@ import json from datetime import timedelta -from typing import Any, Dict, List +from typing import Dict, List import pandas as pd from viadot.config import local_config @@ -61,13 +61,11 @@ def run( retry_delay: timedelta = timedelta(seconds=10), ): """ - Task for downloading data from the Cloud for Customers to a pandas DataFrame using report URL - (generated in Azure Data Factory). - C4CReportToDF task can not contain endpoint and params, this parameters are stored in generated report_url. + Task for downloading data from the Cloud for Customers to a pandas DataFrame using report URL. Args: - report_url (str, optional): The url to the API in case of prepared report. Defaults to None. - env (str, optional): The development environments. Defaults to 'QA'. + report_url (str, optional): The URL to the report. Defaults to None. + env (str, optional): The environment to use. Defaults to 'QA'. skip (int, optional): Initial index value of reading row. Defaults to 0. top (int, optional): The value of top reading row. Defaults to 1000. credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary @@ -118,7 +116,8 @@ def __init__( url: str = None, endpoint: str = None, fields: List[str] = None, - params: Dict[str, Any] = {}, + params: Dict[str, str] = None, + chunksize: int = 10000, env: str = "QA", if_empty: str = "warn", max_retries: int = 3, @@ -130,6 +129,7 @@ def __init__( self.endpoint = endpoint self.fields = fields self.params = params + self.chunksize = chunksize self.env = env self.if_empty = if_empty @@ -141,14 +141,17 @@ def __init__( **kwargs, ) - @defaults_from_attrs("url", "endpoint", "fields", "params", "env", "if_empty") + @defaults_from_attrs( + "url", "endpoint", "fields", "params", "chunksize", "env", "if_empty" + ) def run( self, url: str = None, env: str = "QA", endpoint: str = None, fields: List[str] = None, - params: List[str] = None, + params: Dict[str, str] = None, + chunksize: int = None, if_empty: str = "warn", credentials_secret: str = None, vault_name: str = None, @@ -161,14 +164,15 @@ def run( Example: url = "https://mysource.com/sap/c4c/odata/v1/c4codataapi" endpoint = "ServiceRequestCollection" - params = {"filter": "CreationDateTime > 2021-12-21T00:00:00Z"} + params = {"$filter": "CreationDateTime ge 2021-12-21T00:00:00Z"} Args: url (str, optional): The url to the API in case of prepared report. Defaults to None. - env (str, optional): The development environments. Defaults to 'QA'. + env (str, optional): The environment to use. Defaults to 'QA'. endpoint (str, optional): The endpoint of the API. Defaults to None. fields (List[str], optional): The C4C Table fields. Defaults to None. - params (Dict[str, Any]): The query parameters like filter by creation date time. Defaults to json format. + params (Dict[str, str]): Query parameters. Defaults to $format=json. + chunksize (int, optional): How many rows to retrieve from C4C at a time. Uses a server-side cursor. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with C4C credentials. Defaults to None. @@ -190,15 +194,39 @@ def run( credentials = json.loads(credentials_str)[env] else: credentials = local_config.get("CLOUD_FOR_CUSTOMERS")[env] - cloud_for_customers = CloudForCustomers( - url=url, - params=params, - endpoint=endpoint, - env=env, - fields=fields, - credentials=credentials, - ) - df = cloud_for_customers.to_df(if_empty=if_empty, fields=fields) + self.logger.info(f"Downloading data from {url}...") + + chunks = [] + offset = 0 + while True: + chunk_no = int(offset / chunksize) + 1 + boundaries = {"$skip": offset, "$top": chunksize} + params.update(boundaries) + + self.logger.info(f"Downloading chunk no. {chunk_no}...") + + chunk = CloudForCustomers( + url=url, + endpoint=endpoint, + params=params, + env=env, + credentials=credentials, + ).to_df(if_empty=if_empty, fields=fields) + + self.logger.debug(f"Chunk no. {chunk_no} has been downloaded successfully.") + + chunks.append(chunk) + + if chunk.shape[0] < chunksize: + break + + offset += chunksize + + self.logger.info(f"Data from {url+endpoint} has been downloaded successfully.") + + df = pd.concat(chunks) + + del chunks return df From fd392017c90e44fe33490e527e4dd4cb08f4f198 Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 21:41:08 +0100 Subject: [PATCH 103/135] =?UTF-8?q?=F0=9F=94=8A=20Add=20more=20detail=20to?= =?UTF-8?q?=20the=20log=20message?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/cloud_for_customers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/cloud_for_customers.py b/viadot/tasks/cloud_for_customers.py index 2378ccf28..17630bfd0 100644 --- a/viadot/tasks/cloud_for_customers.py +++ b/viadot/tasks/cloud_for_customers.py @@ -195,7 +195,7 @@ def run( else: credentials = local_config.get("CLOUD_FOR_CUSTOMERS")[env] - self.logger.info(f"Downloading data from {url}...") + self.logger.info(f"Downloading data from {url+endpoint}...") chunks = [] offset = 0 From 16d59b164a6e1204ec7559bfa862b01a570fd577 Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 21:57:07 +0100 Subject: [PATCH 104/135] =?UTF-8?q?=E2=9C=A8=20Add=20`chunksize`=20paramet?= =?UTF-8?q?er=20to=20`BCPTask`=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/bcp.py | 30 ++++++++++++++++++------------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 937d597d0..d190cef12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `df_to_dataset` task util for writing DataFrames to data lakes using `pyarrow` - Added retries to Cloud for Customers tasks - Added `chunksize` parameter to `C4CToDF` task to allow pulling data in chunks +- Added `chunksize` parameter to `BCPTask` task to allow more control over the load process ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/tasks/bcp.py b/viadot/tasks/bcp.py index cb986ba91..0d64111a1 100644 --- a/viadot/tasks/bcp.py +++ b/viadot/tasks/bcp.py @@ -13,12 +13,13 @@ class BCPTask(ShellTask): Task for bulk inserting data into SQL Server-compatible databases. Args: - - path (str, optional): the path to the local CSV file to be inserted - - schema (str, optional): the destination schema - - table (str, optional): the destination table + - path (str, optional): The path to the local CSV file to be inserted. + - schema (str, optional): The destination schema. + - table (str, optional): The destination table. + - chunksize (int, optional): The chunk size to use. - credentials (dict, optional): The credentials to use for connecting with the database. - - vault_name (str): the name of the vault from which to fetch the secret - - **kwargs (dict, optional): additional keyword arguments to pass to the Task constructor + - vault_name (str): The name of the vault from which to fetch the secret. + - **kwargs (dict, optional): Additional keyword arguments to pass to the Task constructor. """ def __init__( @@ -26,6 +27,7 @@ def __init__( path: str = None, schema: str = None, table: str = None, + chunksize: int = 5000, credentials: dict = None, vault_name: str = None, max_retries: int = 3, @@ -36,6 +38,7 @@ def __init__( self.path = path self.schema = schema self.table = table + self.chunksize = chunksize self.credentials = credentials self.vault_name = vault_name @@ -53,6 +56,7 @@ def __init__( "path", "schema", "table", + "chunksize", "credentials", "vault_name", "max_retries", @@ -63,6 +67,7 @@ def run( path: str = None, schema: str = None, table: str = None, + chunksize: int = None, credentials: dict = None, credentials_secret: str = None, vault_name: str = None, @@ -74,16 +79,17 @@ def run( Task run method. Args: - - path (str, optional): the path to the local CSV file to be inserted - - schema (str, optional): the destination schema - - table (str, optional): the destination table + - path (str, optional): The path to the local CSV file to be inserted. + - schema (str, optional): The destination schema. + - table (str, optional): The destination table. + - chunksize (int, optional): The chunk size to use. By default 5000. - credentials (dict, optional): The credentials to use for connecting with SQL Server. - - credentials_secret (str, optional): the name of the Key Vault secret containing database credentials + - credentials_secret (str, optional): The name of the Key Vault secret containing database credentials. (server, db_name, user, password) - - vault_name (str): the name of the vault from which to fetch the secret + - vault_name (str): The name of the vault from which to fetch the secret. Returns: - str: the output of the bcp CLI command + str: The output of the bcp CLI command. """ if not credentials: if not credentials_secret: @@ -113,5 +119,5 @@ def run( # but not in BCP's 'server' argument. server = server.replace(" ", "") - command = f"/opt/mssql-tools/bin/bcp {fqn} in '{path}' -S {server} -d {db_name} -U {uid} -P '{pwd}' -c -F 2 -b 5000 -h 'TABLOCK'" + command = f"/opt/mssql-tools/bin/bcp {fqn} in '{path}' -S {server} -d {db_name} -U {uid} -P '{pwd}' -c -F 2 -b {chunksize} -h 'TABLOCK'" return super().run(command=command, **kwargs) From c48253c85de92e356ae089471844c8846a06dac9 Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 24 Mar 2022 22:18:12 +0100 Subject: [PATCH 105/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20"$top"=20and=20"$s?= =?UTF-8?q?kip"=20values=20being=20ignored=20by=20`C4CToDF`=20task=20if=20?= =?UTF-8?q?provided=20in=20the=20`params`=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/cloud_for_customers.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d190cef12..3b9baf09c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance - Fixed `BCPTask` - the case when the file path contained a space - Fixed credential evaluation logic (`credentials` is now evaluated before `config_key`) +- Fix "$top" and "$skip" values being ignored by `C4CToDF` task if provided in the `params` parameter ### Removed - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. diff --git a/viadot/tasks/cloud_for_customers.py b/viadot/tasks/cloud_for_customers.py index 17630bfd0..a8d9dae2e 100644 --- a/viadot/tasks/cloud_for_customers.py +++ b/viadot/tasks/cloud_for_customers.py @@ -197,6 +197,16 @@ def run( self.logger.info(f"Downloading data from {url+endpoint}...") + # If we get any of these in params, we don't perform any chunking + if any(["$skip" in params, "$top" in params]): + return CloudForCustomers( + url=url, + endpoint=endpoint, + params=params, + env=env, + credentials=credentials, + ).to_df(if_empty=if_empty, fields=fields) + chunks = [] offset = 0 while True: From 7d3f52173417a81b0c395398f5415ccef6a63c1c Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Fri, 25 Mar 2022 15:17:56 +0100 Subject: [PATCH 106/135] Updated requirements great-expectations==0.14.12 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1d38ce277..0c0d11aa8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ openpyxl==3.0.9 jupyterlab==3.2.4 azure-keyvault==4.1.0 azure-identity==1.7.1 -great-expectations==0.14.11 +great-expectations==0.14.12 matplotlib adlfs==2021.10.0 PyGithub==1.55 @@ -26,4 +26,4 @@ imagehash==4.2.1 visions==0.7.4 sharepy==1.3.0 sql-metadata==2.3.0 -duckdb==0.3.2 \ No newline at end of file +duckdb==0.3.2 From d67a40a2ba8924cdf67c2d3612a50a6ebc67109e Mon Sep 17 00:00:00 2001 From: trymzet Date: Sun, 27 Mar 2022 17:06:24 +0200 Subject: [PATCH 107/135] =?UTF-8?q?=E2=9C=A8=20Add=20support=20for=20SQL?= =?UTF-8?q?=20Server's=20custom=20`datetimeoffset`=20type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/sources/base.py | 8 ++++++-- viadot/sources/sql_server.py | 37 +++++++++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b9baf09c..fcdb18859 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added retries to Cloud for Customers tasks - Added `chunksize` parameter to `C4CToDF` task to allow pulling data in chunks - Added `chunksize` parameter to `BCPTask` task to allow more control over the load process +- Added support for SQL Server's custom `datetimeoffset` type ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/sources/base.py b/viadot/sources/base.py index a1ba325f7..50c3ddbb9 100644 --- a/viadot/sources/base.py +++ b/viadot/sources/base.py @@ -219,13 +219,17 @@ def run(self, query: str) -> Union[List[Record], bool]: return result - def to_df(self, query: str, if_empty: str = None) -> pd.DataFrame: + def to_df( + self, query: str, con: pyodbc.Connection = None, if_empty: str = None + ) -> pd.DataFrame: """Creates DataFrame form SQL query. Args: query (str): SQL query. If don't start with "SELECT" returns empty DataFrame. + con (pyodbc.Connection, optional): The connection to use to pull the data. if_empty (str, optional): What to do if the query returns no data. Defaults to None. """ - conn = self.con + conn = con or self.con + if query.upper().startswith("SELECT"): df = pd.read_sql_query(query, conn) if df.empty: diff --git a/viadot/sources/sql_server.py b/viadot/sources/sql_server.py index 7f17c0c1d..753ad4b97 100644 --- a/viadot/sources/sql_server.py +++ b/viadot/sources/sql_server.py @@ -1,6 +1,9 @@ -from .base import SQL +import struct +from datetime import datetime, timedelta, timezone from typing import List +from .base import SQL + class SQLServer(SQL): DEFAULT_SCHEMA = "dbo" @@ -13,6 +16,7 @@ def __init__( **kwargs, ): super().__init__(*args, driver=driver, config_key=config_key, **kwargs) + self.con.add_output_converter(-155, self._handle_datetimeoffset) @property def schemas(self) -> List[str]: @@ -28,6 +32,37 @@ def tables(self) -> List[str]: ) return [".".join(row) for row in tables_tuples] + @staticmethod + def _handle_datetimeoffset(dto_value): + """ + Adds support for SQL Server's custom `datetimeoffset` type, which is not + handled natively by ODBC/pyodbc. + + See: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794 + """ + ( + year, + month, + day, + hour, + minute, + second, + nanoseconds, + offset_hours, + offset_minutes, + ) = struct.unpack("<6hI2h", dto_value) + dt = datetime( + year, + month, + day, + hour, + minute, + second, + nanoseconds // 1000, + tzinfo=timezone(timedelta(hours=offset_hours, minutes=offset_minutes)), + ) + return dt + def exists(self, table: str, schema: str = None) -> bool: """Check whether a table exists. Args: From 5e6c4f3640c3586e54957315dca9090693abc856 Mon Sep 17 00:00:00 2001 From: trymzet Date: Sun, 27 Mar 2022 17:27:58 +0200 Subject: [PATCH 108/135] =?UTF-8?q?=E2=9C=A8=20Add=20`AzureSQLToDF`=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/__init__.py | 2 +- viadot/tasks/azure_sql.py | 53 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcdb18859..f4336c238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `chunksize` parameter to `C4CToDF` task to allow pulling data in chunks - Added `chunksize` parameter to `BCPTask` task to allow more control over the load process - Added support for SQL Server's custom `datetimeoffset` type +- Added `AzureSQLToDF` task ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 7f9254229..86aee449b 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -16,6 +16,7 @@ AzureSQLCreateTable, AzureSQLDBQuery, CreateTableFromBlob, + AzureSQLToDF, CheckColumnOrder, ) from .bcp import BCPTask @@ -35,4 +36,3 @@ from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery, DuckDBToDF from .sql_server import SQLServerCreateTable - diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 0f7dee380..a6133c688 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -253,7 +253,7 @@ def __init__( self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="run_azure_sql_db_query", *args, **kwargs) + super().__init__(name="azure_sql_db_query", *args, **kwargs) def run( self, @@ -280,6 +280,53 @@ def run( return result +class AzureSQLToDF(Task): + """ + Task for loading the result of an Azure SQL Database query into a pandas DataFrame. + + Args: + query (str, required): The query to execute on the database. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary + with SQL db credentials (server, db_name, user, and password). + vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + """ + + def __init__( + self, + credentials_secret: str = None, + vault_name: str = None, + *args, + **kwargs, + ): + self.credentials_secret = credentials_secret + self.vault_name = vault_name + + super().__init__(name="azure_sql_to_df", *args, **kwargs) + + def run( + self, + query: str, + credentials_secret: str = None, + vault_name: str = None, + ): + """Load the result of an Azure SQL Database query into a pandas DataFrame. + + Args: + query (str, required): The query to execute on the database. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary + with SQL db credentials (server, db_name, user, and password). + vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + """ + + credentials = get_credentials(credentials_secret, vault_name=vault_name) + azure_sql = AzureSQL(credentials=credentials) + + df = azure_sql.to_df(query) + + self.logger.info(f"Successfully downloaded data to a DataFrame.") + return df + + class CheckColumnOrder(Task): """ Task for checking the order of columns in the loaded DF and in the SQL table into which the data from DF will be loaded. @@ -315,7 +362,7 @@ def df_change_order( return df_changed - def rename_columns(self, df: pd.DataFrame = None): + def sanitize_columns(self, df: pd.DataFrame = None): """ Function to remove spaces at the end of column name. Args: @@ -348,7 +395,7 @@ def run( """ credentials = get_credentials(credentials_secret, vault_name=vault_name) azure_sql = AzureSQL(credentials=credentials) - df = self.rename_columns(df) + df = self.sanitize_columns(df) query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}'" check_result = azure_sql.run(query=query) if if_exists not in ["replace", "fail"]: From bb33f4e1ea610a981e4c9c56fc9d06eac559265b Mon Sep 17 00:00:00 2001 From: trymzet Date: Sun, 27 Mar 2022 17:38:53 +0200 Subject: [PATCH 109/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20`SQL.to=5Fdf()`=20?= =?UTF-8?q?incorrectly=20handling=20queries=20that=20begin=20with=20whites?= =?UTF-8?q?pace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 ++- viadot/sources/base.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4336c238..4486bb938 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,7 +43,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed some mappings in `get_sql_dtypes_from_df()` and optimized performance - Fixed `BCPTask` - the case when the file path contained a space - Fixed credential evaluation logic (`credentials` is now evaluated before `config_key`) -- Fix "$top" and "$skip" values being ignored by `C4CToDF` task if provided in the `params` parameter +- Fixed "$top" and "$skip" values being ignored by `C4CToDF` task if provided in the `params` parameter +- Fixed `SQL.to_df()` incorrectly handling queries that begin with whitespace ### Removed - Removed `autopick_sep` parameter from `SAPRFC` functions. The separator is now always picked automatically if not provided. diff --git a/viadot/sources/base.py b/viadot/sources/base.py index 50c3ddbb9..6e30db497 100644 --- a/viadot/sources/base.py +++ b/viadot/sources/base.py @@ -230,7 +230,8 @@ def to_df( """ conn = con or self.con - if query.upper().startswith("SELECT"): + query_sanitized = query.strip().upper() + if query_sanitized.startswith("SELECT") or query_sanitized.startswith("WITH"): df = pd.read_sql_query(query, conn) if df.empty: self._handle_if_empty(if_empty=if_empty) From fad6956236ce79fde70ea0a4b3c7c737a99047b9 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 29 Mar 2022 12:54:40 +0200 Subject: [PATCH 110/135] =?UTF-8?q?=E2=9C=A8=20Added=20custom=20mail=20not?= =?UTF-8?q?ifier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 16 +++++++++ viadot/utils.py | 61 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 4b6101d25..44caa98bf 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -2,7 +2,11 @@ import numpy as np import os import pandas as pd +import prefect from typing import List +from viadot.utils import custom_state_handler +from prefect.engine.state import Failed, Success +from prefect import Task, Flow from viadot.task_utils import ( chunk_df, df_get_data_types_task, @@ -170,3 +174,15 @@ def test_write_to_json(): write_to_json.run(dict, "dict.json") assert os.path.exists("dict.json") os.remove("dict.json") + + +def test_custom_state_handler(): + final_state = custom_state_handler( + obj="Flow", + old_state=Success, + new_state=Failed, + only_states=[Failed], + API_KEY=None, + ) + + assert final_state == prefect.engine.state.Failed diff --git a/viadot/utils.py b/viadot/utils.py index 3b2b8afdd..ccd6c9ec6 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -1,14 +1,23 @@ from typing import Any, Dict +from datetime import datetime +from typing import TYPE_CHECKING, Any, Callable, Union, cast import pendulum import prefect import requests +import prefect.client +from prefect.engine.state import Failed from prefect.utilities.graphql import EnumValue, with_args +from prefect import Task, Flow +from sendgrid import SendGridAPIClient +from sendgrid.helpers.mail import Mail +from toolz import curry from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError, HTTPError, ReadTimeout, Timeout from requests.packages.urllib3.util.retry import Retry from urllib3.exceptions import ProtocolError + from .exceptions import APIError @@ -117,3 +126,55 @@ def get_flow_last_run_date(flow_name: str) -> str: pendulum.parse(last_run_date_raw_format).format("YYYY-MM-DDTHH:MM:SS") + "Z" ) return last_run_date + + +@curry +def custom_state_handler( + obj: Union["Flow", "Task"], + old_state: "prefect.engine.state.State", + new_state: "prefect.engine.state.State", + only_states: list = None, + API_KEY: str = None, +) -> "prefect.engine.state.State": + """ + Custom state handler configured to work with sendgrid. + Works as a standalone state handler, or can be called from within a custom state handler. + Args: + - tracked_obj (Task or Flow): Task or Flow object the handler is registered with + - old_state (State): previous state of tracked object + - new_state (State): new state of tracked object + - only_states ([State], optional): similar to `ignore_states`, but instead _only_ + notifies you if the Task / Flow is in a state from the provided list of `State` + classes + Returns: + - State: the `new_state` object that was provided + Raises: + - ValueError: if the email notification fails for any reason + + """ + curr_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + only_states = only_states or [] + if only_states and not any( + [isinstance(new_state, included) for included in only_states] + ): + return new_state + url = prefect.client.Client().get_cloud_url( + "flow-run", prefect.context["flow_run_id"], as_user=False + ) + message = Mail( + from_email="notifications@dyvenia.com", + to_emails="notifications@dyvenia.com", + subject=f"The flow {obj.name} - Status {new_state}", + html_content=f"The flow {cast(str,obj.name)} FAILED at {curr_dt}. \ +

More details here: {url}

", + ) + try: + sg = SendGridAPIClient(API_KEY) + response = sg.send(message) + print(response.status_code) + print(response.body) + print(response.headers) + except Exception as e: + print(e.message) + + return new_state From e3a16d5424d81f91ad4306d399a5e6f883dedfad Mon Sep 17 00:00:00 2001 From: winiar93 Date: Tue, 29 Mar 2022 13:03:25 +0200 Subject: [PATCH 111/135] =?UTF-8?q?=F0=9F=90=9B=20renamed=20variable=20in?= =?UTF-8?q?=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 2 +- viadot/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 44caa98bf..df12eb9a4 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -178,7 +178,7 @@ def test_write_to_json(): def test_custom_state_handler(): final_state = custom_state_handler( - obj="Flow", + tracked_obj="Flow", old_state=Success, new_state=Failed, only_states=[Failed], diff --git a/viadot/utils.py b/viadot/utils.py index ccd6c9ec6..a9da6028c 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -130,7 +130,7 @@ def get_flow_last_run_date(flow_name: str) -> str: @curry def custom_state_handler( - obj: Union["Flow", "Task"], + tracked_obj: Union["Flow", "Task"], old_state: "prefect.engine.state.State", new_state: "prefect.engine.state.State", only_states: list = None, @@ -164,8 +164,8 @@ def custom_state_handler( message = Mail( from_email="notifications@dyvenia.com", to_emails="notifications@dyvenia.com", - subject=f"The flow {obj.name} - Status {new_state}", - html_content=f"The flow {cast(str,obj.name)} FAILED at {curr_dt}. \ + subject=f"The flow {tracked_obj.name} - Status {new_state}", + html_content=f"The flow {cast(str,tracked_obj.name)} FAILED at {curr_dt}. \

More details here: {url}

", ) try: From 10e19a6c23f8b6f43a962a6d4fdcb60fc4a42107 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Wed, 30 Mar 2022 14:01:37 +0200 Subject: [PATCH 112/135] Updated requirments to fix Black issue The most recent release of Click, 8.1.0, is breaking Black so I pined click==8.0.1 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 0c0d11aa8..293ace33a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ azure-core==1.20.1 azure-storage-blob==12.9.0 +click==8.0.1 black==21.11b1 mkdocs-autorefs==0.3.0 mkdocs-material-extensions==1.0.3 From 4cde56dfbe8709f279bea6c3b65edb07b9444004 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 30 Mar 2022 16:06:45 +0200 Subject: [PATCH 113/135] =?UTF-8?q?=F0=9F=90=9B=20Added=20improvments=20in?= =?UTF-8?q?=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 16 +------- viadot/task_utils.py | 77 ++++++++++++++++++++++++++++++++++- viadot/utils.py | 63 +--------------------------- 3 files changed, 79 insertions(+), 77 deletions(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index df12eb9a4..abb62a849 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -4,9 +4,7 @@ import pandas as pd import prefect from typing import List -from viadot.utils import custom_state_handler -from prefect.engine.state import Failed, Success -from prefect import Task, Flow + from viadot.task_utils import ( chunk_df, df_get_data_types_task, @@ -174,15 +172,3 @@ def test_write_to_json(): write_to_json.run(dict, "dict.json") assert os.path.exists("dict.json") os.remove("dict.json") - - -def test_custom_state_handler(): - final_state = custom_state_handler( - tracked_obj="Flow", - old_state=Success, - new_state=Failed, - only_states=[Failed], - API_KEY=None, - ) - - assert final_state == prefect.engine.state.Failed diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 5a6fdbb45..618478fde 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -4,7 +4,8 @@ import shutil from datetime import datetime, timezone from pathlib import Path -from typing import List, Literal +from typing import TYPE_CHECKING, Any, Callable, Union, cast, List, Literal +from toolz import curry import pandas as pd import prefect @@ -13,8 +14,14 @@ from prefect import task from prefect.storage import Git from prefect.utilities import logging +from prefect.tasks.secrets import PrefectSecret +from prefect import Task, Flow +from sendgrid import SendGridAPIClient +from sendgrid.helpers.mail import Mail from visions.functional import infer_type from visions.typesets.complete_set import CompleteSet +from viadot.tasks import AzureKeyVaultSecret + logger = logging.get_logger() METADATA_COLUMNS = {"_viadot_downloaded_at_utc": "DATETIME"} @@ -342,6 +349,74 @@ def df_to_dataset( ) +@curry +def custom_mail_state_handler( + tracked_obj: Union["Flow", "Task"], + old_state: prefect.engine.state.State, + new_state: prefect.engine.state.State, + only_states: list = None, + local_api_key: str = None, + credentials_secret: str = None, + vault_name: str = None, +) -> prefect.engine.state.State: + + """ + Custom state handler configured to work with sendgrid. + Works as a standalone state handler, or can be called from within a custom state handler. + Args: + tracked_obj (Task or Flow): Task or Flow object the handler is registered with. + old_state (State): previous state of tracked object. + new_state (State): new state of tracked object. + only_states ([State], optional): similar to `ignore_states`, but instead _only_ + notifies you if the Task / Flow is in a state from the provided list of `State` + classes. + local_api_key (str, optional): Api key from local config. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with API KEY. + vault_name (str, optional): Name of key vault. + Returns: State: the `new_state` object that was provided + + """ + if credentials_secret is None: + try: + credentials_secret = PrefectSecret("mail_notifier_api_key").run() + except ValueError: + pass + + if credentials_secret is not None: + credentials_str = AzureKeyVaultSecret( + credentials_secret, vault_name=vault_name + ).run() + api_key = json.loads(credentials_str).get("API_KEY") + elif local_api_key is not None: + api_key = local_config.get(local_api_key).get("API_KEY") + else: + print("Please provide API KEY") + + curr_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + only_states = only_states or [] + if only_states and not any( + [isinstance(new_state, included) for included in only_states] + ): + return new_state + url = prefect.client.Client().get_cloud_url( + "flow-run", prefect.context["flow_run_id"], as_user=False + ) + message = Mail( + from_email="notifications@dyvenia.com", + to_emails="notifications@dyvenia.com", + subject=f"The flow {tracked_obj.name} - Status {new_state}", + html_content=f"The flow {cast(str,tracked_obj.name)} FAILED at {curr_dt}. \ +

More details here: {url}

", + ) + try: + sg = SendGridAPIClient(api_key) + response = sg.send(message) + except Exception as e: + print(e.message) + + return new_state + + class Git(Git): @property def git_clone_url(self): diff --git a/viadot/utils.py b/viadot/utils.py index a9da6028c..b98fc60e6 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -1,23 +1,16 @@ from typing import Any, Dict -from datetime import datetime -from typing import TYPE_CHECKING, Any, Callable, Union, cast - +from typing import Any import pendulum import prefect import requests import prefect.client -from prefect.engine.state import Failed + from prefect.utilities.graphql import EnumValue, with_args -from prefect import Task, Flow -from sendgrid import SendGridAPIClient -from sendgrid.helpers.mail import Mail -from toolz import curry from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError, HTTPError, ReadTimeout, Timeout from requests.packages.urllib3.util.retry import Retry from urllib3.exceptions import ProtocolError - from .exceptions import APIError @@ -126,55 +119,3 @@ def get_flow_last_run_date(flow_name: str) -> str: pendulum.parse(last_run_date_raw_format).format("YYYY-MM-DDTHH:MM:SS") + "Z" ) return last_run_date - - -@curry -def custom_state_handler( - tracked_obj: Union["Flow", "Task"], - old_state: "prefect.engine.state.State", - new_state: "prefect.engine.state.State", - only_states: list = None, - API_KEY: str = None, -) -> "prefect.engine.state.State": - """ - Custom state handler configured to work with sendgrid. - Works as a standalone state handler, or can be called from within a custom state handler. - Args: - - tracked_obj (Task or Flow): Task or Flow object the handler is registered with - - old_state (State): previous state of tracked object - - new_state (State): new state of tracked object - - only_states ([State], optional): similar to `ignore_states`, but instead _only_ - notifies you if the Task / Flow is in a state from the provided list of `State` - classes - Returns: - - State: the `new_state` object that was provided - Raises: - - ValueError: if the email notification fails for any reason - - """ - curr_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - only_states = only_states or [] - if only_states and not any( - [isinstance(new_state, included) for included in only_states] - ): - return new_state - url = prefect.client.Client().get_cloud_url( - "flow-run", prefect.context["flow_run_id"], as_user=False - ) - message = Mail( - from_email="notifications@dyvenia.com", - to_emails="notifications@dyvenia.com", - subject=f"The flow {tracked_obj.name} - Status {new_state}", - html_content=f"The flow {cast(str,tracked_obj.name)} FAILED at {curr_dt}. \ -

More details here: {url}

", - ) - try: - sg = SendGridAPIClient(API_KEY) - response = sg.send(message) - print(response.status_code) - print(response.body) - print(response.headers) - except Exception as e: - print(e.message) - - return new_state From 2e7edf0730eff7291ca7804da5f5a0875dd0a94b Mon Sep 17 00:00:00 2001 From: winiar93 Date: Wed, 30 Mar 2022 16:48:05 +0200 Subject: [PATCH 114/135] =?UTF-8?q?=F0=9F=8E=A8=20=20Set=20Failed=20as=20d?= =?UTF-8?q?efault=20in=20only=5Fstates=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 17 +++++++++++++++++ viadot/task_utils.py | 3 ++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index abb62a849..771078193 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -4,6 +4,8 @@ import pandas as pd import prefect from typing import List +from prefect.engine.state import Failed, Success + from viadot.task_utils import ( chunk_df, @@ -15,6 +17,7 @@ dtypes_to_json_task, write_to_json, df_converts_bytes_to_int, + custom_mail_state_handler, ) @@ -172,3 +175,17 @@ def test_write_to_json(): write_to_json.run(dict, "dict.json") assert os.path.exists("dict.json") os.remove("dict.json") + + +def test_custom_state_handler(): + final_state = custom_mail_state_handler( + tracked_obj="Flow", + old_state=Success, + new_state=Failed, + only_states=[Failed], + local_api_key=None, + credentials_secret="SENDGRIND", + vault_name="azuwevelcrkeyv001s", + ) + + assert final_state == prefect.engine.state.Failed diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 618478fde..5dac2fcfe 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -15,6 +15,7 @@ from prefect.storage import Git from prefect.utilities import logging from prefect.tasks.secrets import PrefectSecret +from prefect.engine.state import Failed from prefect import Task, Flow from sendgrid import SendGridAPIClient from sendgrid.helpers.mail import Mail @@ -354,7 +355,7 @@ def custom_mail_state_handler( tracked_obj: Union["Flow", "Task"], old_state: prefect.engine.state.State, new_state: prefect.engine.state.State, - only_states: list = None, + only_states: list = [Failed], local_api_key: str = None, credentials_secret: str = None, vault_name: str = None, From 02688abf6fb53e04d7d0f34c550b7845ca5020d7 Mon Sep 17 00:00:00 2001 From: Mike <70263671+winiar93@users.noreply.github.com> Date: Thu, 31 Mar 2022 09:36:13 +0200 Subject: [PATCH 115/135] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4486bb938..e9e7b84ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added `custom_mail_state_handler` function that send mail notification using custom smtp server. - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added new task `GetFlowNewDateRange` to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` From a61cf04d5102c64f26c7644018fb302b9ade2072 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 31 Mar 2022 11:10:48 +0200 Subject: [PATCH 116/135] =?UTF-8?q?=F0=9F=90=9B=20Corrected=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 3 +-- viadot/utils.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 5dac2fcfe..071ef776b 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -11,12 +11,11 @@ import prefect import pyarrow as pa import pyarrow.dataset as ds -from prefect import task +from prefect import task, Task, Flow from prefect.storage import Git from prefect.utilities import logging from prefect.tasks.secrets import PrefectSecret from prefect.engine.state import Failed -from prefect import Task, Flow from sendgrid import SendGridAPIClient from sendgrid.helpers.mail import Mail from visions.functional import infer_type diff --git a/viadot/utils.py b/viadot/utils.py index b98fc60e6..10e138f2f 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -1,5 +1,4 @@ from typing import Any, Dict -from typing import Any import pendulum import prefect import requests From da95621005e5a30f4e8ca0ffc0412a8e0800fbce Mon Sep 17 00:00:00 2001 From: m-paz Date: Thu, 31 Mar 2022 12:21:38 +0200 Subject: [PATCH 117/135] =?UTF-8?q?=E2=9E=95=20added=20sendgrid=20and=20go?= =?UTF-8?q?ogle-cloud?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 293ace33a..9816ffb1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,5 @@ visions==0.7.4 sharepy==1.3.0 sql-metadata==2.3.0 duckdb==0.3.2 +sendgrid==6.9.7 +google-cloud==0.34.0 From 3cb92561b49d287006725db42f4f842e99f9208e Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 31 Mar 2022 15:26:38 +0200 Subject: [PATCH 118/135] =?UTF-8?q?=F0=9F=90=9B=20Added=20import=20local?= =?UTF-8?q?=5Fconfig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 071ef776b..27402cd75 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -21,6 +21,7 @@ from visions.functional import infer_type from visions.typesets.complete_set import CompleteSet from viadot.tasks import AzureKeyVaultSecret +from viadot.config import local_config logger = logging.get_logger() From 3c9fdb5cce67d9b0211d25e09bd245bd84379414 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 31 Mar 2022 15:41:19 +0200 Subject: [PATCH 119/135] =?UTF-8?q?=F0=9F=8E=A8=20Formatted=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 2 ++ viadot/task_utils.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index d0a6caa83..e42139703 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -190,6 +190,8 @@ def test_custom_state_handler(): ) assert final_state == prefect.engine.state.Failed + + def test_df_clean_column(): data = { "col_1": ["a", "b \\r", "\t c", "d \r\n a"], diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 748a5e3bf..7d74bfeff 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -416,6 +416,8 @@ def custom_mail_state_handler( print(e.message) return new_state + + @task def df_clean_column( df: pd.DataFrame, columns_to_clean: List[str] = None From 7eb600b98ea1cab36669d26860f6cc412692f664 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Thu, 31 Mar 2022 15:49:23 +0200 Subject: [PATCH 120/135] =?UTF-8?q?=F0=9F=8E=A8=20Edited=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_task_utils.py | 20 ++++++++++++++++++++ tests/unit/test_task_utils.py | 16 ---------------- 2 files changed, 20 insertions(+), 16 deletions(-) create mode 100644 tests/integration/tasks/test_task_utils.py diff --git a/tests/integration/tasks/test_task_utils.py b/tests/integration/tasks/test_task_utils.py new file mode 100644 index 000000000..65583a2f2 --- /dev/null +++ b/tests/integration/tasks/test_task_utils.py @@ -0,0 +1,20 @@ +from viadot.task_utils import custom_mail_state_handler +from prefect.tasks.secrets import PrefectSecret +from prefect.engine.state import Failed, Success +import prefect + + +def test_custom_state_handler(): + vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() + + final_state = custom_mail_state_handler( + tracked_obj="Flow", + old_state=Success, + new_state=Failed, + only_states=[Failed], + local_api_key=None, + credentials_secret="SENDGRIND", + vault_name=vault_name, + ) + + assert final_state == prefect.engine.state.Failed diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index e42139703..af442364a 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -4,7 +4,6 @@ import pandas as pd import prefect from typing import List -from prefect.engine.state import Failed, Success from viadot.task_utils import ( @@ -17,7 +16,6 @@ dtypes_to_json_task, write_to_json, df_converts_bytes_to_int, - custom_mail_state_handler, df_clean_column, ) @@ -178,20 +176,6 @@ def test_write_to_json(): os.remove("dict.json") -def test_custom_state_handler(): - final_state = custom_mail_state_handler( - tracked_obj="Flow", - old_state=Success, - new_state=Failed, - only_states=[Failed], - local_api_key=None, - credentials_secret="SENDGRIND", - vault_name="azuwevelcrkeyv001s", - ) - - assert final_state == prefect.engine.state.Failed - - def test_df_clean_column(): data = { "col_1": ["a", "b \\r", "\t c", "d \r\n a"], From 77f93ac782affd47e7785a936927d4b9894d98d1 Mon Sep 17 00:00:00 2001 From: winiar93 Date: Fri, 1 Apr 2022 13:43:22 +0200 Subject: [PATCH 121/135] =?UTF-8?q?=F0=9F=8E=A8=20Eddited=20structure=20of?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_task_utils.py | 3 +-- tests/unit/test_task_utils.py | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/integration/tasks/test_task_utils.py b/tests/integration/tasks/test_task_utils.py index 65583a2f2..8a9a154cd 100644 --- a/tests/integration/tasks/test_task_utils.py +++ b/tests/integration/tasks/test_task_utils.py @@ -1,7 +1,6 @@ from viadot.task_utils import custom_mail_state_handler from prefect.tasks.secrets import PrefectSecret from prefect.engine.state import Failed, Success -import prefect def test_custom_state_handler(): @@ -17,4 +16,4 @@ def test_custom_state_handler(): vault_name=vault_name, ) - assert final_state == prefect.engine.state.Failed + assert final_state == Failed diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index af442364a..e1108b092 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -176,14 +176,14 @@ def test_write_to_json(): os.remove("dict.json") -def test_df_clean_column(): +def test_df_clean_column_all(): data = { - "col_1": ["a", "b \\r", "\t c", "d \r\n a"], - "col_2": ["a", "b \\r", "\t c", "d \r\n a"], + "col_1": ["a", "b\\r", "\tc", "d \r\n a"], + "col_2": ["a", "b\\r", "\tc", "d \r\n a"], } expected_output = { - "col_1": {0: "a", 1: "b ", 2: " c", 3: "d a"}, - "col_2": {0: "a", 1: "b ", 2: " c", 3: "d a"}, + "col_1": {0: "a", 1: "b", 2: "c", 3: "d a"}, + "col_2": {0: "a", 1: "b", 2: "c", 3: "d a"}, } df = pd.DataFrame.from_dict(data) output = df_clean_column.run(df).to_dict() @@ -193,11 +193,11 @@ def test_df_clean_column(): def test_df_clean_column_defined(): data = { "col_1": ["a", "b", "c", "d a"], - "col_2": ["a", "b \\r", "\t c", "d \r\n a"], + "col_2": ["a\t\r", "b\\r", "\tc", "d \r\n a"], } expected_output = { "col_1": {0: "a", 1: "b", 2: "c", 3: "d a"}, - "col_2": {0: "a", 1: "b ", 2: " c", 3: "d a"}, + "col_2": {0: "a", 1: "b", 2: "c", 3: "d a"}, } df = pd.DataFrame.from_dict(data) output = df_clean_column.run(df, ["col_2"]).to_dict() From aa4e3bc2d7e071d942afb69e6bd70dc1854699d5 Mon Sep 17 00:00:00 2001 From: trymzet Date: Sun, 3 Apr 2022 14:14:26 +0200 Subject: [PATCH 122/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20`df=5Fclean=5Fcolu?= =?UTF-8?q?mn`=20modifying=20input=20DataFrame?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index a14d90c1d..05dbfa644 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -2,7 +2,7 @@ import json import os import shutil -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from pathlib import Path from typing import List, Literal @@ -307,6 +307,10 @@ def df_converts_bytes_to_int(df: pd.DataFrame) -> pd.DataFrame: return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) +@task( + max_retries=3, + retry_delay=timedelta(seconds=10), +) @task def df_to_dataset( df: pd.DataFrame, partitioning_flavor="hive", format="parquet", **kwargs @@ -347,15 +351,19 @@ def df_clean_column( df: pd.DataFrame, columns_to_clean: List[str] = None ) -> pd.DataFrame: """ - Function that remove special characters from data frame like escape symbols etc. + Function that removes special characters (such as escape symbols) + from a pandas DataFrame. Args: - df (pd.DataFrame): DataFrame - columns_to_clean (List[str]): List of columns. Defaults is None. + df (pd.DataFrame): The DataFrame to clean. + columns_to_clean (List[str]): A list of columns to clean. Defaults is None. Returns: - pd.DataFrame + pd.DataFrame: The cleaned DataFrame """ + + df = df.copy() + if columns_to_clean is None: df.replace( to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], @@ -363,16 +371,14 @@ def df_clean_column( regex=True, inplace=True, ) - else: - for x in columns_to_clean: - df[x].replace( + for col in columns_to_clean: + df[col].replace( to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["", ""], regex=True, inplace=True, ) - return df From 58e34fa0b46a78df94029b0554849639409a57ad Mon Sep 17 00:00:00 2001 From: trymzet Date: Sun, 3 Apr 2022 14:15:10 +0200 Subject: [PATCH 123/135] =?UTF-8?q?=F0=9F=93=9D=20Fix=20English=20in=20cha?= =?UTF-8?q?ngelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63f365baf..7ac37c3fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added new function `df_clean_column` that clean data frame columns from special characters +- Added `df_clean_column` util task that removes special characters from a pandas DataFrame - Added `MultipleFlows` flow class which enables running multiple flows in a given order. -- Added new task `GetFlowNewDateRange` to change date range based on Prefect flows +- Added `GetFlowNewDateRange` task to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` - Added `ASEliteToDF` task and `ASEliteToADLS` flow - Added KeyVault support in `CloudForCustomers` tasks From e1501f6f4797ed0666b0ed535d7775f8d63c09d2 Mon Sep 17 00:00:00 2001 From: trymzet Date: Sun, 3 Apr 2022 20:58:57 +0200 Subject: [PATCH 124/135] =?UTF-8?q?=F0=9F=90=9B=20Fix=20typo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 05dbfa644..2890cdc70 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -311,7 +311,6 @@ def df_converts_bytes_to_int(df: pd.DataFrame) -> pd.DataFrame: max_retries=3, retry_delay=timedelta(seconds=10), ) -@task def df_to_dataset( df: pd.DataFrame, partitioning_flavor="hive", format="parquet", **kwargs ) -> None: From 6e1b1079d2b4b3e64a2190081a166552cc65279c Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 4 Apr 2022 14:07:50 +0200 Subject: [PATCH 125/135] =?UTF-8?q?=F0=9F=8E=A8=20Corrected=20code=20revie?= =?UTF-8?q?w?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 11 +++++++---- viadot/utils.py | 1 - 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 7d74bfeff..5f5aab652 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -377,6 +377,9 @@ def custom_mail_state_handler( Returns: State: the `new_state` object that was provided """ + + logger = prefect.context.get("logger") + if credentials_secret is None: try: credentials_secret = PrefectSecret("mail_notifier_api_key").run() @@ -391,7 +394,7 @@ def custom_mail_state_handler( elif local_api_key is not None: api_key = local_config.get(local_api_key).get("API_KEY") else: - print("Please provide API KEY") + raise Exception("Please provide API KEY") curr_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") only_states = only_states or [] @@ -410,10 +413,10 @@ def custom_mail_state_handler(

More details here: {url}

", ) try: - sg = SendGridAPIClient(api_key) - response = sg.send(message) + send_grid = SendGridAPIClient(api_key) + response = send_grid.send(message) except Exception as e: - print(e.message) + raise e return new_state diff --git a/viadot/utils.py b/viadot/utils.py index 10e138f2f..834b63fb5 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -2,7 +2,6 @@ import pendulum import prefect import requests -import prefect.client from prefect.utilities.graphql import EnumValue, with_args from requests.adapters import HTTPAdapter From ac5ab6024c02cb7c3d2d5b60f8596aa7d1c6fb1d Mon Sep 17 00:00:00 2001 From: winiar93 Date: Mon, 4 Apr 2022 14:18:58 +0200 Subject: [PATCH 126/135] =?UTF-8?q?=F0=9F=8E=A8=20Deleted=20unused=20line?= =?UTF-8?q?=20of=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index dd66f2f3a..eedd2216d 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -381,8 +381,6 @@ def custom_mail_state_handler( """ - logger = prefect.context.get("logger") - if credentials_secret is None: try: credentials_secret = PrefectSecret("mail_notifier_api_key").run() From c40d9c34d69ec36123a83295d68588d414fa2349 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Tue, 5 Apr 2022 10:32:38 +0200 Subject: [PATCH 127/135] Update CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5195ee9fb..db0e2183b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added `custom_mail_state_handler` function that send mail notification using custom smtp server. -- Added new function `df_clean_column` that clean data frame columns from special characters +- Added `custom_mail_state_handler` function that sends mail notification using custom smtp server. +- Added new function `df_clean_column` that cleans data frame columns from special characters - Added `df_clean_column` util task that removes special characters from a pandas DataFrame - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added `GetFlowNewDateRange` task to change date range based on Prefect flows From b11e366aabe1f2f3a5abf5f96426da8a3a805639 Mon Sep 17 00:00:00 2001 From: m-paz <59165045+m-paz@users.noreply.github.com> Date: Wed, 6 Apr 2022 10:17:23 +0200 Subject: [PATCH 128/135] Added google-auth to the requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9816ffb1c..1f1f6568d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,5 +28,6 @@ visions==0.7.4 sharepy==1.3.0 sql-metadata==2.3.0 duckdb==0.3.2 -sendgrid==6.9.7 google-cloud==0.34.0 +google-auth==2.6.2 +sendgrid==6.9.7 From 4357ed56d8d566f58be320165a411f95d3b9390f Mon Sep 17 00:00:00 2001 From: m-paz Date: Wed, 6 Apr 2022 14:34:20 +0200 Subject: [PATCH 129/135] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20hardcoding=20iss?= =?UTF-8?q?ue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index eedd2216d..846bee73b 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -362,6 +362,8 @@ def custom_mail_state_handler( local_api_key: str = None, credentials_secret: str = None, vault_name: str = None, + from_email: str = None, + to_emails: str = None, ) -> prefect.engine.state.State: """ @@ -377,6 +379,8 @@ def custom_mail_state_handler( local_api_key (str, optional): Api key from local config. credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with API KEY. vault_name (str, optional): Name of key vault. + from_email (str): Sender mailbox address. + to_emails (str): Receiver mailbox address. Returns: State: the `new_state` object that was provided """ @@ -407,8 +411,8 @@ def custom_mail_state_handler( "flow-run", prefect.context["flow_run_id"], as_user=False ) message = Mail( - from_email="notifications@dyvenia.com", - to_emails="notifications@dyvenia.com", + from_email=from_email, + to_emails=to_emails, subject=f"The flow {tracked_obj.name} - Status {new_state}", html_content=f"The flow {cast(str,tracked_obj.name)} FAILED at {curr_dt}. \

More details here: {url}

", From 80d7c1dbe86cf06a613af060dbfbc8e89fc472d1 Mon Sep 17 00:00:00 2001 From: trymzet Date: Wed, 6 Apr 2022 19:26:38 +0200 Subject: [PATCH 130/135] =?UTF-8?q?=E2=9C=A8=20Add=20`AzureSQLUpsert`=20ta?= =?UTF-8?q?sk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/__init__.py | 1 + viadot/tasks/azure_sql.py | 103 ++++++++++++++++++++- viadot/utils.py | 183 +++++++++++++++++++++++++++++++++++++- 4 files changed, 286 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db0e2183b..2f5a85e44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `chunksize` parameter to `BCPTask` task to allow more control over the load process - Added support for SQL Server's custom `datetimeoffset` type - Added `AzureSQLToDF` task +- Added `AzureSQLUpsert` task ### Changed - Changed the base class of `AzureSQL` to `SQLServer` diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 86aee449b..eee05fe43 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -18,6 +18,7 @@ CreateTableFromBlob, AzureSQLToDF, CheckColumnOrder, + AzureSQLUpsert, ) from .bcp import BCPTask from .github import DownloadGitHubFile diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index a6133c688..6e2183c5e 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -10,6 +10,11 @@ from ..exceptions import ValidationError from ..sources import AzureSQL +from ..utils import ( + build_merge_query, + gen_bulk_insert_query_from_df, + get_sql_server_table_dtypes, +) from .azure_key_vault import AzureKeyVaultSecret @@ -322,8 +327,12 @@ def run( azure_sql = AzureSQL(credentials=credentials) df = azure_sql.to_df(query) + nrows = df.shape[0] + ncols = df.shape[1] - self.logger.info(f"Successfully downloaded data to a DataFrame.") + self.logger.info( + f"Successfully downloaded {nrows} rows and {ncols} columns of data to a DataFrame." + ) return df @@ -417,3 +426,95 @@ def run( else: self.logger.info("The table will be replaced.") return df + + +class AzureSQLUpsert(Task): + """Task for upserting data from a pandas DataFrame into AzureSQL. + + Args: + schema (str, optional): The schema where the data should be upserted. Defaults to None. + table (str, optional): The table where the data should be upserted. Defaults to None. + on (str, optional): The field on which to merge (upsert). Defaults to None. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary + vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + """ + + def __init__( + self, + schema: str = None, + table: str = None, + on: str = None, + credentials_secret: str = None, + *args, + **kwargs, + ): + self.schema = schema + self.table = table + self.on = on + self.credentials_secret = credentials_secret + super().__init__(name="azure_sql_upsert", *args, **kwargs) + + @defaults_from_attrs( + "schema", + "table", + "on", + "credentials_secret", + ) + def run( + self, + df: pd.DataFrame, + schema: str = None, + table: str = None, + on: str = None, + credentials_secret: str = None, + vault_name: str = None, + ): + """Upsert data from a pandas DataFrame into AzureSQL using a temporary staging table. + + Args: + df (pd.DataFrame): The DataFrame to upsert. + schema (str, optional): The schema where the data should be upserted. Defaults to None. + table (str, optional): The table where the data should be upserted. Defaults to None. + on (str, optional): The field on which to merge (upsert). Defaults to None. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary + vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + """ + + credentials = get_credentials(credentials_secret, vault_name=vault_name) + azure_sql = AzureSQL(credentials=credentials) + + # Create a temporary staging table. + # Hashtag marks a temp table in SQL server. + stg_table = "#" + "stg_" + table + dtypes = get_sql_server_table_dtypes( + schema=schema, table=table, con=azure_sql.con + ) + created = azure_sql.create_table( + schema=schema, table=stg_table, dtypes=dtypes, if_exists="fail" + ) + + # Insert data into the temp table + stg_table_fqn = f"{schema}.{stg_table}" + insert_query = gen_bulk_insert_query_from_df(df, table_fqn=stg_table_fqn) + inserted = azure_sql.run(insert_query) + + # Upsert into prod table + merge_query = build_merge_query( + stg_schema=schema, + stg_table=stg_table, + schema=schema, + table=table, + primary_key=on, + con=azure_sql.con, + ) + + merged = azure_sql.run(merge_query) + + if merged: + rows = df.shape[0] + table_fqn = f"{schema}.{table}" + self.logger.info( + f"Successfully upserted {rows} rows of data into table '{table_fqn}'." + ) + + return True diff --git a/viadot/utils.py b/viadot/utils.py index 834b63fb5..590b59e01 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -1,8 +1,11 @@ +import re from typing import Any, Dict + +import pandas as pd import pendulum import prefect +import pyodbc import requests - from prefect.utilities.graphql import EnumValue, with_args from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError, HTTPError, ReadTimeout, Timeout @@ -117,3 +120,181 @@ def get_flow_last_run_date(flow_name: str) -> str: pendulum.parse(last_run_date_raw_format).format("YYYY-MM-DDTHH:MM:SS") + "Z" ) return last_run_date + + +def get_sql_server_table_dtypes( + table, con: pyodbc.Connection, schema: str = None +) -> dict: + """Get column names and types from a SQL Server database table. + + Args: + table (_type_): The table for which to fetch dtypes. + con (pyodbc.Connection): The connection to the database where the table is located. + schema (str, optional): The schema where the table is located. Defaults to None. + + Returns: + dict: A dictionary of the form {column_name: dtype, column_name2: dtype2, ...}. + """ + + query = f""" + SELECT + col.name, + t.name, + col.max_length + FROM sys.tables AS tab + INNER JOIN sys.columns AS col + ON tab.object_id = col.object_id + LEFT JOIN sys.types AS t + ON col.user_type_id = t.user_type_id + WHERE tab.name = '{table}' + AND schema_name(tab.schema_id) = '{schema}' + ORDER BY column_id; + """ + cursor = con.cursor() + query_result = cursor.execute(query).fetchall() + cursor.close() + + dtypes = {} + for row in query_result: + column_name = row[0] + dtype = row[1] + length = row[2] + if dtype == "varchar": + dtypes[column_name] = dtype + f"({length})" + else: + dtypes[column_name] = dtype + + return dtypes + + +def _cast_df_cols(df): + + df = df.replace({"False": False, "True": True}) + + datetime_cols = (col for col, dtype in df.dtypes.items() if dtype.kind == "M") + bool_cols = (col for col, dtype in df.dtypes.items() if dtype.kind == "b") + int_cols = (col for col, dtype in df.dtypes.items() if dtype.kind == "i") + + for col in datetime_cols: + df[col] = df[col].dt.strftime("%Y-%m-%d %H:%M:%S+00:00") + + for col in bool_cols: + df[col] = df[col].astype(pd.Int64Dtype()) + + for col in int_cols: + df[col] = df[col].astype(pd.Int64Dtype()) + + return df + + +def build_merge_query( + stg_schema: str, + stg_table: str, + schema: str, + table: str, + primary_key: str, + con: pyodbc.Connection, +) -> str: + """ + Build a merge query for the simplest possible upsert scenario: + - updating and inserting all fields + - merging on a single column, which has the same name in both tables + + Args: + stg_schema (str): The schema where the staging table is located. + stg_table (str): The table with new/updated data. + schema (str): The schema where the table is located. + table (str): The table to merge into. + primary_key (str): The column on which to merge. + con (pyodbc.Connection) The connection to the database on which the + query will be executed. + """ + + # Get column names + columns_query = f""" + SELECT + col.name + FROM sys.tables AS tab + INNER JOIN sys.columns AS col + ON tab.object_id = col.object_id + WHERE tab.name = '{table}' + AND schema_name(tab.schema_id) = '{schema}' + ORDER BY column_id; + """ + cursor = con.cursor() + columns_query_result = cursor.execute(columns_query).fetchall() + cursor.close() + + columns = [tup[0] for tup in columns_query_result] + columns_stg_fqn = [f"stg.{col}" for col in columns] + + # Build merge query + update_pairs = [f"existing.{col} = stg.{col}" for col in columns] + merge_query = f""" + MERGE INTO {schema}.{table} existing + USING {stg_schema}.{stg_table} stg + ON stg.{primary_key} = existing.{primary_key} + WHEN MATCHED + THEN UPDATE SET {", ".join(update_pairs)} + WHEN NOT MATCHED + THEN INSERT({", ".join(columns)}) + VALUES({", ".join(columns_stg_fqn)}); + """ + return merge_query + + +def gen_bulk_insert_query_from_df( + df: pd.DataFrame, table_fqn: str, **kwargs +) -> str: + """ + Converts a DataFrame to a bulk INSERT query. + + Args: + df (pd.DataFrame): The DataFrame which data should be put into the INSERT query. + table_fqn (str): The fully qualified name (schema.table) of the table to be inserted into. + + Returns: + str: A bulk insert query that will insert all data from `df` into `table_fqn`. + + Examples: + >>> data = [(1, "_suffixnan", 1), (2, "Noneprefix", 0), (3, "fooNULLbar", 1, 2.34)] + >>> df = pd.DataFrame(data, columns=["id", "name", "is_deleted", "balance"]) + >>> df + id name is_deleted balance + 0 1 _suffixnan 1 NaN + 1 2 Noneprefix 0 NaN + 2 3 fooNULLbar 1 2.34 + >>> query = gen_bulk_insert_query_from_df(df, "users", status="APPROVED", address=None) + >>> print(query) + INSERT INTO users (id, name, is_deleted, balance, status, address) + VALUES (1, '_suffixnan', 1, NULL, 'APPROVED', NULL), + (2, 'Noneprefix', 0, NULL, 'APPROVED', NULL), + (3, 'fooNULLbar', 1, 2.34, 'APPROVED', NULL); + """ + df = df.copy().assign(**kwargs) + df = _cast_df_cols(df) + + columns = ", ".join(df.columns) + + tuples_raw = df.itertuples(index=False, name=None) + # Escape values with single quotes inside by adding another single quote + # ("val'ue" -> "val''ue"). + # As Python wraps such strings in double quotes, which are interpreted as + # column names by SQL Server, we later also replace the double quotes with + # single quotes. + tuples_escaped = [ + tuple( + f"""{value.replace("'", "''")}""" if type(value) == str else value + for value in row + ) + for row in tuples_raw + ] + tuples = map(str, tuple(tuples_escaped)) + + # Change Nones to NULLs + none_nan_pattern = r"(?<=\W)(nan|None)(?=\W)" + values = re.sub(none_nan_pattern, "NULL", (",\n" + " " * 7).join(tuples)) + + # Change the double quotes into single quotes, as explained above. + values_clean = values.replace('"', "'") + return f"INSERT INTO {table_fqn} ({columns})\n\nVALUES {values_clean}" From 7834f61630df3ff4b68c4dfd9ea0196f6f6ca3bd Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 7 Apr 2022 00:52:47 +0200 Subject: [PATCH 131/135] =?UTF-8?q?=F0=9F=8E=A8=20Fix=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/viadot/utils.py b/viadot/utils.py index 590b59e01..a10d81fe8 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -243,9 +243,7 @@ def build_merge_query( return merge_query -def gen_bulk_insert_query_from_df( - df: pd.DataFrame, table_fqn: str, **kwargs -) -> str: +def gen_bulk_insert_query_from_df(df: pd.DataFrame, table_fqn: str, **kwargs) -> str: """ Converts a DataFrame to a bulk INSERT query. @@ -271,6 +269,11 @@ def gen_bulk_insert_query_from_df( (2, 'Noneprefix', 0, NULL, 'APPROVED', NULL), (3, 'fooNULLbar', 1, 2.34, 'APPROVED', NULL); """ + if df.shape[1] == 1: + raise NotImplementedError( + "Currently, this function only handles DataFrames with at least two columns." + ) + df = df.copy().assign(**kwargs) df = _cast_df_cols(df) From ef84e65aabb515b4fcbde8d3ff1c9ab2f4b6dced Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 7 Apr 2022 10:08:37 +0200 Subject: [PATCH 132/135] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Improve=20reliabil?= =?UTF-8?q?ity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 6 ++++++ viadot/utils.py | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 6e2183c5e..af63cac5f 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -480,6 +480,12 @@ def run( vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. """ + if not table: + raise ValueError("'table' was not provided.") + + if not on: + raise ValueError("'on' was not provided.") + credentials = get_credentials(credentials_secret, vault_name=vault_name) azure_sql = AzureSQL(credentials=credentials) diff --git a/viadot/utils.py b/viadot/utils.py index a10d81fe8..7fa14dc22 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -299,5 +299,7 @@ def gen_bulk_insert_query_from_df(df: pd.DataFrame, table_fqn: str, **kwargs) -> values = re.sub(none_nan_pattern, "NULL", (",\n" + " " * 7).join(tuples)) # Change the double quotes into single quotes, as explained above. - values_clean = values.replace('"', "'") + # Note this pattern should be improved at a later time to cover more edge cases. + pattern = r'(")(.*)(")(\)|,)' + values_clean = re.sub(pattern, r"'\2'\4", values) return f"INSERT INTO {table_fqn} ({columns})\n\nVALUES {values_clean}" From c9300e5b26a9168ba26ff54251d3fded1979d4be Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 7 Apr 2022 10:24:57 +0200 Subject: [PATCH 133/135] =?UTF-8?q?=E2=9C=85=20Add=20tests=20for=20insert?= =?UTF-8?q?=20query=20util?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 68 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/unit/test_utils.py diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 000000000..02957b0d5 --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,68 @@ +from viadot.utils import gen_bulk_insert_query_from_df +import pandas as pd + + +def test_single_quotes_inside(): + TEST_VALUE = "a'b" + df1 = pd.DataFrame( + { + "a": [ + TEST_VALUE, + ], + "b": ["a"], + } + ) + test_insert_query = gen_bulk_insert_query_from_df( + df1, table_fqn="test_schema.test_table" + ) + TEST_VALUE_ESCAPED = "'a''b'" + assert ( + test_insert_query + == f"""INSERT INTO test_schema.test_table (a, b) + +VALUES ({TEST_VALUE_ESCAPED}, 'a')""" + ), test_insert_query + + +def test_single_quotes_outside(): + TEST_VALUE = "'a'" + df1 = pd.DataFrame( + { + "a": [ + TEST_VALUE, + ], + "b": ["b"], + } + ) + test_insert_query = gen_bulk_insert_query_from_df( + df1, table_fqn="test_schema.test_table" + ) + TEST_VALUE_ESCAPED = "'''a'''" + assert ( + test_insert_query + == f"""INSERT INTO test_schema.test_table (a, b) + +VALUES ({TEST_VALUE_ESCAPED}, 'b')""" + ), test_insert_query + + +def test_double_quotes_inside(): + TEST_VALUE = 'a "b"' + df1 = pd.DataFrame( + { + "a": [ + TEST_VALUE, + ], + "b": ["c"], + } + ) + test_insert_query = gen_bulk_insert_query_from_df( + df1, table_fqn="test_schema.test_table" + ) + TEST_VALUE_ESCAPED = """'a "b"'""" + assert ( + test_insert_query + == f"""INSERT INTO test_schema.test_table (a, b) + +VALUES ({TEST_VALUE_ESCAPED}, 'c')""" + ), test_insert_query From fe81071933d21843429225ab80200b0610c72cdf Mon Sep 17 00:00:00 2001 From: trymzet Date: Thu, 7 Apr 2022 11:44:45 +0200 Subject: [PATCH 134/135] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Improved=20variabl?= =?UTF-8?q?e=20name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/utils.py b/viadot/utils.py index 7fa14dc22..ad5d6852a 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -300,6 +300,6 @@ def gen_bulk_insert_query_from_df(df: pd.DataFrame, table_fqn: str, **kwargs) -> # Change the double quotes into single quotes, as explained above. # Note this pattern should be improved at a later time to cover more edge cases. - pattern = r'(")(.*)(")(\)|,)' - values_clean = re.sub(pattern, r"'\2'\4", values) + double_quotes_pattern = r'(")(.*)(")(\)|,)' + values_clean = re.sub(double_quotes_pattern, r"'\2'\4", values) return f"INSERT INTO {table_fqn} ({columns})\n\nVALUES {values_clean}" From df4270a334c75f48f91ca49189273c30bc4b276e Mon Sep 17 00:00:00 2001 From: m-paz Date: Thu, 7 Apr 2022 14:18:33 +0200 Subject: [PATCH 135/135] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog=20be?= =?UTF-8?q?fore=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f5a85e44..cb016bfe1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] + +## [0.4.0] - 2022-04-07 ### Added - Added `custom_mail_state_handler` function that sends mail notification using custom smtp server. - Added new function `df_clean_column` that cleans data frame columns from special characters @@ -13,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `MultipleFlows` flow class which enables running multiple flows in a given order. - Added `GetFlowNewDateRange` task to change date range based on Prefect flows - Added `check_col_order` parameter in `ADLSToAzureSQL` -- Added `ASEliteToDF` task and `ASEliteToADLS` flow +- Added new source `ASElite` - Added KeyVault support in `CloudForCustomers` tasks - Added `SQLServer` source - Added `DuckDBToDF` task