From 751f26eb38534c1b5b51add9b0a85319c6a5b0e5 Mon Sep 17 00:00:00 2001 From: Michael Kofi Armah Date: Thu, 31 Oct 2024 23:42:22 +0000 Subject: [PATCH] [Integration][Gitlab] | Fix unintended deletion of file kinds (#1104) # Description What - Addresses unintended deletion of file-kind entities by enforcing the use of advanced search functionality for project file lookups. Why - Entities created with a file-kind type get unexpectedly deleted due to inconsistent search results. How - This fix introduces an explicit search_type parameter set to advanced within project.search, ensuring that all relevant files are accurately located during searches, thereby preventing inadvertent deletions during resource updates. ## Type of change Please leave one option from the following and delete the rest: - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] New Integration (non-breaking change which adds a new integration) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Non-breaking change (fix of existing functionality that will not change current behavior) - [ ] Documentation (added/updated documentation)

All tests should be run against the port production environment(using a testing org).

### Core testing checklist - [ ] Integration able to create all default resources from scratch - [ ] Resync finishes successfully - [ ] Resync able to create entities - [ ] Resync able to update entities - [ ] Resync able to detect and delete entities - [ ] Scheduled resync able to abort existing resync and start a new one - [ ] Tested with at least 2 integrations from scratch - [ ] Tested with Kafka and Polling event listeners - [ ] Tested deletion of entities that don't pass the selector ### Integration testing checklist - [x] Integration able to create all default resources from scratch - [x] Resync able to create entities - [x] Resync able to update entities - [x] Resync able to detect and delete entities - [x] Resync finishes successfully - [x] If new resource kind is added or updated in the integration, add example raw data, mapping and expected result to the `examples` folder in the integration directory. - [x] If resource kind is updated, run the integration with the example data and check if the expected result is achieved - [x] If new resource kind is added or updated, validate that live-events for that resource are working as expected - [ ] Docs PR link [here](#) ### Preflight checklist - [ ] Handled rate limiting - [ ] Handled pagination - [ ] Implemented the code in async - [ ] Support Multi account ## Screenshots Include screenshots from your environment showing how the resources of the integration will look. ## API Documentation Provide links to the API documentation used for this integration. --------- Co-authored-by: tankilevitch Co-authored-by: Tom Tankilevitch <59158507+Tankilevitch@users.noreply.github.com> --- integrations/gitlab/CHANGELOG.md | 9 +++ .../gitlab_integration/git_integration.py | 1 - .../gitlab_integration/gitlab_service.py | 55 ++++++++++++++----- .../gitlab/gitlab_integration/ocean.py | 33 ++++++++--- integrations/gitlab/pyproject.toml | 2 +- 5 files changed, 77 insertions(+), 23 deletions(-) diff --git a/integrations/gitlab/CHANGELOG.md b/integrations/gitlab/CHANGELOG.md index f695c1660e..cf361a7279 100644 --- a/integrations/gitlab/CHANGELOG.md +++ b/integrations/gitlab/CHANGELOG.md @@ -7,6 +7,15 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm +0.1.135 (2024-10-31) +==================== + +### Improvements + +- Explicitly declaring the file search in projects to use the advanced search type, in cases where the default search in gitlab changes. +- Enhanced more verbosity on file kind + + 0.1.134 (2024-10-23) ==================== diff --git a/integrations/gitlab/gitlab_integration/git_integration.py b/integrations/gitlab/gitlab_integration/git_integration.py index a3281c8e0c..1a5f7ac456 100644 --- a/integrations/gitlab/gitlab_integration/git_integration.py +++ b/integrations/gitlab/gitlab_integration/git_integration.py @@ -3,7 +3,6 @@ from gitlab.v4.objects import Project from loguru import logger from pydantic import Field, BaseModel - from gitlab_integration.core.async_fetcher import AsyncFetcher from gitlab_integration.core.entities import ( FILE_PROPERTY_PREFIX, diff --git a/integrations/gitlab/gitlab_integration/gitlab_service.py b/integrations/gitlab/gitlab_integration/gitlab_service.py index db11e3b7f3..b70311b009 100644 --- a/integrations/gitlab/gitlab_integration/gitlab_service.py +++ b/integrations/gitlab/gitlab_integration/gitlab_service.py @@ -160,10 +160,11 @@ async def get_all_file_paths( ] async def search_files_in_project( - self, - project: Project, - path: str | List[str], + self, project: Project, path: str | List[str] ) -> AsyncIterator[list[dict[str, Any]]]: + logger.info( + f"Searching project {project.path_with_namespace} for files with path pattern {path}" + ) paths = [path] if not isinstance(path, list) else path for path in paths: file_pattern = os.path.basename(path) @@ -175,23 +176,39 @@ async def search_files_in_project( project.search, scope="blobs", search=f"filename:{file_pattern}", + search_type="advanced", retry_transient_errors=True, ): logger.info( f"Found {len(files)} files in project {project.path_with_namespace} with file pattern {file_pattern}, filtering all that don't match path pattern {path}" ) files = typing.cast(Union[GitlabList, List[Dict[str, Any]]], files) - tasks = [ - self.get_and_parse_single_file( - project, file["path"], project.default_branch - ) - for file in files - if does_pattern_apply(path, file["path"]) - ] + tasks = [] + for file in files: + if does_pattern_apply(path, file["path"]): + tasks.append( + self.get_and_parse_single_file( + project, file["path"], project.default_branch + ) + ) + else: + logger.debug( + f"Skipping file {file['path']} as it doesn't match path pattern {path} for project {project.path_with_namespace}" + ) + logger.info( + f"Found {len(tasks)} files in project {project.path_with_namespace} that match path pattern {path}" + ) parsed_files = await asyncio.gather(*tasks) files_with_content = [file for file in parsed_files if file] if files_with_content: + logger.info( + f"Found {len(files_with_content)} files with content for project {project.path_with_namespace} for path {path}" + ) yield files_with_content + else: + logger.info( + f"No files with content found for project {project.path_with_namespace} for path {path}" + ) async def _get_entities_from_git( self, project: Project, file_path: str | List[str], sha: str, ref: str @@ -693,7 +710,7 @@ async def get_entities_diff( return entities_before, entities_after def _parse_file_content( - self, file: ProjectFile + self, project: Project, file: ProjectFile ) -> Union[str, dict[str, Any], list[Any]] | None: """ Process a file from a project. If the file is a JSON or YAML, it will be parsed, otherwise the raw content will be returned @@ -702,18 +719,30 @@ def _parse_file_content( """ if file.size > MAX_ALLOWED_FILE_SIZE_IN_BYTES: logger.warning( - f"File {file.file_path} is too large to be processed. Maximum size allowed is 1MB. Actual size of file: {file.size}" + f"File {file.file_path} in {project.path_with_namespace} is too large to be processed. " + f"Maximum size allowed is 1MB. Actual size of file: {file.size}" ) return None try: return json.loads(file.decode()) except json.JSONDecodeError: try: + logger.debug( + f"Trying to process file {file.file_path} in project {project.path_with_namespace} as YAML" + ) documents = list(yaml.load_all(file.decode(), Loader=yaml.SafeLoader)) if not documents: + logger.debug( + f"Failed to parse file {file.file_path} in project {project.path_with_namespace} as YAML," + f" returning raw content" + ) return file.decode().decode("utf-8") return documents if len(documents) > 1 else documents[0] except yaml.YAMLError: + logger.debug( + f"Failed to parse file {file.file_path} in project {project.path_with_namespace} as JSON or YAML," + f" returning raw content" + ) return file.decode().decode("utf-8") async def get_and_parse_single_file( @@ -730,7 +759,7 @@ async def get_and_parse_single_file( f"Fetched file {file_path} in project {project.path_with_namespace}" ) project_file = typing.cast(ProjectFile, project_file) - parsed_file = self._parse_file_content(project_file) + parsed_file = self._parse_file_content(project, project_file) project_file_dict = project_file.asdict() if not parsed_file: diff --git a/integrations/gitlab/gitlab_integration/ocean.py b/integrations/gitlab/gitlab_integration/ocean.py index 45bbde1784..31f6c1be97 100644 --- a/integrations/gitlab/gitlab_integration/ocean.py +++ b/integrations/gitlab/gitlab_integration/ocean.py @@ -185,6 +185,7 @@ async def resync_files(kind: str) -> ASYNC_GENERATOR_RESYNC_TYPE: "GitLabFilesResourceConfig", event.resource_config ) if not isinstance(gitlab_resource_config, GitLabFilesResourceConfig): + logger.error("Invalid resource config type for GitLab files resync") return selector = gitlab_resource_config.selector @@ -201,20 +202,36 @@ async def resync_files(kind: str) -> ASYNC_GENERATOR_RESYNC_TYPE: ): projects_processed_in_full_batch += len(projects_batch) logger.info( - f"Processing projects files for {projects_processed_in_full_batch}/{len(projects)} projects in batch" + f"Processing project files for {projects_processed_in_full_batch}/{len(projects)} " + f"projects in batch: {[project.path_with_namespace for project in projects_batch]}" ) - tasks = [ - service.search_files_in_project(project, selector.files.path) - for project in projects_batch - if service.should_process_project(project, selector.files.repos) - ] + tasks = [] + matching_projects = [] + for project in projects_batch: + if service.should_process_project(project, selector.files.repos): + matching_projects.append(project) + tasks.append( + service.search_files_in_project( + project, selector.files.path + ) + ) if tasks: - logger.info(f"Found {len(tasks)} relevant projects in batch") + logger.info( + f"Found {len(tasks)} relevant projects in batch, projects: {[project.path_with_namespace for project in matching_projects]}" + ) async for batch in stream_async_iterators_tasks(*tasks): yield batch else: - logger.info("No relevant projects were found in batch, skipping it") + logger.info( + f"No relevant projects were found in batch for path '{selector.files.path}', skipping projects: {[project.path_with_namespace for project in projects_batch]}" + ) + logger.info( + f"Finished Processing project files for {projects_processed_in_full_batch}/{len(projects)}" + ) + logger.info( + f"Finished processing all projects for path '{selector.files.path}'" + ) @ocean.on_resync(ObjectKind.MERGE_REQUEST) diff --git a/integrations/gitlab/pyproject.toml b/integrations/gitlab/pyproject.toml index 1979a5304d..0f44db8100 100644 --- a/integrations/gitlab/pyproject.toml +++ b/integrations/gitlab/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gitlab" -version = "0.1.134" +version = "0.1.135" description = "Gitlab integration for Port using Port-Ocean Framework" authors = ["Yair Siman-Tov "]