diff --git a/integrations/gitlab/CHANGELOG.md b/integrations/gitlab/CHANGELOG.md index f695c1660e..cf361a7279 100644 --- a/integrations/gitlab/CHANGELOG.md +++ b/integrations/gitlab/CHANGELOG.md @@ -7,6 +7,15 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm +0.1.135 (2024-10-31) +==================== + +### Improvements + +- Explicitly declaring the file search in projects to use the advanced search type, in cases where the default search in gitlab changes. +- Enhanced more verbosity on file kind + + 0.1.134 (2024-10-23) ==================== diff --git a/integrations/gitlab/gitlab_integration/git_integration.py b/integrations/gitlab/gitlab_integration/git_integration.py index a3281c8e0c..1a5f7ac456 100644 --- a/integrations/gitlab/gitlab_integration/git_integration.py +++ b/integrations/gitlab/gitlab_integration/git_integration.py @@ -3,7 +3,6 @@ from gitlab.v4.objects import Project from loguru import logger from pydantic import Field, BaseModel - from gitlab_integration.core.async_fetcher import AsyncFetcher from gitlab_integration.core.entities import ( FILE_PROPERTY_PREFIX, diff --git a/integrations/gitlab/gitlab_integration/gitlab_service.py b/integrations/gitlab/gitlab_integration/gitlab_service.py index db11e3b7f3..b70311b009 100644 --- a/integrations/gitlab/gitlab_integration/gitlab_service.py +++ b/integrations/gitlab/gitlab_integration/gitlab_service.py @@ -160,10 +160,11 @@ async def get_all_file_paths( ] async def search_files_in_project( - self, - project: Project, - path: str | List[str], + self, project: Project, path: str | List[str] ) -> AsyncIterator[list[dict[str, Any]]]: + logger.info( + f"Searching project {project.path_with_namespace} for files with path pattern {path}" + ) paths = [path] if not isinstance(path, list) else path for path in paths: file_pattern = os.path.basename(path) @@ -175,23 +176,39 @@ async def search_files_in_project( project.search, scope="blobs", search=f"filename:{file_pattern}", + search_type="advanced", retry_transient_errors=True, ): logger.info( f"Found {len(files)} files in project {project.path_with_namespace} with file pattern {file_pattern}, filtering all that don't match path pattern {path}" ) files = typing.cast(Union[GitlabList, List[Dict[str, Any]]], files) - tasks = [ - self.get_and_parse_single_file( - project, file["path"], project.default_branch - ) - for file in files - if does_pattern_apply(path, file["path"]) - ] + tasks = [] + for file in files: + if does_pattern_apply(path, file["path"]): + tasks.append( + self.get_and_parse_single_file( + project, file["path"], project.default_branch + ) + ) + else: + logger.debug( + f"Skipping file {file['path']} as it doesn't match path pattern {path} for project {project.path_with_namespace}" + ) + logger.info( + f"Found {len(tasks)} files in project {project.path_with_namespace} that match path pattern {path}" + ) parsed_files = await asyncio.gather(*tasks) files_with_content = [file for file in parsed_files if file] if files_with_content: + logger.info( + f"Found {len(files_with_content)} files with content for project {project.path_with_namespace} for path {path}" + ) yield files_with_content + else: + logger.info( + f"No files with content found for project {project.path_with_namespace} for path {path}" + ) async def _get_entities_from_git( self, project: Project, file_path: str | List[str], sha: str, ref: str @@ -693,7 +710,7 @@ async def get_entities_diff( return entities_before, entities_after def _parse_file_content( - self, file: ProjectFile + self, project: Project, file: ProjectFile ) -> Union[str, dict[str, Any], list[Any]] | None: """ Process a file from a project. If the file is a JSON or YAML, it will be parsed, otherwise the raw content will be returned @@ -702,18 +719,30 @@ def _parse_file_content( """ if file.size > MAX_ALLOWED_FILE_SIZE_IN_BYTES: logger.warning( - f"File {file.file_path} is too large to be processed. Maximum size allowed is 1MB. Actual size of file: {file.size}" + f"File {file.file_path} in {project.path_with_namespace} is too large to be processed. " + f"Maximum size allowed is 1MB. Actual size of file: {file.size}" ) return None try: return json.loads(file.decode()) except json.JSONDecodeError: try: + logger.debug( + f"Trying to process file {file.file_path} in project {project.path_with_namespace} as YAML" + ) documents = list(yaml.load_all(file.decode(), Loader=yaml.SafeLoader)) if not documents: + logger.debug( + f"Failed to parse file {file.file_path} in project {project.path_with_namespace} as YAML," + f" returning raw content" + ) return file.decode().decode("utf-8") return documents if len(documents) > 1 else documents[0] except yaml.YAMLError: + logger.debug( + f"Failed to parse file {file.file_path} in project {project.path_with_namespace} as JSON or YAML," + f" returning raw content" + ) return file.decode().decode("utf-8") async def get_and_parse_single_file( @@ -730,7 +759,7 @@ async def get_and_parse_single_file( f"Fetched file {file_path} in project {project.path_with_namespace}" ) project_file = typing.cast(ProjectFile, project_file) - parsed_file = self._parse_file_content(project_file) + parsed_file = self._parse_file_content(project, project_file) project_file_dict = project_file.asdict() if not parsed_file: diff --git a/integrations/gitlab/gitlab_integration/ocean.py b/integrations/gitlab/gitlab_integration/ocean.py index 45bbde1784..31f6c1be97 100644 --- a/integrations/gitlab/gitlab_integration/ocean.py +++ b/integrations/gitlab/gitlab_integration/ocean.py @@ -185,6 +185,7 @@ async def resync_files(kind: str) -> ASYNC_GENERATOR_RESYNC_TYPE: "GitLabFilesResourceConfig", event.resource_config ) if not isinstance(gitlab_resource_config, GitLabFilesResourceConfig): + logger.error("Invalid resource config type for GitLab files resync") return selector = gitlab_resource_config.selector @@ -201,20 +202,36 @@ async def resync_files(kind: str) -> ASYNC_GENERATOR_RESYNC_TYPE: ): projects_processed_in_full_batch += len(projects_batch) logger.info( - f"Processing projects files for {projects_processed_in_full_batch}/{len(projects)} projects in batch" + f"Processing project files for {projects_processed_in_full_batch}/{len(projects)} " + f"projects in batch: {[project.path_with_namespace for project in projects_batch]}" ) - tasks = [ - service.search_files_in_project(project, selector.files.path) - for project in projects_batch - if service.should_process_project(project, selector.files.repos) - ] + tasks = [] + matching_projects = [] + for project in projects_batch: + if service.should_process_project(project, selector.files.repos): + matching_projects.append(project) + tasks.append( + service.search_files_in_project( + project, selector.files.path + ) + ) if tasks: - logger.info(f"Found {len(tasks)} relevant projects in batch") + logger.info( + f"Found {len(tasks)} relevant projects in batch, projects: {[project.path_with_namespace for project in matching_projects]}" + ) async for batch in stream_async_iterators_tasks(*tasks): yield batch else: - logger.info("No relevant projects were found in batch, skipping it") + logger.info( + f"No relevant projects were found in batch for path '{selector.files.path}', skipping projects: {[project.path_with_namespace for project in projects_batch]}" + ) + logger.info( + f"Finished Processing project files for {projects_processed_in_full_batch}/{len(projects)}" + ) + logger.info( + f"Finished processing all projects for path '{selector.files.path}'" + ) @ocean.on_resync(ObjectKind.MERGE_REQUEST) diff --git a/integrations/gitlab/pyproject.toml b/integrations/gitlab/pyproject.toml index 1979a5304d..0f44db8100 100644 --- a/integrations/gitlab/pyproject.toml +++ b/integrations/gitlab/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gitlab" -version = "0.1.134" +version = "0.1.135" description = "Gitlab integration for Port using Port-Ocean Framework" authors = ["Yair Siman-Tov "]