diff --git a/atomic_reactor/constants.py b/atomic_reactor/constants.py index 7ae80148e..16bb1c66e 100644 --- a/atomic_reactor/constants.py +++ b/atomic_reactor/constants.py @@ -143,6 +143,8 @@ HTTP_CLIENT_STATUS_RETRY = (408, 429, 500, 502, 503, 504) # requests timeout in seconds HTTP_REQUEST_TIMEOUT = 600 +# git cmd timeout in seconds +GIT_CMD_TIMEOUT = 600 # max retries for git clone GIT_MAX_RETRIES = 3 # how many seconds should wait before another try of git clone diff --git a/atomic_reactor/plugins/cachi2_init.py b/atomic_reactor/plugins/cachi2_init.py index bf6d6b4c3..d2da530f7 100644 --- a/atomic_reactor/plugins/cachi2_init.py +++ b/atomic_reactor/plugins/cachi2_init.py @@ -10,6 +10,7 @@ from typing import Any, Optional, List, Dict from pathlib import Path +import git from osbs.utils import clone_git_repo from atomic_reactor.constants import ( @@ -28,6 +29,8 @@ from atomic_reactor.utils.cachi2 import ( remote_source_to_cachi2, clone_only, validate_paths, normalize_gomod_pkg_manager, enforce_sandbox, + has_git_submodule_manager, update_submodules, + get_submodules_sbom, get_submodules_request_json_deps, ) @@ -135,6 +138,15 @@ def process_remote_sources(self) -> List[Dict[str, Any]]: remote_source_data["ref"] ) + if has_git_submodule_manager(remote_source_data): + update_submodules(source_path_app) + repo = git.Repo(str(source_path_app)) + git_submodules = { + "sbom": get_submodules_sbom(repo), + "request_json": get_submodules_request_json_deps(repo) + } + remote_source["git_submodules"] = git_submodules + enforce_sandbox(source_path_app, remove_unsafe_symlinks=False) validate_paths(source_path_app, remote_source_data.get("packages", {})) diff --git a/atomic_reactor/plugins/cachi2_postprocess.py b/atomic_reactor/plugins/cachi2_postprocess.py index e7f5faa73..67146928c 100644 --- a/atomic_reactor/plugins/cachi2_postprocess.py +++ b/atomic_reactor/plugins/cachi2_postprocess.py @@ -20,6 +20,7 @@ from atomic_reactor.constants import ( CACHITO_ENV_ARG_ALIAS, CACHITO_ENV_FILENAME, + CACHI2_BUILD_DIR, PLUGIN_CACHI2_INIT, PLUGIN_CACHI2_POSTPROCESS, REMOTE_SOURCE_DIR, @@ -110,6 +111,7 @@ def run(self) -> Optional[List[Dict[str, Any]]]: return None processed_remote_sources = self.postprocess_remote_sources() + self.postprocess_git_submodules_global_sbom() self.inject_remote_sources(processed_remote_sources) return [ @@ -117,6 +119,30 @@ def run(self) -> Optional[List[Dict[str, Any]]]: for remote_source in processed_remote_sources ] + def postprocess_git_submodules_global_sbom(self): + """atomic-reactor is responsbile for handling git-submodules. Global SBOM must be updated""" + all_sboms = [] + for remote_source in self.init_plugin_data: + git_submodules = remote_source.get('git_submodules') + if not git_submodules: + continue + + all_sboms.extend(git_submodules['sboms']) + + if not all_sboms: + return + + # TODO: deduplicate sboms? + + global_sbom_path = self.workflow.build_dir.path/CACHI2_BUILD_DIR/"bom.json" + with open(global_sbom_path, 'r') as global_sbom_f: + global_sbom_data = json.load(global_sbom_f) + global_sbom_data['components'].extend(all_sboms) + + with open(global_sbom_path, 'w') as global_sbom_f: + json.dump(global_sbom_data, global_sbom_f) + global_sbom_f.flush() + def postprocess_remote_sources(self) -> List[Cachi2RemoteSource]: """Process remote source requests and return information about the processed sources.""" @@ -132,12 +158,26 @@ def postprocess_remote_sources(self) -> List[Cachi2RemoteSource]: with open(sbom_path, 'r') as sbom_f: sbom_data = json.load(sbom_f) + # request_json must be generated before modifications to sboms are done + request_json = generate_request_json( + remote_source['remote_source'], sbom_data, json_env_data) + + # update metadata with submodules info + git_submodules = remote_source.get('git_submodules') + if git_submodules: + sbom_data['components'].extend(git_submodules['sboms']) + + with open(sbom_path, 'w') as sbom_f: + json.dump(sbom_data, sbom_f) + sbom_f.flush() + + request_json['dependencies'].extend(git_submodules['request_json']) + remote_source_obj = Cachi2RemoteSource( name=remote_source['name'], tarball_path=Path(remote_source['source_path'], 'remote-source.tar.gz'), sources_path=Path(remote_source['source_path']), - json_data=generate_request_json( - remote_source['remote_source'], sbom_data, json_env_data), + json_data=request_json, json_env_data=json_env_data, ) processed_remote_sources.append(remote_source_obj) diff --git a/atomic_reactor/utils/cachi2.py b/atomic_reactor/utils/cachi2.py index 5fdc732f2..074923096 100644 --- a/atomic_reactor/utils/cachi2.py +++ b/atomic_reactor/utils/cachi2.py @@ -13,9 +13,14 @@ from typing import Any, Callable, Dict, Optional, Tuple, List from pathlib import Path import os.path +import urllib +import git from packageurl import PackageURL +from atomic_reactor import constants +from atomic_reactor.utils import retries + logger = logging.getLogger(__name__) @@ -286,4 +291,90 @@ def clone_only(remote_source: Dict[str, Any]) -> bool: if pkg_managers is not None and len(pkg_managers) == 0: return True + # only git-submodule + if pkg_managers is not None and pkg_managers == ['git-submodule']: + return True + return False + + +def has_git_submodule_manager(remote_source: Dict[str, Any]) -> bool: + """Returns true when for specific remote source git-submodule manager is requested""" + pkg_managers = remote_source.get("pkg_managers") or [] + return 'git-submodule' in pkg_managers + + +def update_submodules(repopath: Path): + """Update submodules in the given repo""" + cmd = ["git", "submodule", "update", "--init", "--filter=blob:none"] + params = { + "cwd": str(repopath), + "universal_newlines": True, + "timeout": constants.GIT_CMD_TIMEOUT, + } + retries.run_cmd(cmd, **params) + + +def get_submodules_sbom(repo: git.Repo) -> List[Dict]: + """Get SBOM of submodules in the specified repository""" + + def to_vcs_purl(pkg_name, repo_url, ref): + """ + Generate the vcs purl representation of the package. + + Use the most specific purl type possible, e.g. pkg:github if repo comes from + github.com. Fall back to using pkg:generic with a ?vcs_url qualifier. + + :param str pkg_name: name of package + :param str repo_url: url of git repository for package + :param str ref: git ref of package + :return: the PURL string of the Package object + :rtype: str + """ + repo_url = repo_url.rstrip("/") + parsed_url = urllib.parse.urlparse(repo_url) + + pkg_type_for_hostname = { + "github.com": "github", + "bitbucket.org": "bitbucket", + } + pkg_type = pkg_type_for_hostname.get(parsed_url.hostname, "generic") + + if pkg_type == "generic": + vcs_url = urllib.parse.quote(f"{repo_url}@{ref}", safe="") + purl = f"pkg:generic/{pkg_name}?vcs_url={vcs_url}" + else: + # pkg:github and pkg:bitbucket use the same format + namespace, repo = parsed_url.path.lstrip("/").rsplit("/", 1) + if repo.endswith(".git"): + repo = repo[: -len(".git")] + purl = f"pkg:{pkg_type}/{namespace.lower()}/{repo.lower()}@{ref}" + + return purl + + submodules_sbom = [ + { + "type": "library", + "name": sm.name, + "version": f"{sm.url}#{sm.hexsha}", + "purl": to_vcs_purl(sm.name, sm.url, sm.hexsha) + } + for sm in repo.submodules + ] + + return submodules_sbom + + +def get_submodules_request_json_deps(repo: git.Repo) -> List[Dict]: + """Get dependencies for request.json from submodule""" + submodules_request_json_dependencies = [ + { + "type": "git-submodule", + "name": sm.name, + "path": sm.name, + "version": f"{sm.url}#{sm.hexsha}", + } + for sm in repo.submodules + ] + + return submodules_request_json_dependencies diff --git a/atomic_reactor/utils/retries.py b/atomic_reactor/utils/retries.py index a46486fb7..5e843a9fe 100644 --- a/atomic_reactor/utils/retries.py +++ b/atomic_reactor/utils/retries.py @@ -89,7 +89,7 @@ def get_retrying_requests_session(client_statuses=HTTP_CLIENT_STATUS_RETRY, max_tries=SUBPROCESS_MAX_RETRIES + 1, # total tries is N retries + 1 initial attempt jitter=None, # use deterministic backoff, do not apply random jitter ) -def run_cmd(cmd: List[str], cleanup_cmd: List[str] = None) -> bytes: +def run_cmd(cmd: List[str], cleanup_cmd: List[str] = None, **params) -> bytes: """Run a subprocess command, retry on any non-zero exit status. Whenever an attempt fails, the stdout and stderr of the failed command will be logged. @@ -98,12 +98,14 @@ def run_cmd(cmd: List[str], cleanup_cmd: List[str] = None) -> bytes: If a cleanup command is specified it'll be run on exception before retry. + :param params: optional params to be passed to subprocess.run function + :return: bytes, the combined stdout and stderr (if any) of the command """ logger.debug("Running %s", " ".join(cmd)) try: - process = subprocess.run(cmd, check=True, capture_output=True) + process = subprocess.run(cmd, check=True, capture_output=True, **params) except subprocess.CalledProcessError as e: logger.warning( "%s failed:\nSTDOUT:\n%s\nSTDERR:\n%s", @@ -114,7 +116,7 @@ def run_cmd(cmd: List[str], cleanup_cmd: List[str] = None) -> bytes: if cleanup_cmd: try: logger.debug("Running %s", " ".join(cleanup_cmd)) - subprocess.run(cleanup_cmd, check=True, capture_output=True) + subprocess.run(cleanup_cmd, check=True, capture_output=True, **params) except subprocess.CalledProcessError as c_e: logger.warning( "Cleanup command: %s failed:\nSTDOUT:\n%s\nSTDERR:\n%s", diff --git a/requirements.in b/requirements.in index 2377aab21..183ee4728 100644 --- a/requirements.in +++ b/requirements.in @@ -2,6 +2,7 @@ backoff dockerfile-parse>=0.0.13 flatpak-module-tools>=0.14 +gitpython jsonschema paramiko>=3.4.0 PyYAML diff --git a/requirements.txt b/requirements.txt index 3a7df5321..3ee8bb632 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,10 @@ editables==0.5 # via hatchling flatpak-module-tools==0.14 # via -r requirements.in +gitdb==4.0.12 + # via gitpython +gitpython==3.1.44 + # via -r requirements.in googleapis-common-protos==1.60.0 # via # opentelemetry-exporter-otlp-proto-grpc @@ -176,6 +180,8 @@ six==1.16.0 # koji # osbs-client # python-dateutil +smmap==5.0.2 + # via gitdb tomli==2.0.1 # via hatchling trove-classifiers==2023.8.7 diff --git a/tests/utils/test_cachi2.py b/tests/utils/test_cachi2.py index e11885f36..30949d954 100644 --- a/tests/utils/test_cachi2.py +++ b/tests/utils/test_cachi2.py @@ -10,6 +10,8 @@ from pathlib import Path from typing import Union +import git + from atomic_reactor.utils.cachi2 import ( SymlinkSandboxError, convert_SBOM_to_ICM, @@ -19,6 +21,7 @@ generate_request_json, clone_only, validate_paths, + has_git_submodule_manager, ) import pytest @@ -26,6 +29,11 @@ from unittest import mock +@pytest.fixture +def mocked_repo_submodules(): + """Mock submodules repo""" + + @pytest.mark.parametrize(('input_remote_source', 'expected_cachi2'), [ pytest.param( {"pkg_managers": ["gomod"]}, @@ -563,6 +571,13 @@ def test_generate_request_json(): True, id="empty_list" ), + pytest.param( + { + "pkg_managers": ["git-submodule"] + }, + True, + id="git_submodule" + ), pytest.param( { "pkg_managers": ["gomod"] @@ -570,6 +585,13 @@ def test_generate_request_json(): False, id="gomod" ), + pytest.param( + { + "pkg_managers": ["gomod", "git-submodule"] + }, + False, + id="gomod_and_git_submodule" + ), pytest.param( {}, False, @@ -588,6 +610,53 @@ def test_clone_only(remote_source, expected): assert clone_only(remote_source) == expected +@pytest.mark.parametrize('remote_source,expected', [ + pytest.param( + { + "pkg_managers": [] + }, + False, + id="empty_list" + ), + pytest.param( + { + "pkg_managers": ["git-submodule"] + }, + True, + id="git_submodule" + ), + pytest.param( + { + "pkg_managers": ["gomod"] + }, + False, + id="gomod" + ), + pytest.param( + { + "pkg_managers": ["gomod", "git-submodule"] + }, + True, + id="gomod_and_git_submodule" + ), + pytest.param( + {}, + False, + id="undefined" + ), + pytest.param( + { + "pkg_managers": None + }, + False, + id="explicit_none" + ), +]) +def test_has_git_submodule_manager(remote_source, expected): + """Test if has_git_submodule_manager correctly detects git-submodule""" + assert has_git_submodule_manager(remote_source) == expected + + class Symlink(str): """ Use this to create symlinks via write_file_tree().