From a0381de06a288a796e4b57885512cb49e6c89cb9 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Fri, 16 Feb 2024 17:31:48 +0100 Subject: [PATCH 01/12] Write a new helper function: gather_registries() to read the config. --- nf_core/download.py | 449 +++++++++++++++++++++++++++++++++----------- 1 file changed, 337 insertions(+), 112 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index bb7b2ae473..d70008597b 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -1,6 +1,5 @@ """Downloads a nf-core pipeline to the local file system.""" - import concurrent.futures import io import logging @@ -34,13 +33,17 @@ log = logging.getLogger(__name__) stderr = rich.console.Console( - stderr=True, style="dim", highlight=False, force_terminal=nf_core.utils.rich_force_colors() + stderr=True, + style="dim", + highlight=False, + force_terminal=nf_core.utils.rich_force_colors(), ) class DownloadError(RuntimeError): """A custom exception that is raised when nf-core download encounters a problem that we already took into consideration. - In this case, we do not want to print the traceback, but give the user some concise, helpful feedback instead.""" + In this case, we do not want to print the traceback, but give the user some concise, helpful feedback instead. + """ class DownloadProgress(rich.progress.Progress): @@ -120,9 +123,15 @@ def __init__( # if flag is not specified, do not assume deliberate choice and prompt config inclusion interactively. # this implies that non-interactive "no" choice is only possible implicitly (e.g. with --tower or if prompt is suppressed by !stderr.is_interactive). # only alternative would have been to make it a parameter with argument, e.g. -d="yes" or -d="no". - self.include_configs = True if download_configuration else False if bool(tower) else None + self.include_configs = ( + True if download_configuration else False if bool(tower) else None + ) # Specifying a cache index or container library implies that containers should be downloaded. - self.container_system = "singularity" if container_cache_index or bool(container_library) else container_system + self.container_system = ( + "singularity" + if container_cache_index or bool(container_library) + else container_system + ) # Manually specified container library (registry) if isinstance(container_library, str) and bool(len(container_library)): self.container_library = [container_library] @@ -131,7 +140,9 @@ def __init__( else: self.container_library = ["quay.io"] # if a container_cache_index is given, use the file and overrule choice. - self.container_cache_utilisation = "remote" if container_cache_index else container_cache_utilisation + self.container_cache_utilisation = ( + "remote" if container_cache_index else container_cache_utilisation + ) self.container_cache_index = container_cache_index # allows to specify a container library / registry or a respective mirror to download images from self.parallel_downloads = parallel_downloads @@ -154,8 +165,8 @@ def download_workflow(self): # Get workflow details try: self.prompt_pipeline_name() - self.pipeline, self.wf_revisions, self.wf_branches = nf_core.utils.get_repo_releases_branches( - self.pipeline, self.wfs + self.pipeline, self.wf_revisions, self.wf_branches = ( + nf_core.utils.get_repo_releases_branches(self.pipeline, self.wfs) ) self.prompt_revision() self.get_revision_hash() @@ -181,9 +192,16 @@ def download_workflow(self): f"Use containers: '{self.container_system}'", ] if self.container_system: - summary_log.append(f"Container library: '{', '.join(self.container_library)}'") - if self.container_system == "singularity" and os.environ.get("NXF_SINGULARITY_CACHEDIR") is not None: - summary_log.append(f"Using [blue]$NXF_SINGULARITY_CACHEDIR[/]': {os.environ['NXF_SINGULARITY_CACHEDIR']}'") + summary_log.append( + f"Container library: '{', '.join(self.container_library)}'" + ) + if ( + self.container_system == "singularity" + and os.environ.get("NXF_SINGULARITY_CACHEDIR") is not None + ): + summary_log.append( + f"Using [blue]$NXF_SINGULARITY_CACHEDIR[/]': {os.environ['NXF_SINGULARITY_CACHEDIR']}'" + ) if self.containers_remote: summary_log.append( f"Successfully read {len(self.containers_remote)} containers from the remote '$NXF_SINGULARITY_CACHEDIR' contents." @@ -201,9 +219,13 @@ def download_workflow(self): if not self.tower: # Only show entry, if option was prompted. - summary_log.append(f"Include default institutional configuration: '{self.include_configs}'") + summary_log.append( + f"Include default institutional configuration: '{self.include_configs}'" + ) else: - summary_log.append(f"Enabled for seqeralabs® Nextflow Tower: '{self.tower}'") + summary_log.append( + f"Enabled for seqeralabs® Nextflow Tower: '{self.tower}'" + ) # Check that the outdir doesn't already exist if os.path.exists(self.outdir): @@ -243,14 +265,20 @@ def download_workflow_static(self): # Download the pipeline files for each selected revision log.info("Downloading workflow files from GitHub") - for item in zip(self.revision, self.wf_sha.values(), self.wf_download_url.values()): - revision_dirname = self.download_wf_files(revision=item[0], wf_sha=item[1], download_url=item[2]) + for item in zip( + self.revision, self.wf_sha.values(), self.wf_download_url.values() + ): + revision_dirname = self.download_wf_files( + revision=item[0], wf_sha=item[1], download_url=item[2] + ) if self.include_configs: try: self.wf_use_local_configs(revision_dirname) except FileNotFoundError as e: - raise DownloadError("Error editing pipeline config file to use local configs!") from e + raise DownloadError( + "Error editing pipeline config file to use local configs!" + ) from e # Collect all required singularity images if self.container_system == "singularity": @@ -275,7 +303,9 @@ def download_workflow_tower(self, location=None): remote_url=f"https://github.com/{self.pipeline}.git", revision=self.revision if self.revision else None, commit=self.wf_sha.values() if bool(self.wf_sha) else None, - location=location if location else None, # manual location is required for the tests to work + location=( + location if location else None + ), # manual location is required for the tests to work in_cache=False, ) @@ -300,13 +330,17 @@ def download_workflow_tower(self, location=None): # Justify why compression is skipped for Tower downloads (Prompt is not shown, but CLI argument could have been set) if self.compress_type is not None: - log.info("Compression choice is ignored for Tower downloads since nothing can be reasonably compressed.") + log.info( + "Compression choice is ignored for Tower downloads since nothing can be reasonably compressed." + ) def prompt_pipeline_name(self): """Prompt for the pipeline name if not set with a flag""" if self.pipeline is None: - stderr.print("Specify the name of a nf-core pipeline or a GitHub repository name (user/repo).") + stderr.print( + "Specify the name of a nf-core pipeline or a GitHub repository name (user/repo)." + ) self.pipeline = nf_core.utils.prompt_remote_pipeline_name(self.wfs) def prompt_revision(self): @@ -335,18 +369,28 @@ def prompt_revision(self): if bool(choice): # have to make sure that self.revision is a list of strings, regardless if choice is str or list of strings. - self.revision.append(choice) if isinstance(choice, str) else self.revision.extend(choice) + ( + self.revision.append(choice) + if isinstance(choice, str) + else self.revision.extend(choice) + ) else: if bool(tag_set): self.revision = tag_set - log.info("No particular revision was selected, all available will be downloaded.") + log.info( + "No particular revision was selected, all available will be downloaded." + ) else: - raise AssertionError(f"No revisions of {self.pipeline} available for download.") + raise AssertionError( + f"No revisions of {self.pipeline} available for download." + ) def get_revision_hash(self): """Find specified revision / branch hash""" - for revision in self.revision: # revision is a list of strings, but may be of length 1 + for ( + revision + ) in self.revision: # revision is a list of strings, but may be of length 1 # Branch if revision in self.wf_branches.keys(): self.wf_sha = {**self.wf_sha, revision: self.wf_branches[revision]} @@ -362,18 +406,27 @@ def get_revision_hash(self): else: log.info( "Available {} revisions: '{}'".format( - self.pipeline, "', '".join([r["tag_name"] for r in self.wf_revisions]) + self.pipeline, + "', '".join([r["tag_name"] for r in self.wf_revisions]), + ) + ) + log.info( + "Available {} branches: '{}'".format( + self.pipeline, "', '".join(self.wf_branches.keys()) ) ) - log.info("Available {} branches: '{}'".format(self.pipeline, "', '".join(self.wf_branches.keys()))) - raise AssertionError(f"Not able to find revision / branch '{revision}' for {self.pipeline}") + raise AssertionError( + f"Not able to find revision / branch '{revision}' for {self.pipeline}" + ) # Set the outdir if not self.outdir: if len(self.wf_sha) > 1: self.outdir = f"{self.pipeline.replace('/', '-').lower()}_{datetime.now().strftime('%Y-%m-%d_%H-%M')}" else: - self.outdir = f"{self.pipeline.replace('/', '-').lower()}_{self.revision[0]}" + self.outdir = ( + f"{self.pipeline.replace('/', '-').lower()}_{self.revision[0]}" + ) if not self.tower: for revision, wf_sha in self.wf_sha.items(): @@ -398,7 +451,9 @@ def prompt_container_download(self): """Prompt whether to download container images or not""" if self.container_system is None and stderr.is_interactive and not self.tower: - stderr.print("\nIn addition to the pipeline code, this tool can download software containers.") + stderr.print( + "\nIn addition to the pipeline code, this tool can download software containers." + ) self.container_system = questionary.select( "Download software container images:", choices=["none", "singularity"], @@ -425,9 +480,13 @@ def prompt_singularity_cachedir_creation(self): cachedir_path = None while cachedir_path is None: prompt_cachedir_path = questionary.path( - "Specify the path:", only_directories=True, style=nf_core.utils.nfcore_question_style + "Specify the path:", + only_directories=True, + style=nf_core.utils.nfcore_question_style, ).unsafe_ask() - cachedir_path = os.path.abspath(os.path.expanduser(prompt_cachedir_path)) + cachedir_path = os.path.abspath( + os.path.expanduser(prompt_cachedir_path) + ) if prompt_cachedir_path == "": log.error("Not using [blue]$NXF_SINGULARITY_CACHEDIR[/]") cachedir_path = False @@ -477,7 +536,9 @@ def prompt_singularity_cachedir_creation(self): + f'export NXF_SINGULARITY_CACHEDIR="{cachedir_path}"' + "\n#######################################\n" ) - log.info(f"Successfully wrote to [blue]{shellprofile_path}[/]") + log.info( + f"Successfully wrote to [blue]{shellprofile_path}[/]" + ) log.warning( "You will need reload your terminal after the download completes for this to take effect." ) @@ -485,7 +546,8 @@ def prompt_singularity_cachedir_creation(self): def prompt_singularity_cachedir_utilization(self): """Ask if we should *only* use $NXF_SINGULARITY_CACHEDIR without copying into target""" if ( - self.container_cache_utilisation is None # no choice regarding singularity cache has been made. + self.container_cache_utilisation + is None # no choice regarding singularity cache has been made. and self.container_system == "singularity" and os.environ.get("NXF_SINGULARITY_CACHEDIR") is not None and stderr.is_interactive @@ -517,9 +579,13 @@ def prompt_singularity_cachedir_remote(self): validate=SingularityCacheFilePathValidator, style=nf_core.utils.nfcore_question_style, ).unsafe_ask() - cachedir_index = os.path.abspath(os.path.expanduser(prompt_cachedir_index)) + cachedir_index = os.path.abspath( + os.path.expanduser(prompt_cachedir_index) + ) if prompt_cachedir_index == "": - log.error("Will disregard contents of a remote [blue]$NXF_SINGULARITY_CACHEDIR[/]") + log.error( + "Will disregard contents of a remote [blue]$NXF_SINGULARITY_CACHEDIR[/]" + ) self.container_cache_index = None self.container_cache_utilisation = "copy" elif not os.access(cachedir_index, os.R_OK): @@ -546,18 +612,30 @@ def read_remote_containers(self): n_total_images += 1 self.containers_remote.append(match.group(0)) if n_total_images == 0: - raise LookupError("Could not find valid container names in the index file.") + raise LookupError( + "Could not find valid container names in the index file." + ) self.containers_remote = sorted(list(set(self.containers_remote))) except (FileNotFoundError, LookupError) as e: - log.error(f"[red]Issue with reading the specified remote $NXF_SINGULARITY_CACHE index:[/]\n{e}\n") - if stderr.is_interactive and rich.prompt.Confirm.ask("[blue]Specify a new index file and try again?"): - self.container_cache_index = None # reset chosen path to index file. + log.error( + f"[red]Issue with reading the specified remote $NXF_SINGULARITY_CACHE index:[/]\n{e}\n" + ) + if stderr.is_interactive and rich.prompt.Confirm.ask( + "[blue]Specify a new index file and try again?" + ): + self.container_cache_index = ( + None # reset chosen path to index file. + ) self.prompt_singularity_cachedir_remote() else: - log.info("Proceeding without consideration of the remote $NXF_SINGULARITY_CACHE index.") + log.info( + "Proceeding without consideration of the remote $NXF_SINGULARITY_CACHE index." + ) self.container_cache_index = None if os.environ.get("NXF_SINGULARITY_CACHEDIR"): - self.container_cache_utilisation = "copy" # default to copy if possible, otherwise skip. + self.container_cache_utilisation = ( + "copy" # default to copy if possible, otherwise skip. + ) else: self.container_cache_utilisation = None @@ -599,14 +677,21 @@ def download_wf_files(self, revision, wf_sha, download_url): revision_dirname = re.sub("[^0-9a-zA-Z]+", "_", revision) # account for name collisions, if there is a branch / release named "configs" or "singularity-images" if revision_dirname in ["configs", "singularity-images"]: - revision_dirname = re.sub("[^0-9a-zA-Z]+", "_", self.pipeline + revision_dirname) + revision_dirname = re.sub( + "[^0-9a-zA-Z]+", "_", self.pipeline + revision_dirname + ) # Rename the internal directory name to be more friendly gh_name = f"{self.pipeline}-{wf_sha if bool(wf_sha) else ''}".split("/")[-1] - os.rename(os.path.join(self.outdir, gh_name), os.path.join(self.outdir, revision_dirname)) + os.rename( + os.path.join(self.outdir, gh_name), + os.path.join(self.outdir, revision_dirname), + ) # Make downloaded files executable - for dirpath, _, filelist in os.walk(os.path.join(self.outdir, revision_dirname)): + for dirpath, _, filelist in os.walk( + os.path.join(self.outdir, revision_dirname) + ): for fname in filelist: os.chmod(os.path.join(dirpath, fname), 0o775) @@ -624,7 +709,10 @@ def download_configs(self): zipfile.extractall(self.outdir) # Rename the internal directory name to be more friendly - os.rename(os.path.join(self.outdir, configs_local_dir), os.path.join(self.outdir, "configs")) + os.rename( + os.path.join(self.outdir, configs_local_dir), + os.path.join(self.outdir, "configs"), + ) # Make downloaded files executable for dirpath, _, filelist in os.walk(os.path.join(self.outdir, "configs")): @@ -647,7 +735,10 @@ def wf_use_local_configs(self, revision_dirname): nfconfig = nfconfig.replace(find_str, repl_str) # Append the singularity.cacheDir to the end if we need it - if self.container_system == "singularity" and self.container_cache_utilisation == "copy": + if ( + self.container_system == "singularity" + and self.container_cache_utilisation == "copy" + ): nfconfig += ( f"\n\n// Added by `nf-core download` v{nf_core.__version__} //\n" + 'singularity.cacheDir = "${projectDir}/../singularity-images/"' @@ -682,7 +773,9 @@ def find_container_images(self, workflow_directory): # Find any config variables that look like a container for k, v in self.nf_config.items(): - if (k.startswith("process.") or k.startswith("params.")) and k.endswith(".container"): + if (k.startswith("process.") or k.startswith("params.")) and k.endswith( + ".container" + ): """ Can be plain string / Docker URI or DSL2 syntax @@ -699,7 +792,9 @@ def find_container_images(self, workflow_directory): if bool(config_findings_dsl2): # finding fill always be a tuple of length 2, first the quote used and second the enquoted value. for finding in config_findings_dsl2: - config_findings.append(finding + (self.nf_config, "Nextflow configs")) + config_findings.append( + finding + (self.nf_config, "Nextflow configs") + ) else: # no regex match, likely just plain string """ Append string also as finding-like tuple for consistency @@ -707,7 +802,9 @@ def find_container_images(self, workflow_directory): self.nf_config is needed, because we need to restart search over raw input if no proper container matches are found. """ - config_findings.append((k, v.strip('"').strip("'"), self.nf_config, "Nextflow configs")) + config_findings.append( + (k, v.strip('"').strip("'"), self.nf_config, "Nextflow configs") + ) # rectify the container paths found in the config # Raw config_findings may yield multiple containers, so better create a shallow copy of the list, since length of input and output may be different ?!? @@ -734,10 +831,13 @@ def find_container_images(self, workflow_directory): re.DOTALL is used to account for the string to be spread out across multiple lines. """ container_regex = re.compile( - r"container\s+[\\s{}=$]*(?P[\'\"])(?P(?:.(?!\1))*.?)\1[\\s}]*", re.DOTALL + r"container\s+[\\s{}=$]*(?P[\'\"])(?P(?:.(?!\1))*.?)\1[\\s}]*", + re.DOTALL, ) - local_module_findings = re.findall(container_regex, search_space) + local_module_findings = re.findall( + container_regex, search_space + ) # finding fill always be a tuple of length 2, first the quote used and second the enquoted value. for finding in local_module_findings: @@ -750,7 +850,9 @@ def find_container_images(self, workflow_directory): module_findings = self.rectify_raw_container_matches(module_findings[:]) # Again clean list, in case config declares Docker URI but module or previous finding already had the http:// download - self.containers = self.prioritize_direct_download(previous_findings + config_findings + module_findings) + self.containers = self.prioritize_direct_download( + previous_findings + config_findings + module_findings + ) def rectify_raw_container_matches(self, raw_findings): """Helper function to rectify the raw extracted container matches into fully qualified container names. @@ -796,9 +898,7 @@ def rectify_raw_container_matches(self, raw_findings): cleaned_matches = [] # Thanks Stack Overflow for the regex: https://stackoverflow.com/a/3809435/713980 - url_regex = ( - r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" - ) + url_regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" # Thanks Stack Overflow for the regex: https://stackoverflow.com/a/39672069/713980 docker_regex = r"^(?:(?=[^:\/]{1,253})(?!-)[a-zA-Z0-9-]{1,63}(?(?(?:.(?!(?(?(?:.(?!(? Date: Fri, 16 Feb 2024 20:09:20 +0100 Subject: [PATCH 02/12] nf-core download: Write function to symlink containers to different libraries (registries). --- nf_core/download.py | 424 ++++++++++++++++---------------------------- 1 file changed, 157 insertions(+), 267 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index d70008597b..fb64b27ada 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -123,15 +123,9 @@ def __init__( # if flag is not specified, do not assume deliberate choice and prompt config inclusion interactively. # this implies that non-interactive "no" choice is only possible implicitly (e.g. with --tower or if prompt is suppressed by !stderr.is_interactive). # only alternative would have been to make it a parameter with argument, e.g. -d="yes" or -d="no". - self.include_configs = ( - True if download_configuration else False if bool(tower) else None - ) + self.include_configs = True if download_configuration else False if bool(tower) else None # Specifying a cache index or container library implies that containers should be downloaded. - self.container_system = ( - "singularity" - if container_cache_index or bool(container_library) - else container_system - ) + self.container_system = "singularity" if container_cache_index or bool(container_library) else container_system # Manually specified container library (registry) if isinstance(container_library, str) and bool(len(container_library)): self.container_library = [container_library] @@ -140,9 +134,7 @@ def __init__( else: self.container_library = ["quay.io"] # if a container_cache_index is given, use the file and overrule choice. - self.container_cache_utilisation = ( - "remote" if container_cache_index else container_cache_utilisation - ) + self.container_cache_utilisation = "remote" if container_cache_index else container_cache_utilisation self.container_cache_index = container_cache_index # allows to specify a container library / registry or a respective mirror to download images from self.parallel_downloads = parallel_downloads @@ -165,8 +157,8 @@ def download_workflow(self): # Get workflow details try: self.prompt_pipeline_name() - self.pipeline, self.wf_revisions, self.wf_branches = ( - nf_core.utils.get_repo_releases_branches(self.pipeline, self.wfs) + self.pipeline, self.wf_revisions, self.wf_branches = nf_core.utils.get_repo_releases_branches( + self.pipeline, self.wfs ) self.prompt_revision() self.get_revision_hash() @@ -192,16 +184,9 @@ def download_workflow(self): f"Use containers: '{self.container_system}'", ] if self.container_system: - summary_log.append( - f"Container library: '{', '.join(self.container_library)}'" - ) - if ( - self.container_system == "singularity" - and os.environ.get("NXF_SINGULARITY_CACHEDIR") is not None - ): - summary_log.append( - f"Using [blue]$NXF_SINGULARITY_CACHEDIR[/]': {os.environ['NXF_SINGULARITY_CACHEDIR']}'" - ) + summary_log.append(f"Container library: '{', '.join(self.container_library)}'") + if self.container_system == "singularity" and os.environ.get("NXF_SINGULARITY_CACHEDIR") is not None: + summary_log.append(f"Using [blue]$NXF_SINGULARITY_CACHEDIR[/]': {os.environ['NXF_SINGULARITY_CACHEDIR']}'") if self.containers_remote: summary_log.append( f"Successfully read {len(self.containers_remote)} containers from the remote '$NXF_SINGULARITY_CACHEDIR' contents." @@ -219,13 +204,9 @@ def download_workflow(self): if not self.tower: # Only show entry, if option was prompted. - summary_log.append( - f"Include default institutional configuration: '{self.include_configs}'" - ) + summary_log.append(f"Include default institutional configuration: '{self.include_configs}'") else: - summary_log.append( - f"Enabled for seqeralabs® Nextflow Tower: '{self.tower}'" - ) + summary_log.append(f"Enabled for seqeralabs® Nextflow Tower: '{self.tower}'") # Check that the outdir doesn't already exist if os.path.exists(self.outdir): @@ -265,24 +246,19 @@ def download_workflow_static(self): # Download the pipeline files for each selected revision log.info("Downloading workflow files from GitHub") - for item in zip( - self.revision, self.wf_sha.values(), self.wf_download_url.values() - ): - revision_dirname = self.download_wf_files( - revision=item[0], wf_sha=item[1], download_url=item[2] - ) + for item in zip(self.revision, self.wf_sha.values(), self.wf_download_url.values()): + revision_dirname = self.download_wf_files(revision=item[0], wf_sha=item[1], download_url=item[2]) if self.include_configs: try: self.wf_use_local_configs(revision_dirname) except FileNotFoundError as e: - raise DownloadError( - "Error editing pipeline config file to use local configs!" - ) from e + raise DownloadError("Error editing pipeline config file to use local configs!") from e # Collect all required singularity images if self.container_system == "singularity": self.find_container_images(os.path.join(self.outdir, revision_dirname)) + self.gather_registries(os.path.join(self.outdir, revision_dirname)) try: self.get_singularity_images(current_revision=item[0]) @@ -303,9 +279,7 @@ def download_workflow_tower(self, location=None): remote_url=f"https://github.com/{self.pipeline}.git", revision=self.revision if self.revision else None, commit=self.wf_sha.values() if bool(self.wf_sha) else None, - location=( - location if location else None - ), # manual location is required for the tests to work + location=(location if location else None), # manual location is required for the tests to work in_cache=False, ) @@ -322,6 +296,7 @@ def download_workflow_tower(self, location=None): self.workflow_repo.checkout(commit) # Collect all required singularity images self.find_container_images(self.workflow_repo.access()) + self.gather_registries(self.workflow_repo.access()) try: self.get_singularity_images(current_revision=revision) @@ -330,17 +305,13 @@ def download_workflow_tower(self, location=None): # Justify why compression is skipped for Tower downloads (Prompt is not shown, but CLI argument could have been set) if self.compress_type is not None: - log.info( - "Compression choice is ignored for Tower downloads since nothing can be reasonably compressed." - ) + log.info("Compression choice is ignored for Tower downloads since nothing can be reasonably compressed.") def prompt_pipeline_name(self): """Prompt for the pipeline name if not set with a flag""" if self.pipeline is None: - stderr.print( - "Specify the name of a nf-core pipeline or a GitHub repository name (user/repo)." - ) + stderr.print("Specify the name of a nf-core pipeline or a GitHub repository name (user/repo).") self.pipeline = nf_core.utils.prompt_remote_pipeline_name(self.wfs) def prompt_revision(self): @@ -369,28 +340,18 @@ def prompt_revision(self): if bool(choice): # have to make sure that self.revision is a list of strings, regardless if choice is str or list of strings. - ( - self.revision.append(choice) - if isinstance(choice, str) - else self.revision.extend(choice) - ) + (self.revision.append(choice) if isinstance(choice, str) else self.revision.extend(choice)) else: if bool(tag_set): self.revision = tag_set - log.info( - "No particular revision was selected, all available will be downloaded." - ) + log.info("No particular revision was selected, all available will be downloaded.") else: - raise AssertionError( - f"No revisions of {self.pipeline} available for download." - ) + raise AssertionError(f"No revisions of {self.pipeline} available for download.") def get_revision_hash(self): """Find specified revision / branch hash""" - for ( - revision - ) in self.revision: # revision is a list of strings, but may be of length 1 + for revision in self.revision: # revision is a list of strings, but may be of length 1 # Branch if revision in self.wf_branches.keys(): self.wf_sha = {**self.wf_sha, revision: self.wf_branches[revision]} @@ -410,23 +371,15 @@ def get_revision_hash(self): "', '".join([r["tag_name"] for r in self.wf_revisions]), ) ) - log.info( - "Available {} branches: '{}'".format( - self.pipeline, "', '".join(self.wf_branches.keys()) - ) - ) - raise AssertionError( - f"Not able to find revision / branch '{revision}' for {self.pipeline}" - ) + log.info("Available {} branches: '{}'".format(self.pipeline, "', '".join(self.wf_branches.keys()))) + raise AssertionError(f"Not able to find revision / branch '{revision}' for {self.pipeline}") # Set the outdir if not self.outdir: if len(self.wf_sha) > 1: self.outdir = f"{self.pipeline.replace('/', '-').lower()}_{datetime.now().strftime('%Y-%m-%d_%H-%M')}" else: - self.outdir = ( - f"{self.pipeline.replace('/', '-').lower()}_{self.revision[0]}" - ) + self.outdir = f"{self.pipeline.replace('/', '-').lower()}_{self.revision[0]}" if not self.tower: for revision, wf_sha in self.wf_sha.items(): @@ -451,9 +404,7 @@ def prompt_container_download(self): """Prompt whether to download container images or not""" if self.container_system is None and stderr.is_interactive and not self.tower: - stderr.print( - "\nIn addition to the pipeline code, this tool can download software containers." - ) + stderr.print("\nIn addition to the pipeline code, this tool can download software containers.") self.container_system = questionary.select( "Download software container images:", choices=["none", "singularity"], @@ -484,9 +435,7 @@ def prompt_singularity_cachedir_creation(self): only_directories=True, style=nf_core.utils.nfcore_question_style, ).unsafe_ask() - cachedir_path = os.path.abspath( - os.path.expanduser(prompt_cachedir_path) - ) + cachedir_path = os.path.abspath(os.path.expanduser(prompt_cachedir_path)) if prompt_cachedir_path == "": log.error("Not using [blue]$NXF_SINGULARITY_CACHEDIR[/]") cachedir_path = False @@ -536,9 +485,7 @@ def prompt_singularity_cachedir_creation(self): + f'export NXF_SINGULARITY_CACHEDIR="{cachedir_path}"' + "\n#######################################\n" ) - log.info( - f"Successfully wrote to [blue]{shellprofile_path}[/]" - ) + log.info(f"Successfully wrote to [blue]{shellprofile_path}[/]") log.warning( "You will need reload your terminal after the download completes for this to take effect." ) @@ -546,8 +493,7 @@ def prompt_singularity_cachedir_creation(self): def prompt_singularity_cachedir_utilization(self): """Ask if we should *only* use $NXF_SINGULARITY_CACHEDIR without copying into target""" if ( - self.container_cache_utilisation - is None # no choice regarding singularity cache has been made. + self.container_cache_utilisation is None # no choice regarding singularity cache has been made. and self.container_system == "singularity" and os.environ.get("NXF_SINGULARITY_CACHEDIR") is not None and stderr.is_interactive @@ -579,13 +525,9 @@ def prompt_singularity_cachedir_remote(self): validate=SingularityCacheFilePathValidator, style=nf_core.utils.nfcore_question_style, ).unsafe_ask() - cachedir_index = os.path.abspath( - os.path.expanduser(prompt_cachedir_index) - ) + cachedir_index = os.path.abspath(os.path.expanduser(prompt_cachedir_index)) if prompt_cachedir_index == "": - log.error( - "Will disregard contents of a remote [blue]$NXF_SINGULARITY_CACHEDIR[/]" - ) + log.error("Will disregard contents of a remote [blue]$NXF_SINGULARITY_CACHEDIR[/]") self.container_cache_index = None self.container_cache_utilisation = "copy" elif not os.access(cachedir_index, os.R_OK): @@ -612,30 +554,18 @@ def read_remote_containers(self): n_total_images += 1 self.containers_remote.append(match.group(0)) if n_total_images == 0: - raise LookupError( - "Could not find valid container names in the index file." - ) + raise LookupError("Could not find valid container names in the index file.") self.containers_remote = sorted(list(set(self.containers_remote))) except (FileNotFoundError, LookupError) as e: - log.error( - f"[red]Issue with reading the specified remote $NXF_SINGULARITY_CACHE index:[/]\n{e}\n" - ) - if stderr.is_interactive and rich.prompt.Confirm.ask( - "[blue]Specify a new index file and try again?" - ): - self.container_cache_index = ( - None # reset chosen path to index file. - ) + log.error(f"[red]Issue with reading the specified remote $NXF_SINGULARITY_CACHE index:[/]\n{e}\n") + if stderr.is_interactive and rich.prompt.Confirm.ask("[blue]Specify a new index file and try again?"): + self.container_cache_index = None # reset chosen path to index file. self.prompt_singularity_cachedir_remote() else: - log.info( - "Proceeding without consideration of the remote $NXF_SINGULARITY_CACHE index." - ) + log.info("Proceeding without consideration of the remote $NXF_SINGULARITY_CACHE index.") self.container_cache_index = None if os.environ.get("NXF_SINGULARITY_CACHEDIR"): - self.container_cache_utilisation = ( - "copy" # default to copy if possible, otherwise skip. - ) + self.container_cache_utilisation = "copy" # default to copy if possible, otherwise skip. else: self.container_cache_utilisation = None @@ -677,9 +607,7 @@ def download_wf_files(self, revision, wf_sha, download_url): revision_dirname = re.sub("[^0-9a-zA-Z]+", "_", revision) # account for name collisions, if there is a branch / release named "configs" or "singularity-images" if revision_dirname in ["configs", "singularity-images"]: - revision_dirname = re.sub( - "[^0-9a-zA-Z]+", "_", self.pipeline + revision_dirname - ) + revision_dirname = re.sub("[^0-9a-zA-Z]+", "_", self.pipeline + revision_dirname) # Rename the internal directory name to be more friendly gh_name = f"{self.pipeline}-{wf_sha if bool(wf_sha) else ''}".split("/")[-1] @@ -689,9 +617,7 @@ def download_wf_files(self, revision, wf_sha, download_url): ) # Make downloaded files executable - for dirpath, _, filelist in os.walk( - os.path.join(self.outdir, revision_dirname) - ): + for dirpath, _, filelist in os.walk(os.path.join(self.outdir, revision_dirname)): for fname in filelist: os.chmod(os.path.join(dirpath, fname), 0o775) @@ -735,10 +661,7 @@ def wf_use_local_configs(self, revision_dirname): nfconfig = nfconfig.replace(find_str, repl_str) # Append the singularity.cacheDir to the end if we need it - if ( - self.container_system == "singularity" - and self.container_cache_utilisation == "copy" - ): + if self.container_system == "singularity" and self.container_cache_utilisation == "copy": nfconfig += ( f"\n\n// Added by `nf-core download` v{nf_core.__version__} //\n" + 'singularity.cacheDir = "${projectDir}/../singularity-images/"' @@ -773,9 +696,7 @@ def find_container_images(self, workflow_directory): # Find any config variables that look like a container for k, v in self.nf_config.items(): - if (k.startswith("process.") or k.startswith("params.")) and k.endswith( - ".container" - ): + if (k.startswith("process.") or k.startswith("params.")) and k.endswith(".container"): """ Can be plain string / Docker URI or DSL2 syntax @@ -792,9 +713,7 @@ def find_container_images(self, workflow_directory): if bool(config_findings_dsl2): # finding fill always be a tuple of length 2, first the quote used and second the enquoted value. for finding in config_findings_dsl2: - config_findings.append( - finding + (self.nf_config, "Nextflow configs") - ) + config_findings.append(finding + (self.nf_config, "Nextflow configs")) else: # no regex match, likely just plain string """ Append string also as finding-like tuple for consistency @@ -802,9 +721,7 @@ def find_container_images(self, workflow_directory): self.nf_config is needed, because we need to restart search over raw input if no proper container matches are found. """ - config_findings.append( - (k, v.strip('"').strip("'"), self.nf_config, "Nextflow configs") - ) + config_findings.append((k, v.strip('"').strip("'"), self.nf_config, "Nextflow configs")) # rectify the container paths found in the config # Raw config_findings may yield multiple containers, so better create a shallow copy of the list, since length of input and output may be different ?!? @@ -835,9 +752,7 @@ def find_container_images(self, workflow_directory): re.DOTALL, ) - local_module_findings = re.findall( - container_regex, search_space - ) + local_module_findings = re.findall(container_regex, search_space) # finding fill always be a tuple of length 2, first the quote used and second the enquoted value. for finding in local_module_findings: @@ -850,9 +765,7 @@ def find_container_images(self, workflow_directory): module_findings = self.rectify_raw_container_matches(module_findings[:]) # Again clean list, in case config declares Docker URI but module or previous finding already had the http:// download - self.containers = self.prioritize_direct_download( - previous_findings + config_findings + module_findings - ) + self.containers = self.prioritize_direct_download(previous_findings + config_findings + module_findings) def rectify_raw_container_matches(self, raw_findings): """Helper function to rectify the raw extracted container matches into fully qualified container names. @@ -898,7 +811,9 @@ def rectify_raw_container_matches(self, raw_findings): cleaned_matches = [] # Thanks Stack Overflow for the regex: https://stackoverflow.com/a/3809435/713980 - url_regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" + url_regex = ( + r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" + ) # Thanks Stack Overflow for the regex: https://stackoverflow.com/a/39672069/713980 docker_regex = r"^(?:(?=[^:\/]{1,253})(?!-)[a-zA-Z0-9-]{1,63}(? Date: Mon, 19 Feb 2024 17:46:43 +0100 Subject: [PATCH 03/12] Refactor: Always trim registry from output name. Naming collisions are presumably unlikely, names including the registry are now consistently always symlinks. --- nf_core/download.py | 63 +++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index fb64b27ada..96e6469dee 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -133,6 +133,8 @@ def __init__( self.container_library = [*container_library] else: self.container_library = ["quay.io"] + # Create a new set and add all values from self.container_library (CLI arguments to --container-library) + self.registry_set = set(self.container_library) if hasattr(self, "container_library") else set() # if a container_cache_index is given, use the file and overrule choice. self.container_cache_utilisation = "remote" if container_cache_index else container_cache_utilisation self.container_cache_index = container_cache_index @@ -985,9 +987,6 @@ def gather_registries(self, workflow_directory): if not self.nf_config: self.nf_config = nf_core.utils.fetch_wf_config(workflow_directory) - # Create a new set and add all values from self.container_library (CLI arguments to --container-library) - self.registry_set = set(self.container_library) if hasattr(self, "container_library") else set() - # Select registries defined in pipeline config configured_registries = [ "apptainer.registry", @@ -1000,6 +999,9 @@ def gather_registries(self, workflow_directory): if registry in self.nf_config: self.registry_set.add(self.nf_config[registry]) + # add depot.galaxyproject.org to the set, because it is the default registry for singularity hardcoded in modules + self.registry_set.add("depot.galaxyproject.org") + def symlink_singularity_images(self, image_out_path): """Create a symlink for each registry in the registry set that points to the image. We have dropped the explicit registries from the modules in favor of the configurable registries. @@ -1014,35 +1016,31 @@ def symlink_singularity_images(self, image_out_path): """ if self.registry_set: - # add depot.galaxyproject.org to the set, because it is the default registry for singularity hardcoded in modules - self.registry_set.add("depot.galaxyproject.org") - - # Create a regex pattern from the set + # Create a regex pattern from the set, in case trimming is needed. trim_pattern = "|".join(f"{re.escape(registry)}-?" for registry in self.registry_set) - # Use the pattern to trim the string - trimmed_image_name = re.sub(f"^{trim_pattern}", "", os.path.basename(image_out_path)) - for registry in self.registry_set: - # avoid symlinking the same file to itself if not os.path.basename(image_out_path).startswith(registry): - symlink_name = f"./{registry}-{trimmed_image_name}" - - symlink_full = os.path.join(os.path.dirname(image_out_path), symlink_name) - target_name = os.path.join("./", os.path.basename(image_out_path)) - - if not os.path.exists(symlink_full): - os.makedirs(os.path.dirname(symlink_full), exist_ok=True) - image_dir = os.open(os.path.dirname(image_out_path), os.O_RDONLY) - try: - os.symlink( - target_name, - symlink_name, - dir_fd=image_dir, - ) - log.debug(f"Symlinked {target_name} as {symlink_name}.") - finally: - os.close(image_dir) + symlink_name = f"./{registry}-{os.path.basename(image_out_path)}" + else: + trimmed_name = re.sub(f"^{trim_pattern}", "", os.path.basename(image_out_path)) + symlink_name = f"./{registry}-{trimmed_name}" + + symlink_full = os.path.join(os.path.dirname(image_out_path), symlink_name) + target_name = os.path.join("./", os.path.basename(image_out_path)) + + if not os.path.exists(symlink_full): + os.makedirs(os.path.dirname(symlink_full), exist_ok=True) + image_dir = os.open(os.path.dirname(image_out_path), os.O_RDONLY) + try: + os.symlink( + target_name, + symlink_name, + dir_fd=image_dir, + ) + log.debug(f"Symlinked {target_name} as {symlink_name}.") + finally: + os.close(image_dir) def get_singularity_images(self, current_revision=""): """Loop through container names and download Singularity images""" @@ -1234,6 +1232,15 @@ def singularity_image_filenames(self, container): # Add file extension out_name = out_name + extension + # Trim potential registries from the name for consistency. + # This will allow pipelines to work offline without symlinked images, + # if docker.registry / singularity.registry are set to empty strings at runtime, which can be included in the HPC config profiles easily. + if self.registry_set: + # Create a regex pattern from the set of registries + trim_pattern = "|".join(f"{re.escape(registry)}-?" for registry in self.registry_set) + # Use the pattern to trim the string + out_name = re.sub(f"^{trim_pattern}", "", out_name) + # Full destination and cache paths out_path = os.path.abspath(os.path.join(self.outdir, "singularity-images", out_name)) cache_path = None From 7954bcbc5fc541cecb4961eed5fd5df1b6b17c85 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Tue, 20 Feb 2024 16:00:40 +0100 Subject: [PATCH 04/12] Modify the pytest.yml action such that Singularity is used for the nf-core download tests. --- .github/workflows/pytest.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 5faeef49da..d5345bdb33 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -108,6 +108,12 @@ jobs: sudo add-apt-repository --remove ppa:git-core/ppa sudo apt install -y git + - name: Set up Singularity + if: ${{ matrix.test == 'test_download.py'}} + uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 + with: + singularity-version: 3.8.3 + - name: Get current date id: date run: echo "date=$(date +'%Y-%m')" >> $GITHUB_ENV From 5cfccd3357839757d346e62987731e1183bcf453 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Tue, 20 Feb 2024 18:26:29 +0100 Subject: [PATCH 05/12] Extend ContainerError to match ghcr.io error message. --- nf_core/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nf_core/download.py b/nf_core/download.py index 96e6469dee..14fa907426 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -1722,6 +1722,7 @@ def __init__( elif ( re.search(r"requested\saccess\sto\sthe\sresource\sis\sdenied", line) or re.search(r"StatusCode:\s404", line) + or re.search(r"400|Bad\s?Request", line) or re.search(r"invalid\sstatus\scode\sfrom\sregistry\s400", line) ): # Unfortunately, every registry seems to return an individual error here: From 08170240c6512cbcc4fc43f811051a90aef01a13 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 21 Feb 2024 11:51:36 +0100 Subject: [PATCH 06/12] Add test_gather_registries() to test_download.py --- nf_core/download.py | 6 +++--- tests/test_download.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index 14fa907426..cb2fb173b2 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -713,7 +713,7 @@ def find_container_images(self, workflow_directory): config_findings_dsl2 = re.findall(config_regex, v) if bool(config_findings_dsl2): - # finding fill always be a tuple of length 2, first the quote used and second the enquoted value. + # finding will always be a tuple of length 2, first the quote used and second the enquoted value. for finding in config_findings_dsl2: config_findings.append(finding + (self.nf_config, "Nextflow configs")) else: # no regex match, likely just plain string @@ -1021,10 +1021,10 @@ def symlink_singularity_images(self, image_out_path): for registry in self.registry_set: if not os.path.basename(image_out_path).startswith(registry): - symlink_name = f"./{registry}-{os.path.basename(image_out_path)}" + symlink_name = f"{registry}-{os.path.basename(image_out_path)}" else: trimmed_name = re.sub(f"^{trim_pattern}", "", os.path.basename(image_out_path)) - symlink_name = f"./{registry}-{trimmed_name}" + symlink_name = f"{registry}-{trimmed_name}" symlink_full = os.path.join(os.path.dirname(image_out_path), symlink_name) target_name = os.path.join("./", os.path.basename(image_out_path)) diff --git a/tests/test_download.py b/tests/test_download.py index 7f34f7fbc6..826884b618 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -352,6 +352,38 @@ def test_get_singularity_images(self, tmp_path, mock_fetch_wf_config): # Test that they are all caught inside get_singularity_images(). download_obj.get_singularity_images() + # + # Test for gather_registries' + # + @with_temporary_folder + @mock.patch("nf_core.utils.fetch_wf_config") + def test_gather_registries(self, tmp_path, mock_fetch_wf_config): + download_obj = DownloadWorkflow( + pipeline="dummy", + outdir=tmp_path, + container_library=None, + ) + mock_fetch_wf_config.return_value = { + "apptainer.registry": "apptainer-registry.io", + "docker.registry": "docker.io", + "podman.registry": "podman-registry.io", + "singularity.registry": "singularity-registry.io", + "someother.registry": "fake-registry.io", + } + download_obj.gather_registries(tmp_path) + assert download_obj.registry_set + assert isinstance(download_obj.registry_set, set) + assert len(download_obj.registry_set) == 6 + + assert "quay.io" in download_obj.registry_set # default registry, if no container library is provided. + assert "depot.galaxyproject.org" in download_obj.registry_set # default registry, often hardcoded in modules + assert "apptainer-registry.io" in download_obj.registry_set + assert "docker.io" in download_obj.registry_set + assert "podman-registry.io" in download_obj.registry_set + assert "singularity-registry.io" in download_obj.registry_set + # it should only pull the apptainer, docker, podman and singularity registry from the config, but not any registry. + assert "fake-registry.io" not in download_obj.registry_set + # If Singularity is not installed, it raises a OSError because the singularity command can't be found. @pytest.mark.skipif( shutil.which("singularity") is not None, From d1fb738282771b400ce6f2779024003017da553c Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 21 Feb 2024 21:40:54 +0100 Subject: [PATCH 07/12] Finally a comprehensive test for symlink_singularity_images() --- nf_core/download.py | 6 ++--- tests/test_download.py | 56 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index cb2fb173b2..eac0516fb6 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -1021,15 +1021,15 @@ def symlink_singularity_images(self, image_out_path): for registry in self.registry_set: if not os.path.basename(image_out_path).startswith(registry): - symlink_name = f"{registry}-{os.path.basename(image_out_path)}" + symlink_name = os.path.join("./", f"{registry}-{os.path.basename(image_out_path)}") else: trimmed_name = re.sub(f"^{trim_pattern}", "", os.path.basename(image_out_path)) - symlink_name = f"{registry}-{trimmed_name}" + symlink_name = os.path.join("./", f"{registry}-{trimmed_name}") symlink_full = os.path.join(os.path.dirname(image_out_path), symlink_name) target_name = os.path.join("./", os.path.basename(image_out_path)) - if not os.path.exists(symlink_full): + if not os.path.exists(symlink_full) and target_name != symlink_name: os.makedirs(os.path.dirname(symlink_full), exist_ok=True) image_dir = os.open(os.path.dirname(image_out_path), os.O_RDONLY) try: diff --git a/tests/test_download.py b/tests/test_download.py index 826884b618..3a11f4d41a 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -352,6 +352,60 @@ def test_get_singularity_images(self, tmp_path, mock_fetch_wf_config): # Test that they are all caught inside get_singularity_images(). download_obj.get_singularity_images() + @with_temporary_folder + @mock.patch("os.makedirs") + @mock.patch("os.symlink") + @mock.patch("os.open") + @mock.patch("os.close") + @mock.patch("re.sub") + @mock.patch("os.path.basename") + @mock.patch("os.path.dirname") + def test_symlink_singularity_images( + self, + tmp_path, + mock_dirname, + mock_basename, + mock_resub, + mock_close, + mock_open, + mock_symlink, + mock_makedirs, + ): + # Setup + mock_resub.return_value = "singularity-image.img" + mock_dirname.return_value = f"{tmp_path}/path/to" + mock_basename.return_value = "quay.io-singularity-image.img" + mock_open.return_value = 12 # file descriptor + mock_close.return_value = 12 # file descriptor + + download_obj = DownloadWorkflow( + pipeline="dummy", + outdir=tmp_path, + container_library=("mirage-the-imaginative-registry.io", "quay.io"), + ) + + # Call the method + download_obj.symlink_singularity_images(f"{tmp_path}/path/to/quay.io-singularity-image.img") + print(mock_resub.call_args) + + # Check that os.makedirs was called with the correct arguments + mock_makedirs.assert_any_call(f"{tmp_path}/path/to", exist_ok=True) + + # Check that os.open was called with the correct arguments + mock_open.assert_called_once_with(f"{tmp_path}/path/to", os.O_RDONLY) + + # Check that os.symlink was called with the correct arguments + mock_symlink.assert_any_call( + "./quay.io-singularity-image.img", + "./mirage-the-imaginative-registry.io-quay.io-singularity-image.img", + dir_fd=12, + ) + # Check that there is no attempt to symlink to itself (test parameters would result in that behavior if not checked in the function) + assert ( + unittest.mock.call("./quay.io-singularity-image.img", "./quay.io-singularity-image.img", dir_fd=12) + not in mock_symlink.call_args_list + ) + # # Test for gather_registries' # @@ -384,7 +438,9 @@ def test_gather_registries(self, tmp_path, mock_fetch_wf_config): # it should only pull the apptainer, docker, podman and singularity registry from the config, but not any registry. assert "fake-registry.io" not in download_obj.registry_set + # # If Singularity is not installed, it raises a OSError because the singularity command can't be found. + # @pytest.mark.skipif( shutil.which("singularity") is not None, reason="Can't test how the code behaves when singularity is not installed if it is.", From 5a3fa12bdcaf557675f0495905cd6aeafb6a472f Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 21 Feb 2024 22:12:54 +0100 Subject: [PATCH 08/12] Changelog. --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89468b4cb0..f6c0b056b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ - Remove obsolete editor settings in `devcontainer.json` and `gitpod.yml` ([#2795](https://github.com/nf-core/tools/pull/2795)) +### Download + +- Improved offline container image resolution by introducing symlinks, fixes issues [#2751](https://github.com/nf-core/tools/issues/2751), [#2644](https://github.com/nf-core/tools/issues/2644) and [demultiplex#164](https://github.com/nf-core/demultiplex/issues/164): ([#2768](https://github.com/nf-core/tools/pull/2768)) + ### Linting ### Components From d1179a48628e1c7048c62bfd91382453b564f56c Mon Sep 17 00:00:00 2001 From: Matthias Zepper <6963520+MatthiasZepper@users.noreply.github.com> Date: Mon, 26 Feb 2024 12:15:24 +0100 Subject: [PATCH 09/12] Apply @Aratz 's suggestions from code review Co-authored-by: Adrien Coulier --- nf_core/download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index eac0516fb6..4afee7fab0 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -1023,7 +1023,7 @@ def symlink_singularity_images(self, image_out_path): if not os.path.basename(image_out_path).startswith(registry): symlink_name = os.path.join("./", f"{registry}-{os.path.basename(image_out_path)}") else: - trimmed_name = re.sub(f"^{trim_pattern}", "", os.path.basename(image_out_path)) + trimmed_name = re.sub(f"^({trim_pattern})", "", os.path.basename(image_out_path)) symlink_name = os.path.join("./", f"{registry}-{trimmed_name}") symlink_full = os.path.join(os.path.dirname(image_out_path), symlink_name) @@ -1239,7 +1239,7 @@ def singularity_image_filenames(self, container): # Create a regex pattern from the set of registries trim_pattern = "|".join(f"{re.escape(registry)}-?" for registry in self.registry_set) # Use the pattern to trim the string - out_name = re.sub(f"^{trim_pattern}", "", out_name) + out_name = re.sub(f"^({trim_pattern})", "", out_name) # Full destination and cache paths out_path = os.path.abspath(os.path.join(self.outdir, "singularity-images", out_name)) From e3c7f5b9dd454260806cfbde3292c14611d506fb Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 26 Feb 2024 19:28:04 +0100 Subject: [PATCH 10/12] Improved the documentation of the singularity_image_filenames() function. --- nf_core/download.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index 4afee7fab0..31192676c0 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -1017,13 +1017,13 @@ def symlink_singularity_images(self, image_out_path): if self.registry_set: # Create a regex pattern from the set, in case trimming is needed. - trim_pattern = "|".join(f"{re.escape(registry)}-?" for registry in self.registry_set) + trim_pattern = "|".join(f"^{re.escape(registry)}-?" for registry in self.registry_set) for registry in self.registry_set: if not os.path.basename(image_out_path).startswith(registry): symlink_name = os.path.join("./", f"{registry}-{os.path.basename(image_out_path)}") else: - trimmed_name = re.sub(f"^({trim_pattern})", "", os.path.basename(image_out_path)) + trimmed_name = re.sub(f"{trim_pattern}", "", os.path.basename(image_out_path)) symlink_name = os.path.join("./", f"{registry}-{trimmed_name}") symlink_full = os.path.join(os.path.dirname(image_out_path), symlink_name) @@ -1209,8 +1209,11 @@ def singularity_image_filenames(self, container): or a Docker Hub repository ID. Returns: - results (bool, str): Returns True if we have the image in the target location. - Returns a download path if not. + tuple (str, str): Returns a tuple of (out_path, cache_path). + out_path is the final target output path. it may point to the NXF_SINGULARITY_CACHEDIR, if cache utilisation was set to 'amend'. + If cache utilisation was set to 'copy', it will point to the target folder, a subdirectory of the output directory. In the latter case, + cache_path may either be None (image is not yet cached locally) or point to the image in the NXF_SINGULARITY_CACHEDIR, so it will not be + downloaded from the web again, but directly copied from there. See get_singularity_images() for implementation. """ # Generate file paths @@ -1237,9 +1240,9 @@ def singularity_image_filenames(self, container): # if docker.registry / singularity.registry are set to empty strings at runtime, which can be included in the HPC config profiles easily. if self.registry_set: # Create a regex pattern from the set of registries - trim_pattern = "|".join(f"{re.escape(registry)}-?" for registry in self.registry_set) + trim_pattern = "|".join(f"^{re.escape(registry)}-?" for registry in self.registry_set) # Use the pattern to trim the string - out_name = re.sub(f"^({trim_pattern})", "", out_name) + out_name = re.sub(f"{trim_pattern}", "", out_name) # Full destination and cache paths out_path = os.path.abspath(os.path.join(self.outdir, "singularity-images", out_name)) From dceb7b3686fc2f38056cb5bbead658beb174fb9e Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Tue, 27 Feb 2024 12:49:00 +0100 Subject: [PATCH 11/12] Added a test for the so far untested singularity_image_filenames() function. --- tests/test_download.py | 62 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tests/test_download.py b/tests/test_download.py index 3a11f4d41a..d823040247 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -454,6 +454,68 @@ def test_singularity_pull_image_singularity_not_installed(self, tmp_dir, mock_ri "a-container", f"{tmp_dir}/anothercontainer.sif", None, "quay.io", mock_rich_progress ) + # + # Test for 'singularity_image_filenames' function + # + @with_temporary_folder + def test_singularity_image_filenames(self, tmp_path): + os.environ["NXF_SINGULARITY_CACHEDIR"] = f"{tmp_path}/cachedir" + + download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path) + download_obj.outdir = tmp_path + download_obj.container_cache_utilisation = "amend" + download_obj.registry_set = {"docker.io", "quay.io", "depot.galaxyproject.org"} + + ## Test phase I: Container not yet cached, should be amended to cache + # out_path: str, Path to cache + # cache_path: None + + result = download_obj.singularity_image_filenames( + "https://depot.galaxyproject.org/singularity/bbmap:38.93--he522d1c_0" + ) + + # Assert that the result is a tuple of length 2 + self.assertIsInstance(result, tuple) + self.assertEqual(len(result), 2) + + # Assert that the types of the elements are (str, None) + self.assertTrue(all((isinstance(element, str), element is None) for element in result)) + + # assert that the correct out_path is returned that points to the cache + assert result[0].endswith("/cachedir/singularity-bbmap-38.93--he522d1c_0.img") + + ## Test phase II: Test various container names + # out_path: str, Path to cache + # cache_path: None + result = download_obj.singularity_image_filenames( + "quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0" + ) + assert result[0].endswith( + "/cachedir/biocontainers-mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2-59cdd445419f14abac76b31dd0d71217994cbcc9-0.img" + ) + + result = download_obj.singularity_image_filenames("nf-core/ubuntu:20.04") + assert result[0].endswith("/cachedir/nf-core-ubuntu-20.04.img") + + ## Test phase III: Container wil lbe cached but also copied to out_path + # out_path: str, Path to cache + # cache_path: str, Path to cache + download_obj.container_cache_utilisation = "copy" + result = download_obj.singularity_image_filenames( + "https://depot.galaxyproject.org/singularity/bbmap:38.93--he522d1c_0" + ) + + self.assertTrue(all(isinstance(element, str) for element in result)) + assert result[0].endswith("/singularity-images/singularity-bbmap-38.93--he522d1c_0.img") + assert result[1].endswith("/cachedir/singularity-bbmap-38.93--he522d1c_0.img") + + ## Test phase IV: Expect an error if no NXF_SINGULARITY_CACHEDIR is defined + os.environ["NXF_SINGULARITY_CACHEDIR"] = "" + with self.assertRaises(FileNotFoundError): + download_obj.singularity_image_filenames( + "https://depot.galaxyproject.org/singularity/bbmap:38.93--he522d1c_0" + ) + # # Test for '--singularity-cache remote --singularity-cache-index'. Provide a list of containers already available in a remote location. # From e64f1a7e9ea6905926949ca4373b560358ad0cf2 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Tue, 27 Feb 2024 17:47:19 +0100 Subject: [PATCH 12/12] Added a few type hints and opened Pandoras box. --- nf_core/download.py | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/nf_core/download.py b/nf_core/download.py index 31192676c0..d08e0ba40e 100644 --- a/nf_core/download.py +++ b/nf_core/download.py @@ -10,6 +10,7 @@ import tarfile import textwrap from datetime import datetime +from typing import List, Optional, Tuple from zipfile import ZipFile import git @@ -978,7 +979,7 @@ def prioritize_direct_download(self, container_list): d[k] = c return sorted(list(d.values())) - def gather_registries(self, workflow_directory): + def gather_registries(self, workflow_directory: str) -> None: """Fetch the registries from the pipeline config and CLI arguments and store them in a set. This is needed to symlink downloaded container images so Nextflow will find them. """ @@ -1002,7 +1003,7 @@ def gather_registries(self, workflow_directory): # add depot.galaxyproject.org to the set, because it is the default registry for singularity hardcoded in modules self.registry_set.add("depot.galaxyproject.org") - def symlink_singularity_images(self, image_out_path): + def symlink_singularity_images(self, image_out_path: str) -> None: """Create a symlink for each registry in the registry set that points to the image. We have dropped the explicit registries from the modules in favor of the configurable registries. Unfortunately, Nextflow still expects the registry to be part of the file name, so a symlink is needed. @@ -1042,7 +1043,7 @@ def symlink_singularity_images(self, image_out_path): finally: os.close(image_dir) - def get_singularity_images(self, current_revision=""): + def get_singularity_images(self, current_revision: str = "") -> None: """Loop through container names and download Singularity images""" if len(self.containers) == 0: @@ -1060,10 +1061,10 @@ def get_singularity_images(self, current_revision=""): ) # Organise containers based on what we need to do with them - containers_exist = [] - containers_cache = [] - containers_download = [] - containers_pull = [] + containers_exist: List[str] = [] + containers_cache: List[Tuple[str, str, Optional[str]]] = [] + containers_download: List[Tuple[str, str, Optional[str]]] = [] + containers_pull: List[Tuple[str, str, Optional[str]]] = [] for container in self.containers: # Fetch the output and cached filenames for this container out_path, cache_path = self.singularity_image_filenames(container) @@ -1086,16 +1087,16 @@ def get_singularity_images(self, current_revision=""): # We have a copy of this in the NXF_SINGULARITY_CACHE dir if cache_path and os.path.exists(cache_path): - containers_cache.append([container, out_path, cache_path]) + containers_cache.append((container, out_path, cache_path)) continue # Direct download within Python if container.startswith("http"): - containers_download.append([container, out_path, cache_path]) + containers_download.append((container, out_path, cache_path)) continue # Pull using singularity - containers_pull.append([container, out_path, cache_path]) + containers_pull.append((container, out_path, cache_path)) # Exit if we need to pull images and Singularity is not installed if len(containers_pull) > 0: @@ -1127,8 +1128,8 @@ def get_singularity_images(self, current_revision=""): # Kick off concurrent downloads future_downloads = [ - pool.submit(self.singularity_download_image, *container, progress) - for container in containers_download + pool.submit(self.singularity_download_image, *containers, progress) + for containers in containers_download ] # Make ctrl-c work with multi-threading @@ -1153,13 +1154,13 @@ def get_singularity_images(self, current_revision=""): # Re-raise exception on the main thread raise - for container in containers_pull: + for containers in containers_pull: progress.update(task, description="Pulling singularity images") # it is possible to try multiple registries / mirrors if multiple were specified. # Iteration happens over a copy of self.container_library[:], as I want to be able to remove failing registries for subsequent images. for library in self.container_library[:]: try: - self.singularity_pull_image(*container, library, progress) + self.singularity_pull_image(*containers, library, progress) # Pulling the image was successful, no ContainerError was raised, break the library loop break except ContainerError.ImageExistsError: @@ -1196,12 +1197,12 @@ def get_singularity_images(self, current_revision=""): # The else clause executes after the loop completes normally. # This means the library loop completed without breaking, indicating failure for all libraries (registries) log.error( - f"Not able to pull image of {container}. Service might be down or internet connection is dead." + f"Not able to pull image of {containers}. Service might be down or internet connection is dead." ) # Task should advance in any case. Failure to pull will not kill the download process. progress.update(task, advance=1) - def singularity_image_filenames(self, container): + def singularity_image_filenames(self, container: str) -> Tuple[str, Optional[str]]: """Check Singularity cache for image, copy to destination folder if found. Args: @@ -1258,7 +1259,7 @@ def singularity_image_filenames(self, container): return (out_path, cache_path) - def singularity_copy_cache_image(self, container, out_path, cache_path): + def singularity_copy_cache_image(self, container: str, out_path: str, cache_path: Optional[str]) -> None: """Copy Singularity image from NXF_SINGULARITY_CACHEDIR to target folder.""" # Copy to destination folder if we have a cached version if cache_path and os.path.exists(cache_path): @@ -1267,7 +1268,9 @@ def singularity_copy_cache_image(self, container, out_path, cache_path): # Create symlinks to ensure that the images are found even with different registries being used. self.symlink_singularity_images(out_path) - def singularity_download_image(self, container, out_path, cache_path, progress): + def singularity_download_image( + self, container: str, out_path: str, cache_path: Optional[str], progress: DownloadProgress + ) -> None: """Download a singularity image from the web. Use native Python to download the file. @@ -1314,7 +1317,6 @@ def singularity_download_image(self, container, out_path, cache_path, progress): # Rename partial filename to final filename os.rename(output_path_tmp, output_path) - output_path_tmp = None # Copy cached download if we are using the cache if cache_path: @@ -1339,8 +1341,12 @@ def singularity_download_image(self, container, out_path, cache_path, progress): os.remove(output_path) # Re-raise the caught exception raise + finally: + del output_path_tmp - def singularity_pull_image(self, container, out_path, cache_path, library, progress): + def singularity_pull_image( + self, container: str, out_path: str, cache_path: Optional[str], library: List[str], progress: DownloadProgress + ) -> None: """Pull a singularity image using ``singularity pull`` Attempt to use a local installation of singularity to pull the image. @@ -1355,6 +1361,11 @@ def singularity_pull_image(self, container, out_path, cache_path, library, progr """ output_path = cache_path or out_path + # where the output of 'singularity pull' is first generated before being copied to the NXF_SINGULARITY_CACHDIR. + # if not defined by the Singularity administrators, then use the temporary directory to avoid storing the images in the work directory. + if os.environ.get("SINGULARITY_CACHEDIR") is None: + os.environ["SINGULARITY_CACHEDIR"] = NFCORE_CACHE_DIR + # Sometimes, container still contain an explicit library specification, which # resulted in attempted pulls e.g. from docker://quay.io/quay.io/qiime2/core:2022.11 # Thus, if an explicit registry is specified, the provided -l value is ignored. @@ -1399,9 +1410,10 @@ def singularity_pull_image(self, container, out_path, cache_path, library, progr bufsize=1, ) as proc: lines = [] - for line in proc.stdout: - lines.append(line) - progress.update(task, current_log=line.strip()) + if proc.stdout is not None: + for line in proc.stdout: + lines.append(line) + progress.update(task, current_log=line.strip()) if lines: # something went wrong with the container retrieval @@ -1428,7 +1440,7 @@ def singularity_pull_image(self, container, out_path, cache_path, library, progr progress.remove_task(task) - def compress_download(self): + def compress_download(self) -> None: """Take the downloaded files and make a compressed .tar.gz archive.""" log.debug(f"Creating archive: {self.output_filename}")