diff --git a/scripts/clean_tests.sh b/scripts/clean_tests.sh index f657c15..5a84e0b 100755 --- a/scripts/clean_tests.sh +++ b/scripts/clean_tests.sh @@ -38,7 +38,6 @@ directories=( "$dataset_multimodal_breast/tests/test005/" "$dicom_images_breast/data/meta/pre/" "$dicom_images_breast/data/meta/post/" - "$dicom_images_breast/data/mapping/" "$dicom_images_breast/data/logs/" ) diff --git a/scripts/move_back_to_unexplored.sh b/scripts/move_back_to_unexplored.sh new file mode 100755 index 0000000..cd8ed4e --- /dev/null +++ b/scripts/move_back_to_unexplored.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# +# Author: Francisco Maria Calisto +# Maintainer: Francisco Maria Calisto +# Email: francisco.calisto@tecnico.ulisboa.pt +# License: ACADEMIC & COMMERCIAL +# Created Date: 2024-09-22 +# Revised Date: 2024-09-22 # Updated to reflect file limit via variable +# Version: 1.2 # Incremented version to reflect changes for variable-based file limit +# Status: Development +# Credits: +# - Carlos Santiago +# - Catarina Barata +# - Jacinto C. Nascimento +# - Diogo Araújo +# Usage: ./move_back_to_unexplored.sh +# Example: ./script/move_back_to_unexplored.sh +# Description: This script moves a limited number of DICOM files from the "checking" folder +# to the "unexplored" folder in the dataset-multimodal-breast repository. It includes error +# handling, logging, and batch file processing with a configurable file limit. + +# Exit script on any command failure to ensure safe execution +set -e + +# Define the home directory using the system's HOME environment variable +home="$HOME" + +# Define the file limit for the number of DICOM files to move +FILE_LIMIT=10 # You can adjust this value to change the file limit + +# Define source and destination directories (using realpath for absolute paths) +SRC_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/checking")" +DEST_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/unexplored")" +LOG_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/logs")" + +# Ensure the logs directory exists, create it if necessary +if [ ! -d "$LOG_DIR" ]; then + mkdir -p "$LOG_DIR" +fi + +# Create a timestamp for the log file to avoid overwriting previous logs +LOG_FILE="$LOG_DIR/move_back_$(date +'%Y%m%d_%H%M%S').log" + +# Function to log messages to both the terminal and log file +log_message() { + echo "$1" | tee -a "$LOG_FILE" +} + +# Function to validate directory existence +# Arguments: +# $1: Directory path to validate +# $2: Directory name for logging purposes +validate_directory() { + local dir="$1" + local dir_name="$2" + + if [ ! -d "$dir" ]; then + log_message "Error: $dir_name directory $dir does not exist. Exiting." + exit 1 + fi +} + +# Function to move DICOM files from source to destination, respecting the file limit +# Arguments: +# $1: Source directory +# $2: Destination directory +# $3: File limit +move_files() { + local src="$1" + local dest="$2" + local limit="$3" + local count=0 + + log_message "Moving up to $limit DICOM files from $src to $dest..." + + # Find and move only DICOM files (.dcm extension), limit to the specified number + find "$src" -type f -name "*.dcm" | while IFS= read -r file; do + if [ -f "$file" ] && (( count < limit )); then + mv "$file" "$dest" + log_message "$(date): Moved $file" + ((count++)) + fi + done + + log_message "Moved $count DICOM files from $src to $dest." +} + +# Main execution begins here + +# Validate that both source and destination directories exist +validate_directory "$SRC_DIR" "Checking" +validate_directory "$DEST_DIR" "Unexplored" + +# Move DICOM files from the "checking" folder to the "unexplored" folder, respecting the file limit +move_files "$SRC_DIR" "$DEST_DIR" "$FILE_LIMIT" + +# Log completion message +log_message "File move operation completed successfully. Logs are saved to $LOG_FILE." + +# End of script \ No newline at end of file diff --git a/scripts/move_files.sh b/scripts/move_files.sh index 17b25b9..8d8fb25 100755 --- a/scripts/move_files.sh +++ b/scripts/move_files.sh @@ -5,8 +5,8 @@ # Email: francisco.calisto@tecnico.ulisboa.pt # License: ACADEMIC & COMMERCIAL # Created Date: 2024-09-21 -# Revised Date: 2024-09-22 # Updated to reflect optimizations and improvements -# Version: 1.5 # Incremented version to reflect new logging location +# Revised Date: 2024-09-23 # Updated to reflect optimizations and improvements +# Version: 1.6 # Incremented version to reflect additional logging and optimizations # Status: Development # Credits: # - Carlos Santiago @@ -19,59 +19,75 @@ # inside the dataset-multimodal-breast repository. It handles large datasets by processing files in batches, # offers parallelism for speed, checks disk space, and logs errors in the curation/logs folder. -# Exit script on any command failure +# Exit script on any command failure to ensure safe execution set -e -# Define home directory using the system's HOME environment variable +# Define the home directory using the system's HOME environment variable home="$HOME" -# Resolve the absolute paths for source, destination, and log directories -SRC_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/unexplored")" -DEST_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/checking")" +# Log file with timestamp to prevent overwriting previous logs +timestamp=$(date +"%Y%m%d_%H%M%S") LOG_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/logs")" +LOG_FILE="$LOG_DIR/move_files_$timestamp.log" # Ensure the logs directory exists, create if it doesn't if [ ! -d "$LOG_DIR" ]; then mkdir -p "$LOG_DIR" fi -# Log file to capture details of file moves and errors -LOG_FILE="$LOG_DIR/move_files.log" +# Log the beginning of the script execution +echo "$(date): Starting move_files.sh script" >> "$LOG_FILE" + +# Define the absolute paths for source and destination directories +SRC_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/unexplored")" +DEST_DIR="$(realpath "$home/Git/dataset-multimodal-breast/data/curation/checking")" + +# Log the source and destination directories +echo "$(date): Source directory: $SRC_DIR" >> "$LOG_FILE" +echo "$(date): Destination directory: $DEST_DIR" >> "$LOG_FILE" + +# Function to log both errors and standard messages +log_message() { + echo "$1" + echo "$(date): $1" >> "$LOG_FILE" +} -# Function to print error messages to stderr and log them +# Function to log errors, print to stderr, and log to the log file # Arguments: # $1: The error message to display and log -function print_error { +print_error() { echo "$1" >&2 # Print the error message to stderr - echo "$(date): $1" >> "$LOG_FILE" # Append the error message to the log file + echo "$(date): ERROR: $1" >> "$LOG_FILE" # Log the error message to the log file } -# Function to validate that a directory exists +# Function to validate the existence of a directory # Arguments: -# $1: The path of the directory to validate -# $2: A friendly name for the directory (e.g., "Source", "Destination") +# $1: The directory path +# $2: Friendly name for the directory (e.g., "Source", "Destination") validate_directory() { - local dir_path="$1" # Directory to check - local dir_name="$2" # Friendly name for logging and messages - - # Check if the directory exists and is a valid directory + local dir_path="$1" + local dir_name="$2" if [ ! -d "$dir_path" ]; then print_error "$dir_name directory $dir_path does not exist. Exiting." - exit 1 # Exit the script if the directory doesn't exist + exit 1 + else + log_message "$dir_name directory exists: $dir_path" fi } -# Function to check disk space before moving files +# Function to check if sufficient disk space is available before moving files # Arguments: # $1: The required minimum free space in kilobytes (e.g., 10485760 for 10GB) check_disk_space() { local required_space="$1" local available_space=$(df "$DEST_DIR" | awk 'NR==2 {print $4}') - # Check if there is enough available space + # Check if the available disk space is sufficient if (( available_space < required_space )); then print_error "Not enough disk space. Available: ${available_space}KB, Required: ${required_space}KB. Exiting." exit 1 + else + log_message "Sufficient disk space available: ${available_space}KB" fi } @@ -83,51 +99,53 @@ move_files_in_batches() { local src="$1" local dest="$2" local count=0 + local BATCH_SIZE=100 # Customize the batch size for optimal performance - echo "Moving files from $src to $dest in batches of $BATCH_SIZE..." + log_message "Moving files from $src to $dest in batches of $BATCH_SIZE..." # Find all files in the source directory and move them in batches find "$src" -type f | while IFS= read -r file; do if [ -f "$file" ]; then - mv "$file" "$dest" # Move file to the destination - echo "$(date): Moved $file" >> "$LOG_FILE" + mv "$file" "$dest" + log_message "Moved file: $file" ((count++)) - # Check if we've reached the batch size + + # Log progress every batch if (( count % BATCH_SIZE == 0 )); then - echo "Moved $count files so far..." - sleep 1 # Optional: Add a pause between batches to reduce system load + log_message "Moved $count files so far..." + sleep 1 # Add a short delay between batches to reduce system load fi fi done + log_message "Finished moving files. Total files moved: $count" } -# Function to check if the last operation (moving files) was successful +# Function to check if the move operation was successful check_move_success() { - # $? holds the exit status of the last command (mv in this case) if [ $? -eq 0 ]; then - echo "Files moved successfully." + log_message "File move operation completed successfully." else - print_error "An error occurred while moving files." - exit 1 # Exit with an error status if something went wrong + print_error "An error occurred during the file move operation." + exit 1 fi } # Main script execution begins here -# Validate the existence of the source and destination directories +# Validate the source and destination directories validate_directory "$SRC_DIR" "Source" validate_directory "$DEST_DIR" "Destination" -# Check if there is enough disk space (assuming 10GB minimum required space) +# Check for sufficient disk space (assuming a minimum of 10GB required space) check_disk_space 10485760 # 10GB in kilobytes -# Move files in batches from the source to the destination +# Move the files from the source to the destination in batches move_files_in_batches "$SRC_DIR" "$DEST_DIR" # Check if the move operation was successful check_move_success -# Print a final message indicating that the script has completed successfully -echo "Operation complete. Logs can be found in $LOG_FILE." +# Final log message indicating that the script has completed +log_message "Operation complete. Logs saved in $LOG_FILE." # End of script \ No newline at end of file diff --git a/src/validation/explorer.py b/src/validation/explorer.py new file mode 100644 index 0000000..9851fbd --- /dev/null +++ b/src/validation/explorer.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python + +""" +explorer.py: +This script reads DICOM files from the "unexplored" folder, extracts the Patient ID from the DICOM metainformation, +and checks if the Patient ID exists in the second column of the 'anonymized_patients_birads_curation.csv' file. +If a match is found, the DICOM file is moved to the "checking" folder. Otherwise, it stays in the "unexplored" folder. +The script processes up to 50,000 DICOM files. +""" + +__author__ = "Francisco Maria Calisto" +__maintainer__ = "Francisco Maria Calisto" +__email__ = "francisco.calisto@tecnico.ulisboa.pt" +__license__ = "ACADEMIC & COMMERCIAL" +__version__ = "0.1.3" # Version increment to reflect further refinements +__status__ = "Development" +__credits__ = ["Carlos Santiago", + "Catarina Barata", + "Jacinto C. Nascimento", + "Diogo Araújo"] + +import os +import csv +import logging +import pydicom +import shutil + +# Configure detailed logging for debugging and tracking +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Limit the number of files to process (configurable) +FILE_LIMIT = 10 + +# File name for the CSV mapping of patients +csv_fn = "anonymized_patients_birads_curation.csv" + +# Directory paths (handled holistically to ensure cross-platform compatibility) +# The logic ensures the root_dir points to the correct absolute root folder +root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +# Define subdirectories relative to the root directory +unexplored_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "unexplored") +checking_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "checking") +csv_file = os.path.join(root_dir, "dataset-multimodal-breast", "data", "birads", csv_fn) + +# Ensure the "checking" directory exists; create it if it doesn't +os.makedirs(checking_dir, exist_ok=True) + +def load_patient_ids(csv_filepath): + """ + Load Patient IDs from the second column of the CSV file into a set for fast lookups. + + Args: + csv_filepath (str): Path to the CSV file. + + Returns: + set: A set containing Patient IDs. + """ + patient_ids = set() + try: + # Open the CSV file and read patient IDs from the second column + with open(csv_filepath, mode='r') as file: + reader = csv.reader(file) + next(reader) # Skip the header row + for row in reader: + if len(row) >= 2: # Ensure row has enough columns + # Add Patient ID from the second column and strip any extra spaces + patient_ids.add(row[1].strip()) + logging.info(f"Loaded {len(patient_ids)} Patient IDs from CSV.") + except Exception as e: + logging.error(f"Error reading CSV file {csv_filepath}: {e}") + return patient_ids + +def extract_patient_id(dicom_file): + """ + Extract the Patient ID from the DICOM file's metadata. + + Args: + dicom_file (str): Path to the DICOM file. + + Returns: + str: The Patient ID, or None if it cannot be found. + """ + try: + # Read the DICOM file's metadata and attempt to extract the Patient ID + ds = pydicom.dcmread(dicom_file) + return ds.PatientID if hasattr(ds, 'PatientID') else None # Extract PatientID if it exists + except Exception as e: + # Log if reading the DICOM file fails + logging.warning(f"Failed to read DICOM file {dicom_file}: {e}") + return None + +def move_file_to_checking(src_path, dest_dir): + """ + Move a DICOM file to the "checking" folder. + + Args: + src_path (str): Source path of the DICOM file. + dest_dir (str): Destination directory where the file will be moved. + """ + try: + # Move file from the unexplored directory to the checking directory + shutil.move(src_path, dest_dir) + logging.info(f"Moved file {src_path} to {dest_dir}") + except Exception as e: + # Log an error if the file move operation fails + logging.error(f"Failed to move file {src_path} to {dest_dir}: {e}") + +def process_dicom_files(unexplored_dir, checking_dir, patient_ids, file_limit): + """ + Process DICOM files from the "unexplored" folder, extracting Patient IDs and checking them against the CSV file. + If a match is found, move the file to the "checking" folder. Otherwise, leave it in "unexplored". + + Args: + unexplored_dir (str): Path to the "unexplored" folder. + checking_dir (str): Path to the "checking" folder. + patient_ids (set): Set of Patient IDs loaded from the CSV. + file_limit (int): Maximum number of DICOM files to process. + """ + processed_files = 0 # Track how many files have been processed + moved_files = 0 # Track how many files were moved + + # Walk through each file in the unexplored directory + for root, _, files in os.walk(unexplored_dir): + for file in files: + if processed_files >= file_limit: # Stop if the file limit is reached + logging.info(f"Reached the limit of {file_limit} files.") + logging.info(f"Total files moved to checking: {moved_files}") + return + + # Full path of the DICOM file to process + dicom_file_path = os.path.join(root, file) + patient_id = extract_patient_id(dicom_file_path) # Extract the Patient ID + + if patient_id: + if patient_id in patient_ids: + # If Patient ID exists in the CSV, move the file to the checking directory + move_file_to_checking(dicom_file_path, checking_dir) + moved_files += 1 # Track number of files moved + else: + logging.info(f"Patient ID {patient_id} not found in CSV. File remains in unexplored.") + else: + logging.warning(f"No Patient ID found in {dicom_file_path}") # Log if no Patient ID is found + + processed_files += 1 # Increment the count of processed files + +if __name__ == "__main__": + # Load Patient IDs from the specified CSV file + patient_ids = load_patient_ids(csv_file) + + # Process DICOM files in the "unexplored" folder based on the loaded Patient IDs + process_dicom_files(unexplored_dir, checking_dir, patient_ids, FILE_LIMIT) + + # Log when DICOM file exploration is complete + logging.info("DICOM file exploration complete.") + +# End of file \ No newline at end of file diff --git a/src/validation/identifier.py b/src/validation/identifier.py index 809005a..6965395 100644 --- a/src/validation/identifier.py +++ b/src/validation/identifier.py @@ -16,7 +16,7 @@ __maintainer__ = "Francisco Maria Calisto" __email__ = "francisco.calisto@tecnico.ulisboa.pt" __license__ = "ACADEMIC & COMMERCIAL" -__version__ = "0.4.0" +__version__ = "0.4.4" # Version updated to reflect optimizations for file handling __status__ = "Development" __copyright__ = "Copyright 2024, Instituto Superior Técnico (IST)" __credits__ = ["Carlos Santiago", @@ -33,43 +33,60 @@ from urllib3.exceptions import NotOpenSSLWarning from multiprocessing import Pool, cpu_count -# Set up logging +# Configure detailed logging for debugging and tracking progress logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -# Suppress warnings +# Suppress irrelevant warnings (e.g., SSL warnings from urllib3) warnings.filterwarnings("ignore", category=NotOpenSSLWarning) -# Define paths and constants -BATCH_SIZE = 1000 # Number of files to process in each batch -NUM_WORKERS = max(1, cpu_count() - 1) # Parallelize file processing across available CPU cores +# Constants for batch size and parallel processing +BATCH_SIZE = 500 # Reduced batch size to limit the number of open files +NUM_WORKERS = max(1, cpu_count() // 2) # Reduce the number of workers to control open files -# Mapping file name +# Define key directories and paths holistically (cross-platform handling) mapping_fn = "mamo_patients_mapping_data.csv" - -# Directory paths root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +# Directory paths for checking, raw, identified, and unsolvable files checking_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "checking") identified_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "identified") unsolvable_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "unsolvable") raw_dir = os.path.join(root_dir, "dicom-images-breast", "known", "raw") mapping_csv = os.path.join(root_dir, "dicom-images-breast", "data", "mapping", mapping_fn) -# Ensure necessary directories exist -os.makedirs(unsolvable_dir, exist_ok=True) +# Ensure directories for identified and unsolvable files exist os.makedirs(identified_dir, exist_ok=True) +os.makedirs(unsolvable_dir, exist_ok=True) def normalize_string(s): - """Normalize string by stripping whitespace, lowering case, and removing special characters.""" + """ + Normalize string by stripping whitespace, converting to lowercase, + and removing special characters for consistent comparison. + + Args: + s (str): The string to normalize. + + Returns: + str: The normalized string. + """ return s.strip().lower().replace('\u200b', '') def load_mapping(csv_file): - """Load mapping of real_patient_id to anonymized_patient_id from CSV.""" + """ + Load the mapping of real_patient_id to anonymized_patient_id from a CSV file. + + Args: + csv_file (str): Path to the mapping CSV file. + + Returns: + dict: A dictionary mapping real_patient_id to anonymized_patient_id. + """ logging.info(f"Loading mapping from {csv_file}") mapping = {} try: with open(csv_file, mode='r') as file: reader = csv.reader(file) - next(reader) # Skip header + next(reader) # Skip the header row for row in reader: if len(row) >= 2: real_id = normalize_string(row[0]) @@ -81,7 +98,15 @@ def load_mapping(csv_file): return mapping def load_sop_instance_uid_map(raw_path): - """Load SOP Instance UIDs from the raw directory into a dictionary for quick lookups.""" + """ + Load SOP Instance UIDs from the raw directory into a dictionary for fast lookups. + + Args: + raw_path (str): Path to the raw DICOM folder. + + Returns: + dict: A dictionary mapping SOP Instance UIDs to file paths. + """ logging.info("Loading SOP Instance UIDs from raw directory...") sop_map = {} try: @@ -98,16 +123,32 @@ def load_sop_instance_uid_map(raw_path): return sop_map def is_dicom_file(filepath): - """Check if a file is a DICOM file by attempting to read it.""" + """ + Check if a file is a valid DICOM file by attempting to read its metadata. + + Args: + filepath (str): Path to the file. + + Returns: + bool: True if the file is a valid DICOM file, False otherwise. + """ try: pydicom.dcmread(filepath, stop_before_pixels=True) return True except Exception as e: - logging.warning(f"File {filepath} is not a DICOM file: {e}") + logging.warning(f"File {filepath} is not a valid DICOM file: {e}") return False def get_sop_instance_uid(dicom_file): - """Extract the SOP Instance UID from DICOM metadata.""" + """ + Extract the SOP Instance UID from DICOM metadata. + + Args: + dicom_file (str): Path to the DICOM file. + + Returns: + str: SOP Instance UID if found, None otherwise. + """ try: dicom_data = pydicom.dcmread(dicom_file) return dicom_data.get("SOPInstanceUID", None) @@ -116,7 +157,15 @@ def get_sop_instance_uid(dicom_file): return None def get_patient_id(dicom_file): - """Extract the Patient ID from DICOM metadata.""" + """ + Extract the Patient ID from DICOM metadata. + + Args: + dicom_file (str): Path to the DICOM file. + + Returns: + str: Normalized Patient ID if found, "Unknown" otherwise. + """ try: dicom_data = pydicom.dcmread(dicom_file) return normalize_string(dicom_data.get("PatientID", "Unknown")) @@ -125,7 +174,13 @@ def get_patient_id(dicom_file): return None def move_file(src_path, dest_path): - """Move the file from src_path to dest_path.""" + """ + Move a file from src_path to dest_path, creating directories if needed. + + Args: + src_path (str): The source file path. + dest_path (str): The destination file path. + """ if os.path.exists(src_path): os.makedirs(os.path.dirname(dest_path), exist_ok=True) try: @@ -137,14 +192,33 @@ def move_file(src_path, dest_path): logging.warning(f"File not found: {src_path}") def rename_file(file_name, new_patient_id): - """Rename the file with the new patient ID.""" + """ + Rename a file by replacing the Patient ID part of the filename with a new anonymized ID. + + Args: + file_name (str): The original filename. + new_patient_id (str): The new anonymized patient ID. + + Returns: + str: The updated filename with the anonymized ID. + """ parts = file_name.split('_') if len(parts) > 0: - parts[0] = new_patient_id # Replace the first part (anonymized_patient_id) + parts[0] = new_patient_id # Replace the first part with the anonymized_patient_id return '_'.join(parts) def process_file(file, checking_file_path, sop_map, identified_path, unsolvable_path, mapping): - """Process an individual file to match SOP Instance UID and update patient ID.""" + """ + Process a single DICOM file, checking for matches and updating the patient ID as necessary. + + Args: + file (str): Filename to process. + checking_file_path (str): Path to the file in the checking folder. + sop_map (dict): SOP Instance UID map. + identified_path (str): Path to the identified folder. + unsolvable_path (str): Path to the unsolvable folder. + mapping (dict): Mapping of real_patient_id to anonymized_patient_id. + """ if not is_dicom_file(checking_file_path): move_file(checking_file_path, os.path.join(unsolvable_path, file)) return @@ -170,13 +244,31 @@ def process_file(file, checking_file_path, sop_map, identified_path, unsolvable_ move_file(checking_file_path, os.path.join(unsolvable_path, file)) def batch_process_files(files_batch, sop_map, identified_path, unsolvable_path, mapping): - """Process a batch of DICOM files in parallel.""" + """ + Process a batch of DICOM files. + + Args: + files_batch (list): A batch of files to process. + sop_map (dict): SOP Instance UID map. + identified_path (str): Path to the identified folder. + unsolvable_path (str): Path to the unsolvable folder. + mapping (dict): Mapping of real_patient_id to anonymized_patient_id. + """ for file in files_batch: checking_file_path = os.path.join(checking_dir, file) process_file(file, checking_file_path, sop_map, identified_path, unsolvable_path, mapping) def process_checking_files_in_batches(checking_path, sop_map, identified_path, unsolvable_path, mapping): - """Process files in the checking directory in batches using parallel processing.""" + """ + Process DICOM files in batches using parallel processing. + + Args: + checking_path (str): Path to the checking folder. + sop_map (dict): SOP Instance UID map. + identified_path (str): Path to the identified folder. + unsolvable_path (str): Path to the unsolvable folder. + mapping (dict): Mapping of real_patient_id to anonymized_patient_id. + """ all_files = [file for file in os.listdir(checking_path) if os.path.isfile(os.path.join(checking_path, file))] total_files = len(all_files) logging.info(f"Total DICOM files to process: {total_files}") @@ -184,7 +276,9 @@ def process_checking_files_in_batches(checking_path, sop_map, identified_path, u for i in range(0, total_files, BATCH_SIZE): batch = all_files[i:i + BATCH_SIZE] logging.info(f"Processing batch {i // BATCH_SIZE + 1} with {len(batch)} files.") - with Pool(processes=NUM_WORKERS) as pool: + + # Use multiprocessing for parallel processing with file batches + with Pool(processes=NUM_WORKERS, maxtasksperchild=100) as pool: pool.apply_async(batch_process_files, (batch, sop_map, identified_dir, unsolvable_dir, mapping)) if __name__ == '__main__':