diff --git a/LICENSE b/LICENSE index 46ad56d..82ede59 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Kaszanas +Copyright (c) 2021 Andrzej BiaƂecki Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md new file mode 100644 index 0000000..d317d49 --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# SC2DatasetPreparator + +This repository contains tools which can be used in order to perform the following steps: + +1. Using ```src/directory_flattener.py``` Flatten the directory structure and save the old directory tree to a mapping of ```{"replayUniqueHash": "whereItWasInOldStructure"}``` +2. Using ```src/sc2_replaypack_processor``` Perform replaypack processing with https://github.com/Kaszanas/SC2InfoExtractorGo + +## Customization + +In order to specify different processing flags for https://github.com/Kaszanas/SC2InfoExtractorGo please modify the ```src/sc2_replaypack_processor``` file directly + +## Usage + +Before using this software please Python >= 3.7 and ```requirements.txt```. + +Please keep in mind that ```src/directory_flattener.py``` does not contain default flag values and can be customized with the following command line flags: + +``` +usage: directory_flattener.py [-h] [--input_path INPUT_PATH] + [--file_extension FILE_EXTENSION] + +Directory restructuring tool used in order to flatten the structure, map the +old structure to a separate file, and for later processing with other tools. + +optional arguments: + -h, --help show this help message and exit + --input_path INPUT_PATH + Please provide input path to the dataset that is going + to be processed. + --file_extension FILE_EXTENSION + Please provide a file extension for files that will be + moved and renamed. +``` + + +Please keep in mind that the ```src/sc2_replaypack_processor.py``` does not contain default flag values and can be customized with the following command line flags: + +``` +Tool used for processing SC2 datasets. with +https://github.com/Kaszanas/SC2InfoExtractorGo + +optional arguments: + -h, --help show this help message and exit + --input_dir INPUT_DIR + Please provide input path to the directory containing + the dataset that is going to be processed. + --output_dir OUTPUT_DIR + Please provide an output directory for the resulting + files. + --number_of_processes NUMBER_OF_PROCESSES + Please provide the number of processes to be spawn for + the dataset processing. +``` + +# Citation \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bad16f5 Binary files /dev/null and b/requirements.txt differ diff --git a/src/directory_flattener.py b/src/directory_flattener.py new file mode 100644 index 0000000..ee165e5 --- /dev/null +++ b/src/directory_flattener.py @@ -0,0 +1,45 @@ +import os +import argparse +import uuid +import json +import glob +import shutil + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Directory restructuring tool used in order to flatten the structure, map the old structure to a separate file, and for later processing with other tools.") + parser.add_argument("--input_path", help="Please provide input path to the dataset that is going to be processed.") + parser.add_argument("--file_extension", help="Please provide a file extension for files that will be moved and renamed.") + args = parser.parse_args() + + + dir_structure_mapping = {} + + # Iterate over the supplied directory: + for root, _, filename in os.walk(args.input_path): + # Performing action for every file that was detected + for file in filename: + if file.endswith(args.file_extension): + + # Prepare relative paths: + relative_dir = os.path.relpath(root, args.input_path) + relative_file = os.path.join(relative_dir, file) + # Get unique filename: + unique_filename = uuid.uuid4().hex + + # Create directory if it doesn't exist: + new_root_directory = args.input_path + "_processed" + if not os.path.exists(new_root_directory): + os.makedirs(new_root_directory) + + # Moving and renaming files: + current_file = os.path.join(root, file) + new_path_and_filename = os.path.join(new_root_directory, unique_filename+args.file_extension) + + # Copying files: + shutil.copy(current_file, new_path_and_filename) + + # Add to a mapping + dir_structure_mapping[unique_filename] = relative_file + + with open(os.path.join(args.input_path+"_processed", "processed_mapping.json"), "w") as json_file: + json.dump(dir_structure_mapping, json_file) \ No newline at end of file diff --git a/src/sc2_replaypack_processor.py b/src/sc2_replaypack_processor.py new file mode 100644 index 0000000..21f3ea9 --- /dev/null +++ b/src/sc2_replaypack_processor.py @@ -0,0 +1,59 @@ +import os +import argparse +import subprocess +from tqdm import tqdm +from multiprocessing import Pool + + +def multiprocessing_scheduler(processing_arguments): + with Pool(processes=12) as pool: + pool.imap_unordered(multiprocessing_client, processing_arguments) + pool.close() + pool.join() + + +def multiprocessing_client(arguments:tuple): + + directory, output_directory_filepath = arguments + + subprocess.run(["GoSC2Science.exe", + f"-input={directory}/", + f"-output={output_directory_filepath}/", + "-integrity_check=false", + "-validity_check=false", + "-number_of_packages=1", + "-game_mode=0b1111111111", + "-localized_maps_file=F:\\Projects\\EsportDataset\\processed\\program\\new_maps_processed.json", + "-perform_anonymization=false", + "-localize_maps=true", + "-with_multiprocessing=false", + "-log_level=3", + f"-log_dir={output_directory_filepath}/"]) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Tool used for processing SC2 datasets. with https://github.com/Kaszanas/SC2InfoExtractorGo") + parser.add_argument("--input_dir", help="Please provide input path to the directory containing the dataset that is going to be processed.") + parser.add_argument("--output_dir", help="Please provide an output directory for the resulting files.") + parser.add_argument("--number_of_processes", help="Please provide the number of processes to be spawned for the dataset processing.") + args = parser.parse_args() + + multiprocessing_list = [] + for directory, _, file in tqdm(os.walk(args.input_dir)): + + # Create the main output directory: + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + + output_directory_name = directory.split("\\")[-1] + if output_directory_name == "input": + continue + + output_directory_filepath = os.path.join(args.output_dir, output_directory_name) + + # Create the output subdirectories: + if not os.path.exists(output_directory_filepath): + os.mkdir(output_directory_filepath) + + multiprocessing_list.append((directory, output_directory_filepath)) + + multiprocessing_scheduler(multiprocessing_list) \ No newline at end of file