diff --git a/ariel/__init__.py b/ariel/__init__.py index 08f1d72..bf30417 100644 --- a/ariel/__init__.py +++ b/ariel/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. """Ariel library for for end-to-end video ad dubbing using AI.""" -__version__ = "0.0.16" +__version__ = "0.0.17" diff --git a/ariel/audio_processing.py b/ariel/audio_processing.py index 54cf961..2143bf9 100644 --- a/ariel/audio_processing.py +++ b/ariel/audio_processing.py @@ -220,6 +220,114 @@ def assemble_split_audio_file_paths(command: str) -> tuple[str, str]: return audio_vocals_file, audio_background_file +def execute_vocals_non_vocals_split( + *, audio_file: str, output_directory: str, device: str +) -> tuple[str, str]: + """Splits an audio file into vocal and non-vocal (background) components using Demucs. + + Args: + audio_file: The path to the input audio file. + output_directory: The directory where the separated audio files will be + saved. + device: The device to use for Demucs processing (e.g., "cuda" for GPU or + "cpu"). + + Returns: + A tuple containing the paths to the separated audio files: + - The path to the vocals audio file. + - The path to the background audio file. + """ + demucs_command = build_demucs_command( + audio_file=audio_file, + output_directory=output_directory, + device=device, + ) + audio_vocals_file, audio_background_file = assemble_split_audio_file_paths( + command=demucs_command + ) + if tf.io.gfile.exists(audio_vocals_file) and tf.io.gfile.exists( + audio_background_file + ): + logging.info( + "The DEMUCS command will not be executed, because the expected files" + f" {audio_vocals_file} and {audio_background_file} already exist." + ) + else: + execute_demucs_command(command=demucs_command) + return audio_vocals_file, audio_background_file + + +def split_audio_track( + audio_file: str, + output_directory: str, + device: str, + voice_separation_rounds: int = 2, +) -> tuple[str, str]: + """Splits an audio track into vocal and non-vocal components, with optional iterative refinement. + + This function separates the vocals from the background music in an audio file. + It first checks if the separated files already exist to avoid redundant + processing. + If not, it uses the `execute_vocal_non_vocals_split` function to perform the + initial separation. + + To further refine the separation, it can iteratively apply the voice + separation + process to the background track. This helps to remove any residual vocal + traces + from the background. Finally, it cleans up temporary files generated by + Demucs. + + Args: + audio_file: The path to the input audio file. + output_directory: The directory to store the separated audio files. + device: The device to use for Demucs processing (e.g., "cuda" or "cpu"). + voice_separation_rounds: The number of times to iteratively apply voice + separation to the background track (default is 2). + + Returns: + A tuple containing the paths to the separated audio files: + - The path to the vocals audio file. + - The path to the background audio file. + """ + demucs_command = build_demucs_command( + audio_file=audio_file, + output_directory=output_directory, + device=device, + ) + audio_vocals_file, _ = assemble_split_audio_file_paths(command=demucs_command) + file_extension = os.path.splitext(audio_vocals_file)[1] + vocals_path = os.path.join( + output_directory, AUDIO_PROCESSING, f"vocals{file_extension}" + ) + background_path = os.path.join( + output_directory, AUDIO_PROCESSING, f"no_vocals{file_extension}" + ) + if tf.io.gfile.exists(vocals_path) and tf.io.gfile.exists(background_path): + logging.info( + "The DEMUCS command will not be executed, because the expected files" + f" {vocals_path} and {background_path} already exist." + ) + return vocals_path, background_path + audio_vocals_file, audio_background_file = execute_vocals_non_vocals_split( + audio_file=audio_file, output_directory=output_directory, device=device + ) + tf.io.gfile.copy(audio_vocals_file, vocals_path) + tf.io.gfile.copy(audio_background_file, background_path) + if voice_separation_rounds > 1: + for _ in range(1, voice_separation_rounds): + _, audio_background_file = execute_vocals_non_vocals_split( + audio_file=background_path, + output_directory=output_directory, + device=device, + ) + tf.io.gfile.copy(audio_background_file, background_path, overwrite=True) + tf.io.gfile.rmtree( + os.path.join(output_directory, AUDIO_PROCESSING, "htdemucs") + ) + return vocals_path, background_path + + def create_pyannote_timestamps( *, audio_file: str, diff --git a/ariel/dubbing.py b/ariel/dubbing.py index 5dd1e84..11b9480 100644 --- a/ariel/dubbing.py +++ b/ariel/dubbing.py @@ -554,6 +554,7 @@ def __init__( adjust_speed: bool = False, vocals_volume_adjustment: float = 5.0, background_volume_adjustment: float = 0.0, + voice_separation_rounds: int = 2, clean_up: bool = True, pyannote_model: str = _DEFAULT_PYANNOTE_MODEL, gemini_model_name: str = _DEFAULT_GEMINI_MODEL, @@ -621,6 +622,9 @@ def __init__( adjusted. background_volume_adjustment: By how much the background audio volume should be adjusted. + voice_separation_rounds: The number of times the background audio file + should be processed for voice detection and removal. It helps with the + old voice artifacts being present in the dubbed ad. clean_up: Whether to delete intermediate files after dubbing. Only the final ouput and the utterance metadata will be kept. pyannote_model: Name of the PyAnnote diarization model. @@ -662,6 +666,7 @@ def __init__( self.adjust_speed = adjust_speed self.vocals_volume_adjustment = vocals_volume_adjustment self.background_volume_adjustment = background_volume_adjustment + self.voice_separation_rounds = voice_separation_rounds self.clean_up = clean_up self.pyannote_model = pyannote_model self.hugging_face_token = hugging_face_token @@ -910,23 +915,14 @@ def run_preprocessing(self) -> None: else: video_file = None audio_file = self.input_file - demucs_command = audio_processing.build_demucs_command( - audio_file=audio_file, - output_directory=self.output_directory, - device=self.device, - ) audio_vocals_file, audio_background_file = ( - audio_processing.assemble_split_audio_file_paths(command=demucs_command) + audio_processing.split_audio_track( + audio_file=audio_file, + output_directory=self.output_directory, + device=self.device, + voice_separation_rounds=self.voice_separation_rounds, + ) ) - if tf.io.gfile.exists(audio_vocals_file) and tf.io.gfile.exists( - audio_background_file - ): - logging.info( - "The DEMUCS command will not be executed, because the expected files" - f" {audio_vocals_file} and {audio_background_file} already exist." - ) - else: - audio_processing.execute_demucs_command(command=demucs_command) utterance_metadata = audio_processing.create_pyannote_timestamps( audio_file=audio_file, number_of_speakers=self.number_of_speakers, diff --git a/examples/dubbing_workflow.ipynb b/examples/dubbing_workflow.ipynb index 6eda4d6..de370a7 100644 --- a/examples/dubbing_workflow.ipynb +++ b/examples/dubbing_workflow.ipynb @@ -223,6 +223,10 @@ "vocals_volume_adjustment = \"5.0\" # @param {type:\"string\"}\n", "vocals_volume_adjustment = float(vocals_volume_adjustment)\n", "\n", + "# @markdown **voice_separation_rounds** The number of times the background audio file should be processed for voice detection and removal. It helps with the old voice artifacts being present in the dubbed ad.\n", + "voice_separation_rounds = \"2\" # @param [1, 2, 3, 4, 5]\n", + "voice_separation_rounds = int(voice_separation_rounds)\n", + "\n", "# @markdown **background_volume_adjustment** By how much the background audio volume should be adjusted.\n", "background_volume_adjustment = \"0.0\" # @param {type:\"string\"}\n", "background_volume_adjustment = float(background_volume_adjustment)\n", @@ -341,6 +345,7 @@ " adjust_speed=adjust_speed,\n", " vocals_volume_adjustment=vocals_volume_adjustment,\n", " background_volume_adjustment=background_volume_adjustment,\n", + " voice_separation_rounds=voice_separation_rounds,\n", " clean_up=clean_up,\n", " gemini_model_name=gemini_model_name,\n", " temperature=gemini_temperature,\n", diff --git a/main.py b/main.py index 332688d..92e4bb1 100644 --- a/main.py +++ b/main.py @@ -146,6 +146,13 @@ 0.0, "By how much the background audio volume should be adjusted.", ) +_VOICE_SEPARATION_ROUNDS = flags.DEFINE_float( + "voice_separation_rounds", + 2, + "The number of times the background audio file" + " should be processed for voice detection and removal. It helps with" + " the old voice artifacts being present in the dubbed ad.", +) _CLEAN_UP = flags.DEFINE_bool( "clean_up", False, @@ -239,6 +246,7 @@ def main(argv: Sequence[str]) -> None: adjust_speed=_ADJUST_SPEED.value, vocals_volume_adjustment=_VOCALS_VOLUME_ADJUSTMENT.value, background_volume_adjustment=_BACKGROUND_VOLUME_ADJUSTMENT.value, + voice_separation_rounds=_VOICE_SEPARATION_ROUNDS.value, clean_up=_CLEAN_UP.value, gemini_model_name=_GEMINI_MODEL_NAME.value, temperature=_TEMPERATURE.value, diff --git a/tests/audio_processing_test.py b/tests/audio_processing_test.py index da58918..fc09602 100644 --- a/tests/audio_processing_test.py +++ b/tests/audio_processing_test.py @@ -249,6 +249,136 @@ def test_execute_command_error(self, mock_run): ) +class TestExecuteVocalNonVocalsSplit(absltest.TestCase): + + @mock.patch("ariel.audio_processing.execute_demucs_command") + @mock.patch("tensorflow.io.gfile.exists") + def test_execute_vocals_non_vocals_split_files_exist( + self, mock_exists, mock_execute_demucs_command + ): + mock_exists.side_effect = [True, True] + audio_file = "test.wav" + output_directory = "output_dir" + device = "cpu" + _, _ = audio_processing.execute_vocals_non_vocals_split( + audio_file=audio_file, output_directory=output_directory, device=device + ) + mock_execute_demucs_command.assert_not_called() + + @mock.patch("ariel.audio_processing.execute_demucs_command") + @mock.patch("tensorflow.io.gfile.exists") + def test_execute_vocals_non_vocals_split_files_dont_exist( + self, mock_exists, mock_execute_demucs_command + ): + mock_exists.side_effect = [False, False] + audio_file = "test.wav" + output_directory = "output_dir" + device = "cpu" + audio_processing.execute_vocals_non_vocals_split( + audio_file=audio_file, output_directory=output_directory, device=device + ) + mock_execute_demucs_command.assert_called_once() + + @mock.patch("ariel.audio_processing.build_demucs_command") + @mock.patch("ariel.audio_processing.execute_demucs_command") + @mock.patch("tensorflow.io.gfile.exists") + def test_execute_vocals_non_vocals_split_correct_command( + self, mock_exists, mock_execute_demucs_command, mock_build_demucs_command + ): + mock_exists.side_effect = [False, False] + audio_file = "test.wav" + output_directory = "output_dir" + device = "cpu" + expected_command = ( + "python -m demucs.separate -o 'output_dir/audio_processing' --device" + " cpu --shifts 10 --overlap 0.25 -j 0 --two-stems vocals 'test.wav'" + ) + mock_build_demucs_command.return_value = expected_command + audio_processing.execute_vocals_non_vocals_split( + audio_file=audio_file, output_directory=output_directory, device=device + ) + mock_build_demucs_command.assert_called_once_with( + audio_file=audio_file, output_directory=output_directory, device=device + ) + mock_execute_demucs_command.assert_called_once_with( + command=expected_command + ) + + +class TestSplitAudioTrack(absltest.TestCase): + + @patch("tensorflow.io.gfile.exists") + @patch("tensorflow.io.gfile.copy") + @patch("tensorflow.io.gfile.rmtree") + @patch("ariel.audio_processing.execute_vocals_non_vocals_split") + @patch("ariel.audio_processing.build_demucs_command") + @patch("ariel.audio_processing.assemble_split_audio_file_paths") + def test_split_audio_track( + self, + mock_assemble_split_paths, + mock_build_command, + mock_execute_split, + mock_rmtree, + mock_copy, + mock_exists, + ): + mock_execute_split.return_value = ("vocals_file.wav", "background_file.wav") + mock_build_command.return_value = "demucs_command" + mock_assemble_split_paths.return_value = ( + "vocals_file.wav", + "background_file.wav", + ) + mock_exists.side_effect = [False, False] + vocals_path, background_path = audio_processing.split_audio_track( + audio_file="input.wav", + output_directory="output_dir", + device="cpu", + voice_separation_rounds=2, + ) + self.assertEqual( + vocals_path, + os.path.join( + "output_dir", audio_processing.AUDIO_PROCESSING, "vocals.wav" + ), + ) + self.assertEqual( + background_path, + os.path.join( + "output_dir", audio_processing.AUDIO_PROCESSING, "no_vocals.wav" + ), + ) + + mock_build_command.assert_called_once_with( + audio_file="input.wav", output_directory="output_dir", device="cpu" + ) + mock_copy.assert_any_call("vocals_file.wav", vocals_path) + mock_copy.assert_any_call("background_file.wav", background_path) + mock_rmtree.assert_called_once_with( + os.path.join( + "output_dir", audio_processing.AUDIO_PROCESSING, "htdemucs" + ) + ) + + @patch("tensorflow.io.gfile.exists") + def test_split_audio_track_files_exist(self, mock_exists): + mock_exists.side_effect = [True, True] + vocals_path, background_path = audio_processing.split_audio_track( + audio_file="input.wav", output_directory="output_dir", device="cpu" + ) + self.assertEqual( + vocals_path, + os.path.join( + "output_dir", audio_processing.AUDIO_PROCESSING, "vocals.mp3" + ), + ) + self.assertEqual( + background_path, + os.path.join( + "output_dir", audio_processing.AUDIO_PROCESSING, "no_vocals.mp3" + ), + ) + + class CreatePyannoteTimestampsTest(absltest.TestCase): def test_create_timestamps_with_silence(self): @@ -456,6 +586,7 @@ def test_verify_modified_audio_chunk( "start": 1.0, "end": 2.0, "path": "wrong_chunk.mp3", + "for_dubbing": True, } result = audio_processing.verify_modified_audio_chunk( audio_file=audio_file_path, @@ -505,6 +636,7 @@ def test_insert_audio_at_timestamps(self): "start": 3.0, "end": 5.0, "dubbed_path": audio_chunk_path, + "for_dubbing": True, }] output_path = audio_processing.insert_audio_at_timestamps( utterance_metadata=utterance_metadata,