fix: Provided a mechanism to clean the background audio from vocals b…

…etter to avoid the reported 'metallic voice' presence in the dubbed output. Change-Id: I622eaa7429b535811d34e99280001131bc9e06ce
google-marketing-solutions · Oct 24, 2024 · 9eb260e · 9eb260e
1 parent a6a1aa9
commit 9eb260e
Show file tree

Hide file tree

Showing 6 changed files with 265 additions and 16 deletions.
diff --git a/ariel/__init__.py b/ariel/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 """Ariel library for for end-to-end video ad dubbing using AI."""
-__version__ = "0.0.16"
+__version__ = "0.0.17"
diff --git a/ariel/audio_processing.py b/ariel/audio_processing.py
@@ -220,6 +220,114 @@ def assemble_split_audio_file_paths(command: str) -> tuple[str, str]:
   return audio_vocals_file, audio_background_file
 
 
+def execute_vocals_non_vocals_split(
+    *, audio_file: str, output_directory: str, device: str
+) -> tuple[str, str]:
+  """Splits an audio file into vocal and non-vocal (background) components using Demucs.
+
+  Args:
+    audio_file: The path to the input audio file.
+    output_directory: The directory where the separated audio files will be
+      saved.
+    device: The device to use for Demucs processing (e.g., "cuda" for GPU or
+      "cpu").
+
+  Returns:
+    A tuple containing the paths to the separated audio files:
+      - The path to the vocals audio file.
+      - The path to the background audio file.
+  """
+  demucs_command = build_demucs_command(
+      audio_file=audio_file,
+      output_directory=output_directory,
+      device=device,
+  )
+  audio_vocals_file, audio_background_file = assemble_split_audio_file_paths(
+      command=demucs_command
+  )
+  if tf.io.gfile.exists(audio_vocals_file) and tf.io.gfile.exists(
+      audio_background_file
+  ):
+    logging.info(
+        "The DEMUCS command will not be executed, because the expected files"
+        f" {audio_vocals_file} and {audio_background_file} already exist."
+    )
+  else:
+    execute_demucs_command(command=demucs_command)
+  return audio_vocals_file, audio_background_file
+
+
+def split_audio_track(
+    audio_file: str,
+    output_directory: str,
+    device: str,
+    voice_separation_rounds: int = 2,
+) -> tuple[str, str]:
+  """Splits an audio track into vocal and non-vocal components, with optional iterative refinement.
+
+  This function separates the vocals from the background music in an audio file.
+  It first checks if the separated files already exist to avoid redundant
+  processing.
+  If not, it uses the `execute_vocal_non_vocals_split` function to perform the
+  initial separation.
+
+  To further refine the separation, it can iteratively apply the voice
+  separation
+  process to the background track. This helps to remove any residual vocal
+  traces
+  from the background. Finally, it cleans up temporary files generated by
+  Demucs.
+
+  Args:
+    audio_file: The path to the input audio file.
+    output_directory: The directory to store the separated audio files.
+    device: The device to use for Demucs processing (e.g., "cuda" or "cpu").
+    voice_separation_rounds: The number of times to iteratively apply voice
+      separation to the background track (default is 2).
+
+  Returns:
+    A tuple containing the paths to the separated audio files:
+      - The path to the vocals audio file.
+      - The path to the background audio file.
+  """
+  demucs_command = build_demucs_command(
+      audio_file=audio_file,
+      output_directory=output_directory,
+      device=device,
+  )
+  audio_vocals_file, _ = assemble_split_audio_file_paths(command=demucs_command)
+  file_extension = os.path.splitext(audio_vocals_file)[1]
+  vocals_path = os.path.join(
+      output_directory, AUDIO_PROCESSING, f"vocals{file_extension}"
+  )
+  background_path = os.path.join(
+      output_directory, AUDIO_PROCESSING, f"no_vocals{file_extension}"
+  )
+  if tf.io.gfile.exists(vocals_path) and tf.io.gfile.exists(background_path):
+    logging.info(
+        "The DEMUCS command will not be executed, because the expected files"
+        f" {vocals_path} and {background_path} already exist."
+    )
+    return vocals_path, background_path
+  audio_vocals_file, audio_background_file = execute_vocals_non_vocals_split(
+      audio_file=audio_file, output_directory=output_directory, device=device
+  )
+  tf.io.gfile.copy(audio_vocals_file, vocals_path)
+  tf.io.gfile.copy(audio_background_file, background_path)
+  if voice_separation_rounds > 1:
+    for _ in range(1, voice_separation_rounds):
+      _, audio_background_file = execute_vocals_non_vocals_split(
+          audio_file=background_path,
+          output_directory=output_directory,
+          device=device,
+      )
+      tf.io.gfile.copy(audio_background_file, background_path, overwrite=True)
+  tf.io.gfile.rmtree(
+      os.path.join(output_directory, AUDIO_PROCESSING, "htdemucs")
+  )
+  return vocals_path, background_path
+
+
 def create_pyannote_timestamps(
     *,
     audio_file: str,

diff --git a/ariel/dubbing.py b/ariel/dubbing.py
@@ -554,6 +554,7 @@ def __init__(
       adjust_speed: bool = False,
       vocals_volume_adjustment: float = 5.0,
       background_volume_adjustment: float = 0.0,
+      voice_separation_rounds: int = 2,
       clean_up: bool = True,
       pyannote_model: str = _DEFAULT_PYANNOTE_MODEL,
       gemini_model_name: str = _DEFAULT_GEMINI_MODEL,
@@ -621,6 +622,9 @@ def __init__(
           adjusted.
         background_volume_adjustment: By how much the background audio volume
           should be adjusted.
+        voice_separation_rounds: The number of times the background audio file
+          should be processed for voice detection and removal. It helps with the
+          old voice artifacts being present in the dubbed ad.
         clean_up: Whether to delete intermediate files after dubbing. Only the
           final ouput and the utterance metadata will be kept.
         pyannote_model: Name of the PyAnnote diarization model.
@@ -662,6 +666,7 @@ def __init__(
     self.adjust_speed = adjust_speed
     self.vocals_volume_adjustment = vocals_volume_adjustment
     self.background_volume_adjustment = background_volume_adjustment
+    self.voice_separation_rounds = voice_separation_rounds
     self.clean_up = clean_up
     self.pyannote_model = pyannote_model
     self.hugging_face_token = hugging_face_token
@@ -910,23 +915,14 @@ def run_preprocessing(self) -> None:
     else:
       video_file = None
       audio_file = self.input_file
-    demucs_command = audio_processing.build_demucs_command(
-        audio_file=audio_file,
-        output_directory=self.output_directory,
-        device=self.device,
-    )
     audio_vocals_file, audio_background_file = (
-        audio_processing.assemble_split_audio_file_paths(command=demucs_command)
+        audio_processing.split_audio_track(
+            audio_file=audio_file,
+            output_directory=self.output_directory,
+            device=self.device,
+            voice_separation_rounds=self.voice_separation_rounds,
+        )
     )
-    if tf.io.gfile.exists(audio_vocals_file) and tf.io.gfile.exists(
-        audio_background_file
-    ):
-      logging.info(
-          "The DEMUCS command will not be executed, because the expected files"
-          f" {audio_vocals_file} and {audio_background_file} already exist."
-      )
-    else:
-      audio_processing.execute_demucs_command(command=demucs_command)
     utterance_metadata = audio_processing.create_pyannote_timestamps(
         audio_file=audio_file,
         number_of_speakers=self.number_of_speakers,

diff --git a/examples/dubbing_workflow.ipynb b/examples/dubbing_workflow.ipynb
@@ -223,6 +223,10 @@
         "vocals_volume_adjustment = \"5.0\" # @param {type:\"string\"}\n",
         "vocals_volume_adjustment = float(vocals_volume_adjustment)\n",
         "\n",
+        "# @markdown **voice_separation_rounds** The number of times the background audio file should be processed for voice detection and removal. It helps with the old voice artifacts being present in the dubbed ad.\n",
+        "voice_separation_rounds = \"2\" # @param [1, 2, 3, 4, 5]\n",
+        "voice_separation_rounds = int(voice_separation_rounds)\n",
+        "\n",
         "# @markdown **background_volume_adjustment** By how much the background audio volume should be adjusted.\n",
         "background_volume_adjustment = \"0.0\" # @param {type:\"string\"}\n",
         "background_volume_adjustment = float(background_volume_adjustment)\n",
@@ -341,6 +345,7 @@
         "      adjust_speed=adjust_speed,\n",
         "      vocals_volume_adjustment=vocals_volume_adjustment,\n",
         "      background_volume_adjustment=background_volume_adjustment,\n",
+        "      voice_separation_rounds=voice_separation_rounds,\n",
         "      clean_up=clean_up,\n",
         "      gemini_model_name=gemini_model_name,\n",
         "      temperature=gemini_temperature,\n",

diff --git a/main.py b/main.py
@@ -146,6 +146,13 @@
     0.0,
     "By how much the background audio volume should be adjusted.",
 )
+_VOICE_SEPARATION_ROUNDS = flags.DEFINE_float(
+    "voice_separation_rounds",
+    2,
+    "The number of times the background audio file"
+    " should be processed for voice detection and removal. It helps with"
+    " the old voice artifacts being present in the dubbed ad.",
+)
 _CLEAN_UP = flags.DEFINE_bool(
     "clean_up",
     False,
@@ -239,6 +246,7 @@ def main(argv: Sequence[str]) -> None:
       adjust_speed=_ADJUST_SPEED.value,
       vocals_volume_adjustment=_VOCALS_VOLUME_ADJUSTMENT.value,
       background_volume_adjustment=_BACKGROUND_VOLUME_ADJUSTMENT.value,
+      voice_separation_rounds=_VOICE_SEPARATION_ROUNDS.value,
       clean_up=_CLEAN_UP.value,
       gemini_model_name=_GEMINI_MODEL_NAME.value,
       temperature=_TEMPERATURE.value,

diff --git a/tests/audio_processing_test.py b/tests/audio_processing_test.py
@@ -249,6 +249,136 @@ def test_execute_command_error(self, mock_run):
       )
 
 
+class TestExecuteVocalNonVocalsSplit(absltest.TestCase):
+
+  @mock.patch("ariel.audio_processing.execute_demucs_command")
+  @mock.patch("tensorflow.io.gfile.exists")
+  def test_execute_vocals_non_vocals_split_files_exist(
+      self, mock_exists, mock_execute_demucs_command
+  ):
+    mock_exists.side_effect = [True, True]
+    audio_file = "test.wav"
+    output_directory = "output_dir"
+    device = "cpu"
+    _, _ = audio_processing.execute_vocals_non_vocals_split(
+        audio_file=audio_file, output_directory=output_directory, device=device
+    )
+    mock_execute_demucs_command.assert_not_called()
+
+  @mock.patch("ariel.audio_processing.execute_demucs_command")
+  @mock.patch("tensorflow.io.gfile.exists")
+  def test_execute_vocals_non_vocals_split_files_dont_exist(
+      self, mock_exists, mock_execute_demucs_command
+  ):
+    mock_exists.side_effect = [False, False]
+    audio_file = "test.wav"
+    output_directory = "output_dir"
+    device = "cpu"
+    audio_processing.execute_vocals_non_vocals_split(
+        audio_file=audio_file, output_directory=output_directory, device=device
+    )
+    mock_execute_demucs_command.assert_called_once()
+
+  @mock.patch("ariel.audio_processing.build_demucs_command")
+  @mock.patch("ariel.audio_processing.execute_demucs_command")
+  @mock.patch("tensorflow.io.gfile.exists")
+  def test_execute_vocals_non_vocals_split_correct_command(
+      self, mock_exists, mock_execute_demucs_command, mock_build_demucs_command
+  ):
+    mock_exists.side_effect = [False, False]
+    audio_file = "test.wav"
+    output_directory = "output_dir"
+    device = "cpu"
+    expected_command = (
+        "python -m demucs.separate -o 'output_dir/audio_processing' --device"
+        " cpu --shifts 10 --overlap 0.25 -j 0 --two-stems vocals 'test.wav'"
+    )
+    mock_build_demucs_command.return_value = expected_command
+    audio_processing.execute_vocals_non_vocals_split(
+        audio_file=audio_file, output_directory=output_directory, device=device
+    )
+    mock_build_demucs_command.assert_called_once_with(
+        audio_file=audio_file, output_directory=output_directory, device=device
+    )
+    mock_execute_demucs_command.assert_called_once_with(
+        command=expected_command
+    )
+
+
+class TestSplitAudioTrack(absltest.TestCase):
+
+  @patch("tensorflow.io.gfile.exists")
+  @patch("tensorflow.io.gfile.copy")
+  @patch("tensorflow.io.gfile.rmtree")
+  @patch("ariel.audio_processing.execute_vocals_non_vocals_split")
+  @patch("ariel.audio_processing.build_demucs_command")
+  @patch("ariel.audio_processing.assemble_split_audio_file_paths")
+  def test_split_audio_track(
+      self,
+      mock_assemble_split_paths,
+      mock_build_command,
+      mock_execute_split,
+      mock_rmtree,
+      mock_copy,
+      mock_exists,
+  ):
+    mock_execute_split.return_value = ("vocals_file.wav", "background_file.wav")
+    mock_build_command.return_value = "demucs_command"
+    mock_assemble_split_paths.return_value = (
+        "vocals_file.wav",
+        "background_file.wav",
+    )
+    mock_exists.side_effect = [False, False]
+    vocals_path, background_path = audio_processing.split_audio_track(
+        audio_file="input.wav",
+        output_directory="output_dir",
+        device="cpu",
+        voice_separation_rounds=2,
+    )
+    self.assertEqual(
+        vocals_path,
+        os.path.join(
+            "output_dir", audio_processing.AUDIO_PROCESSING, "vocals.wav"
+        ),
+    )
+    self.assertEqual(
+        background_path,
+        os.path.join(
+            "output_dir", audio_processing.AUDIO_PROCESSING, "no_vocals.wav"
+        ),
+    )
+
+    mock_build_command.assert_called_once_with(
+        audio_file="input.wav", output_directory="output_dir", device="cpu"
+    )
+    mock_copy.assert_any_call("vocals_file.wav", vocals_path)
+    mock_copy.assert_any_call("background_file.wav", background_path)
+    mock_rmtree.assert_called_once_with(
+        os.path.join(
+            "output_dir", audio_processing.AUDIO_PROCESSING, "htdemucs"
+        )
+    )
+
+  @patch("tensorflow.io.gfile.exists")
+  def test_split_audio_track_files_exist(self, mock_exists):
+    mock_exists.side_effect = [True, True]
+    vocals_path, background_path = audio_processing.split_audio_track(
+        audio_file="input.wav", output_directory="output_dir", device="cpu"
+    )
+    self.assertEqual(
+        vocals_path,
+        os.path.join(
+            "output_dir", audio_processing.AUDIO_PROCESSING, "vocals.mp3"
+        ),
+    )
+    self.assertEqual(
+        background_path,
+        os.path.join(
+            "output_dir", audio_processing.AUDIO_PROCESSING, "no_vocals.mp3"
+        ),
+    )
+
+
 class CreatePyannoteTimestampsTest(absltest.TestCase):
 
   def test_create_timestamps_with_silence(self):
@@ -456,6 +586,7 @@ def test_verify_modified_audio_chunk(
           "start": 1.0,
           "end": 2.0,
           "path": "wrong_chunk.mp3",
+          "for_dubbing": True,
       }
       result = audio_processing.verify_modified_audio_chunk(
           audio_file=audio_file_path,
@@ -505,6 +636,7 @@ def test_insert_audio_at_timestamps(self):
           "start": 3.0,
           "end": 5.0,
           "dubbed_path": audio_chunk_path,
+          "for_dubbing": True,
       }]
       output_path = audio_processing.insert_audio_at_timestamps(
           utterance_metadata=utterance_metadata,