Skip to content

Commit

Permalink
fix: Provided a mechanism to clean the background audio from vocals b…
Browse files Browse the repository at this point in the history
…etter to avoid the reported 'metallic voice' presence in the dubbed output.

Change-Id: I622eaa7429b535811d34e99280001131bc9e06ce
  • Loading branch information
Kacper Krasowiak committed Oct 24, 2024
1 parent a6a1aa9 commit 9eb260e
Show file tree
Hide file tree
Showing 6 changed files with 265 additions and 16 deletions.
2 changes: 1 addition & 1 deletion ariel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# limitations under the License.

"""Ariel library for for end-to-end video ad dubbing using AI."""
__version__ = "0.0.16"
__version__ = "0.0.17"
108 changes: 108 additions & 0 deletions ariel/audio_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,114 @@ def assemble_split_audio_file_paths(command: str) -> tuple[str, str]:
return audio_vocals_file, audio_background_file


def execute_vocals_non_vocals_split(
*, audio_file: str, output_directory: str, device: str
) -> tuple[str, str]:
"""Splits an audio file into vocal and non-vocal (background) components using Demucs.
Args:
audio_file: The path to the input audio file.
output_directory: The directory where the separated audio files will be
saved.
device: The device to use for Demucs processing (e.g., "cuda" for GPU or
"cpu").
Returns:
A tuple containing the paths to the separated audio files:
- The path to the vocals audio file.
- The path to the background audio file.
"""
demucs_command = build_demucs_command(
audio_file=audio_file,
output_directory=output_directory,
device=device,
)
audio_vocals_file, audio_background_file = assemble_split_audio_file_paths(
command=demucs_command
)
if tf.io.gfile.exists(audio_vocals_file) and tf.io.gfile.exists(
audio_background_file
):
logging.info(
"The DEMUCS command will not be executed, because the expected files"
f" {audio_vocals_file} and {audio_background_file} already exist."
)
else:
execute_demucs_command(command=demucs_command)
return audio_vocals_file, audio_background_file


def split_audio_track(
audio_file: str,
output_directory: str,
device: str,
voice_separation_rounds: int = 2,
) -> tuple[str, str]:
"""Splits an audio track into vocal and non-vocal components, with optional iterative refinement.
This function separates the vocals from the background music in an audio file.
It first checks if the separated files already exist to avoid redundant
processing.
If not, it uses the `execute_vocal_non_vocals_split` function to perform the
initial separation.
To further refine the separation, it can iteratively apply the voice
separation
process to the background track. This helps to remove any residual vocal
traces
from the background. Finally, it cleans up temporary files generated by
Demucs.
Args:
audio_file: The path to the input audio file.
output_directory: The directory to store the separated audio files.
device: The device to use for Demucs processing (e.g., "cuda" or "cpu").
voice_separation_rounds: The number of times to iteratively apply voice
separation to the background track (default is 2).
Returns:
A tuple containing the paths to the separated audio files:
- The path to the vocals audio file.
- The path to the background audio file.
"""
demucs_command = build_demucs_command(
audio_file=audio_file,
output_directory=output_directory,
device=device,
)
audio_vocals_file, _ = assemble_split_audio_file_paths(command=demucs_command)
file_extension = os.path.splitext(audio_vocals_file)[1]
vocals_path = os.path.join(
output_directory, AUDIO_PROCESSING, f"vocals{file_extension}"
)
background_path = os.path.join(
output_directory, AUDIO_PROCESSING, f"no_vocals{file_extension}"
)
if tf.io.gfile.exists(vocals_path) and tf.io.gfile.exists(background_path):
logging.info(
"The DEMUCS command will not be executed, because the expected files"
f" {vocals_path} and {background_path} already exist."
)
return vocals_path, background_path
audio_vocals_file, audio_background_file = execute_vocals_non_vocals_split(
audio_file=audio_file, output_directory=output_directory, device=device
)
tf.io.gfile.copy(audio_vocals_file, vocals_path)
tf.io.gfile.copy(audio_background_file, background_path)
if voice_separation_rounds > 1:
for _ in range(1, voice_separation_rounds):
_, audio_background_file = execute_vocals_non_vocals_split(
audio_file=background_path,
output_directory=output_directory,
device=device,
)
tf.io.gfile.copy(audio_background_file, background_path, overwrite=True)
tf.io.gfile.rmtree(
os.path.join(output_directory, AUDIO_PROCESSING, "htdemucs")
)
return vocals_path, background_path


def create_pyannote_timestamps(
*,
audio_file: str,
Expand Down
26 changes: 11 additions & 15 deletions ariel/dubbing.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,7 @@ def __init__(
adjust_speed: bool = False,
vocals_volume_adjustment: float = 5.0,
background_volume_adjustment: float = 0.0,
voice_separation_rounds: int = 2,
clean_up: bool = True,
pyannote_model: str = _DEFAULT_PYANNOTE_MODEL,
gemini_model_name: str = _DEFAULT_GEMINI_MODEL,
Expand Down Expand Up @@ -621,6 +622,9 @@ def __init__(
adjusted.
background_volume_adjustment: By how much the background audio volume
should be adjusted.
voice_separation_rounds: The number of times the background audio file
should be processed for voice detection and removal. It helps with the
old voice artifacts being present in the dubbed ad.
clean_up: Whether to delete intermediate files after dubbing. Only the
final ouput and the utterance metadata will be kept.
pyannote_model: Name of the PyAnnote diarization model.
Expand Down Expand Up @@ -662,6 +666,7 @@ def __init__(
self.adjust_speed = adjust_speed
self.vocals_volume_adjustment = vocals_volume_adjustment
self.background_volume_adjustment = background_volume_adjustment
self.voice_separation_rounds = voice_separation_rounds
self.clean_up = clean_up
self.pyannote_model = pyannote_model
self.hugging_face_token = hugging_face_token
Expand Down Expand Up @@ -910,23 +915,14 @@ def run_preprocessing(self) -> None:
else:
video_file = None
audio_file = self.input_file
demucs_command = audio_processing.build_demucs_command(
audio_file=audio_file,
output_directory=self.output_directory,
device=self.device,
)
audio_vocals_file, audio_background_file = (
audio_processing.assemble_split_audio_file_paths(command=demucs_command)
audio_processing.split_audio_track(
audio_file=audio_file,
output_directory=self.output_directory,
device=self.device,
voice_separation_rounds=self.voice_separation_rounds,
)
)
if tf.io.gfile.exists(audio_vocals_file) and tf.io.gfile.exists(
audio_background_file
):
logging.info(
"The DEMUCS command will not be executed, because the expected files"
f" {audio_vocals_file} and {audio_background_file} already exist."
)
else:
audio_processing.execute_demucs_command(command=demucs_command)
utterance_metadata = audio_processing.create_pyannote_timestamps(
audio_file=audio_file,
number_of_speakers=self.number_of_speakers,
Expand Down
5 changes: 5 additions & 0 deletions examples/dubbing_workflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@
"vocals_volume_adjustment = \"5.0\" # @param {type:\"string\"}\n",
"vocals_volume_adjustment = float(vocals_volume_adjustment)\n",
"\n",
"# @markdown **voice_separation_rounds** The number of times the background audio file should be processed for voice detection and removal. It helps with the old voice artifacts being present in the dubbed ad.\n",
"voice_separation_rounds = \"2\" # @param [1, 2, 3, 4, 5]\n",
"voice_separation_rounds = int(voice_separation_rounds)\n",
"\n",
"# @markdown **background_volume_adjustment** By how much the background audio volume should be adjusted.\n",
"background_volume_adjustment = \"0.0\" # @param {type:\"string\"}\n",
"background_volume_adjustment = float(background_volume_adjustment)\n",
Expand Down Expand Up @@ -341,6 +345,7 @@
" adjust_speed=adjust_speed,\n",
" vocals_volume_adjustment=vocals_volume_adjustment,\n",
" background_volume_adjustment=background_volume_adjustment,\n",
" voice_separation_rounds=voice_separation_rounds,\n",
" clean_up=clean_up,\n",
" gemini_model_name=gemini_model_name,\n",
" temperature=gemini_temperature,\n",
Expand Down
8 changes: 8 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,13 @@
0.0,
"By how much the background audio volume should be adjusted.",
)
_VOICE_SEPARATION_ROUNDS = flags.DEFINE_float(
"voice_separation_rounds",
2,
"The number of times the background audio file"
" should be processed for voice detection and removal. It helps with"
" the old voice artifacts being present in the dubbed ad.",
)
_CLEAN_UP = flags.DEFINE_bool(
"clean_up",
False,
Expand Down Expand Up @@ -239,6 +246,7 @@ def main(argv: Sequence[str]) -> None:
adjust_speed=_ADJUST_SPEED.value,
vocals_volume_adjustment=_VOCALS_VOLUME_ADJUSTMENT.value,
background_volume_adjustment=_BACKGROUND_VOLUME_ADJUSTMENT.value,
voice_separation_rounds=_VOICE_SEPARATION_ROUNDS.value,
clean_up=_CLEAN_UP.value,
gemini_model_name=_GEMINI_MODEL_NAME.value,
temperature=_TEMPERATURE.value,
Expand Down
132 changes: 132 additions & 0 deletions tests/audio_processing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,136 @@ def test_execute_command_error(self, mock_run):
)


class TestExecuteVocalNonVocalsSplit(absltest.TestCase):

@mock.patch("ariel.audio_processing.execute_demucs_command")
@mock.patch("tensorflow.io.gfile.exists")
def test_execute_vocals_non_vocals_split_files_exist(
self, mock_exists, mock_execute_demucs_command
):
mock_exists.side_effect = [True, True]
audio_file = "test.wav"
output_directory = "output_dir"
device = "cpu"
_, _ = audio_processing.execute_vocals_non_vocals_split(
audio_file=audio_file, output_directory=output_directory, device=device
)
mock_execute_demucs_command.assert_not_called()

@mock.patch("ariel.audio_processing.execute_demucs_command")
@mock.patch("tensorflow.io.gfile.exists")
def test_execute_vocals_non_vocals_split_files_dont_exist(
self, mock_exists, mock_execute_demucs_command
):
mock_exists.side_effect = [False, False]
audio_file = "test.wav"
output_directory = "output_dir"
device = "cpu"
audio_processing.execute_vocals_non_vocals_split(
audio_file=audio_file, output_directory=output_directory, device=device
)
mock_execute_demucs_command.assert_called_once()

@mock.patch("ariel.audio_processing.build_demucs_command")
@mock.patch("ariel.audio_processing.execute_demucs_command")
@mock.patch("tensorflow.io.gfile.exists")
def test_execute_vocals_non_vocals_split_correct_command(
self, mock_exists, mock_execute_demucs_command, mock_build_demucs_command
):
mock_exists.side_effect = [False, False]
audio_file = "test.wav"
output_directory = "output_dir"
device = "cpu"
expected_command = (
"python -m demucs.separate -o 'output_dir/audio_processing' --device"
" cpu --shifts 10 --overlap 0.25 -j 0 --two-stems vocals 'test.wav'"
)
mock_build_demucs_command.return_value = expected_command
audio_processing.execute_vocals_non_vocals_split(
audio_file=audio_file, output_directory=output_directory, device=device
)
mock_build_demucs_command.assert_called_once_with(
audio_file=audio_file, output_directory=output_directory, device=device
)
mock_execute_demucs_command.assert_called_once_with(
command=expected_command
)


class TestSplitAudioTrack(absltest.TestCase):

@patch("tensorflow.io.gfile.exists")
@patch("tensorflow.io.gfile.copy")
@patch("tensorflow.io.gfile.rmtree")
@patch("ariel.audio_processing.execute_vocals_non_vocals_split")
@patch("ariel.audio_processing.build_demucs_command")
@patch("ariel.audio_processing.assemble_split_audio_file_paths")
def test_split_audio_track(
self,
mock_assemble_split_paths,
mock_build_command,
mock_execute_split,
mock_rmtree,
mock_copy,
mock_exists,
):
mock_execute_split.return_value = ("vocals_file.wav", "background_file.wav")
mock_build_command.return_value = "demucs_command"
mock_assemble_split_paths.return_value = (
"vocals_file.wav",
"background_file.wav",
)
mock_exists.side_effect = [False, False]
vocals_path, background_path = audio_processing.split_audio_track(
audio_file="input.wav",
output_directory="output_dir",
device="cpu",
voice_separation_rounds=2,
)
self.assertEqual(
vocals_path,
os.path.join(
"output_dir", audio_processing.AUDIO_PROCESSING, "vocals.wav"
),
)
self.assertEqual(
background_path,
os.path.join(
"output_dir", audio_processing.AUDIO_PROCESSING, "no_vocals.wav"
),
)

mock_build_command.assert_called_once_with(
audio_file="input.wav", output_directory="output_dir", device="cpu"
)
mock_copy.assert_any_call("vocals_file.wav", vocals_path)
mock_copy.assert_any_call("background_file.wav", background_path)
mock_rmtree.assert_called_once_with(
os.path.join(
"output_dir", audio_processing.AUDIO_PROCESSING, "htdemucs"
)
)

@patch("tensorflow.io.gfile.exists")
def test_split_audio_track_files_exist(self, mock_exists):
mock_exists.side_effect = [True, True]
vocals_path, background_path = audio_processing.split_audio_track(
audio_file="input.wav", output_directory="output_dir", device="cpu"
)
self.assertEqual(
vocals_path,
os.path.join(
"output_dir", audio_processing.AUDIO_PROCESSING, "vocals.mp3"
),
)
self.assertEqual(
background_path,
os.path.join(
"output_dir", audio_processing.AUDIO_PROCESSING, "no_vocals.mp3"
),
)


class CreatePyannoteTimestampsTest(absltest.TestCase):

def test_create_timestamps_with_silence(self):
Expand Down Expand Up @@ -456,6 +586,7 @@ def test_verify_modified_audio_chunk(
"start": 1.0,
"end": 2.0,
"path": "wrong_chunk.mp3",
"for_dubbing": True,
}
result = audio_processing.verify_modified_audio_chunk(
audio_file=audio_file_path,
Expand Down Expand Up @@ -505,6 +636,7 @@ def test_insert_audio_at_timestamps(self):
"start": 3.0,
"end": 5.0,
"dubbed_path": audio_chunk_path,
"for_dubbing": True,
}]
output_path = audio_processing.insert_audio_at_timestamps(
utterance_metadata=utterance_metadata,
Expand Down

0 comments on commit 9eb260e

Please sign in to comment.