diff --git a/ariel/__init__.py b/ariel/__init__.py index 836e315..dc2ed2e 100644 --- a/ariel/__init__.py +++ b/ariel/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. """Ariel library for for end-to-end video ad dubbing using AI.""" -__version__ = "0.0.3" \ No newline at end of file +__version__ = "0.0.4" \ No newline at end of file diff --git a/ariel/dubbing.py b/ariel/dubbing.py index 7d67897..000e99b 100644 --- a/ariel/dubbing.py +++ b/ariel/dubbing.py @@ -87,6 +87,19 @@ " You translated it as: '{}'. The target language was: '{}'. The company" " asks you to modify this translation: '{}'" ) +_REQUIRED_KEYS: Final[set] = {"text", "start", "end"} +_COMPATIBLE_SSML_GENDERS: Final[set] = {"Male", "Female"} +_REQUIRED_GOOGLE_TTS_PARAMETERS: Final[set] = { + "pitch", + "speed", + "volume_gain_db", +} +_REQUIRED_ELEVENLABS_PARAMETERS: Final[set] = { + "stability", + "similarity_boost", + "style", + "use_speaker_boost", +} def is_video(*, input_file: str) -> bool: @@ -167,10 +180,10 @@ class PreprocessingArtifacts: vocals. """ - video_file: str + video_file: str | None audio_file: str - audio_vocals_file: str - audio_background_file: str + audio_vocals_file: str | None = None + audio_background_file: str | None = None @dataclasses.dataclass @@ -210,6 +223,175 @@ class ElevenLabsAccessError(Exception): pass +def _add_items_to_dictionaries( + *, + utterance_metadata: Sequence[Mapping[str, str | float]], + items: Sequence[str], + key: str | None = None, +): + """Adds items from a list to the utterance metadata. + + Args: + utterance_metadata: The list of the source mappings. + items: The list of items to add. + key: The key to use for the new item in each dictionary if items are not + dictionaries. + + Returns: + An updated sequence with utterance metadata. + + Raises: + ValueError: If the lengths of utterance metadata and items are not equal. + """ + utterance_metadata_length = len(utterance_metadata) + items_length = len(items) + if utterance_metadata_length != items_length: + raise ValueError( + f"The number of dictionaries for the key '{key}' and items must be" + f" equal. Received: {utterance_metadata_length} and" + f" {items_length} respectively." + ) + updated_utterance_metadata = [] + for dictionary, item in zip(utterance_metadata, items): + dictionary_copy = dictionary.copy() + if isinstance(item, dict): + dictionary_copy.update(item) + else: + dictionary_copy[key] = item + updated_utterance_metadata.append(dictionary_copy) + return updated_utterance_metadata + + +def _verify_dictionary( + *, + dictionary_to_verify: Sequence[Mapping[str, str | float]], + required_keys: set, +) -> None: + """Verifies the completeness of a dictionary. + + Args: + dictionary_to_verify: A sequence of dictionaries to verify. + required_keys: A set of strings representing the mandatory keys expected + in each parameter dictionary. + + Raises: + KeyError: If any dictionary within `dictionary_to_verify` is missing one + or more of the `required_keys`. + """ + for dictionary in dictionary_to_verify: + missing_keys = required_keys - set(dictionary.keys()) + if missing_keys: + raise KeyError( + f"Dictionary is missing keys: {missing_keys}. Problematic dictionary:" + f" {dictionary}" + ) + + +def assemble_utterance_metadata_for_dubbing_from_script( + *, + script_with_timestamps: Sequence[Mapping[str, str | float]], + assigned_voice: str | Sequence[str], + use_elevenlabs: bool = False, + google_text_to_speech_parameters: ( + Mapping[str, str | float] | Sequence[Mapping[str, str | float]] | None + ) = {"pitch": -5.0, "speed": 1.0, "volume_gain_db": 16.0}, + elevenlabs_text_to_speech_parameters: ( + Mapping[str, str | float] | Sequence[Mapping[str, str | float]] | None + ) = { + "stability": 0.5, + "similarity_boost": 0.75, + "style": 0.0, + "use_speaker_boost": True, + }, +): + """Assembles utterance metadata for dubbing based on a script with timestamps. + + This function takes a script with timestamps, voice assignments, and other + parameters to create a structured metadata representation suitable for dubbing + tasks. It validates the input data, adds necessary fields, and handles + different text-to-speech (TTS) configurations. + + Args: + script_with_timestamps: A sequence of dictionaries, each containing + information about an utterance in the script: - "text": The text of the + utterance. - "start": The start time of the utterance (in seconds). - + "end": The end time of the utterance (in seconds). + assigned_voice: The name of the assigned voice (or a list of names if each + utterance has a different voice). + use_elevenlabs: If True, use ElevenLabs TTS parameters; otherwise, use + Google TTS parameters. + google_text_to_speech_parameters: A dictionary or list of dictionaries + with Google TTS parameters (only used if `use_elevenlabs` is False). + elevenlabs_text_to_speech_parameters: A dictionary or list of dictionaries + with ElevenLabs TTS parameters (only used if `use_elevenlabs` is True). + + Returns: + A sequence of dictionaries, each containing enriched metadata for an + utterance: + - All keys from the original `script_with_timestamps` dictionaries. + - "for_dubbing": Always set to True. + - "assigned_voice": The assigned voice name. + - Additional TTS parameters based on the `use_elevenlabs` flag and the + corresponding parameter dictionaries. + + Raises: + KeyError: If a dictionary in `script_with_timestamps` is missing "text", + "start", or "end" keys. + KeyError: If the specified TTS parameter dictionary is missing required + keys. + + Example: + ```python + script = [ + {"text": "Hello, world!", "start": 0.0, "end": 1.5}, + {"text": "This is a test.", "start": 2.0, "end": 3.8}, + ] + metadata = assemble_utterance_metadata_for_dubbing_from_script( + script_with_timestamps=script, + assigned_voice=["Alice", "Bob"], + use_elevenlabs=False, + google_text_to_speech_parameters=[{"pitch": -2.0}, {"speed": 0.9}], + ) + print(metadata) + ``` + """ + _verify_dictionary( + dictionary_to_verify=script_with_timestamps, required_keys=_REQUIRED_KEYS + ) + number_of_utterances = len(script_with_timestamps) + for_dubbing = [True] * number_of_utterances + utterance_metadata_with_for_dubbing = _add_items_to_dictionaries( + utterance_metadata=script_with_timestamps, + items=for_dubbing, + key="for_dubbing", + ) + if not isinstance(assigned_voice, list): + assigned_voice = [assigned_voice] * number_of_utterances + utterance_metadata_with_assigned_voice = _add_items_to_dictionaries( + utterance_metadata=utterance_metadata_with_for_dubbing, + items=assigned_voice, + key="assigned_voice", + ) + if use_elevenlabs: + text_to_speech_parameters = elevenlabs_text_to_speech_parameters + required_keys = _REQUIRED_ELEVENLABS_PARAMETERS + else: + text_to_speech_parameters = google_text_to_speech_parameters + required_keys = _REQUIRED_GOOGLE_TTS_PARAMETERS + if not isinstance(text_to_speech_parameters, list): + text_to_speech_parameters = [ + text_to_speech_parameters + ] * number_of_utterances + _verify_dictionary( + dictionary_to_verify=text_to_speech_parameters, + required_keys=required_keys, + ) + return _add_items_to_dictionaries( + utterance_metadata=utterance_metadata_with_assigned_voice, + items=text_to_speech_parameters, + ) + + class Dubber: """A class to manage the entire ad dubbing process.""" @@ -249,6 +431,7 @@ def __init__( elevenlabs_clone_voices: bool = False, elevenlabs_model: str = _DEFAULT_ELEVENLABS_MODEL, number_of_steps: int = _NUMBER_OF_STEPS, + with_verification: bool = True, ) -> None: """Initializes the Dubber class with various parameters for dubbing configuration. @@ -304,6 +487,8 @@ def __init__( elevenlabs_model: The ElevenLabs model to use in the Text-To-Speech process. number_of_steps: The total number of steps in the dubbing process. + with_verification: Whether a user wishes to verify, and optionally edit, + the utterance metadata in the dubbing process. """ self.input_file = input_file self.output_directory = output_directory @@ -337,6 +522,7 @@ def __init__( self.safety_settings = safety_settings self.utterance_metadata = None self._number_of_steps = number_of_steps + self.with_verification = with_verification self._rerun = False @functools.cached_property @@ -610,6 +796,25 @@ def run_preprocessing(self) -> None: logging.info("Completed preprocessing.") self.progress_bar.update() + def run_preprocessing_for_dubbing_from_script(self) -> None: + """Splits audio/video. + + Returns: + A named tuple containing paths and metadata of the processed files. + """ + if self.is_video: + video_file, audio_file = video_processing.split_audio_video( + video_file=self.input_file, output_directory=self.output_directory + ) + else: + video_file = None + audio_file = self.input_file + + self.preprocesing_output = PreprocessingArtifacts( + video_file=video_file, + audio_file=audio_file, + ) + def run_speech_to_text(self) -> None: """Transcribes audio, applies speaker diarization, and updates metadata with Gemini. @@ -1130,11 +1335,15 @@ def run_postprocessing(self) -> None: dubbed_audio_vocals_file = audio_processing.insert_audio_at_timestamps( utterance_metadata=self.utterance_metadata, - background_audio_file=self.preprocesing_output.audio_background_file, + background_audio_file=self.preprocesing_output.audio_background_file + if self.preprocesing_output.audio_background_file + else self.preprocesing_output.audio_file, output_directory=self.output_directory, ) dubbed_audio_file = audio_processing.merge_background_and_vocals( - background_audio_file=self.preprocesing_output.audio_background_file, + background_audio_file=self.preprocesing_output.audio_background_file + if self.preprocesing_output.audio_background_file + else self.preprocesing_output.audio_file, dubbed_vocals_audio_file=dubbed_audio_vocals_file, output_directory=self.output_directory, target_language=self.target_language, @@ -1225,7 +1434,8 @@ def dub_ad(self) -> PostprocessingArtifacts: self.run_speech_to_text() self.run_translation() self.run_configure_text_to_speech() - self._run_verify_utterance_metadata() + if self.with_verification: + self._run_verify_utterance_metadata() self.run_text_to_speech() self.run_save_utterance_metadata() self.run_postprocessing() @@ -1278,7 +1488,8 @@ def dub_ad_with_utterance_metadata( "The class utterance metadata was overwritten with the provided input." ) self._rerun = True - self._run_verify_utterance_metadata() + if self.with_verification: + self._run_verify_utterance_metadata() self.run_text_to_speech() self.run_postprocessing() logging.info("Dubbing process finished.") @@ -1313,10 +1524,90 @@ def dub_ad_with_different_language( self._rerun = True self.run_translation() self.run_configure_text_to_speech() - self._run_verify_utterance_metadata() + if self.with_verification: + self._run_verify_utterance_metadata() self.run_text_to_speech() self.run_save_utterance_metadata() self.run_postprocessing() + logging.info("Dubbing process finished.") + logging.info("Output files saved in: %s.", self.output_directory) + return self.postprocessing_output + + def dub_ad_from_script( + self, + *, + script_with_timestamps: Sequence[Mapping[str, str | float]], + assigned_voice: str | Sequence[str], + google_text_to_speech_parameters: ( + Mapping[str, str | float] | Sequence[Mapping[str, str | float]] + ) = {"pitch": -5.0, "speed": 1.0, "volume_gain_db": 16.0}, + elevenlabs_text_to_speech_parameters: ( + Mapping[str, str | float] | Sequence[Mapping[str, str | float]] + ) = { + "stability": 0.5, + "similarity_boost": 0.75, + "style": 0.0, + "use_speaker_boost": True, + }, + ) -> PostprocessingArtifacts: + """Orchestrates the complete ad dubbing process from a script with timestamps. + + This method takes a script with timestamps, assigns voices, and performs the + following steps: + + 1. Prepares utterance metadata for dubbing based on the script. + 2. Runs preprocessing steps on the script. + 3. Performs translation of the script if necessary. + 4. Verifies utterance metadata (optional). + 5. Synthesizes speech using either Google Text-to-Speech or ElevenLabs. + 6. Executes post-processing tasks on the synthesized speech. + + Args: + script_with_timestamps: A sequence of mappings detailing each + utterance's metadata. Each mapping should contain: * 'start', 'end': + Utterance start/end times in seconds (float). * 'text': The text + content of the utterance. + assigned_voice: The name of the assigned voice(s) for the utterances + (either a single string or a sequence of strings). + google_text_to_speech_parameters: Parameters for Google Text-to-Speech + synthesis. + elevenlabs_text_to_speech_parameters: Parameters for ElevenLabs + Text-to-Speech synthesis. + + Returns: + PostprocessingArtifacts: An object containing the post-processed dubbing + results. + """ + + logging.info("Dubbing process from script starting...") + if self.use_elevenlabs and self.elevenlabs_clone_voices: + logging.warning( + "Voices won't be cloned when dubbing from script. You can only use" + " off-the-shelf voices (e.g. 'Charlie') from ElevenLabs." + ) + self.elevenlabs_clone_voices = False + self.utterance_metadata = assemble_utterance_metadata_for_dubbing_from_script( + script_with_timestamps=script_with_timestamps, + assigned_voice=assigned_voice, + use_elevenlabs=self.use_elevenlabs, + google_text_to_speech_parameters=google_text_to_speech_parameters, + elevenlabs_text_to_speech_parameters=elevenlabs_text_to_speech_parameters, + ) + self._rerun = True + self.run_preprocessing_for_dubbing_from_script() + if self.original_language != self.target_language: + self.run_translation() + else: + updated_utterance_metadata = [] + for utterance in self.utterance_metadata: + utterance_copy = utterance.copy() + utterance_copy["translated_text"] = utterance_copy["text"] + updated_utterance_metadata.append(utterance_copy) + self.utterance_metadata = updated_utterance_metadata + if self.with_verification: + self._run_verify_utterance_metadata() + self.run_text_to_speech() + self.run_postprocessing() if self.clean_up: self.run_clean_directory() logging.info("Dubbing process finished.") diff --git a/ariel/text_to_speech.py b/ariel/text_to_speech.py index 7e797b1..250f776 100644 --- a/ariel/text_to_speech.py +++ b/ariel/text_to_speech.py @@ -338,21 +338,19 @@ def convert_text_to_speech( def calculate_target_utterance_speed( *, - reference_file: str, + reference_length: float, dubbed_file: str, ) -> float: """Returns the ratio between the reference and target duration. Args: - reference_file: The path to the reference MP3 file. + reference_length: The reference length of an audio chunk. dubbed_file: The path to the dubbed MP3 file. """ - reference_audio = AudioSegment.from_file(reference_file) dubbed_audio = AudioSegment.from_file(dubbed_file) - reference_duration = reference_audio.duration_seconds dubbed_duration = dubbed_audio.duration_seconds - return dubbed_duration / reference_duration + return dubbed_duration / reference_length def elevenlabs_convert_text_to_speech( @@ -456,7 +454,7 @@ def elevenlabs_run_clone_voices( def adjust_audio_speed( *, - reference_file: str, + reference_length: float, dubbed_file: str, speed: float | None = None, chunk_size: int = _DEFAULT_CHUNK_SIZE, @@ -467,7 +465,7 @@ def adjust_audio_speed( is the same or shorter than the duration of the reference file. Args: - reference_file: The path to the reference MP3 file. + reference_length: The reference length of an audio chunk. dubbed_file: The path to the dubbed MP3 file. speed: The desired speed in seconds. If None it will be determined based on the duration of the reference_file and dubbed_file. @@ -478,7 +476,7 @@ def adjust_audio_speed( dubbed_audio = AudioSegment.from_file(dubbed_file) if not speed: speed = calculate_target_utterance_speed( - reference_file=reference_file, dubbed_file=dubbed_file + reference_length=reference_length, dubbed_file=dubbed_file ) if speed <= 1.0: return @@ -545,19 +543,28 @@ def dub_utterances( updated_utterance_metadata = [] for utterance in utterance_metadata: utterance_copy = utterance.copy() - if not utterance["for_dubbing"]: - dubbed_path = utterance["path"] + if not utterance_copy["for_dubbing"]: + try: + dubbed_path = utterance_copy["path"] + except KeyError: + dubbed_path = f"chunk_{utterance['start']}_{utterance['end']}.mp3" else: if elevenlabs_clone_voices: - assigned_voice = speaker_to_voices_mapping[utterance["speaker_id"]] + assigned_voice = speaker_to_voices_mapping[utterance_copy["speaker_id"]] else: - assigned_voice = utterance["assigned_voice"] - path = utterance["path"] - text = utterance["translated_text"] - base_filename = os.path.splitext(os.path.basename(path))[0] - output_filename = os.path.join( - output_directory, f"dubbed_{base_filename}.mp3" - ) + assigned_voice = utterance_copy["assigned_voice"] + reference_length = utterance_copy["end"] - utterance_copy["start"] + text = utterance_copy["translated_text"] + try: + path = utterance_copy["path"] + base_filename = os.path.splitext(os.path.basename(path))[0] + output_filename = os.path.join( + output_directory, f"dubbed_{base_filename}.mp3" + ) + except KeyError: + output_filename = os.path.join( + output_directory, f"dubbed_chunk_{utterance['start']}_{utterance['end']}.mp3" + ) if use_elevenlabs: dubbed_path = elevenlabs_convert_text_to_speech( client=client, @@ -565,10 +572,10 @@ def dub_utterances( assigned_elevenlabs_voice=assigned_voice, output_filename=output_filename, text=text, - stability=utterance["stability"], - similarity_boost=utterance["similarity_boost"], - style=utterance["style"], - use_speaker_boost=utterance["use_speaker_boost"], + stability=utterance_copy["stability"], + similarity_boost=utterance_copy["similarity_boost"], + style=utterance_copy["style"], + use_speaker_boost=utterance_copy["use_speaker_boost"], ) else: dubbed_path = convert_text_to_speech( @@ -577,21 +584,21 @@ def dub_utterances( target_language=target_language, output_filename=output_filename, text=text, - pitch=utterance["pitch"], - speed=utterance["speed"], - volume_gain_db=utterance["volume_gain_db"], + pitch=utterance_copy["pitch"], + speed=utterance_copy["speed"], + volume_gain_db=utterance_copy["volume_gain_db"], ) condition_one = adjust_speed and use_elevenlabs - assigned_voice = utterance.get("assigned_voice", None) + assigned_voice = utterance_copy.get("assigned_voice", None) assigned_voice = assigned_voice if assigned_voice else "" condition_two = adjust_speed and "Journey" in assigned_voice speed = calculate_target_utterance_speed( - reference_file=utterance["path"], dubbed_file=dubbed_path + reference_length=reference_length, dubbed_file=dubbed_path ) if condition_one or condition_two: chunk_size = utterance_copy.get("chunk_size", _DEFAULT_CHUNK_SIZE) adjust_audio_speed( - reference_file=utterance["path"], + reference_length=reference_length, dubbed_file=dubbed_path, chunk_size=chunk_size, ) @@ -604,9 +611,9 @@ def dub_utterances( target_language=target_language, output_filename=output_filename, text=text, - pitch=utterance["pitch"], + pitch=utterance_copy["pitch"], speed=speed, - volume_gain_db=utterance["volume_gain_db"], + volume_gain_db=utterance_copy["volume_gain_db"], ) utterance_copy["dubbed_path"] = dubbed_path updated_utterance_metadata.append(utterance_copy) diff --git a/examples/video_ad_dubbing_gtech_ads_ariel_demo.ipynb b/examples/video_ad_dubbing_gtech_ads_ariel_demo.ipynb index 227f64e..d17a588 100644 --- a/examples/video_ad_dubbing_gtech_ads_ariel_demo.ipynb +++ b/examples/video_ad_dubbing_gtech_ads_ariel_demo.ipynb @@ -518,7 +518,11 @@ "\n", "# @markdown **clean_up** The indicator if to remove all artifacts in the output directory except the dubbed video / audio file and the utterance metadata.\n", "clean_up = \"True\" # @param [\"True\", \"False\"]\n", - "clean_up = False if clean_up == \"False\" else True" + "clean_up = False if clean_up == \"False\" else True\n", + "\n", + "# @markdown **with_verification** The indicator whether a user wishes to verify, and optionally edit, the utterance metadata in the dubbing process.\n", + "with_verification = \"True\" # @param [\"True\", \"False\"]\n", + "with_verification = False if with_verification == \"False\" else True" ] }, { @@ -616,6 +620,7 @@ " use_elevenlabs=use_elevenlabs,\n", " elevenlabs_token=elevenlabs_token,\n", " elevenlabs_clone_voices=elevenlabs_clone_voices,\n", + " with_verification=with_verification,\n", " )\n", "dubber.dub_ad()" ] diff --git a/main.py b/main.py index 14cdb01..0b55c8a 100644 --- a/main.py +++ b/main.py @@ -152,7 +152,11 @@ False, "Whether to clone source voices. It requires using ElevenLabs API.", ) - +_WITH_VERIFICATION = flags.DEFINE_bool( + "with_verification", + True, + "Verify, and optionally edit, the utterance metadata in the dubbing process.", +) def main(argv: Sequence[str]) -> None: @@ -186,6 +190,7 @@ def main(argv: Sequence[str]) -> None: use_elevenlabs=_USE_ELEVENLABS.value, elevenlabs_token=_ELEVENLABS_TOKEN.value, elevenlabs_clone_voices=_ELEVENLABS_CLONE_VOICES.value, + with_verification=_WITH_VERIFICATION.value, ) dubber.dub_ad() diff --git a/tests/dubbing_test.py b/tests/dubbing_test.py index 5f8263d..c33e062 100644 --- a/tests/dubbing_test.py +++ b/tests/dubbing_test.py @@ -73,5 +73,96 @@ def test_nonexistent_file(self): dubbing.read_system_settings("nonexistent.txt") +class TestAssembleUtteranceMetadata(parameterized.TestCase): + + @parameterized.named_parameters( + ( + "Basic Case", + [ + {"text": "Hello there!", "start": 0.0, "end": 2.5}, + {"text": "How are you?", "start": 3.0, "end": 5.2}, + ], + "John Doe", + False, + {"pitch": -3.0, "speed": 1.2, "volume_gain_db": 10.0}, + None, + [ + { + "text": "Hello there!", + "start": 0.0, + "end": 2.5, + "for_dubbing": True, + "assigned_voice": "John Doe", + "pitch": -3.0, + "speed": 1.2, + "volume_gain_db": 10.0, + }, + { + "text": "How are you?", + "start": 3.0, + "end": 5.2, + "for_dubbing": True, + "assigned_voice": "John Doe", + "pitch": -3.0, + "speed": 1.2, + "volume_gain_db": 10.0, + }, + ], + ), + ( + "ElevenLabs Case", + [ + {"text": "This is for ElevenLabs", "start": 0.0, "end": 2.0}, + ], + "David", + True, + None, + { + "stability": 0.6, + "similarity_boost": 0.8, + "style": 0.2, + "use_speaker_boost": False, + }, + [{ + "text": "This is for ElevenLabs", + "start": 0.0, + "end": 2.0, + "for_dubbing": True, + "assigned_voice": "David", + "stability": 0.6, + "similarity_boost": 0.8, + "style": 0.2, + "use_speaker_boost": False, + }], + ), + ) + def test_assemble_utterance_metadata( + self, + script_with_timestamps, + assigned_voice, + use_elevenlabs, + google_text_to_speech_parameters, + elevenlabs_text_to_speech_parameters, + expected_output, + ): + result = dubbing.assemble_utterance_metadata_for_dubbing_from_script( + script_with_timestamps=script_with_timestamps, + assigned_voice=assigned_voice, + use_elevenlabs=use_elevenlabs, + google_text_to_speech_parameters=google_text_to_speech_parameters, + elevenlabs_text_to_speech_parameters=elevenlabs_text_to_speech_parameters, + ) + self.assertEqual(result, expected_output) + + def test_missing_key_raises_key_error(self): + with self.assertRaises(KeyError): + dubbing.assemble_utterance_metadata_for_dubbing_from_script( + script_with_timestamps=[ + {"text": "This is incomplete", "start": 1.0}, + ], + assigned_voice="Jane Smith", + ) + + if __name__ == "__main__": absltest.main()