diff --git a/fern/definition/infill.yml b/fern/definition/infill.yml new file mode 100644 index 0000000..9d71422 --- /dev/null +++ b/fern/definition/infill.yml @@ -0,0 +1,84 @@ +imports: + tts: ./tts.yml + voice_changer: ./voice-changer.yml + +service: + base-path: /infill + auth: true + endpoints: + bytes: + path: /bytes + method: POST + display-name: Infill (Bytes) + docs: | + Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions. + + At least one of `left_audio` or `right_audio` must be provided. + request: + name: InfillBytesRequest + body: + properties: + left_audio: + type: file + right_audio: + type: file + model_id[]: + type: string + docs: The ID of the model to use for generating audio + language[]: + type: string + docs: The language of the transcript + transcript[]: + type: string + docs: The infill text to generate + voice[id]: + type: string + docs: The ID of the voice to use for generating audio + output_format[container]: + type: voice_changer.OutputFormatContainer + docs: The format of the output audio + output_format[sample_rate]: + type: integer + docs: The sample rate of the output audio + output_format[encoding]: + type: optional + docs: | + Required for `raw` and `wav` containers. + output_format[bit_rate]: + type: optional + docs: | + Required for `mp3` containers. + voice[__experimental_controls][speed]: + type: optional + docs: | + Either a number between -1.0 and 1.0 or a natural language description of speed. + + If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed. + voice[__experimental_controls][emotion][]: + type: optional + docs: | + An array of emotion:level tags. + + Supported emotions are: anger, positivity, surprise, sadness, and curiosity. + + Supported levels are: lowest, low, (omit), high, highest. + response: file + examples: + - name: MP3 + request: + model_id[]: sonic-english + language[]: en + transcript[]: middle segment + voice[id]: 694f9389-aac1-45b6-b726-9d9369183238 + output_format[container]: mp3 + output_format[sample_rate]: 44100 + output_format[bit_rate]: 128000 + - name: WAV + request: + model_id[]: sonic-english + language[]: en + transcript[]: middle segment + voice[id]: 694f9389-aac1-45b6-b726-9d9369183238 + output_format[container]: wav + output_format[sample_rate]: 44100 + output_format[encoding]: pcm_f32le diff --git a/fern/generators.yml b/fern/generators.yml index 613a141..9c38928 100644 --- a/fern/generators.yml +++ b/fern/generators.yml @@ -40,6 +40,6 @@ groups: emittery: "^0.13.1" human-id: "^4.1.1" ws: "^8.15.13" - extraDevDependencies: + extraDevDependencies: "@types/ws": "^8.5.13" smart-casing: true