Skip to content
This repository has been archived by the owner on Dec 19, 2023. It is now read-only.

Commit

Permalink
feat: support MULAW and MP3_64_KBPS audio encoding and support timepo…
Browse files Browse the repository at this point in the history
…inting via SSML <mark> tag

PiperOrigin-RevId: 323424211

Source-Author: Google APIs <noreply@google.com>
Source-Date: Mon Jul 27 13:05:41 2020 -0700
Source-Repo: googleapis/googleapis
Source-Sha: a94df49e8f208649f2f5cb39a84668c6a3434ce8
Source-Link: googleapis/googleapis@a94df49
  • Loading branch information
yoshi-automation authored Jul 30, 2020
1 parent 4a3163b commit 8826c87
Show file tree
Hide file tree
Showing 7 changed files with 580 additions and 10 deletions.
39 changes: 37 additions & 2 deletions protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

syntax = "proto3";

Expand Down Expand Up @@ -83,7 +82,7 @@ enum SsmlVoiceGender {
// A female voice.
FEMALE = 2;

// A gender-neutral voice.
// A gender-neutral voice. This voice is not yet supported.
NEUTRAL = 3;
}

Expand All @@ -100,11 +99,18 @@ enum AudioEncoding {
// MP3 audio at 32kbps.
MP3 = 2;

// MP3 at 64kbps.
MP3_64_KBPS = 4;

// Opus encoded audio wrapped in an ogg container. The result will be a
// file which can be played natively on Android, and in browsers (at least
// Chrome and Firefox). The quality of the encoding is considerably higher
// than MP3 while using approximately the same bitrate.
OGG_OPUS = 3;

// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
// Audio content returned as MULAW also contains a WAV header.
MULAW = 5;
}

// The message returned to the client by the `ListVoices` method.
Expand Down Expand Up @@ -132,6 +138,15 @@ message Voice {

// The top-level message sent by the client for the `SynthesizeSpeech` method.
message SynthesizeSpeechRequest {
// The type of timepoint information that is returned in the response.
enum TimepointType {
// Not specified. No timepoint information will be returned.
TIMEPOINT_TYPE_UNSPECIFIED = 0;

// Timepoint information of `<mark>` tags in SSML input will be returned.
SSML_MARK = 1;
}

// Required. The Synthesizer requires either plain text or SSML as input.
SynthesisInput input = 1 [(google.api.field_behavior) = REQUIRED];

Expand All @@ -140,6 +155,9 @@ message SynthesizeSpeechRequest {

// Required. The configuration of the synthesized audio.
AudioConfig audio_config = 3 [(google.api.field_behavior) = REQUIRED];

// Whether and what timepoints should be returned in the response.
repeated TimepointType enable_time_pointing = 4;
}

// Contains text input to be synthesized. Either `text` or `ssml` must be
Expand Down Expand Up @@ -251,4 +269,21 @@ message SynthesizeSpeechResponse {
// with all bytes fields, protobuffers use a pure binary representation,
// whereas JSON representations use base64.
bytes audio_content = 1;

// A link between a position in the original request input and a corresponding
// time in the output audio. It's only supported via `<mark>` of SSML input.
repeated Timepoint timepoints = 2;

// The audio metadata of `audio_content`.
AudioConfig audio_config = 4;
}

// This contains a mapping between a certain point in the input text and a
// corresponding time in the output audio.
message Timepoint {
// Timepoint name as received from the client within `<mark>` tag.
string mark_name = 4;

// Time offset in seconds from the start of the synthesized audio.
double time_seconds = 3;
}
127 changes: 126 additions & 1 deletion protos/protos.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 8826c87

Please sign in to comment.