feat: support MULAW and MP3_64_KBPS audio encoding and support timepo…

…inting via SSML <mark> tag PiperOrigin-RevId: 323424211 Source-Author: Google APIs <noreply@google.com> Source-Date: Mon Jul 27 13:05:41 2020 -0700 Source-Repo: googleapis/googleapis Source-Sha: a94df49e8f208649f2f5cb39a84668c6a3434ce8 Source-Link: googleapis/googleapis@a94df49
googleapis · Jul 30, 2020 · 8826c87 · 8826c87
1 parent 4a3163b
commit 8826c87
Show file tree

Hide file tree

Showing 7 changed files with 580 additions and 10 deletions.
diff --git a/protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-//
 
 syntax = "proto3";
 
@@ -83,7 +82,7 @@ enum SsmlVoiceGender {
   // A female voice.
   FEMALE = 2;
 
-  // A gender-neutral voice.
+  // A gender-neutral voice. This voice is not yet supported.
   NEUTRAL = 3;
 }
 
@@ -100,11 +99,18 @@ enum AudioEncoding {
   // MP3 audio at 32kbps.
   MP3 = 2;
 
+  // MP3 at 64kbps.
+  MP3_64_KBPS = 4;
+
   // Opus encoded audio wrapped in an ogg container. The result will be a
   // file which can be played natively on Android, and in browsers (at least
   // Chrome and Firefox). The quality of the encoding is considerably higher
   // than MP3 while using approximately the same bitrate.
   OGG_OPUS = 3;
+
+  // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
+  // Audio content returned as MULAW also contains a WAV header.
+  MULAW = 5;
 }
 
 // The message returned to the client by the `ListVoices` method.
@@ -132,6 +138,15 @@ message Voice {
 
 // The top-level message sent by the client for the `SynthesizeSpeech` method.
 message SynthesizeSpeechRequest {
+  // The type of timepoint information that is returned in the response.
+  enum TimepointType {
+    // Not specified. No timepoint information will be returned.
+    TIMEPOINT_TYPE_UNSPECIFIED = 0;
+
+    // Timepoint information of `<mark>` tags in SSML input will be returned.
+    SSML_MARK = 1;
+  }
+
   // Required. The Synthesizer requires either plain text or SSML as input.
   SynthesisInput input = 1 [(google.api.field_behavior) = REQUIRED];
 
@@ -140,6 +155,9 @@ message SynthesizeSpeechRequest {
 
   // Required. The configuration of the synthesized audio.
   AudioConfig audio_config = 3 [(google.api.field_behavior) = REQUIRED];
+
+  // Whether and what timepoints should be returned in the response.
+  repeated TimepointType enable_time_pointing = 4;
 }
 
 // Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -251,4 +269,21 @@ message SynthesizeSpeechResponse {
   // with all bytes fields, protobuffers use a pure binary representation,
   // whereas JSON representations use base64.
   bytes audio_content = 1;
+
+  // A link between a position in the original request input and a corresponding
+  // time in the output audio. It's only supported via `<mark>` of SSML input.
+  repeated Timepoint timepoints = 2;
+
+  // The audio metadata of `audio_content`.
+  AudioConfig audio_config = 4;
+}
+
+// This contains a mapping between a certain point in the input text and a
+// corresponding time in the output audio.
+message Timepoint {
+  // Timepoint name as received from the client within `<mark>` tag.
+  string mark_name = 4;
+
+  // Time offset in seconds from the start of the synthesized audio.
+  double time_seconds = 3;
 }
diff --git a/protos/protos.d.ts b/protos/protos.d.ts