-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
87 lines (67 loc) · 2.8 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import openai
from pydub import AudioSegment
from google.cloud import texttospeech
from google.oauth2 import service_account
import os
from dotenv import load_dotenv
load_dotenv()
import uuid
openai.api_key = os.getenv("OPENAI_API_KEY")
credentials = service_account.Credentials.from_service_account_file('my-credentials.json')
tts_client = texttospeech.TextToSpeechClient(credentials=credentials)
conversation = []
language_dict = {
"English": {"en-US", "en-US-Standard-H"},
"Japanese": {"ja-JP", "ja-JP-Neural2-B"},
"Korean": {"ko-KR", "ko-KR-Neural2-A"}
}
def clear_conversation():
conversation = []
return "conversation has been renewed!"
def transcribe(audio, language, role):
global conversation
global tts_client
# Whisper API
audio_file_wav = open(audio, "rb")
audio_file_mp3 = AudioSegment.from_wav(audio_file_wav).export("audio.mp3", format="mp3")
transcript = openai.Audio.transcribe("whisper-1", audio_file_mp3)
print(transcript)
conversation.insert(0, {"role": "system", "content": f"Role-playing in {language} when you're a {role}"})
conversation += [{"role": "user", "content": transcript["text"]}]
# Chatgpt API
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=conversation)
chatgpt_response = response['choices'][0]['message']['content']
print(chatgpt_response)
conversation += [{"role": "assistant", "content": chatgpt_response}]
# Set the language
language_code, voice_name = language_dict[language[0]]
# print(language_code, voice_name)
# Google cloud text-to-speech
input_text = texttospeech.SynthesisInput(text=chatgpt_response)
print(input_text)
# Note: the voice can also be specified by name.
# Names of voices can be retrieved with client.list_voices().
voice = texttospeech.VoiceSelectionParams(
language_code=language_code,
name=voice_name)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
response = tts_client.synthesize_speech(
request={"input": input_text, "voice": voice, "audio_config": audio_config}
)
# The response's audio_content is binary.
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
return "output.mp3"
bot = gr.Interface(fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath"),
gr.CheckboxGroup(["English", "Japanese", "Korean"], label="Language", info="Which language do you practice?"),
gr.CheckboxGroup(["Teacher", "Clerk", "Friend"], label="Role", info="ChatGPT would be...")],
outputs="audio")
bot.launch()