|
import numpy as np |
|
import gradio as gr |
|
|
|
import soundfile |
|
import azure.cognitiveservices.speech as speechsdk |
|
import openai |
|
import os |
|
openai.api_key = os.environ.get("OPENAI_API_KEY") |
|
speech_key = os.environ.get("SPEECH_KEY") |
|
|
|
def ms_tts(text, filename): |
|
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region='eastasia') |
|
audio_config = speechsdk.audio.AudioOutputConfig(filename = filename) |
|
|
|
|
|
|
|
speech_config.speech_synthesis_voice_name='zh-CN-XiaomengNeural' |
|
|
|
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) |
|
|
|
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get() |
|
|
|
def ms_asr(filename): |
|
|
|
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region="eastus") |
|
speech_config.speech_recognition_language="zh-CN" |
|
|
|
audio_config = speechsdk.audio.AudioConfig(filename=filename) |
|
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) |
|
|
|
|
|
speech_recognition_result = speech_recognizer.recognize_once_async().get() |
|
|
|
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech: |
|
print("Recognized: {}".format(speech_recognition_result.text)) |
|
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch: |
|
print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details)) |
|
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled: |
|
cancellation_details = speech_recognition_result.cancellation_details |
|
print("Speech Recognition canceled: {}".format(cancellation_details.reason)) |
|
if cancellation_details.reason == speechsdk.CancellationReason.Error: |
|
print("Error details: {}".format(cancellation_details.error_details)) |
|
print("Did you set the speech resource key and region values?") |
|
|
|
return speech_recognition_result.text |
|
|
|
class Conversation: |
|
def __init__(self, prompt, num_of_round): |
|
self.prompt = prompt |
|
self.num_of_round = num_of_round |
|
self.messages = [] |
|
self.messages.append({"role": "system", "content": self.prompt}) |
|
|
|
def ask(self, question): |
|
try: |
|
self.messages.append( {"role": "user", "content": question}) |
|
response = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=self.messages, |
|
|
|
temperature=0, |
|
max_tokens=2048, |
|
top_p=1, |
|
) |
|
except Exception as e: |
|
print(e) |
|
return e |
|
|
|
message = response["choices"][0]["message"]["content"] |
|
self.messages.append({"role": "assistant", "content": message}) |
|
|
|
if len(self.messages) > self.num_of_round*2 + 1: |
|
del self.messages[1:3] |
|
return message |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompt = """You are an educational conversation bot. Your name is Xiaomeng. You can speak English and Chinese. You were developed by the AIOT CTO organization. |
|
你是一个教育对话机器人。你叫小萌。你可以说英文和中文。你是由智能物联首席技术官组织开发的。 |
|
你的任务包括: |
|
- 英语口语对话 |
|
- 知识问答 |
|
- 讲故事 |
|
|
|
知识问答任务要求: |
|
- 回答必须保证是正确的 |
|
- 回答需要采用逐步推理的方式 |
|
- 如果你不确定答案,你可以说“我不知道”或“我不确定" |
|
|
|
合规要求: |
|
- 用户为年龄大于3岁且小于15岁的儿童 |
|
- 知识问答任务中回答的内容和形式必须适合年龄大于3岁且小于15岁的儿童 |
|
- 英语口语对话任务中回答的内容和形式必须适合大于3岁且小于15岁的儿童 |
|
- 讲故事任务中回答的内容和形式必须适合年龄大于3岁且小于15岁的儿童 |
|
- 全部对话的内容和形式必须适合年龄大于3岁且小于15岁的儿童 |
|
- 需要对全部对话进行分析,当检测到用户不是年龄大于3岁且小于15岁的儿童时,回答”我猜和我对话的一定是一位机灵可爱的小朋友^_^ I guess I'm talking to a clever and cute kid ^_^” |
|
|
|
语言要求: |
|
- 只支持用中文或英文回答 |
|
- 当用户用中文提问时,用中文回答 |
|
- 当用户用英文提问时,用英文回答 |
|
- 当用户用中文和英文混合语言提问时,可以用中文回答,也可以用英文回答,还可以用中文和英文混合语言回答 |
|
- 当用户用其它语言提问时,回答“抱歉,我只说中文或英文!Sorry, I only speak Chinese or English!” |
|
- 当用户要求用中文回答时,用中文回答 |
|
- 当用户要求用英文回答时,用英文回答 |
|
- 当用户要求回答的语言既不是中文也不是英文时,回答“抱歉,我只说中文或英文!Sorry, I only speak Chinese or English!” |
|
|
|
回答长度要求: |
|
- 知识问答任务中回答长度小于50字 |
|
- 英语口语对话任务中回答长度小于20字 |
|
- 讲故事任务中回答长度小于100字 |
|
""" |
|
|
|
|
|
conv = Conversation(prompt, 20) |
|
|
|
def predict(input, history=[]): |
|
history.append(input) |
|
response = conv.ask(input) |
|
history.append(response) |
|
responses = [(u,b) for u,b in zip(history[::2], history[1::2])] |
|
return response, responses, history |
|
|
|
def main(audio, history=[]): |
|
|
|
s,y = audio |
|
|
|
print(s) |
|
assert s in [48000, 16000] |
|
if s == 48000: |
|
y = (y / max(np.max(y), 1) * 32767)[::3].astype("int16") |
|
soundfile.write("./input.wav",y,16000) |
|
|
|
|
|
wav_res = "hello!" |
|
print("You said : ", wav_res) |
|
|
|
|
|
|
|
|
|
answer, his_list, history = predict(wav_res, history) |
|
|
|
print("answer: ", answer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
path1="./output.wav" |
|
print("historyList.hlist: ", historyList.hlist) |
|
return his_list, history, path1 |
|
|
|
|
|
with gr.Blocks() as demo: |
|
state = gr.State([]) |
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
txt = gr.Chatbot(label="ChatBox") |
|
out_voice = gr.Audio(label="audio") |
|
with gr.Column(scale=4): |
|
mic = gr.Mic(label="input") |
|
button = gr.Button("Generate") |
|
button.click(main, [mic, state], [txt, state, out_voice]) |
|
|
|
demo.queue().launch() |