import gradio as gr from dotenv import load_dotenv import gradio as gr import openai import os import soundfile from espnet2.bin.tts_inference import Text2Speech def tts(text): text2speech = Text2Speech.from_pretrained("kan-bayashi/ljspeech_vits") speech = text2speech(text)["wav"] speech = speech.numpy() # Save the waverform soundfile.write('example_TTS.wav', speech, 22050, 'PCM_24') return os.path.join(os.path.dirname(__file__), "example_TTS.wav") def transcribe(audio): # print(audio) # global conv file = open(audio, "rb") transcription = openai.Audio.transcribe("whisper-1", file=file) req = transcription['text'] # conv.append(req) return req def conversation(audio): # global conv req = transcribe(audio) completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": req}] # {"role": "user", "content" : line} for line in conv] ) req2 = completion['choices'][0]['message']['content'] fin_text = 'You: ' + req + '\n' + 'AI: ' + req2.strip() tts_data = tts(req2) return fin_text, tts_data def generate_image(audio): text = transcribe(audio) response = openai.Image.create( prompt=text, n=1, size="1024x1024" ) return response['data'][0]['url'] with gr.Blocks() as demo: # global conv load_dotenv() openai.api_key=os.getenv('api_key') conv = [] with gr.Tab("Start a conversation"): with gr.Row(): audio_input_conv=gr.Audio(source="microphone", type="filepath") text_output_conv=gr.Textbox(lines=10) audio_output_conv = gr.Audio() with gr.Row(): clear_button_conv = gr.Button("Clear") submit_button_conv = gr.Button("Submit") with gr.Tab("Generate image"): with gr.Row(): audio_input_img=gr.Audio(source="microphone", type="filepath") image_output_img = gr.Image() with gr.Row(): clear_button_img = gr.Button("Clear") submit_button_img = gr.Button("Submit") # image_button = gr.Button("Flip") # with gr.Accordion("Open for More!"): # gr.Markdown("Look at me...") with gr.Accordion("How to use"): gr.Markdown("Record and submit your voice to talk to AI or to generate an image!!") submit_button_conv.click(conversation, inputs=audio_input_conv, outputs=[text_output_conv, audio_output_conv]) clear_button_conv.click(lambda: None, None, audio_input_conv, queue=False) submit_button_img.click(generate_image, inputs=audio_input_img, outputs=image_output_img) clear_button_img.click(lambda: None, None, audio_input_img, queue=False) clear_button_img.click(lambda: None, None, image_output_img, queue=False) # def same_auth(username, password): # return username == password # demo.launch(auth=same_auth) demo.launch()