import gradio as gr import whisper import numpy as np import openai def greet(name): return "Hello " + name + "!!" with open('app.css','r') as f: css_file = f.read() markdown=""" # Polish ASR BIGOS workspace """ def whisper_model_change(radio_whisper_model): whisper_model = whisper.load_model(radio_whisper_model) return(whisper_model) def prompt_gpt(input_text): messages = [ {"role": "system", "content": "You are a helpful assistant."}] if input_text: messages.append( {"role": "user", "content": input_text}, ) chat_completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages ) reply = chat_completion.choices[0].message.content return reply def process_pipeline(audio): asr_out = transcribe(audio) gpt_out = prompt_gpt(asr_out) tts_out = synthesize_speech(gpt_out) return(tts_out) def transcribe(audio, language, whisper_model, whisper_model_type): if not whisper_model: whisper_model=init_whisper_model(whisper_model_type) print(f"Transcribing {audio} for language {language} and model {whisper_model_type}") audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio) options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False) result = whisper.decode(whisper_model, mel, options) result_text = result.text return result_text def init_whisper_model(whisper_model_type): print("Initializing whisper model") print(whisper_model_type) whisper_model = whisper.load_model(whisper_model_type) return whisper_model def synthesize_speech(text): audioobj = gTTS(text = out_result, lang = lang, slow = False) audioobj.save("Temp.mp3") return("Temp.mp3") block = gr.Blocks(css=css_file) with block: #state variables language = gr.State("en") whisper_model_type = gr.State("base") whisper_model = gr.State() # state handling functions def change_language(choice): if choice == "Polish": language="pl" print("Switching to Polish") print("language") print(language) elif choice == "English": language="en" print("Switching to English") print("language") print(language) return(language) def change_whisper_model(choice): whisper_model_type = choice print("Switching Whisper model") print(whisper_model_type) whisper_model = init_whisper_model(whisper_model_type) return [whisper_model_type, whisper_model] gr.Markdown(markdown) with gr.Tabs(): with gr.TabItem('Voicebot playground'): with gr.Box(): gr.HTML("

API Key:

") # API key textbox (password-style) api_key = gr.Textbox(label="", elem_id="pw") radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used") #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money") #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service") radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base") mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice') out_asr = gr.Textbox(placeholder="ASR output", lines=5, max_lines=10, show_label=False) out_gpt = gr.Textbox(placeholder="ChatGPT output", lines=10, max_lines=25, show_label=False) button_transcribe = gr.Button("Transcribe") button_prompt_gpt = gr.Button("Prompt ChatGPT") button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr) button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt) radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language) radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model]) block.launch()