mj-new commited on
Commit
0587641
1 Parent(s): 81fdfb6

Baseline local whisper model and language selection

Browse files
Files changed (2) hide show
  1. app.css +4 -0
  2. app.py +111 -3
app.css CHANGED
@@ -36,3 +36,7 @@ text-align: left;
36
  thead tr {
37
  text-align: left;
38
  }
 
 
 
 
 
36
  thead tr {
37
  text-align: left;
38
  }
39
+
40
+ #pw {
41
+ -webkit-text-security: disc;
42
+ }
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
- #import whisper
3
  import numpy as np
 
4
 
5
  def greet(name):
6
  return "Hello " + name + "!!"
@@ -11,13 +12,120 @@ with open('app.css','r') as f:
11
  markdown="""
12
  # Polish ASR BIGOS workspace
13
  """
 
 
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  block = gr.Blocks(css=css_file)
16
  with block:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  gr.Markdown(markdown)
18
  with gr.Tabs():
19
  with gr.TabItem('Voicebot playground'):
20
- record = gr.Audio(source="microphone", label='Record your voice')
21
- save = gr.Button("Submit")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  block.launch()
 
1
  import gradio as gr
2
+ import whisper
3
  import numpy as np
4
+ import openai
5
 
6
  def greet(name):
7
  return "Hello " + name + "!!"
 
12
  markdown="""
13
  # Polish ASR BIGOS workspace
14
  """
15
+ def whisper_model_change(radio_whisper_model):
16
+ whisper_model = whisper.load_model(radio_whisper_model)
17
+ return(whisper_model)
18
 
19
+ def prompt_gpt(input_text):
20
+ messages = [
21
+ {"role": "system", "content": "You are a helpful assistant."}]
22
+
23
+ if input_text:
24
+ messages.append(
25
+ {"role": "user", "content": input_text},
26
+ )
27
+ chat_completion = openai.ChatCompletion.create(
28
+ model="gpt-3.5-turbo", messages=messages
29
+ )
30
+
31
+ reply = chat_completion.choices[0].message.content
32
+ return reply
33
+
34
+ def process_pipeline(audio):
35
+ asr_out = transcribe(audio)
36
+ gpt_out = prompt_gpt(asr_out)
37
+ tts_out = synthesize_speech(gpt_out)
38
+ return(tts_out)
39
+
40
+ def transcribe(audio, language, whisper_model, whisper_model_type):
41
+ if not whisper_model:
42
+ whisper_model=init_whisper_model(whisper_model_type)
43
+
44
+ print(f"Transcribing {audio} for language {language} and model {whisper_model_type}")
45
+ audio = whisper.load_audio(audio)
46
+ audio = whisper.pad_or_trim(audio)
47
+
48
+ mel = whisper.log_mel_spectrogram(audio)
49
+
50
+ options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False)
51
+ result = whisper.decode(whisper_model, mel, options)
52
+ result_text = result.text
53
+ return result_text
54
+
55
+ def init_whisper_model(whisper_model_type):
56
+ print("Initializing whisper model")
57
+ print(whisper_model_type)
58
+ whisper_model = whisper.load_model(whisper_model_type)
59
+ return whisper_model
60
+
61
+ def synthesize_speech(text):
62
+ audioobj = gTTS(text = out_result,
63
+ lang = lang,
64
+ slow = False)
65
+
66
+ audioobj.save("Temp.mp3")
67
+ return("Temp.mp3")
68
+
69
  block = gr.Blocks(css=css_file)
70
  with block:
71
+
72
+ #state variables
73
+ language = gr.State("en")
74
+ whisper_model_type = gr.State("base")
75
+ whisper_model = gr.State()
76
+
77
+ # state handling functions
78
+ def change_language(choice):
79
+ if choice == "Polish":
80
+ language="pl"
81
+ print("Switching to Polish")
82
+ print("language")
83
+ print(language)
84
+ elif choice == "English":
85
+ language="en"
86
+ print("Switching to English")
87
+ print("language")
88
+ print(language)
89
+ return(language)
90
+
91
+ def change_whisper_model(choice):
92
+ whisper_model_type = choice
93
+ print("Switching Whisper model")
94
+ print(whisper_model_type)
95
+ whisper_model = init_whisper_model(whisper_model_type)
96
+ return [whisper_model_type, whisper_model]
97
+
98
  gr.Markdown(markdown)
99
  with gr.Tabs():
100
  with gr.TabItem('Voicebot playground'):
101
+ with gr.Box():
102
+ gr.HTML("<p class=\"apikey\">API Key:</p>")
103
+ # API key textbox (password-style)
104
+ api_key = gr.Textbox(label="", elem_id="pw")
105
+
106
+ radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
107
+ #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
108
+ #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
109
+ radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base")
110
+
111
+ mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
112
+
113
+ out_asr = gr.Textbox(placeholder="ASR output",
114
+ lines=5,
115
+ max_lines=10,
116
+ show_label=False)
117
+ out_gpt = gr.Textbox(placeholder="ChatGPT output",
118
+ lines=10,
119
+ max_lines=25,
120
+ show_label=False)
121
+
122
+ button_transcribe = gr.Button("Transcribe")
123
+ button_prompt_gpt = gr.Button("Prompt ChatGPT")
124
+
125
+ button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr)
126
+ button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt)
127
+
128
+ radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
129
+ radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
130
 
131
  block.launch()