yuangongfdu commited on
Commit
a24b835
1 Parent(s): 3af9b1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -6,7 +6,7 @@ text = "[Github]"
6
  paper_link = "https://arxiv.org/pdf/2307.03183.pdf"
7
  paper_text = "[Paper]"
8
 
9
- model = whisper.load_model("tiny")
10
  print('model loaded')
11
 
12
  def round_time_resolution(time_resolution):
@@ -16,9 +16,11 @@ def round_time_resolution(time_resolution):
16
  return rounded_time_resolution
17
 
18
  def predict(audio_path_m, audio_path_t, time_resolution):
19
- return asr_output, at_output
20
- if (audio_path_m is None) != (audio_path_t is None):
21
- return "Please only upload one recording, either upload it or record using microphone.", "Please only upload one recording, either upload it or record using microphone."
 
 
22
  else:
23
  audio_path = audio_path_m or audio_path_t
24
  audio_tagging_time_resolution = round_time_resolution(time_resolution)
@@ -26,19 +28,19 @@ def predict(audio_path_m, audio_path_t, time_resolution):
26
  audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
27
  asr_output = ""
28
  for segment in result['segments']:
29
- asr_output = asr_output + str(segment['start']) + 's-' + str(segment['end']) + 's: ' + segment['text'] + '\n'
30
  at_output = ""
31
  for segment in audio_tag_result:
32
  print(segment)
33
- at_output = at_output + str(segment['time']['start']) + 's-' + str(segment['time']['end']) + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
34
  print(at_output)
35
  return asr_output, at_output
36
 
37
  iface = gr.Interface(fn=predict,
38
- inputs=[gr.Audio(type="filepath", source='microphone'), gr.Audio(type="filepath"), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
39
  outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
40
  cache_examples=True,
41
  title="Quick Demo of Whisper-AT",
42
  description="We are glad to introduce Whisper-AT - A new joint audio tagging and speech recognition model. It outputs background sound labels in addition to text." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
43
- "Whisper-AT is authored by Yuan Gong, Sameer Khurana, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab).")
44
  iface.launch(debug=True)
 
6
  paper_link = "https://arxiv.org/pdf/2307.03183.pdf"
7
  paper_text = "[Paper]"
8
 
9
+ model = whisper.load_model('large-v1')
10
  print('model loaded')
11
 
12
  def round_time_resolution(time_resolution):
 
16
  return rounded_time_resolution
17
 
18
  def predict(audio_path_m, audio_path_t, time_resolution):
19
+ # print(audio_path_m, audio_path_t)
20
+ # print(type(audio_path_m), type(audio_path_t))
21
+ #return audio_path_m, audio_path_t
22
+ if ((audio_path_m is None) != (audio_path_t is None)) == False:
23
+ return "Please upload and only upload one recording, either upload the audio file or record using microphone.", "Please upload and only upload one recording, either upload the audio file or record using microphone."
24
  else:
25
  audio_path = audio_path_m or audio_path_t
26
  audio_tagging_time_resolution = round_time_resolution(time_resolution)
 
28
  audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
29
  asr_output = ""
30
  for segment in result['segments']:
31
+ asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
32
  at_output = ""
33
  for segment in audio_tag_result:
34
  print(segment)
35
+ at_output = at_output + format(segment['time']['start'], ".1f") + 's-' + format(segment['time']['end'], ".1f") + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
36
  print(at_output)
37
  return asr_output, at_output
38
 
39
  iface = gr.Interface(fn=predict,
40
+ inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
41
  outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
42
  cache_examples=True,
43
  title="Quick Demo of Whisper-AT",
44
  description="We are glad to introduce Whisper-AT - A new joint audio tagging and speech recognition model. It outputs background sound labels in addition to text." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
45
+ "Whisper-AT is authored by Yuan Gong, Sameer Khurana, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). It is an Interspeech 2023 paper.")
46
  iface.launch(debug=True)