ToletiSri commited on
Commit
6d2bd3e
1 Parent(s): e71d38b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import torch
2
  import torch.nn as nn
3
  import gradio as gr
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, WhisperProcessor, WhisperForConditionalGeneration
5
 
6
 
7
  class _MLPVectorProjector(nn.Module):
@@ -28,9 +28,16 @@ phi2_text = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=T
28
  tokenizer_text = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
29
 
30
  ## Audio model
31
- processor_audio = WhisperProcessor.from_pretrained("openai/whisper-small")
32
- model_audio = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
33
- model_audio.config.forced_decoder_ids = None
 
 
 
 
 
 
 
34
 
35
  ## image model
36
 
@@ -60,15 +67,21 @@ def imageMode(image, question):
60
  return "In progress"
61
 
62
  def audioMode(audio):
63
- #print('---------type of audio--------------')
 
 
 
 
 
 
64
  #sampling_rate = audio[0]
65
- audio_array = audio[1]
66
  #print(sampling_rate)
67
  #print(audio_array)
68
- input_features = processor_audio(audio_array, sampling_rate=16000, return_tensors="pt").input_features
69
- predicted_ids = model_audio.generate(input_features)
70
- transcription = processor_audio.batch_decode(predicted_ids, skip_special_tokens=True)
71
- return transcription[0]
72
 
73
 
74
  interface_title = "TSAI-ERA-V1 - Capstone - Multimodal GPT Demo"
@@ -89,7 +102,7 @@ with gr.Blocks() as demo:
89
  image_text_output = gr.Textbox(label="Answer")
90
 
91
  with gr.Tab("Audio mode"):
92
- audio_input = gr.Audio()
93
  audio_button = gr.Button("Submit")
94
  audio_text_output = gr.Textbox(label="Chat GPT like text")
95
 
 
1
  import torch
2
  import torch.nn as nn
3
  import gradio as gr
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
 
6
 
7
  class _MLPVectorProjector(nn.Module):
 
28
  tokenizer_text = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
29
 
30
  ## Audio model
31
+ model_name_audio = "openai/whisper-small"
32
+ #processor_audio = WhisperProcessor.from_pretrained("openai/whisper-small")
33
+ #model_audio = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
34
+ #model_audio.config.forced_decoder_ids = None
35
+ pipe = pipeline(
36
+ task="automatic-speech-recognition",
37
+ model=model_name_audio,
38
+ chunk_length_s=30,
39
+ device="cpu",
40
+ )
41
 
42
  ## image model
43
 
 
67
  return "In progress"
68
 
69
  def audioMode(audio):
70
+ if audio is None:
71
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
72
+
73
+ print('---------type of audio--------------')
74
+ print(type(audio))
75
+ print(audio)
76
+ text = pipe(audio, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
77
  #sampling_rate = audio[0]
78
+ #audio_array = audio[1]
79
  #print(sampling_rate)
80
  #print(audio_array)
81
+ #input_features = processor_audio(audio_array, sampling_rate=16000, return_tensors="pt").input_features
82
+ #predicted_ids = model_audio.generate(input_features)
83
+ #transcription = processor_audio.batch_decode(predicted_ids, skip_special_tokens=True)
84
+ return text
85
 
86
 
87
  interface_title = "TSAI-ERA-V1 - Capstone - Multimodal GPT Demo"
 
102
  image_text_output = gr.Textbox(label="Answer")
103
 
104
  with gr.Tab("Audio mode"):
105
+ audio_input = gr.Audio(type="filepath", optional=True)
106
  audio_button = gr.Button("Submit")
107
  audio_text_output = gr.Textbox(label="Chat GPT like text")
108