MusIre commited on
Commit
76264a6
1 Parent(s): 7f2e93f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -18
app.py CHANGED
@@ -1,28 +1,41 @@
1
  import subprocess
2
- subprocess.run(["pip", "install", "-U", "pip"])
3
- subprocess.run(["pip", "install", "-U", "gradio"])
4
- subprocess.run(["pip", "install", "whisper"])
5
 
 
6
  import gradio as gr
7
- import whisper
8
 
9
- def transcribe_audio(audio_file):
10
- model = whisper.load_model("base")
11
- result = model.transcribe(audio_file)
12
- return result["text"]
 
13
 
14
- def main():
15
- audio_input = gr.Audio(source="upload", type="file")
16
- output_text = gr.Textbox()
 
 
 
 
 
17
 
18
- iface = gr.Interface(fn=transcribe_audio, inputs=audio_input,
19
- outputs=output_text, title="Audio Transcription App",
20
- description="Upload an audio file and hit the 'Submit'\
21
- button")
 
 
 
 
22
 
23
- iface.launch()
 
 
 
 
24
 
25
- if __name__ == '__main__':
26
- main()
27
 
28
 
 
1
  import subprocess
2
+ subprocess.run(["pip", "install", "gradio", "--upgrade"])
3
+ subprocess.run(["pip", "install", "transformers"])
4
+ subprocess.run(["pip", "install", "torchaudio", "--upgrade"])
5
 
6
+ import numpy as np
7
  import gradio as gr
8
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
 
10
+ # Load Whisper ASR model and processor
11
+ model_name = "openai/whisper-small"
12
+ processor = WhisperProcessor.from_pretrained(model_name, sampling_rate=44_100)
13
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
14
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")
15
 
16
+
17
+
18
+ def transcribe_audio(input_audio):
19
+ if isinstance(input_audio, int):
20
+ # Handle the case where input_audio is an integer (error fallback)
21
+ input_audio_np = np.array([0.0]) # You can adjust this default value
22
+ else:
23
+ input_audio_np = np.array(input_audio.data)
24
 
25
+ input_features = processor(input_audio_np, return_tensors="pt").input_features
26
+
27
+
28
+ # Generate token ids
29
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
30
+
31
+ # Decode token ids to text
32
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
33
 
34
+ return transcription[0]
35
+
36
+
37
+ audio_input = gr.Audio(sources=["microphone"])
38
+ gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs="text").launch()
39
 
 
 
40
 
41