MusIre commited on
Commit
b5128f7
1 Parent(s): 2df363a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ subprocess.run(["pip", "install", "gradio", "--upgrade"])
3
+ subprocess.run(["pip", "install", "transformers"])
4
+ subprocess.run(["pip", "install", "torchaudio", "--upgrade"])
5
+
6
+ import gradio as gr
7
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
8
+ import torchaudio
9
+ import torch
10
+
11
+ # Load model and processor
12
+ processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-italian")
13
+ model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-italian")
14
+
15
+ # Function to perform ASR on audio data
16
+ def transcribe_audio(audio_data):
17
+ print("Received audio data:", audio_data) # Debug print
18
+
19
+ # Check if audio_data is None or not a tuple of length 2
20
+ if audio_data is None or not isinstance(audio_data, tuple) or len(audio_data) != 2:
21
+ return "Invalid audio data format."
22
+
23
+ sample_rate, waveform = audio_data
24
+
25
+ # Check if waveform is None or not a NumPy array
26
+ if waveform is None or not isinstance(waveform, torch.Tensor):
27
+ return "Invalid audio data format."
28
+
29
+ try:
30
+ # Convert audio data to mono and normalize
31
+ audio_data = torchaudio.transforms.Resample(sample_rate, 100000)(waveform)
32
+ audio_data = torchaudio.functional.gain(audio_data, gain_db=5.0)
33
+
34
+ # Apply custom preprocessing to the audio data if needed
35
+ input_values = processor(audio_data[0], return_tensors="pt").input_values
36
+
37
+ # Perform ASR
38
+ with torch.no_grad():
39
+ logits = model(input_values).logits
40
+
41
+ # Decode the output
42
+ predicted_ids = torch.argmax(logits, dim=-1)
43
+ transcription = processor.batch_decode(predicted_ids)
44
+
45
+ return transcription[0]
46
+
47
+ except Exception as e:
48
+ return f"An error occurred: {str(e)}"
49
+
50
+ # Create Gradio interface
51
+ audio_input = gr.Audio(sources=["microphone"])
52
+ gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs="text").launch()