patrickvonplaten commited on
Commit
62a78c6
1 Parent(s): 63914a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -23
app.py CHANGED
@@ -1,29 +1,29 @@
1
- import torch
2
- from transformers import SpeechEncoderDecoder, Wav2Vec2Processor
3
  import gradio as gr
4
- import scipy.signal as sps
 
 
 
 
5
 
6
- def read_file(wav):
7
- sample_rate, signal = wav
8
- signal = signal.mean(-1)
9
- number_of_samples = round(len(signal) * float(16000) / sample_rate)
10
- resampled_signal = sps.resample(signal, number_of_samples)
11
- return resampled_signal
 
12
 
13
- def parse_transcription(wav_file):
14
- speech = read_file(wav_file)
15
- input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
16
- logits = model(input_values).logits
17
- predicted_ids = torch.argmax(logits, dim=-1)
18
- transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
19
- return transcription
20
 
 
21
 
22
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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")
23
- model = SpeechEncoderDecoder.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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")
24
 
25
-
26
- #input_ = gr.inputs.Audio(source="microphone", type="file")
27
- input_ = gr.inputs.Audio(source="microphone", type="numpy")
28
- gr.Interface(parse_transcription, inputs = input_, outputs="text",
29
- analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);
 
 
 
 
1
  import gradio as gr
2
+ import librosa
3
+ from transformers import Wav2Vec2Processor, SpeechEncoderDecoder
4
+
5
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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")
6
+ model = SpeechEncoderDecoder.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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")
7
 
8
+ def process_audio_file(file):
9
+ data, sr = librosa.load(file)
10
+ if sr != 16000:
11
+ data = librosa.resample(data, sr, 16000)
12
+ print(data.shape)
13
+ input_values = processor(data, return_tensors="pt").input_values
14
+ return input_values
15
 
16
+ def transcribe(file):
17
+ input_values = process_audio_file(file)
 
 
 
 
 
18
 
19
+ sequences = model.generate(input_values, num_beams=1, max_length=30)
20
 
21
+ transcription = processor.batch_decode(sequences)
22
+ return transcription[0]
23
 
24
+ iface = gr.Interface(
25
+ fn=speech_recognize,
26
+ inputs=gr.inputs.Audio(source="microphone", type='filepath'),
27
+ outputs="text",
28
+ )
29
+ iface.launch()