patrickvonplaten commited on
Commit
dd689bb
1 Parent(s): 0566261

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -5
app.py CHANGED
@@ -1,7 +1,49 @@
 
 
 
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
-
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ from transformers import SpeechEncoderDecoder, Wav2Vec2Processor
4
  import gradio as gr
5
+ import scipy.signal as sps
6
+ import sox
7
 
8
+ def convert(inputfile, outfile):
9
+ sox_tfm = sox.Transformer()
10
+ sox_tfm.set_output_format(
11
+ file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
12
+ )
13
+ #print(this is not done)
14
+ sox_tfm.build(inputfile, outfile)
15
+
16
+ def read_file(wav):
17
+ sample_rate, signal = wav
18
+ signal = signal.mean(-1)
19
+ number_of_samples = round(len(signal) * float(16000) / sample_rate)
20
+ resampled_signal = sps.resample(signal, number_of_samples)
21
+ return resampled_signal
22
+
23
+ def parse_transcription(wav_file):
24
+ '''
25
+ filename = wav_file.name.split('.')[0]
26
+ convert(wav_file.name, filename + "16k.wav")
27
+ speech, _ = sf.read(filename + "16k.wav")
28
+
29
+ '''
30
+ speech = read_file(wav_file)
31
+ input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
32
+ logits = model(input_values).logits
33
+ predicted_ids = torch.argmax(logits, dim=-1)
34
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
35
+ return transcription
36
+
37
+
38
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
39
+ model = SpeechEncoderDecoder.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
40
+
41
+
42
+ processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
43
+ model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
44
+
45
+
46
+ #input_ = gr.inputs.Audio(source="microphone", type="file")
47
+ input_ = gr.inputs.Audio(source="microphone", type="numpy")
48
+ gr.Interface(parse_transcription, inputs = input_, outputs="text",
49
+ analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);