Harveenchadha commited on
Commit
38368a5
1 Parent(s): 98c1a75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -3
app.py CHANGED
@@ -3,7 +3,7 @@ import torch
3
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  import gradio as gr
5
  import sox
6
-
7
 
8
 
9
  def convert(inputfile, outfile):
@@ -13,12 +13,20 @@ def convert(inputfile, outfile):
13
  )
14
  sox_tfm.build(inputfile, outfile)
15
 
 
 
 
 
 
 
16
 
17
 
18
  def parse_transcription(wav_file):
19
- filename = wav_file.name.split('.')[0]
20
  convert(wav_file.name, filename + "16k.wav")
21
  speech, _ = sf.read(filename + "16k.wav")
 
 
22
  input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
23
 
24
  logits = model(input_values).logits
@@ -36,6 +44,8 @@ model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-hi
36
  processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
37
  model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
38
 
39
- input_ = gr.inputs.Audio(source="microphone", type="file")
 
 
40
  gr.Interface(parse_transcription, inputs = input_, outputs="text",
41
  analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);
 
3
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  import gradio as gr
5
  import sox
6
+ import scipy.signal as sps
7
 
8
 
9
  def convert(inputfile, outfile):
 
13
  )
14
  sox_tfm.build(inputfile, outfile)
15
 
16
+ def read_file(wav):
17
+ sample_rate, signal = wav_file
18
+ signal = signal.mean(-1)
19
+ number_of_samples = round(len(signal) * float(16000) / sample_rate)
20
+ resampled_signal = sps.resample(signal, number_of_samples)
21
+ return resampled_signal
22
 
23
 
24
  def parse_transcription(wav_file):
25
+ '''filename = wav_file.name.split('.')[0]
26
  convert(wav_file.name, filename + "16k.wav")
27
  speech, _ = sf.read(filename + "16k.wav")
28
+ '''
29
+ speech = read_file(wav_file)
30
  input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
31
 
32
  logits = model(input_values).logits
 
44
  processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
45
  model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
46
 
47
+ #input_ = gr.inputs.Audio(source="microphone", type="file")
48
+ input_ = gr.inputs.Audio(source="microphone", type="numpy")
49
+
50
  gr.Interface(parse_transcription, inputs = input_, outputs="text",
51
  analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);