patrickvonplaten commited on
Commit
9900ba7
1 Parent(s): 04e8018

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -23
app.py CHANGED
@@ -1,18 +1,8 @@
1
- import soundfile as sf
2
  import torch
3
  from transformers import SpeechEncoderDecoder, Wav2Vec2Processor
4
  import gradio as gr
5
  import scipy.signal as sps
6
- import sox
7
 
8
- def convert(inputfile, outfile):
9
- sox_tfm = sox.Transformer()
10
- sox_tfm.set_output_format(
11
- file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
12
- )
13
- #print(this is not done)
14
- sox_tfm.build(inputfile, outfile)
15
-
16
  def read_file(wav):
17
  sample_rate, signal = wav
18
  signal = signal.mean(-1)
@@ -21,12 +11,6 @@ def read_file(wav):
21
  return resampled_signal
22
 
23
  def parse_transcription(wav_file):
24
- '''
25
- filename = wav_file.name.split('.')[0]
26
- convert(wav_file.name, filename + "16k.wav")
27
- speech, _ = sf.read(filename + "16k.wav")
28
-
29
- '''
30
  speech = read_file(wav_file)
31
  input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
32
  logits = model(input_values).logits
@@ -35,14 +19,10 @@ def parse_transcription(wav_file):
35
  return transcription
36
 
37
 
38
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
39
- model = SpeechEncoderDecoder.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
40
-
41
-
42
- processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
43
- model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
44
-
45
 
 
46
  #input_ = gr.inputs.Audio(source="microphone", type="file")
47
  input_ = gr.inputs.Audio(source="microphone", type="numpy")
48
  gr.Interface(parse_transcription, inputs = input_, outputs="text",
 
 
1
  import torch
2
  from transformers import SpeechEncoderDecoder, Wav2Vec2Processor
3
  import gradio as gr
4
  import scipy.signal as sps
 
5
 
 
 
 
 
 
 
 
 
6
  def read_file(wav):
7
  sample_rate, signal = wav
8
  signal = signal.mean(-1)
 
11
  return resampled_signal
12
 
13
  def parse_transcription(wav_file):
 
 
 
 
 
 
14
  speech = read_file(wav_file)
15
  input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
16
  logits = model(input_values).logits
 
19
  return transcription
20
 
21
 
22
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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")
23
+ model = SpeechEncoderDecoder.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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")
 
 
 
 
 
24
 
25
+
26
  #input_ = gr.inputs.Audio(source="microphone", type="file")
27
  input_ = gr.inputs.Audio(source="microphone", type="numpy")
28
  gr.Interface(parse_transcription, inputs = input_, outputs="text",