patrickvonplaten commited on
Commit
00349e4
1 Parent(s): 5b8b578

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -6
app.py CHANGED
@@ -2,9 +2,11 @@ import gradio as gr
2
  import librosa
3
  from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
4
 
5
- feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
6
- tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_fast=False)
7
- model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
 
 
8
 
9
  def process_audio_file(file):
10
  data, sr = librosa.load(file)
@@ -14,11 +16,18 @@ def process_audio_file(file):
14
  input_values = feature_extractor(data, return_tensors="pt").input_values
15
  return input_values
16
 
17
- def transcribe(file, target_language):
18
 
19
  target_code = target_language.split("(")[-1].split(")")[0]
20
  forced_bos_token_id = MAPPING[target_code]
21
-
 
 
 
 
 
 
 
22
  input_values = process_audio_file(file)
23
 
24
  sequences = model.generate(input_values, forced_bos_token_id=forced_bos_token_id)
@@ -65,7 +74,8 @@ MAPPING = {
65
  iface = gr.Interface(
66
  fn=transcribe,
67
  inputs=[
68
- gr.inputs.Audio(source="microphone", type='filepath'),
 
69
  gr.inputs.Dropdown(target_language),
70
  ],
71
  outputs="text",
 
2
  import librosa
3
  from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
4
 
5
+ # feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
6
+ # tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_fast=False)
7
+ # model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
8
+
9
+ feature_extractor = tokenizer = model = None
10
 
11
  def process_audio_file(file):
12
  data, sr = librosa.load(file)
 
16
  input_values = feature_extractor(data, return_tensors="pt").input_values
17
  return input_values
18
 
19
+ def transcribe(file_mic, file_upload, target_language):
20
 
21
  target_code = target_language.split("(")[-1].split(")")[0]
22
  forced_bos_token_id = MAPPING[target_code]
23
+
24
+ if file_mic is not None and file_upload is not None:
25
+ print("Warning: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.")
26
+ elif file_mic is None and file_upload is None:
27
+ raise ValueError("You have to either use the microphone or upload an audio file")
28
+
29
+ file = file_mic or file_upload
30
+
31
  input_values = process_audio_file(file)
32
 
33
  sequences = model.generate(input_values, forced_bos_token_id=forced_bos_token_id)
 
74
  iface = gr.Interface(
75
  fn=transcribe,
76
  inputs=[
77
+ gr.inputs.Audio(source="microphone", type='filepath_mic', optional=True),
78
+ gr.inputs.Audio(source="upload", type='filepath_upload', optional=True),
79
  gr.inputs.Dropdown(target_language),
80
  ],
81
  outputs="text",