vumichien commited on
Commit
f481a94
1 Parent(s): 40989d6
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+ import torch
5
+ # config
6
+ model_name = "vumichien/wav2vec2-large-xlsr-japanese-hỉragana"
7
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
8
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
9
+
10
+
11
+ def process_audio_file(file):
12
+ data, sr = librosa.load(file)
13
+ if sr != 16000:
14
+ data = librosa.resample(data, sr, 16000).squeeze()
15
+ print(data.shape)
16
+ inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
17
+ return inputs
18
+
19
+
20
+ def transcribe(file_mic, file_upload):
21
+ warn_output = ""
22
+ if (file_mic is not None) and (file_upload is not None):
23
+ warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the " \
24
+ "microphone will be used and the uploaded audio will be discarded.\n "
25
+ file = file_mic
26
+ elif (file_mic is None) and (file_upload is None):
27
+ return "ERROR: You have to either use the microphone or upload an audio file"
28
+ elif file_mic is not None:
29
+ file = file_mic
30
+ else:
31
+ file = file_upload
32
+ inputs = process_audio_file(file)
33
+ with torch.no_grad():
34
+ output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask.to("cuda")).logits
35
+ pred_ids = torch.argmax(output_logit, dim=-1)
36
+ return warn_output + processor.batch_decode(pred_ids)
37
+
38
+
39
+ iface = gr.Interface(
40
+ fn=transcribe,
41
+ inputs=[
42
+ gr.inputs.Audio(source="microphone", type='filepath', optional=True),
43
+ gr.inputs.Audio(source="upload", type='filepath', optional=True),
44
+ ],
45
+ outputs="text",
46
+ layout="horizontal",
47
+ theme="huggingface",
48
+ title="Transcribe Japanese audio to Hiragana",
49
+ description="A simple interface to transcribe from spoken Japanese to Hiragana.",
50
+ article="<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-1b-en-to-15' target='_blank'>Click to learn more about XLS-R-1B-EN-15 </a> | <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
51
+ enable_queue=True,
52
+ allow_flagging=False,
53
+ )
54
+ iface.launch()