Spaces:

Detomo
/

voice-japanese

Build error

App Files Files Community

vumichien commited on Jan 26, 2022

Commit

f481a94

•

1 Parent(s): 40989d6

update

Browse files

Files changed (1) hide show

app.py +54 -0

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import gradio as gr
+import librosa
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import torch
+# config
+model_name = "vumichien/wav2vec2-large-xlsr-japanese-hỉragana"
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+def process_audio_file(file):
+    data, sr = librosa.load(file)
+    if sr != 16000:
+        data = librosa.resample(data, sr, 16000).squeeze()
+    print(data.shape)
+    inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
+    return inputs
+def transcribe(file_mic, file_upload):
+    warn_output = ""
+    if (file_mic is not None) and (file_upload is not None):
+        warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the " \
+                      "microphone will be used and the uploaded audio will be discarded.\n "
+        file = file_mic
+    elif (file_mic is None) and (file_upload is None):
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    elif file_mic is not None:
+        file = file_mic
+    else:
+        file = file_upload
+    inputs = process_audio_file(file)
+    with torch.no_grad():
+        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask.to("cuda")).logits
+    pred_ids = torch.argmax(output_logit, dim=-1)
+    return warn_output + processor.batch_decode(pred_ids)
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
+        gr.inputs.Audio(source="upload", type='filepath', optional=True),
+    ],
+    outputs="text",
+    layout="horizontal",
+    theme="huggingface",
+    title="Transcribe Japanese audio to Hiragana",
+    description="A simple interface to transcribe from spoken Japanese to Hiragana.",
+    article="<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-1b-en-to-15' target='_blank'>Click to learn more about XLS-R-1B-EN-15 </a> | <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
+    enable_queue=True,
+    allow_flagging=False,
+)
+iface.launch()