Spaces:

Detomo
/

voice-japanese

Build error

App Files Files Community

vumichien commited on Jan 27, 2022

Commit

08f9ba3

•

1 Parent(s): f481a94

update

Browse files

Files changed (4) hide show

app.py +19 -25
requirements.txt +4 -0
samples/BASIC5000_0001.wav +0 -0
samples/BASIC5000_0005.wav +0 -0

app.py CHANGED Viewed

@@ -2,8 +2,9 @@ import gradio as gr
 import librosa
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torch
 # config
-model_name = "vumichien/wav2vec2-large-xlsr-japanese-hỉragana"
 processor = Wav2Vec2Processor.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
@@ -11,44 +12,37 @@ model = Wav2Vec2ForCTC.from_pretrained(model_name)
 def process_audio_file(file):
     data, sr = librosa.load(file)
     if sr != 16000:
-        data = librosa.resample(data, sr, 16000).squeeze()
     print(data.shape)
     inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
     return inputs
-def transcribe(file_mic, file_upload):
-    warn_output = ""
-    if (file_mic is not None) and (file_upload is not None):
-        warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the " \
-                      "microphone will be used and the uploaded audio will be discarded.\n "
-        file = file_mic
-    elif (file_mic is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    elif file_mic is not None:
-        file = file_mic
-    else:
-        file = file_upload
     inputs = process_audio_file(file)
     with torch.no_grad():
-        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask.to("cuda")).logits
     pred_ids = torch.argmax(output_logit, dim=-1)
-    return warn_output + processor.batch_decode(pred_ids)
 iface = gr.Interface(
     fn=transcribe,
-    inputs=[
-        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
-        gr.inputs.Audio(source="upload", type='filepath', optional=True),
-    ],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
     title="Transcribe Japanese audio to Hiragana",
-    description="A simple interface to transcribe from spoken Japanese to Hiragana.",
-    article="<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-1b-en-to-15' target='_blank'>Click to learn more about XLS-R-1B-EN-15 </a> | <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
-    enable_queue=True,
-    allow_flagging=False,
 )
-iface.launch()

 import librosa
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torch
 # config
+model_name = "vumichien/wav2vec2-large-xlsr-japanese-hiragana"
 processor = Wav2Vec2Processor.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
 def process_audio_file(file):
     data, sr = librosa.load(file)
     if sr != 16000:
+        data = librosa.resample(data, sr, 16000)
     print(data.shape)
     inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
     return inputs
+def transcribe(file):
     inputs = process_audio_file(file)
     with torch.no_grad():
+        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
     pred_ids = torch.argmax(output_logit, dim=-1)
+    return processor.batch_decode(pred_ids)[0]
+description = "A simple interface to transcribe from spoken Japanese to Hiragana."
+article = "<p style='text-align: center'><a @2022 Detomo </a></p>"
+inputs = [gr.inputs.Audio(source="microphone", type='filepath', optional=True)
+          ]
+examples = [["samples/BASIC5000_0001.wav"],
+            ["samples/BASIC5000_0005.wav"]
+        ]
 iface = gr.Interface(
     fn=transcribe,
+    inputs=inputs,
     outputs="text",
     layout="horizontal",
     theme="huggingface",
     title="Transcribe Japanese audio to Hiragana",
+    description=description,
+    article=article,
+    allow_flagging='never',
+    examples=examples
 )
+iface.launch(enable_queue=True, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio~=2.7.5.2
+librosa~=0.8.1
+torch~=1.10.1
+transformers~=4.15.0

samples/BASIC5000_0001.wav ADDED Viewed

Binary file (306 kB). View file

samples/BASIC5000_0005.wav ADDED Viewed

Binary file (354 kB). View file