Spaces:
Build error
Build error
update
Browse files- app.py +19 -25
- requirements.txt +4 -0
- samples/BASIC5000_0001.wav +0 -0
- samples/BASIC5000_0005.wav +0 -0
app.py
CHANGED
@@ -2,8 +2,9 @@ import gradio as gr
|
|
2 |
import librosa
|
3 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
4 |
import torch
|
|
|
5 |
# config
|
6 |
-
model_name = "vumichien/wav2vec2-large-xlsr-japanese-
|
7 |
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
8 |
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
9 |
|
@@ -11,44 +12,37 @@ model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
|
11 |
def process_audio_file(file):
|
12 |
data, sr = librosa.load(file)
|
13 |
if sr != 16000:
|
14 |
-
data = librosa.resample(data, sr, 16000)
|
15 |
print(data.shape)
|
16 |
inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
|
17 |
return inputs
|
18 |
|
19 |
|
20 |
-
def transcribe(
|
21 |
-
warn_output = ""
|
22 |
-
if (file_mic is not None) and (file_upload is not None):
|
23 |
-
warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the " \
|
24 |
-
"microphone will be used and the uploaded audio will be discarded.\n "
|
25 |
-
file = file_mic
|
26 |
-
elif (file_mic is None) and (file_upload is None):
|
27 |
-
return "ERROR: You have to either use the microphone or upload an audio file"
|
28 |
-
elif file_mic is not None:
|
29 |
-
file = file_mic
|
30 |
-
else:
|
31 |
-
file = file_upload
|
32 |
inputs = process_audio_file(file)
|
33 |
with torch.no_grad():
|
34 |
-
output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask
|
35 |
pred_ids = torch.argmax(output_logit, dim=-1)
|
36 |
-
return
|
37 |
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
iface = gr.Interface(
|
40 |
fn=transcribe,
|
41 |
-
inputs=
|
42 |
-
gr.inputs.Audio(source="microphone", type='filepath', optional=True),
|
43 |
-
gr.inputs.Audio(source="upload", type='filepath', optional=True),
|
44 |
-
],
|
45 |
outputs="text",
|
46 |
layout="horizontal",
|
47 |
theme="huggingface",
|
48 |
title="Transcribe Japanese audio to Hiragana",
|
49 |
-
description=
|
50 |
-
article=
|
51 |
-
|
52 |
-
|
53 |
)
|
54 |
-
iface.launch()
|
|
|
2 |
import librosa
|
3 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
4 |
import torch
|
5 |
+
|
6 |
# config
|
7 |
+
model_name = "vumichien/wav2vec2-large-xlsr-japanese-hiragana"
|
8 |
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
9 |
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
10 |
|
|
|
12 |
def process_audio_file(file):
|
13 |
data, sr = librosa.load(file)
|
14 |
if sr != 16000:
|
15 |
+
data = librosa.resample(data, sr, 16000)
|
16 |
print(data.shape)
|
17 |
inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
|
18 |
return inputs
|
19 |
|
20 |
|
21 |
+
def transcribe(file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
inputs = process_audio_file(file)
|
23 |
with torch.no_grad():
|
24 |
+
output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
25 |
pred_ids = torch.argmax(output_logit, dim=-1)
|
26 |
+
return processor.batch_decode(pred_ids)[0]
|
27 |
|
28 |
|
29 |
+
description = "A simple interface to transcribe from spoken Japanese to Hiragana."
|
30 |
+
article = "<p style='text-align: center'><a @2022 Detomo </a></p>"
|
31 |
+
inputs = [gr.inputs.Audio(source="microphone", type='filepath', optional=True)
|
32 |
+
]
|
33 |
+
examples = [["samples/BASIC5000_0001.wav"],
|
34 |
+
["samples/BASIC5000_0005.wav"]
|
35 |
+
]
|
36 |
iface = gr.Interface(
|
37 |
fn=transcribe,
|
38 |
+
inputs=inputs,
|
|
|
|
|
|
|
39 |
outputs="text",
|
40 |
layout="horizontal",
|
41 |
theme="huggingface",
|
42 |
title="Transcribe Japanese audio to Hiragana",
|
43 |
+
description=description,
|
44 |
+
article=article,
|
45 |
+
allow_flagging='never',
|
46 |
+
examples=examples
|
47 |
)
|
48 |
+
iface.launch(enable_queue=True, share=True)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio~=2.7.5.2
|
2 |
+
librosa~=0.8.1
|
3 |
+
torch~=1.10.1
|
4 |
+
transformers~=4.15.0
|
samples/BASIC5000_0001.wav
ADDED
Binary file (306 kB). View file
|
|
samples/BASIC5000_0005.wav
ADDED
Binary file (354 kB). View file
|
|