Update app.py
Browse files
app.py
CHANGED
@@ -3,30 +3,20 @@ import soundfile as sf
|
|
3 |
import numpy as np
|
4 |
import torch, torchaudio
|
5 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
6 |
-
from datasets import load_dataset, Audio
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
|
9 |
-
|
10 |
-
|
11 |
|
12 |
torch.random.manual_seed(0)
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
#ds = load_dataset("language-and-voice-lab/samromur_asr",split='train',streaming=True)
|
20 |
-
#ds = load_dataset("language-and-voice-lab/samromur_asr",split='test')
|
21 |
-
#ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
|
22 |
-
|
23 |
-
def show_ex(exnum):
|
24 |
-
#return(ds['audio_id'][exnum])
|
25 |
-
return(exnum)
|
26 |
-
|
27 |
|
28 |
-
def recc(
|
29 |
-
wav, sr = sf.read(
|
30 |
if len(wav.shape) == 2:
|
31 |
wav = wav.mean(1)
|
32 |
if sr != 16000:
|
@@ -34,31 +24,42 @@ def recc(a_f):
|
|
34 |
wav = signal.resample(wav, wlen)
|
35 |
|
36 |
with torch.inference_mode():
|
37 |
-
#wav = torch.from_numpy(wav).unsqueeze(0)
|
38 |
-
#if torch.cuda.is_available():
|
39 |
-
# wav = wav.cuda()
|
40 |
input_values = processor(wav,sampling_rate=16000).input_values[0]
|
41 |
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
|
42 |
logits = model(input_values).logits
|
43 |
pred_ids = torch.argmax(logits, dim=-1)
|
44 |
-
#pred_ids= pred_ids[0].cpu().detach()
|
45 |
xcp = processor.batch_decode(pred_ids)
|
46 |
return xcp
|
47 |
|
48 |
|
49 |
bl = gr.Blocks()
|
50 |
with bl:
|
51 |
-
audio_file = gr.Audio(type="filepath")
|
52 |
-
text_button = gr.Button("Recognise")
|
53 |
-
text_output = gr.Textbox()
|
54 |
-
|
55 |
-
text_button.click(recc, inputs=audio_file, outputs=text_output)
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
#https://huggingface.co/spaces/pplonski/deploy-mercury
|
62 |
-
#https://discuss.huggingface.co/t/deploy-interactive-jupyter-notebook-on-spaces-with-mercury/17000
|
63 |
-
#https://huggingface.co/docs/transformers/notebooks
|
64 |
|
|
|
3 |
import numpy as np
|
4 |
import torch, torchaudio
|
5 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
|
|
|
|
6 |
|
7 |
+
MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
|
8 |
+
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
|
9 |
|
10 |
torch.random.manual_seed(0)
|
11 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
12 |
|
13 |
+
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
|
14 |
+
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
|
15 |
+
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
|
16 |
+
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
def recc(audio_file,model,processor):
|
19 |
+
wav, sr = sf.read(audio_file, dtype=np.float32)
|
20 |
if len(wav.shape) == 2:
|
21 |
wav = wav.mean(1)
|
22 |
if sr != 16000:
|
|
|
24 |
wav = signal.resample(wav, wlen)
|
25 |
|
26 |
with torch.inference_mode():
|
|
|
|
|
|
|
27 |
input_values = processor(wav,sampling_rate=16000).input_values[0]
|
28 |
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
|
29 |
logits = model(input_values).logits
|
30 |
pred_ids = torch.argmax(logits, dim=-1)
|
|
|
31 |
xcp = processor.batch_decode(pred_ids)
|
32 |
return xcp
|
33 |
|
34 |
|
35 |
bl = gr.Blocks()
|
36 |
with bl:
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
gr.Markdown(
|
39 |
+
"""
|
40 |
+
# W2V2 speech recognition
|
41 |
+
Upload a file for recognition with
|
42 |
+
https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h
|
43 |
+
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
|
44 |
|
45 |
+
- For some reason, the huggingface 'Hosted inference API' on the model page does not work, but this demo does.
|
46 |
+
- There is no language model (yet), so it can generate non-words.
|
47 |
+
"""
|
48 |
+
)
|
49 |
+
|
50 |
+
with gr.Tabs():
|
51 |
+
with gr.TabItem("Icelandic"):
|
52 |
+
with gr.Row():
|
53 |
+
audio_file = gr.Audio(type="filepath")
|
54 |
+
text_output = gr.Textbox()
|
55 |
+
text_button = gr.Button("Recognise")
|
56 |
+
text_button.click(recc, inputs=[audio_file,model_is,processor_is], outputs=text_output)
|
57 |
+
with gr.TabItem("Faroese"):
|
58 |
+
with gr.Row():
|
59 |
+
audio_file = gr.Audio(type="filepath")
|
60 |
+
text_output = gr.Textbox()
|
61 |
+
text_button = gr.Button("Recognise")
|
62 |
+
text_button.click(recc, inputs=[audio_file,model_fo,processor_fo], outputs=text_output)
|
63 |
|
64 |
+
bl.launch()
|
|
|
|
|
|
|
65 |
|