File size: 2,831 Bytes
dd2b02c
56f1ec9
a64c958
56f1ec9
88ec444
6b73084
0e89078
 
72390b0
56f1ec9
 
 
0e89078
 
 
 
d4d3d57
5adac55
 
56f1ec9
 
 
 
 
5adac55
 
 
 
56f1ec9
5adac55
509f052
5adac55
a16e474
5adac55
e462854
c793959
5adac55
 
 
 
 
 
 
 
 
f9a5e8b
d4d3d57
 
 
0e89078
 
 
 
 
 
d4d3d57
0e89078
 
 
 
 
 
 
 
5adac55
 
 
 
0e89078
 
 
 
 
5adac55
79be1a5
0e89078
79be1a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import soundfile as sf
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) 
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) 

def readwav(a_f):
    wav, sr = sf.read(a_f, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
    return wav

def recis(audio_file):
    wav = readwav(audio_file)
    with torch.inference_mode():
        input_values = processor_is(wav,sampling_rate=16000).input_values[0]
        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
        logits = model_is(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        xcp = processor_is.batch_decode(pred_ids)
        return xcp

def recfo(audio_file):
    wav = readwav(audio_file)
    with torch.inference_mode():
        input_values = processor_fo(wav,sampling_rate=16000).input_values[0]
        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
        logits = model_fo(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        xcp = processor_fo.batch_decode(pred_ids)
        return xcp

bl = gr.Blocks()
with bl:

    gr.Markdown(
        """
    # W2V2 speech recognition
    Upload a file for recognition with 
    https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h 
    or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h  

    - For some reason, the huggingface 'Hosted inference API' on the model page does not work, but this demo does.  
    - There is no language model (yet), so it can generate non-words.
    """
    )

    with gr.Tabs():
        with gr.TabItem("Icelandic"):
            with gr.Row():
                audio_file = gr.Audio(type="filepath")
                text_output = gr.Textbox()
            text_button = gr.Button("Recognise")
            text_button.click(recis, inputs=audio_file, outputs=text_output)
        with gr.TabItem("Faroese"):
            with gr.Row():
                audio_file = gr.Audio(type="filepath")
                text_output = gr.Textbox()
            text_button = gr.Button("Recognise")
            text_button.click(recfo, inputs=audio_file, outputs=text_output)

bl.launch()