File size: 2,361 Bytes
dd2b02c
56f1ec9
a64c958
56f1ec9
88ec444
 
56f1ec9
6b73084
72390b0
149c35c
72390b0
56f1ec9
 
 
43047d5
 
 
 
7d31b0d
04ae345
56f1ec9
d4d3d57
 
04ae345
 
56f1ec9
d4d3d57
56f1ec9
 
 
 
 
 
 
 
 
99453c8
 
 
509f052
 
 
a16e474
 
c793959
d23652b
 
c793959
 
 
9f4e47e
c793959
f9a5e8b
d4d3d57
 
04ae345
509f052
 
 
f9a5e8b
d4d3d57
c793959
 
 
 
d4d3d57
df9a8d7
79be1a5
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
import soundfile as sf
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset, Audio
import matplotlib.pyplot as plt


MODEL_NAME="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) # do i need this? can't remember


#ds = load_dataset("language-and-voice-lab/samromur_asr",split='train',streaming=True)
#ds = load_dataset("language-and-voice-lab/samromur_asr",split='test')
#ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

def show_ex(exnum):
    #return(ds['audio_id'][exnum])
    return(exnum)
    

def recc(a_f):
    wav, sr = sf.read(a_f, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
        
    with torch.inference_mode():
        #wav = torch.from_numpy(wav).unsqueeze(0)
        #if torch.cuda.is_available():
        #    wav = wav.cuda()
        input_values = processor(wav,sampling_rate=16000).input_values[0]
        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
        logits = model(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        pred_ids= pred_ids[0].cpu().detach()
    return pred_ids
        #xcp = processor.decode(pred_ids)
        #return xcp

def dec(pids):
    with torch.inference_mode():
        xcp = processor.decode(pids)
    return xcp

bl = gr.Blocks()
with bl:
    audio_file = gr.Audio(type="filepath")
    text_button = gr.Button("Recognise")
    text_output = gr.Textbox()
    
    text_button.click(recc, inputs=audio_file, outputs=text_output)

    text_button2 = gr.Button("Dec")
    text_output2 = gr.Textbox()
    text_button2.click(dec, inputs=text_output, outputs=text_output2)


bl.launch()

#https://mercury-docs.readthedocs.io/en/latest/deploy/hugging-face-spaces/
#https://huggingface.co/spaces/pplonski/deploy-mercury
#https://discuss.huggingface.co/t/deploy-interactive-jupyter-notebook-on-spaces-with-mercury/17000
#https://huggingface.co/docs/transformers/notebooks