File size: 2,507 Bytes
5d65b51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os

os.system("pip install transformers")
os.system("pip install https://github.com/kpu/kenlm/archive/master.zip")
os.system("pip install pyctcdecode")
os.system("pip install gradio")
os.system("pip install librosa")

import gradio as gr
import librosa
import torch

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2ProcessorWithLM

repo_name = "aiface/vietnamese_s2t"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
processor = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name, token="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")
model = Wav2Vec2ForCTC.from_pretrained(repo_name, token="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").to(device)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo_name, token="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")
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, token="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")

def process_audio_file(file):
    data, sr = librosa.load(file, sr = 16000)
    
    return data

def transcribe(file_mic, file_upload):
    warn_output = ""
    if (file_mic is not None) and (file_upload is not None):
       warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
       file = file_mic
    elif (file_mic is None) and (file_upload is None):
       return "ERROR: You have to either use the microphone or upload an audio file"
    elif file_mic is not None:
       file = file_mic
    else:
       file = file_upload

    input_values = process_audio_file(file)
    input_dict = processor(input_values, sampling_rate=16_000, return_tensors="pt", padding=True)
    logits = model(input_dict.input_values.to(device)).logits

    pred_ids = torch.argmax(logits, dim=-1)[0]
    pres = processor.batch_decode(logits.to("cpu").detach().numpy()).text
    
    return warn_output + str(pres[0])

iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
        gr.inputs.Audio(source="upload", type='filepath', optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Speech to text MMS With Language Model",
    description="Demo đơn giản speech to text",
)
iface.launch(share=True)