Spaces:
Runtime error
Runtime error
File size: 2,360 Bytes
3c7858a 07459bd 08132ae a7ec155 022f864 d4083a0 07459bd df2191f d4083a0 df3f33b 07459bd ec875d7 07459bd 92776f9 df2191f 8f928e3 df2191f 09cbba9 8f928e3 df2191f 8f928e3 bff0c13 df2191f 022f864 bff0c13 0d69e89 df2191f c900926 2373938 ebcd408 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# from: https://gradio.app/real_time_speech_recognition/
from transformers import pipeline, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import torch
import pyctcdecode
import kenlm
import gradio as gr
import librosa
import os
import time
#Loading the model and the tokenizer
token_key = os.environ.get("HUGGING_FACE_HUB_TOKEN")
#model_name = "unilux/wav2vec-xls-r-Luxembourgish20-with-LM"
model_name = "unilux/wav2vec-xlsr-300m-Luxembourgish-with-LM"
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name, use_auth_token=token_key)
model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=token_key)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name, use_auth_token=token_key)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
p = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder, use_auth_token=token_key)
#p = pipeline("automatic-speech-recognition", model=model_name, use_auth_token = token_key)
#p = pipeline("automatic-speech-recognition", model=model_name, use_auth_token = True)
#tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
#model = Wav2Vec2ForCTC.from_pretrained(model_name)
def load_data(input_file):
""" Function for resampling to ensure that the speech input is sampled at 16KHz.
"""
sampling_rate = 16_000
#read the file
speech, sample_rate = librosa.load(input_file, sr=sampling_rate, mono=True)
speech = librosa.effects.trim(speech, top_db= 10)
return speech[0]
def asr_pipe(input_file):
load_data(input_file)
transcription = p(input_file, chunk_length_s=3, stride_length_s=(1, 1))["text"]
return transcription
gr.Interface(asr_pipe,
inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Hei kënnt Dir Är Sprooch iwwert de Mikro ophuelen"),
outputs = gr.outputs.Textbox(label="Erkannten Text"),
title="Sproocherkennung fir d'Lëtzebuergescht @uni.lu",
description = "Dës App convertéiert Är geschwate Sprooch an de (méi oder manner richtegen ;-)) Text!",
examples = [["ChamberMeisch.wav"], ["Chamber_Fayot_2005.wav"], ["Erlieft-a-Verzielt.wav"], ["Schnessen_Beispill.wav"]], theme="default").launch()
|