File size: 2,360 Bytes
3c7858a
 
07459bd
08132ae
a7ec155
 
022f864
 
d4083a0
07459bd
df2191f
 
d4083a0
df3f33b
07459bd
 
 
 
 
ec875d7
 
 
07459bd
 
 
92776f9
df2191f
 
 
 
 
 
 
 
8f928e3
df2191f
09cbba9
8f928e3
 
df2191f
 
8f928e3
bff0c13
df2191f
022f864
bff0c13
0d69e89
 
df2191f
c900926
2373938
ebcd408
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# from: https://gradio.app/real_time_speech_recognition/

from transformers import pipeline, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import torch
import pyctcdecode
import kenlm
import gradio as gr
import librosa
import os
import time

#Loading the model and the tokenizer
token_key = os.environ.get("HUGGING_FACE_HUB_TOKEN")
#model_name = "unilux/wav2vec-xls-r-Luxembourgish20-with-LM"
model_name = "unilux/wav2vec-xlsr-300m-Luxembourgish-with-LM"
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name, use_auth_token=token_key)
model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=token_key)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name, use_auth_token=token_key)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

p = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder, use_auth_token=token_key)

#p = pipeline("automatic-speech-recognition", model=model_name, use_auth_token = token_key)
#p = pipeline("automatic-speech-recognition", model=model_name, use_auth_token = True)

#tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
#model = Wav2Vec2ForCTC.from_pretrained(model_name)

def load_data(input_file):
  
  """ Function for resampling to ensure that the speech input is sampled at 16KHz.
  """
  sampling_rate = 16_000
  #read the file
  speech, sample_rate = librosa.load(input_file, sr=sampling_rate, mono=True)
  speech = librosa.effects.trim(speech, top_db= 10)
  return speech[0]
    
def asr_pipe(input_file):
  load_data(input_file)
  transcription = p(input_file, chunk_length_s=3, stride_length_s=(1, 1))["text"]
  return transcription
    
gr.Interface(asr_pipe,
             inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Hei kënnt Dir Är Sprooch iwwert de Mikro ophuelen"),
             outputs = gr.outputs.Textbox(label="Erkannten Text"),
             title="Sproocherkennung fir d'Lëtzebuergescht @uni.lu",
             description = "Dës App convertéiert Är geschwate Sprooch an de (méi oder manner richtegen ;-)) Text!",
             examples = [["ChamberMeisch.wav"], ["Chamber_Fayot_2005.wav"], ["Erlieft-a-Verzielt.wav"], ["Schnessen_Beispill.wav"]], theme="default").launch()