File size: 2,914 Bytes
577c3f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f2a1b8
 
577c3f8
941714c
577c3f8
941714c
577c3f8
 
 
 
 
 
 
 
7f2a1b8
 
577c3f8
 
 
 
 
 
 
 
 
 
 
 
 
7f2a1b8
577c3f8
 
 
 
236ae01
577c3f8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import nltk
import librosa
import torch
import kenlm
import gradio as gr
from pyctcdecode import build_ctcdecoder
from transformers import Wav2Vec2Processor,Wav2Vec2ProcessorWithLM,Wav2Vec2ForCTC

nltk.download("punkt")

def return_processor_and_model(model_name):
    return Wav2Vec2Processor.from_pretrained(model_name), Wav2Vec2ForCTC.from_pretrained(model_name)

def return_processor_and_modelWithLM(model_name):
    return Wav2Vec2ProcessorWithLM.from_pretrained(model_name), Wav2Vec2ForCTC.from_pretrained(model_name)

def load_and_fix_data(input_file):  
  speech, sample_rate = librosa.load(input_file)
  if len(speech.shape) > 1: 
      speech = speech[:,0] + speech[:,1]
  if sample_rate !=16000:
    speech = librosa.resample(speech, sample_rate,16000)
  return speech

def fix_transcription_casing(input_sentence):
  sentences = nltk.sent_tokenize(input_sentence)
  return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
  

def predict_and_ctc_lm_decode(input_file, model_name):
  processor, model = return_processor_and_modelWithLM(model_name)
  speech = load_and_fix_data(input_file)

  input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
  with torch.no_grad():
    logits = model(input_values).logits.cpu().detach().numpy()[0]

  pred = processor.decode(logits).text

  transcribed_text = fix_transcription_casing(pred.lower())

  return transcribed_text

def predict_and_greedy_decode(input_file, model_name):
  processor, model = return_processor_and_model(model_name)
  speech = load_and_fix_data(input_file)

  input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
  with torch.no_grad():
    logits = model(input_values).logits

  predicted_ids = torch.argmax(logits, dim=-1)
  pred = processor.batch_decode(predicted_ids)

  transcribed_text = fix_transcription_casing(pred[0].lower())

  return transcribed_text

def return_all_predictions(input_file, model_name):
  return predict_and_ctc_lm_decode(input_file, model_name), predict_and_greedy_decode(input_file, model_name)


gr.Interface(return_all_predictions,
             inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"), gr.inputs.Dropdown(["LuisG07/wav2vec2-large-xlsr-53-spanish", "jonatasgrosman/wav2vec2-xls-r-1b-spanish"], label="Model Name")],
             outputs = [gr.outputs.Textbox(label="Beam CTC decoding w/ LM"), gr.outputs.Textbox(label="Greedy decoding")],
             title="ASR using Wav2Vec2 & pyctcdecode in spanish",
             description = "Comparing greedy decoder with beam search CTC decoder, record/ drop your audio!",
             layout = "horizontal",
             examples = [["test1.wav", "LuisG07/wav2vec2-large-xlsr-53-spanish"], ["test2.wav", "LuisG07/wav2vec2-large-xlsr-53-spanish"]], 
             theme="huggingface",
             enable_queue=True).launch()