File size: 1,870 Bytes
e8f9c74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6a2d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a97d8ed
 
 
 
 
 
 
 
 
 
 
 
d6a2d06
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text


import soundfile
import librosa.display
import matplotlib.pyplot as plt
import gradio as gr


d = ModelDownloader()
speech2text = Speech2Text(
    **d.download_and_unpack(tag),
    device="cpu",
    minlenratio=0.0,
    maxlenratio=0.0,
    ctc_weight=0.3,
    beam_size=10,
    batch_size=0,
    nbest=1
)

def text_normalizer(text):
    text = text.upper()
    return text.translate(str.maketrans('', '', string.punctuation))
    
lang = 'multilingual'
fs = 16000 
tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best' 


def inference(audio):
  speech, rate = soundfile.read(audio.name)
  assert rate == fs, "mismatch in sampling rate"
  nbests = speech2text(speech)
  text, *_ = nbests[0]

  print(f"Input Speech: {file_name}")
  display(Audio(speech, rate=rate))
  librosa.display.waveplot(speech, sr=rate)
  plt.show()
  print(f"ASR hypothesis: {text_normalizer(text)}")
  print("*" * 50)
  
inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs =  gr.outputs.Textbox(label="Output Text")

title = "wav2vec 2.0"
description = "demo for Facebook AI wav2vec 2.0 using Hugging Face transformers. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2006.11477'>wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations</a> | <a href='https://github.com/pytorch/fairseq'>Github Repo</a> | <a href='https://huggingface.co/facebook/wav2vec2-base-960h'>Hugging Face model</a></p>"
examples = [
    ["poem.wav"]
]

gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()