Spaces:
Runtime error
Runtime error
File size: 2,072 Bytes
c106aba b5b3a67 c106aba 6b8cfb9 487d42a ea8b34f c106aba 5b6d82c c106aba 06628a1 144bafa f2d1246 1e9cc4e c043038 5b6d82c a958451 c043038 07d59d1 4f9fa63 5b6d82c 4f9fa63 c45c6ec 5b6d82c c45c6ec 975c18c c45c6ec 975c18c c45c6ec c106aba 4f9fa63 ea8b34f c106aba 0b02e4c 4f9fa63 5b6d82c 90f5dcf 0b02e4c 06628a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess
def read_file_and_process(wav_file):
filename = wav_file.split('.')[0]
filename_16k = filename + "16k.wav"
resampler(wav_file, filename_16k)
speech, _ = sf.read(filename_16k)
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
return inputs
def resampler(input_file_path, output_file_path):
command = (
f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
f"{output_file_path}"
)
subprocess.call(command, shell=True)
def parse_transcription_with_lm(logits):
result = processor_with_LM.batch_decode(logits.cpu().numpy())
text = result.text
transcription = text[0].replace('<s>','')
return transcription
def parse_transcription(logits):
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
def parse(wav_file, applyLM):
input_values = read_file_and_process(wav_file)
with torch.no_grad():
logits = model(**input_values).logits
if applyLM:
return parse_transcription_with_lm(logits)
else:
return parse_transcription(logits)
model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
processor = Wav2Vec2Processor.from_pretrained(model_id)
processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
input_ = gr.Audio(source="microphone", type="filepath")
txtbox = gr.Textbox(
label="Output from model will appear here:",
lines=5
)
chkbox = gr.Checkbox(label="Apply LM", value=False)
gr.Interface(parse, inputs = [input_, chkbox], outputs=txtbox,
streaming=True, interactive=True,
analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False); |