File size: 2,708 Bytes
c106aba b5b3a67 c106aba 38368a5 6b8cfb9 ea8b34f 296f52e ea8b34f c106aba 38368a5 6b8cfb9 38368a5 c106aba 06628a1 4f9fa63 6c461f0 ea8b34f 4f9fa63 785e129 0de7587 ea8b34f c106aba 4f9fa63 ea8b34f c106aba 0b02e4c 0de7587 0b02e4c 38368a5 4f9fa63 0b02e4c 06628a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import scipy.signal as sps
import sox
def convert(inputfile, outfile):
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
)
#print(this is not done)
sox_tfm.build(inputfile, outfile)
def read_file(wav):
sample_rate, signal = wav
signal = signal.mean(-1)
number_of_samples = round(len(signal) * float(16000) / sample_rate)
resampled_signal = sps.resample(signal, number_of_samples)
return resampled_signal
def parse_transcription_with_lm(wav_file):
speech = convert_file(wav_file)
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
int_result = processor.batch_decode(logits.cpu().numpy())
transcription = int_result.text
return transcription
def convert_file(wav_file):
filename = wav_file.split('.')[0]
convert(wav_file, filename + "16k.wav")
speech, _ = sf.read(filename + "16k.wav")
return speech
def parse_transcription(wav_file):
speech = convert_file(wav_file)
#speech = read_file(wav_file)
input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
processor = Wav2Vec2Processor.from_pretrained(model_id)
processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
input_ = gr.Audio(source="microphone", type="filepath")
#input_ = gr.inputs.Audio(source="microphone", type="numpy")
txtbox = gr.Textbox(
label="Output from model will appear here:",
lines=5
)
chkbox = gr.Checkbox(label="Apply LM", value=False)
gr.Interface(parse_transcription, inputs = [input_, chckbox], outputs=txtbox,
streaming=True, interactive=True,
analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False); |