File size: 2,072 Bytes
c106aba
 
b5b3a67
c106aba
6b8cfb9
487d42a
ea8b34f
c106aba
5b6d82c
 
 
 
 
 
 
 
c106aba
06628a1
144bafa
f2d1246
 
 
 
 
1e9cc4e
c043038
5b6d82c
a958451
c043038
07d59d1
4f9fa63
 
5b6d82c
 
 
 
4f9fa63
c45c6ec
5b6d82c
 
 
 
c45c6ec
975c18c
c45c6ec
975c18c
c45c6ec
c106aba
4f9fa63
 
 
 
ea8b34f
c106aba
0b02e4c
 
 
 
 
4f9fa63
 
5b6d82c
90f5dcf
0b02e4c
06628a1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess


def read_file_and_process(wav_file):
    filename = wav_file.split('.')[0]
    filename_16k = filename + "16k.wav"
    resampler(wav_file, filename_16k)
    speech, _ = sf.read(filename_16k)
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
    
    return inputs


def resampler(input_file_path, output_file_path):
    command = (
        f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
        f"{output_file_path}"
    )
    subprocess.call(command, shell=True)


def parse_transcription_with_lm(logits):
    result = processor_with_LM.batch_decode(logits.cpu().numpy())
    text = result.text
    transcription = text[0].replace('<s>','')
    return transcription

def parse_transcription(logits):
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

def parse(wav_file, applyLM):
    input_values = read_file_and_process(wav_file)
    with torch.no_grad():
        logits = model(**input_values).logits
   
    if applyLM:
        return parse_transcription_with_lm(logits)
    else:
        return parse_transcription(logits)

    
model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
processor = Wav2Vec2Processor.from_pretrained(model_id)
processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

    
input_ = gr.Audio(source="microphone", type="filepath") 
txtbox = gr.Textbox(
            label="Output from model will appear here:",
            lines=5
        )
chkbox = gr.Checkbox(label="Apply LM", value=False)


gr.Interface(parse, inputs = [input_, chkbox],  outputs=txtbox,
             streaming=True, interactive=True,
             analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);