Mihaj's picture
Update app.py
8e288dd verified
import gradio as gr
from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
import os
import soundfile as sf
from pyannote.audio import Pipeline
import torch
HF_TOKEN = os.environ.get("HF_TOKEN")
model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav"
bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
use_auth_token=HF_TOKEN)
def transcribe(diarise, processing, audio):
y, sr = sf.read(audio)
print(diarise)
print(processing)
if diarise:
dia = pipeline_dia(audio)
lines = []
for line in dia.to_lab().split('\n'):
if line.strip() != "":
res = line.split(" ")
start = int(float(res[0]) * sr)
end = int(float(res[1]) * sr)
label = res[2]
for i in res:
print(i)
if processing == "LM":
trans = pipe(y[start:end])["text"]
else:
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
lines.append(f"{res[0]} {res[1]} {res[2]} : {trans}")
text = "\n".join(lines)
else:
if processing == "LM":
res = pipe(y)
else:
res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2))
text = res["text"]
return text
iface = gr.Interface(
fn=transcribe,
inputs=[gr.Checkbox(label="Diarise", info="Do you need to separate speakers?"), gr.Radio(["LM", "Striding"], label="Processing", info="You can choose Striding or Language Model processing. Striding allows processing of longer files (above 10 sec), while LM shows higher accuracy."), gr.Audio(type="filepath")],
outputs="text",
title="Wav2Vec2 RuOH",
description=r"Realtime demo for Russian Oral History recognition using a fine-tuned Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
)
iface.launch()