|
import gradio as gr |
|
from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC |
|
import os |
|
import soundfile as sf |
|
from pyannote.audio import Pipeline |
|
import torch |
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav" |
|
bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm" |
|
processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model) |
|
model = Wav2Vec2ForCTC.from_pretrained(model_name) |
|
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder) |
|
|
|
|
|
pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", |
|
use_auth_token=HF_TOKEN) |
|
|
|
def transcribe(diarise, processing, audio): |
|
y, sr = sf.read(audio) |
|
print(diarise) |
|
print(processing) |
|
if diarise: |
|
dia = pipeline_dia(audio) |
|
lines = [] |
|
for line in dia.to_lab().split('\n'): |
|
if line.strip() != "": |
|
res = line.split(" ") |
|
start = int(float(res[0]) * sr) |
|
end = int(float(res[1]) * sr) |
|
label = res[2] |
|
for i in res: |
|
print(i) |
|
if processing == "LM": |
|
trans = pipe(y[start:end])["text"] |
|
else: |
|
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"] |
|
lines.append(f"{res[0]} {res[1]} {res[2]} : {trans}") |
|
text = "\n".join(lines) |
|
else: |
|
if processing == "LM": |
|
res = pipe(y) |
|
else: |
|
res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2)) |
|
text = res["text"] |
|
return text |
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=[gr.Checkbox(label="Diarise", info="Do you need to separate speakers?"), gr.Radio(["LM", "Striding"], label="Processing", info="You can choose Striding or Language Model processing. Striding allows processing of longer files (above 10 sec), while LM shows higher accuracy."), gr.Audio(type="filepath")], |
|
outputs="text", |
|
title="Wav2Vec2 RuOH", |
|
description=r"Realtime demo for Russian Oral History recognition using a fine-tuned Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm", |
|
) |
|
|
|
iface.launch() |