import gradio as gr from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC import os import soundfile as sf from pyannote.audio import Pipeline import torch HF_TOKEN = os.environ.get("HF_TOKEN") model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav" bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm" processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model) model = Wav2Vec2ForCTC.from_pretrained(model_name) pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder) pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN) def transcribe(diarise, processing, audio): y, sr = sf.read(audio) print(diarise) print(processing) if diarise: dia = pipeline_dia(audio) lines = [] for line in dia.to_lab().split('\n'): if line.strip() != "": res = line.split(" ") start = int(float(res[0]) * sr) end = int(float(res[1]) * sr) label = res[2] for i in res: print(i) if processing == "LM": trans = pipe(y[start:end])["text"] else: trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"] lines.append(f"{res[0]} {res[1]} {res[2]} : {trans}") text = "\n".join(lines) else: if processing == "LM": res = pipe(y) else: res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2)) text = res["text"] return text iface = gr.Interface( fn=transcribe, inputs=[gr.Checkbox(label="Diarise", info="Do you need to separate speakers?"), gr.Radio(["LM", "Striding"], label="Processing", info="You can choose Striding or Language Model processing. Striding allows processing of longer files (above 10 sec), while LM shows higher accuracy."), gr.Audio(type="filepath")], outputs="text", title="Wav2Vec2 RuOH", description=r"Realtime demo for Russian Oral History recognition using a fine-tuned Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm", ) iface.launch()