Spaces:

Mihaj
/

Mihaj-wav2vec2-large-xls-r-300m-ruOH-alphav

Running

App Files Files Community

Mihaj-wav2vec2-large-xls-r-300m-ruOH-alphav / app.py

Mihaj

Update app.py

8e288dd verified about 1 month ago

raw history blame contribute delete

No virus

2.3 kB

	import gradio as gr
	from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
	import os
	import soundfile as sf
	from pyannote.audio import Pipeline
	import torch
	HF_TOKEN = os.environ.get("HF_TOKEN")

	model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav"
	bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm"
	processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model)
	model = Wav2Vec2ForCTC.from_pretrained(model_name)
	pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder)


	pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
	use_auth_token=HF_TOKEN)

	def transcribe(diarise, processing, audio):
	y, sr = sf.read(audio)
	print(diarise)
	print(processing)
	if diarise:
	dia = pipeline_dia(audio)
	lines = []
	for line in dia.to_lab().split('\n'):
	if line.strip() != "":
	res = line.split(" ")
	start = int(float(res[0]) * sr)
	end = int(float(res[1]) * sr)
	label = res[2]
	for i in res:
	print(i)
	if processing == "LM":
	trans = pipe(y[start:end])["text"]
	else:
	trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
	lines.append(f"{res[0]} {res[1]} {res[2]} : {trans}")
	text = "\n".join(lines)
	else:
	if processing == "LM":
	res = pipe(y)
	else:
	res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2))
	text = res["text"]
	return text

	iface = gr.Interface(
	fn=transcribe,
	inputs=[gr.Checkbox(label="Diarise", info="Do you need to separate speakers?"), gr.Radio(["LM", "Striding"], label="Processing", info="You can choose Striding or Language Model processing. Striding allows processing of longer files (above 10 sec), while LM shows higher accuracy."), gr.Audio(type="filepath")],
	outputs="text",
	title="Wav2Vec2 RuOH",
	description=r"Realtime demo for Russian Oral History recognition using a fine-tuned Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
	)

	iface.launch()