whisper-lule-sami-demo

Running on Zero

App Files Files Community

whisper-lule-sami-demo / app.py

versae

Update app.py

cff1674 verified 19 days ago

raw

history blame contribute delete

3.92 kB

	import os

	import torch

	import gradio as gr
	import pytube as pt
	import spaces
	from transformers import pipeline
	from huggingface_hub import model_info

	MODEL_NAME = os.environ.get("MODEL_NAME", "NbAiLab/salmon-whisper-large-smj-lr7e-5")
	MODEL_VERSION = os.environ.get("MODEL_VERSION", None)
	lang = "fi"

	share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
	auth_token = os.environ.get("AUTH_TOKEN") or True
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	@spaces.GPU(duration=120)
	def pipe(file, return_timestamps=False):
	asr = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	revision=MODEL_VERSION,
	chunk_length_s=30,
	device=device,
	token=auth_token,
	)
	asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
	language=lang,
	task="transcribe",
	no_timestamps=not return_timestamps,
	)
	# asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<\|notimestamps\|>", add_special_tokens=False)[0]
	return asr(file, return_timestamps=return_timestamps)

	def transcribe(file, return_timestamps=False):
	if not return_timestamps:
	text = pipe(file)["text"]
	else:
	chunks = pipe(file, return_timestamps=True)["chunks"]
	text = []
	for chunk in chunks:
	start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
	end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
	line = f"[{start_time} -> {end_time}] {chunk['text']}"
	text.append(line)
	text = "\n".join(text)
	return text


	def _return_yt_html_embed(yt_url):
	video_id = yt_url.split("?v=")[-1]
	HTML_str = (
	f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
	" </center>"
	)
	return HTML_str


	def yt_transcribe(yt_url, return_timestamps=False):
	yt = pt.YouTube(yt_url)
	html_embed_str = _return_yt_html_embed(yt_url)
	stream = yt.streams.filter(only_audio=True)[0]
	stream.download(filename="audio.mp3")

	text = transcribe("audio.mp3", return_timestamps=return_timestamps)

	return html_embed_str, text


	demo = gr.Blocks()

	mf_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
	# gr.components.Checkbox(label="Return timestamps"),
	],
	outputs="text",
	theme="huggingface",
	title="Whisper Demo: Transcribe Audio",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
	f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
	" of arbitrary length."
	),
	allow_flagging="never",
	)

	yt_transcribe = gr.Interface(
	fn=yt_transcribe,
	inputs=[
	gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
	# gr.components.Checkbox(label="Return timestamps"),
	],
	examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
	outputs=["html", "text"],
	theme="huggingface",
	title="Whisper Demo: Transcribe YouTube",
	description=(
	"Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
	f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
	" arbitrary length."
	),
	allow_flagging="never",
	)

	with demo:
	gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])

	demo.launch(share=True).queue()