import gradio as gr import torch import torchaudio from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline model_id = "lyhourt/whisper-small-clean_6-v4" device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, # You can increase this if needed batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) def transcribe(audio_path): waveform, sample_rate = torchaudio.load(audio_path) # Split the audio into chunks of 30 seconds (or your desired chunk length) chunk_length = 30 * sample_rate # 30 seconds chunks = [waveform[:, i:i + chunk_length] for i in range(0, waveform.size(1), chunk_length)] texts = [] for chunk in chunks: chunk = chunk.to(device) text = pipe(chunk)["text"] texts.append(text) # Concatenate all texts full_text = " ".join(texts) return full_text iface = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["upload"], type="filepath"), outputs="text", title="Whisper Small Hungarian", description="Realtime demo for Hungarian speech recognition using a fine-tuned Whisper small.", ) iface.launch()