import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import gradio as gr import torchaudio device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "distil-whisper/distil-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=25, batch_size=16, torch_dtype=torch_dtype, device=device, ) def speech_to_text(audio_file): try: waveform, sample_rate = torchaudio.load(audio_file) if waveform.size(0) > 1: resample = torchaudio.transforms.Resample(sample_rate, sample_rate) waveform = resample(waveform) waveform_np = waveform.numpy() print("pass to pipe") result = pipe(waveform_np[0]) print("result",result) return result["text"] except Exception as e: print(f"Error: {str(e)}") iface = gr.Interface(fn=speech_to_text, inputs="file", outputs="text", title="Speech-to-Text") if __name__ == "__main__": iface.launch()