import gradio as gr from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import torch # Set up device and data type for torch based on GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 print(f"Using device: {device}, torch_dtype: {torch_dtype}") # Correct the model_id if using from the Hugging Face Model Hub model_id = "distil-whisper/distil-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True) processor = AutoProcessor.from_pretrained(model_id) model.to(device) print(f"Model and processor loaded successfully: {model_id}") def transcribe_speech(file_info): filepath = file_info['path'] sample = processor(filepath, return_tensors="pt") input_features = sample.input_features.to(device) # Check audio length to decide on chunking audio_length_seconds = sample.input_values.shape[1] / processor.feature_extractor.sampling_rate if audio_length_seconds > 30: chunk_length_s = 15 batch_size = 2 else: chunk_length_s = None batch_size = 1 # Use the model and processor directly in the pipeline function pipe = pipeline( "automatic-speech-recognition", model=model, feature_extractor=processor.feature_extractor, tokenizer=processor.tokenizer, device=device, max_new_tokens=128, chunk_length_s=chunk_length_s, batch_size=batch_size, torch_dtype=torch_dtype ) result = pipe(input_features) return result["text"] with gr.Blocks() as demo: with gr.Tab("Transcribe Audio"): with gr.Row(): audio_input = gr.Audio(label="Upload audio file or record") with gr.Row(): audio_output = gr.Textbox(label="Transcription") demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output]) demo.launch(share=True)