Ai-Transcript / app333.py
Eldermind's picture
Rename app.py to app333.py
7e6267a verified
import gradio as gr
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
# Set up device and data type for torch based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}, torch_dtype: {torch_dtype}")
# Correct the model_id if using from the Hugging Face Model Hub
model_id = "distil-whisper/distil-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
processor = AutoProcessor.from_pretrained(model_id)
model.to(device)
print(f"Model and processor loaded successfully: {model_id}")
def transcribe_speech(file_info):
filepath = file_info['path']
sample = processor(filepath, return_tensors="pt")
input_features = sample.input_features.to(device)
# Check audio length to decide on chunking
audio_length_seconds = sample.input_values.shape[1] / processor.feature_extractor.sampling_rate
if audio_length_seconds > 30:
chunk_length_s = 15
batch_size = 2
else:
chunk_length_s = None
batch_size = 1
# Use the model and processor directly in the pipeline function
pipe = pipeline(
"automatic-speech-recognition",
model=model,
feature_extractor=processor.feature_extractor,
tokenizer=processor.tokenizer,
device=device,
max_new_tokens=128,
chunk_length_s=chunk_length_s,
batch_size=batch_size,
torch_dtype=torch_dtype
)
result = pipe(input_features)
return result["text"]
with gr.Blocks() as demo:
with gr.Tab("Transcribe Audio"):
with gr.Row():
audio_input = gr.Audio(label="Upload audio file or record")
with gr.Row():
audio_output = gr.Textbox(label="Transcription")
demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])
demo.launch(share=True)