shijunju's picture
Update app.py
b0c7e3c verified
raw
history blame
2.09 kB
import gradio as gr
import torch
from transformers import pipeline
import os
# Initialize the Whisper ASR model
api_key = os.getenv("HF_TOKEN")
whisper_asr_trained = pipeline(
"automatic-speech-recognition",
model="shijunju/whisper_turbo_1202",
chunk_length_s=30, # do not make it over 30
device="cuda" if torch.cuda.is_available() else "cpu"
)
def transcribe_speech(filepath):
if filepath is None:
return "No audio found, please upload a file.", None
# Process the audio file and get the transcription
try:
output = whisper_asr_trained(filepath, return_timestamps=True)
# Extract and format the transcribed text
if isinstance(output, dict) and "chunks" in output:
transcribed_text = "\n\n".join([str(chunk['timestamp']) + '\n' + chunk["text"] for chunk in output["chunks"]])
else:
transcribed_text = output.get("text", str(output))
# Save the transcription to a text file
text_file_path = "transcription.txt"
with open(text_file_path, "w") as text_file:
text_file.write(transcribed_text)
return transcribed_text, text_file_path
except Exception as e:
return f"Error during transcription: {str(e)}", None
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Finetuned Whisper Turbo Speech to Text (CPU, each run takes about 2 min)")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload MP3 or WAV File")
transcribe_button = gr.Button("Transcribe Audio")
with gr.Row():
output_textbox = gr.Textbox(label="Transcribed Text", lines=10)
download_link = gr.File(label="Download Transcription")
def update_output(filepath):
transcribed_text, text_file_path = transcribe_speech(filepath)
return transcribed_text, text_file_path
transcribe_button.click(
fn=update_output,
inputs=audio_input,
outputs=[output_textbox, download_link]
)
# Launch the Gradio app
demo.launch()