|
import gradio as gr |
|
import torch |
|
from transformers import pipeline |
|
import os |
|
|
|
|
|
api_key = os.getenv("HF_TOKEN") |
|
whisper_asr_trained = pipeline( |
|
"automatic-speech-recognition", |
|
model="shijunju/whisper_turbo_1202", |
|
chunk_length_s=30, |
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
) |
|
|
|
def transcribe_speech(filepath): |
|
if filepath is None: |
|
return "No audio found, please upload a file.", None |
|
|
|
|
|
try: |
|
output = whisper_asr_trained(filepath, return_timestamps=True) |
|
|
|
|
|
if isinstance(output, dict) and "chunks" in output: |
|
transcribed_text = "\n\n".join([str(chunk['timestamp']) + '\n' + chunk["text"] for chunk in output["chunks"]]) |
|
else: |
|
transcribed_text = output.get("text", str(output)) |
|
|
|
|
|
text_file_path = "transcription.txt" |
|
with open(text_file_path, "w") as text_file: |
|
text_file.write(transcribed_text) |
|
|
|
return transcribed_text, text_file_path |
|
except Exception as e: |
|
return f"Error during transcription: {str(e)}", None |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Finetuned Whisper Turbo Speech to Text (CPU, each run takes about 2 min)") |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio(type="filepath", label="Upload MP3 or WAV File") |
|
transcribe_button = gr.Button("Transcribe Audio") |
|
|
|
with gr.Row(): |
|
output_textbox = gr.Textbox(label="Transcribed Text", lines=10) |
|
download_link = gr.File(label="Download Transcription") |
|
|
|
|
|
def update_output(filepath): |
|
|
|
transcribed_text, text_file_path = transcribe_speech(filepath) |
|
|
|
return transcribed_text, text_file_path |
|
|
|
transcribe_button.click( |
|
fn=update_output, |
|
inputs=audio_input, |
|
outputs=[output_textbox, download_link] |
|
) |
|
|
|
|
|
demo.launch() |