speech / app.py
TristanBehrens's picture
Upload app.py
628f79b verified
import os
import gradio as gr
from transformers import pipeline
import numpy as np
import time
from typing import Tuple
import logging
import torch
# Create a logger.
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Check if all the variables are set.
required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"]
for required_variable in required_variables:
if os.environ.get(required_variable, "NO") == "NO":
logger.error(
f"Environment variable {required_variable} is not set. "
"Please set it before running the application."
)
raise ValueError(
f"Environment variable {required_variable} is not set. "
"Please set it before running the application."
)
# Create the transcription pipeline.
model_name = os.environ["MODEL_NAME"]
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Loading model {model_name} with device {device}...")
transcriber = pipeline(
"automatic-speech-recognition",
model=model_name,
device=device
)
logger.info(f"Model loaded successfully.")
# Start the app.
def main():
interface = create_interface()
interface.launch()
# Create the Gradio interface for the Whisper transcription service.
def create_interface():
# The UI is a block of Gradio components.
with gr.Blocks() as interface:
# Title.
gr.Markdown("# Whisper Speech Transcription")
# One row for the password input and another for the audio input.
with gr.Row():
with gr.Column(scale=2):
passwort_input = gr.Textbox(
label="Enter Password",
placeholder="Enter the password to access the transcription service",
type="password"
)
# Row for audio input.
with gr.Row():
with gr.Column(scale=2):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Record or Upload Audio"
)
# Row for the transcription button.
with gr.Row():
transcribe_button = gr.Button("Transcribe", variant="primary")
# Row for the transcription output.
with gr.Row():
output_text = gr.Textbox(
label="Transcription Output",
placeholder="Transcription will appear here...",
lines=5
)
# Status message for transcription time.
status_text = gr.Textbox(
label="Status",
placeholder="Transcription status will appear here...",
lines=1,
interactive=False
)
# Set up the transcribe button click event
transcribe_button.click(
fn=transcribe_audio,
inputs=[audio_input, passwort_input],
outputs=[output_text, status_text],
)
# Also transcribe when audio is recorded/uploaded
audio_input.change(
fn=transcribe_audio,
inputs=[audio_input, passwort_input],
outputs=[output_text, status_text],
)
return interface
def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str:
# If the password is wrong, return an error message.
if password != os.environ.get("PASSWORD"):
return "Incorrect password. Please try again.", ""
# If there is no audio, return an error message.
if audio is None:
return "No audio detected. Please record some audio.", ""
# Start measuring the time.
start_time = time.time()
# Unpack the audio.
sr, y = audio
# Convert to mono if stereo
if y.ndim > 1:
logger.debug(f"Converting {y.shape[1]} channels to mono")
y = y.mean(axis=1)
# Normalize audio
y = y.astype(np.float32)
max_abs = np.max(np.abs(y))
if max_abs > 0: # Avoid division by zero
y /= max_abs
logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)")
# Run transcription
result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0])
logger.info(f"Transcription completed.")
# Calculate elapsed time
elapsed_time = time.time() - start_time
audio_time = len(y) / sr
status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}."
return result["text"], status_string
# Entrypoint.
main()