Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
import time | |
from typing import Tuple | |
import logging | |
import torch | |
# Create a logger. | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Check if all the variables are set. | |
required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"] | |
for required_variable in required_variables: | |
if os.environ.get(required_variable, "NO") == "NO": | |
logger.error( | |
f"Environment variable {required_variable} is not set. " | |
"Please set it before running the application." | |
) | |
raise ValueError( | |
f"Environment variable {required_variable} is not set. " | |
"Please set it before running the application." | |
) | |
# Create the transcription pipeline. | |
model_name = os.environ["MODEL_NAME"] | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
logger.info(f"Loading model {model_name} with device {device}...") | |
transcriber = pipeline( | |
"automatic-speech-recognition", | |
model=model_name, | |
device=device | |
) | |
logger.info(f"Model loaded successfully.") | |
# Start the app. | |
def main(): | |
interface = create_interface() | |
interface.launch() | |
# Create the Gradio interface for the Whisper transcription service. | |
def create_interface(): | |
# The UI is a block of Gradio components. | |
with gr.Blocks() as interface: | |
# Title. | |
gr.Markdown("# Whisper Speech Transcription") | |
# One row for the password input and another for the audio input. | |
with gr.Row(): | |
with gr.Column(scale=2): | |
passwort_input = gr.Textbox( | |
label="Enter Password", | |
placeholder="Enter the password to access the transcription service", | |
type="password" | |
) | |
# Row for audio input. | |
with gr.Row(): | |
with gr.Column(scale=2): | |
audio_input = gr.Audio( | |
sources=["microphone", "upload"], | |
type="numpy", | |
label="Record or Upload Audio" | |
) | |
# Row for the transcription button. | |
with gr.Row(): | |
transcribe_button = gr.Button("Transcribe", variant="primary") | |
# Row for the transcription output. | |
with gr.Row(): | |
output_text = gr.Textbox( | |
label="Transcription Output", | |
placeholder="Transcription will appear here...", | |
lines=5 | |
) | |
# Status message for transcription time. | |
status_text = gr.Textbox( | |
label="Status", | |
placeholder="Transcription status will appear here...", | |
lines=1, | |
interactive=False | |
) | |
# Set up the transcribe button click event | |
transcribe_button.click( | |
fn=transcribe_audio, | |
inputs=[audio_input, passwort_input], | |
outputs=[output_text, status_text], | |
) | |
# Also transcribe when audio is recorded/uploaded | |
audio_input.change( | |
fn=transcribe_audio, | |
inputs=[audio_input, passwort_input], | |
outputs=[output_text, status_text], | |
) | |
return interface | |
def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str: | |
# If the password is wrong, return an error message. | |
if password != os.environ.get("PASSWORD"): | |
return "Incorrect password. Please try again.", "" | |
# If there is no audio, return an error message. | |
if audio is None: | |
return "No audio detected. Please record some audio.", "" | |
# Start measuring the time. | |
start_time = time.time() | |
# Unpack the audio. | |
sr, y = audio | |
# Convert to mono if stereo | |
if y.ndim > 1: | |
logger.debug(f"Converting {y.shape[1]} channels to mono") | |
y = y.mean(axis=1) | |
# Normalize audio | |
y = y.astype(np.float32) | |
max_abs = np.max(np.abs(y)) | |
if max_abs > 0: # Avoid division by zero | |
y /= max_abs | |
logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)") | |
# Run transcription | |
result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0]) | |
logger.info(f"Transcription completed.") | |
# Calculate elapsed time | |
elapsed_time = time.time() - start_time | |
audio_time = len(y) / sr | |
status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}." | |
return result["text"], status_string | |
# Entrypoint. | |
main() |