Spaces:

TristanBehrens
/

speech

Sleeping

App Files Files Community

speech / app.py

TristanBehrens

Upload app.py

628f79b verified 3 months ago

raw

history blame contribute delete

4.63 kB

	import os
	import gradio as gr
	from transformers import pipeline
	import numpy as np
	import time
	from typing import Tuple
	import logging
	import torch


	# Create a logger.
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Check if all the variables are set.
	required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"]
	for required_variable in required_variables:
	if os.environ.get(required_variable, "NO") == "NO":
	logger.error(
	f"Environment variable {required_variable} is not set. "
	"Please set it before running the application."
	)
	raise ValueError(
	f"Environment variable {required_variable} is not set. "
	"Please set it before running the application."
	)


	# Create the transcription pipeline.
	model_name = os.environ["MODEL_NAME"]
	device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Loading model {model_name} with device {device}...")
	transcriber = pipeline(
	"automatic-speech-recognition",
	model=model_name,
	device=device
	)
	logger.info(f"Model loaded successfully.")


	# Start the app.
	def main():
	interface = create_interface()
	interface.launch()


	# Create the Gradio interface for the Whisper transcription service.
	def create_interface():

	# The UI is a block of Gradio components.
	with gr.Blocks() as interface:

	# Title.
	gr.Markdown("# Whisper Speech Transcription")

	# One row for the password input and another for the audio input.
	with gr.Row():
	with gr.Column(scale=2):
	passwort_input = gr.Textbox(
	label="Enter Password",
	placeholder="Enter the password to access the transcription service",
	type="password"
	)

	# Row for audio input.
	with gr.Row():
	with gr.Column(scale=2):
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="numpy",
	label="Record or Upload Audio"
	)

	# Row for the transcription button.
	with gr.Row():
	transcribe_button = gr.Button("Transcribe", variant="primary")

	# Row for the transcription output.
	with gr.Row():
	output_text = gr.Textbox(
	label="Transcription Output",
	placeholder="Transcription will appear here...",
	lines=5
	)

	# Status message for transcription time.
	status_text = gr.Textbox(
	label="Status",
	placeholder="Transcription status will appear here...",
	lines=1,
	interactive=False
	)

	# Set up the transcribe button click event
	transcribe_button.click(
	fn=transcribe_audio,
	inputs=[audio_input, passwort_input],
	outputs=[output_text, status_text],
	)

	# Also transcribe when audio is recorded/uploaded
	audio_input.change(
	fn=transcribe_audio,
	inputs=[audio_input, passwort_input],
	outputs=[output_text, status_text],
	)
	return interface


	def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str:

	# If the password is wrong, return an error message.
	if password != os.environ.get("PASSWORD"):
	return "Incorrect password. Please try again.", ""

	# If there is no audio, return an error message.
	if audio is None:
	return "No audio detected. Please record some audio.", ""

	# Start measuring the time.
	start_time = time.time()

	# Unpack the audio.
	sr, y = audio

	# Convert to mono if stereo
	if y.ndim > 1:
	logger.debug(f"Converting {y.shape[1]} channels to mono")
	y = y.mean(axis=1)

	# Normalize audio
	y = y.astype(np.float32)
	max_abs = np.max(np.abs(y))
	if max_abs > 0: # Avoid division by zero
	y /= max_abs

	logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)")

	# Run transcription
	result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0])
	logger.info(f"Transcription completed.")

	# Calculate elapsed time
	elapsed_time = time.time() - start_time
	audio_time = len(y) / sr
	status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}."
	return result["text"], status_string


	# Entrypoint.
	main()