Spaces:
Sleeping
Sleeping
# web/app.py | |
# Gradio-based web UI for VocalPrint AI (Refactored to use shared CLI logic) | |
import gradio as gr | |
import os | |
import tempfile | |
import whisper | |
import torch | |
import json | |
import sys | |
# Ensure parent directory is in path | |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
from core.processor import ( | |
download_video, | |
extract_audio, | |
transcribe, | |
classify_accent, | |
compute_fluency | |
) | |
# Load Whisper model once | |
whisper_model = whisper.load_model("small") | |
def process_video(url): | |
try: | |
temp_dir = tempfile.mkdtemp() | |
video_path = os.path.join(temp_dir, "video.mp4") | |
audio_path = os.path.join(temp_dir, "audio.wav") | |
download_video(url, temp_dir) | |
video_file = next((f for f in os.listdir(temp_dir) if f.endswith(".mp4")), None) | |
if not video_file: | |
raise FileNotFoundError("No .mp4 file found") | |
extract_audio(os.path.join(temp_dir, video_file), audio_path) | |
transcript, segments, language = transcribe(audio_path, whisper_model) | |
top_accent, confidence, top3 = classify_accent(audio_path) | |
fluency = compute_fluency(segments) | |
# Format the top3 for the dataframe display | |
top3_formatted = [[item["accent"], f"{item['confidence']}%"] for item in top3] | |
return ( | |
top_accent, | |
f"{confidence}%", | |
fluency, | |
language, | |
transcript[:500], | |
top3_formatted | |
) | |
except Exception as e: | |
return ("Error", "-", "-", "-", str(e), []) | |
iface = gr.Interface( | |
fn=process_video, | |
inputs=gr.Textbox(label="Public Video URL (YouTube, Loom, MP4)", placeholder="https://..."), | |
outputs=[ | |
gr.Textbox(label="Detected Accent"), | |
gr.Textbox(label="Confidence (%)"), | |
gr.Textbox(label="Fluency Score (0–100)"), | |
gr.Textbox(label="Language Detected by Whisper"), | |
gr.Textbox(label="Transcript Sample (first 500 chars)"), | |
gr.Dataframe(headers=["Accent", "Confidence"], label="Top 3 Accent Predictions") | |
], | |
title="VocalPrint AI", | |
description="Analyze English speech from a public video link to detect accent, fluency, and transcription.", | |
allow_flagging="never", | |
theme="default" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |