abraarsyed's picture
Initial Commit
39ec667
# web/app.py
# Gradio-based web UI for VocalPrint AI (Refactored to use shared CLI logic)
import gradio as gr
import os
import tempfile
import whisper
import torch
import json
import sys
# Ensure parent directory is in path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from core.processor import (
download_video,
extract_audio,
transcribe,
classify_accent,
compute_fluency
)
# Load Whisper model once
whisper_model = whisper.load_model("small")
def process_video(url):
try:
temp_dir = tempfile.mkdtemp()
video_path = os.path.join(temp_dir, "video.mp4")
audio_path = os.path.join(temp_dir, "audio.wav")
download_video(url, temp_dir)
video_file = next((f for f in os.listdir(temp_dir) if f.endswith(".mp4")), None)
if not video_file:
raise FileNotFoundError("No .mp4 file found")
extract_audio(os.path.join(temp_dir, video_file), audio_path)
transcript, segments, language = transcribe(audio_path, whisper_model)
top_accent, confidence, top3 = classify_accent(audio_path)
fluency = compute_fluency(segments)
# Format the top3 for the dataframe display
top3_formatted = [[item["accent"], f"{item['confidence']}%"] for item in top3]
return (
top_accent,
f"{confidence}%",
fluency,
language,
transcript[:500],
top3_formatted
)
except Exception as e:
return ("Error", "-", "-", "-", str(e), [])
iface = gr.Interface(
fn=process_video,
inputs=gr.Textbox(label="Public Video URL (YouTube, Loom, MP4)", placeholder="https://..."),
outputs=[
gr.Textbox(label="Detected Accent"),
gr.Textbox(label="Confidence (%)"),
gr.Textbox(label="Fluency Score (0–100)"),
gr.Textbox(label="Language Detected by Whisper"),
gr.Textbox(label="Transcript Sample (first 500 chars)"),
gr.Dataframe(headers=["Accent", "Confidence"], label="Top 3 Accent Predictions")
],
title="VocalPrint AI",
description="Analyze English speech from a public video link to detect accent, fluency, and transcription.",
allow_flagging="never",
theme="default"
)
if __name__ == "__main__":
iface.launch()