lip_reader / app /app.py
omm7's picture
Upload app/app.py with huggingface_hub
84b62a9 verified
from __future__ import annotations
from pathlib import Path
import subprocess
import tempfile
import imageio
import numpy as np
import streamlit as st
import tensorflow as tf
from modelutil import load_model
from utils import load_data, num_to_char
# ── Page config ───────────────────────────────────────────────────────────────
st.set_page_config(
page_title="LipNet - Silent Speech Recognition",
page_icon="πŸ‘„",
layout="wide",
)
# ── Custom CSS ────────────────────────────────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
html, body, [class*="css"] {
font-family: 'Syne', sans-serif;
background-color: #07070f;
color: #e2e2f0;
}
.stApp { background-color: #07070f; }
[data-testid="stSidebar"] {
background-color: #0f0f1c !important;
border-right: 1px solid #1e1e32;
}
[data-testid="stSidebar"] * { color: #9ca3af !important; }
h1 {
font-weight: 800 !important;
background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
letter-spacing: -0.03em;
}
h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
.stAlert { border-radius: 10px !important; }
[data-testid="stInfo"] {
background: #0f0f1c !important;
border: 1px solid #2d2d4e !important;
color: #a5b4fc !important;
font-family: 'Space Mono', monospace;
font-size: 0.82rem;
}
[data-testid="stSuccess"] {
background: #0a1a14 !important;
border: 1px solid #1a3330 !important;
color: #34d399 !important;
font-family: 'Space Mono', monospace;
font-size: 1.1rem;
}
code, pre {
font-family: 'Space Mono', monospace !important;
background: #0a0a16 !important;
color: #a5b4fc !important;
border-radius: 8px !important;
font-size: 0.8rem !important;
}
[data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
hr { border-color: #1a1a2e !important; }
</style>
""", unsafe_allow_html=True)
# ── Sidebar ───────────────────────────────────────────────────────────────────
with st.sidebar:
st.markdown("## πŸ‘„ LipNet")
st.markdown(
"<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
"letter-spacing:0.1em;'>SILENT SPEECH RECOGNITION</p>",
unsafe_allow_html=True,
)
st.divider()
st.markdown("**Architecture**")
st.markdown("""
<p style='font-family:Space Mono,monospace;font-size:0.72rem;line-height:2;color:#4b5563;'>
Conv3D(128) ↓<br>
Conv3D(256) ↓<br>
Conv3D(75) ↓<br>
Reshape ↓<br>
BiLSTM(128) ↓<br>
BiLSTM(128) ↓<br>
Dense(41) + CTC
</p>
""", unsafe_allow_html=True)
st.divider()
st.markdown("**Dataset**")
st.markdown(
"<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
"line-height:2;'>GRID Corpus Β· Speaker S1<br>500 videos<br>"
"450 train / 50 test<br>Vocab: a–z 1–9 ' ? ! (space)</p>",
unsafe_allow_html=True,
)
st.divider()
st.caption("No audio. Lips only.")
# ── Title ─────────────────────────────────────────────────────────────────────
st.title("LipNet - Silent Speech Recognition")
st.markdown(
"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;"
"letter-spacing:0.15em;margin-top:-1rem;'>CONV3D + BILSTM + CTC Β· NO AUDIO REQUIRED</p>",
unsafe_allow_html=True,
)
st.divider()
# ── Data paths ────────────────────────────────────────────────────────────────
BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / 'data' / 's1'
options = sorted([item.name for item in DATA_DIR.glob('*.mpg')])
if not options:
st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.")
st.stop()
selected_video = st.selectbox("**Choose a video**", options)
file_path = DATA_DIR / selected_video
st.divider()
# ── Load model (cached) ───────────────────────────────────────────────────────
@st.cache_resource(show_spinner="Loading LipNet model...")
def get_model():
return load_model()
model = get_model()
# ── Load frames + alignment (cached per video) ────────────────────────────────
@st.cache_data(show_spinner="Processing video...")
def get_video_data(path: str):
video_tensor, annotations = load_data(tf.convert_to_tensor(path))
ground_truth = tf.strings.reduce_join(
num_to_char(annotations)
).numpy().decode('utf-8')
return video_tensor, annotations, ground_truth
video_tensor, annotations, ground_truth = get_video_data(str(file_path))
# ── Two-column layout ─────────────────────────────────────────────────────────
col1, col2 = st.columns(2, gap="large")
# ── Column 1: Video preview + Ground truth ────────────────────────────────────
with col1:
st.markdown("### πŸ“Ή Original Video")
st.info("Video converted to mp4 for browser playback")
output_path = None
try:
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
output_path = Path(f.name)
subprocess.run(
["ffmpeg", "-i", str(file_path), "-vcodec", "libx264",
"-crf", "23", str(output_path), "-y"],
check=True, capture_output=True, text=True,
)
st.video(output_path.read_bytes())
except subprocess.CalledProcessError as exc:
st.error("ffmpeg conversion failed.")
st.code(exc.stderr or "No error output.")
finally:
if output_path and output_path.exists():
output_path.unlink()
# ── Column 2: Model inference ─────────────────────────────────────────────────
with col2:
st.markdown("### 🧠 Model Inference")
# ── Mouth crop GIF ────────────────────────────────────────────────────────
st.info("Mouth crop - what the model actually sees (grayscale Β· normalized)")
gif_path = None
try:
with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf:
gif_path = Path(gf.name)
frames_np = video_tensor.numpy()
gif_frames = []
for f in frames_np:
g = f[:, :, 0]
g = (g - g.min()) / max(g.max() - g.min(), 1e-8)
rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8")
gif_frames.append(rgb)
imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0)
st.image(str(gif_path), width=400)
finally:
if gif_path and gif_path.exists():
gif_path.unlink()
st.divider()
# ── Raw tokens ────────────────────────────────────────────────────────────
st.info("Raw CTC token indices from model output")
yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
st.code(str(decoded[0].tolist()), language=None)
# ── Ground truth (moved here) ─────────────────────────────────────────────
st.divider()
st.info("Ground truth label (from `.align` file)")
st.code(ground_truth, language=None)
st.divider()
# ── Final prediction ──────────────────────────────────────────────────────
prediction = tf.strings.reduce_join(
num_to_char(decoded[0])
).numpy().decode('utf-8').strip()
st.success(f"**Prediction:** {prediction}")
# ── Confidence ────────────────────────────────────────────────────────────
confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
st.markdown(
f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
f"AVG CONFIDENCE Β· <span style='color:#34d399'>{confidence:.1f}%</span></p>",
unsafe_allow_html=True,
)