Spaces:

st192011
/

Torgo-DSR-Lab

Sleeping

App Files Files Community

st192011 commited on 16 days ago

Commit

0353a67

verified ·

1 Parent(s): 1c0c2a7

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -73

app.py CHANGED Viewed

@@ -2,118 +2,110 @@ import gradio as gr
 import os
 import random
 import soundfile as sf
 from transformers import pipeline
 from datasets import load_dataset
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Setup Local Whisper (Baseline)
-# Running locally ensures the user gets an immediate baseline result
 whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
-# 2. Setup Private Backend Connection
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Change this to your actual private space URL when ready
-PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
-def get_sample_from_dataset(speaker_id):
-    """Streams a sample from Hugging Face datasets."""
     try:
         if "UA" in speaker_id:
-            return None, "UA-Speech samples are currently static in this lab. Please use the record function for custom UA testing.", SPEAKER_META[speaker_id]
-        # Stream Torgo test set
-        ds = load_dataset("unsw-cse/torgo", split="test", streaming=True)
-        speaker_ds = ds.filter(lambda x: x["speaker_id"] == speaker_id)
-        # Get a random sample from the first few available
-        sample = next(iter(speaker_ds.shuffle(buffer_size=5)))
-        audio_path = "temp_sample.wav"
         sf.write(audio_path, sample["audio"]["array"], sample["audio"]["sampling_rate"])
         return audio_path, sample["text"], SPEAKER_META[speaker_id]
     except Exception as e:
-        return None, f"Error streaming dataset: {e}", None
-def run_lab_comparison(audio):
-    if audio is None:
-        return "Please provide audio.", "", ""
     # A. Local Whisper Inference
-    w_raw = whisper_asr(audio)["text"]
-    w_norm = w_raw.lower().strip().replace(".", "").replace("?", "")
-    # B. Call Private Backend
-    # This keeps your specific stacking, Allosaurus, and Gemma logic secret
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # We expect the private app to return two strings: result_5k and result_10k
-        res_5k, res_10k = client.predict(audio, w_norm, api_name="/predict_dsr_dual")
     except Exception as e:
-        res_5k = "Backend Connection Required"
-        res_10k = f"Error: {e}"
     return w_raw, res_5k, res_10k
-# UI Construction
-with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("### Neural Reconstruction and Correction for Severe Dysarthric Speech")
-    with gr.Tab("🔬 Interactive Lab"):
-        gr.Markdown("Select a speaker from the Torgo or UA-Speech datasets to compare standard ASR with our reconstruction layer.")
         with gr.Row():
             with gr.Column(scale=1):
-                speaker_drop = gr.Dropdown(list(SPEAKER_META.keys()), label="Choose Speaker Profile")
-                load_btn = gr.Button("🎲 Load Dataset Sample")
                 gr.Markdown("---")
-                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio (Real-time or Dataset)")
             with gr.Column(scale=2):
-                with gr.Group():
-                    gr.Markdown("#### Speaker Metadata & Ground Truth")
-                    gt_display = gr.Textbox(label="Ground Truth (Human Verified)", interactive=False)
-                    meta_display = gr.JSON(label="Speaker Characteristics")
-                with gr.Group():
-                    gr.Markdown("#### Reconstruction Comparison")
-                    w_out = gr.Textbox(label="Whisper Tiny Baseline (Uncorrected)")
-                    with gr.Row():
-                        out_5k = gr.Textbox(label="5K Pure Model (Acoustic Expert)")
-                        out_10k = gr.Textbox(label="10K Triple-Mix Model (Linguistic Assistant)")
-        run_btn = gr.Button("🚀 Run Reconstruction Layer", variant="primary")
-    with gr.Tab("📊 Research & Statistics"):
-        gr.Markdown("## In-Domain Accuracy (Torgo Dataset)")
-        gr.Markdown("This table shows the performance gain of our models across different severity levels when trained on speaker-specific data.")
         gr.DataFrame(get_indomain_breakdown())
-        gr.Markdown("## Cross-Speaker & Cross-Domain Summary")
-        gr.Markdown("Evaluation of the model's ability to generalize to unseen speakers (LOSO) and entirely different datasets (UA-Speech Zero-Shot).")
         gr.DataFrame(get_experimental_summary())
-        gr.Markdown("""
-        ### Key Scientific Findings
-        * **Severity Correlation:** Standard ASR performance drops significantly as severity increases. Our models provide the highest relative gain (+100%) in the 'Severe' category.
-        * **The Acoustic Floor:** The **5K Pure Model** (trained only on real data) provides the highest raw accuracy, proving that real-world articulatory distortions are essential for model grounding.
-        * **Linguistic Fluency:** The **10K Triple-Mix Model** incorporates synthetic data to provide grammatically structured output, making it more suitable for assistant-based communication.
-        * **Transfer Ability:** Our zero-shot tests on **UA-Speech (F02)** prove that the model has learned a generalized phonetic dictionary, outperforming Whisper on a completely foreign dataset.
-        """)
     # Event Logic
-    load_btn.click(
-        get_sample_from_dataset,
-        inputs=speaker_drop,
-        outputs=[audio_input, gt_display, meta_display]
-    )
-    run_btn.click(
-        run_lab_comparison,
-        inputs=audio_input,
-        outputs=[w_out, out_5k, out_10k]
-    )
 demo.launch()

 import os
 import random
 import soundfile as sf
+import re
 from transformers import pipeline
 from datasets import load_dataset
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Local Whisper (Baseline)
 whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+# 2. Setup Private Backend Connection (Hidden logic)
 HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private" # Update with your private space name
+def normalize_text(text):
+    """Simple normalization for comparison: lowercase and strip punctuation."""
+    return re.sub(r'[^\w\s]', '', text).lower().strip()
+def get_sample(speaker_id):
+    """Accesses HF Datasets via Streaming to get a sample for the UI."""
     try:
         if "UA" in speaker_id:
+            # Note: UA-Speech ID logic (Speaker F02)
+            path = "ngdiana/uaspeech_severity_high"
+            actual_spk = "F02"
+        else:
+            path = "unsw-cse/torgo"
+            actual_spk = speaker_id
+        # Stream dataset to avoid huge downloads
+        ds = load_dataset(path, split="test", streaming=True)
+        # Filter for the chosen speaker
+        speaker_ds = ds.filter(lambda x: x["speaker_id"] == actual_spk)
+        # Take a small buffer and pick a random sample
+        samples = list(speaker_ds.take(20))
+        sample = random.choice(samples)
+        audio_path = "sample_audio.wav"
         sf.write(audio_path, sample["audio"]["array"], sample["audio"]["sampling_rate"])
         return audio_path, sample["text"], SPEAKER_META[speaker_id]
     except Exception as e:
+        return None, f"Error accessing dataset: {e}", None
+def run_correction(audio_path, gt_text):
+    if audio_path is None: return "No audio input", "", ""
     # A. Local Whisper Inference
+    w_raw = whisper_asr(audio_path)["text"]
+    w_norm = normalize_text(w_raw)
+    # B. Call Private Backend for the 5K and 10K results
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Private app receives audio + normalized whisper, returns (5k_pred, 10k_pred)
+        res_5k, res_10k = client.predict(audio_path, w_norm, api_name="/predict_dsr_dual")
     except Exception as e:
+        res_5k, res_10k = "Backend Connection Required", f"Details: {e}"
     return w_raw, res_5k, res_10k
+# UI Layout
+with gr.Blocks(theme=gr.themes.Default(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("### Neural Reconstruction and ASR Correction for Torgo and UA-Speech")
+    with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("#### 1. Dataset Explorer")
+                spk_input = gr.Dropdown(list(SPEAKER_META.keys()), label="Select Speaker Profile")
+                load_btn = gr.Button("🎲 Load Random Dataset Sample")
                 gr.Markdown("---")
+                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio")
             with gr.Column(scale=2):
+                gr.Markdown("#### 2. Metadata & Ground Truth")
+                gt_box = gr.Textbox(label="Ground Truth (Human Label)", interactive=False)
+                meta_box = gr.JSON(label="Speaker Characteristics")
+                gr.Markdown("#### 3. Comparison Results")
+                w_out = gr.Textbox(label="Whisper Tiny Baseline (Raw Transcript)")
+                with gr.Row():
+                    out_5k = gr.Textbox(label="5K Pure Model (Acoustic Focus)")
+                    out_10k = gr.Textbox(label="10K Triple-Mix Model (Linguistic Focus)")
+        run_btn = gr.Button("🚀 Run Correction Layer", variant="primary")
+    with gr.Tab("📊 Research Statistics"):
+        gr.Markdown("# 🔬 Evaluation Metrics")
+        gr.Markdown("""
+        **Metric:** Exact Match Accuracy.
+        Calculated by comparing the **normalized prediction** (lowercase, no punctuation) against the **normalized ground truth**.
+        """)
+        gr.Markdown("### 1. In-Domain Torgo Breakdown (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
+        gr.Markdown("### 2. Experimental Milestone Summary")
+        gr.Markdown("_Note: The 10K model was utilized to test generalization via LOSO on unseen speaker F01._")
         gr.DataFrame(get_experimental_summary())
     # Event Logic
+    load_btn.click(get_sample, inputs=spk_input, outputs=[audio_input, gt_box, meta_box])
+    run_btn.click(run_correction, inputs=[audio_input, gt_box], outputs=[w_out, out_5k, out_10k])
 demo.launch()