Torgo-DSR-Lab / stats_data.py
st192011's picture
Update stats_data.py
f310c2d verified
import pandas as pd
# Metadata lookup for UI and filtering
SPEAKER_META = {
"F01": {"Gender": "Female", "Severity": "Severe", "Dataset": "Torgo"},
"F03": {"Gender": "Female", "Severity": "Mild", "Dataset": "Torgo"},
"F04": {"Gender": "Female", "Severity": "Mild", "Dataset": "Torgo"},
"M01": {"Gender": "Male", "Severity": "Moderate", "Dataset": "Torgo"},
"M02": {"Gender": "Male", "Severity": "Mild", "Dataset": "Torgo"},
"M03": {"Gender": "Male", "Severity": "Mild", "Dataset": "Torgo"},
"M04": {"Gender": "Male", "Severity": "Moderate", "Dataset": "Torgo"},
"M05": {"Gender": "Male", "Severity": "Severe", "Dataset": "Torgo"},
"F02 (UA)": {"Gender": "Female", "Severity": "Severe", "Dataset": "UA-Speech"}
}
def get_indomain_breakdown():
# Performance on Seen Torgo Speakers
data = {
"Speaker": ["M05", "F01", "M01", "M04", "M02", "M03", "F03", "F04"],
"Severity": ["Severe", "Severe", "Moderate", "Moderate", "Mild", "Mild", "Mild", "Mild"],
"Whisper Tiny": [12.1, 12.6, 32.7, 31.8, 62.1, 58.4, 61.2, 59.1],
"5K Pure Model": [33.1, 34.2, 47.2, 45.6, 84.5, 81.8, 83.5, 82.8],
"10K Triple-Mix": ["25.4%", "24.1% (LOSO)", "44.1%", "41.2%", "79.1%", "77.5%", "79.0%", "78.2%"]
}
df = pd.DataFrame(data)
# Calculate Relative Gain from the best model (usually 5K for in-domain)
df["Relative Gain (Best)"] = (((df["5K Pure Model"] - df["Whisper Tiny"]) / df["Whisper Tiny"]) * 100).round(1)
# Formatting
df["Whisper Tiny"] = df["Whisper Tiny"].astype(str) + "%"
df["5K Pure Model"] = df["5K Pure Model"].astype(str) + "%"
df["Relative Gain (Best)"] = "+" + df["Relative Gain (Best)"].astype(str) + "%"
return df
def get_experimental_summary():
# Summary of the three primary research conditions
data = {
"Condition": ["In-Domain (Seen Torgo)", "LOSO (Unseen Torgo F01)", "Zero-Shot (UA-Speech F02)"],
"Whisper Tiny": [41.50, 12.38, 4.33],
"5K Pure Model": [58.77, "N/A", 6.19],
"10K Triple-Mix": [54.67, 24.76, 5.98]
}
df = pd.DataFrame(data)
# Calculate Best Relative Gain
best_scores = [58.77, 24.76, 6.19]
whisper_base = [41.50, 12.38, 4.33]
gains = [f"+{round(((b-w)/w)*100, 1)}%" for b, w in zip(best_scores, whisper_base)]
df["Relative Gain (Best)"] = gains
# Formatting for display
df["Whisper Tiny"] = df["Whisper Tiny"].astype(str) + "%"
df["5K Pure Model"] = df["5K Pure Model"].apply(lambda x: f"{x}%" if x != "N/A" else x)
df["10K Triple-Mix"] = df["10K Triple-Mix"].astype(str) + "%"
return df