|
import json, glob, os |
|
import pandas as pd |
|
from typing import Dict, List |
|
|
|
def _read_rows(results_dir: str) -> pd.DataFrame: |
|
rows = [] |
|
for path in glob.glob(os.path.join(results_dir, "*.json")): |
|
with open(path, "r") as f: |
|
data = json.load(f) |
|
for _, rec in data.items(): |
|
rows.append({ |
|
"Model": rec.get("model"), |
|
"Corpus": rec.get("corpus"), |
|
"Split": rec.get("split"), |
|
"WER": rec.get("wer"), |
|
"CER": rec.get("cer"), |
|
"Domain": rec.get("domain", ""), |
|
"Evaluated": rec.get("evaluated_at"), |
|
}) |
|
return pd.DataFrame(rows) |
|
|
|
def per_dataset_tables(results_dir: str) -> Dict[str, pd.DataFrame]: |
|
"""Return a dict of {corpus_name: DataFrame(Model, WER, CER)}""" |
|
df = _read_rows(results_dir) |
|
if df.empty: |
|
return {} |
|
tables = {} |
|
for corpus, g in df.groupby("Corpus"): |
|
t = ( |
|
g.sort_values("WER", ascending=True) |
|
.loc[:, ["Model", "WER", "CER"]] |
|
.reset_index(drop=True) |
|
) |
|
tables[corpus] = t |
|
return tables |
|
|
|
def overall_table(results_dir: str) -> pd.DataFrame: |
|
""" |
|
Return a single table with per-model averages: |
|
- Avg WER (all corpora) |
|
- Avg CER (all corpora) |
|
- Avg WER (excl. in-domain) |
|
- Avg CER (excl. in-domain) |
|
""" |
|
df = _read_rows(results_dir) |
|
if df.empty: |
|
return pd.DataFrame() |
|
|
|
|
|
agg_all = ( |
|
df.groupby("Model")[["WER", "CER"]] |
|
.mean() |
|
.rename(columns={"WER": "Avg WER (all)", "CER": "Avg CER (all)"}) |
|
) |
|
|
|
|
|
df_out = df[df["Domain"] != "in"] |
|
if df_out.empty: |
|
agg_out = pd.DataFrame(index=agg_all.index, columns=["Avg WER (excl in)", "Avg CER (excl in)"]) |
|
else: |
|
agg_out = ( |
|
df_out.groupby("Model")[["WER", "CER"]] |
|
.mean() |
|
.rename(columns={"WER": "Avg WER (excl in)", "CER": "Avg CER (excl in)"}) |
|
) |
|
|
|
|
|
merged = pd.concat([agg_all, agg_out], axis=1) |
|
merged = merged.reset_index() |
|
if "Avg WER (all)" in merged.columns: |
|
merged = merged.sort_values("Avg WER (all)", ascending=True) |
|
return merged |
|
|
|
def list_corpora(results_dir: str) -> List[str]: |
|
df = _read_rows(results_dir) |
|
if df.empty: |
|
return [] |
|
return sorted(df["Corpus"].unique().tolist()) |
|
|