Spaces:
Sleeping
Sleeping
| # check_eval_coverage.py | |
| import argparse, json, re | |
| from pathlib import Path | |
| import pandas as pd | |
| def normalize_video_id(s: str) -> str: | |
| """CSV/JSON/URL에서 들어오는 video_id를 repo-relative path로 통일.""" | |
| if s is None: | |
| return "" | |
| s = str(s).strip() | |
| # URL이면 /resolve/<branch>/ 뒤를 repo-relative로 자르기 | |
| m = re.search(r"/resolve/[^/]+/(.+)$", s) | |
| if m: | |
| s = m.group(1) | |
| # 혹시 leading slash 있으면 제거 | |
| s = s.lstrip("/") | |
| # Windows backslash 방지 | |
| s = s.replace("\\", "/") | |
| return s | |
| def action_from_video_id(video_id: str) -> str: | |
| # 보통 "Model/Action/file.mp4" | |
| parts = normalize_video_id(video_id).split("/") | |
| if len(parts) >= 2: | |
| return parts[1] | |
| return "UNKNOWN" | |
| def load_expected_ids(json_path: Path): | |
| data = json.loads(json_path.read_text(encoding="utf-8")) | |
| expected = [] | |
| for v in data: | |
| vid = v.get("id") or v.get("url") or "" | |
| expected.append(normalize_video_id(vid)) | |
| # 빈 값 제거 | |
| expected = [x for x in expected if x] | |
| return expected | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--json", required=True, help="videos json (list of dicts with id/url)") | |
| ap.add_argument("--csv", required=True, help="results_extend.csv") | |
| ap.add_argument("--pid", default="YS", help="participant_id to check") | |
| ap.add_argument("--save_lists", action="store_true", help="save missing/extra to txt") | |
| args = ap.parse_args() | |
| json_path = Path(args.json) | |
| csv_path = Path(args.csv) | |
| expected_list = load_expected_ids(json_path) | |
| expected_set = set(expected_list) | |
| df = pd.read_csv(csv_path) | |
| # 컬럼 이름 혹시 공백/대소문자 문제 대비 | |
| df.columns = [c.strip() for c in df.columns] | |
| if "participant_id" not in df.columns or "video_id" not in df.columns: | |
| raise SystemExit(f"CSV columns must include participant_id and video_id. Got: {list(df.columns)}") | |
| df_pid = df[df["participant_id"].astype(str).str.strip() == args.pid].copy() | |
| df_pid["video_id_norm"] = df_pid["video_id"].map(normalize_video_id) | |
| rated_list = [x for x in df_pid["video_id_norm"].tolist() if x] | |
| rated_set = set(rated_list) | |
| missing = sorted(expected_set - rated_set) | |
| extra = sorted(rated_set - expected_set) | |
| # duplicates: 같은 video_id를 여러번 저장한 경우 | |
| dup_counts = ( | |
| pd.Series(rated_list) | |
| .value_counts() | |
| .loc[lambda s: s > 1] | |
| .sort_values(ascending=False) | |
| ) | |
| print("\n=== SUMMARY ===") | |
| print(f"PID: {args.pid}") | |
| print(f"Expected videos (from JSON): {len(expected_list)} (unique={len(expected_set)})") | |
| print(f"Rated rows in CSV (for PID): {len(df_pid)}") | |
| print(f"Rated unique videos: {len(rated_set)}") | |
| print(f"Missing (expected - rated): {len(missing)}") | |
| print(f"Extra (rated - expected): {len(extra)}") | |
| print(f"Duplicate-rated videos: {len(dup_counts)}") | |
| # action별 진행 상황 (원하면 유용) | |
| exp_actions = pd.Series([action_from_video_id(x) for x in expected_list]).value_counts() | |
| rated_actions = pd.Series([action_from_video_id(x) for x in rated_list]).value_counts() | |
| action_table = ( | |
| pd.DataFrame({"expected": exp_actions, "rated_rows": rated_actions}) | |
| .fillna(0).astype(int) | |
| .sort_values(["expected", "rated_rows"], ascending=False) | |
| ) | |
| print("\n=== ACTION COUNTS (expected vs rated rows) ===") | |
| print(action_table.to_string()) | |
| if missing: | |
| print("\n=== MISSING (first 50) ===") | |
| for x in missing[:50]: | |
| print(x) | |
| if extra: | |
| print("\n=== EXTRA (first 50) ===") | |
| for x in extra[:50]: | |
| print(x) | |
| if len(dup_counts) > 0: | |
| print("\n=== DUPLICATES (top 50) ===") | |
| print(dup_counts.head(50).to_string()) | |
| if args.save_lists: | |
| Path("missing.txt").write_text("\n".join(missing) + ("\n" if missing else ""), encoding="utf-8") | |
| Path("extra.txt").write_text("\n".join(extra) + ("\n" if extra else ""), encoding="utf-8") | |
| Path("duplicates.txt").write_text(dup_counts.to_string() + "\n", encoding="utf-8") | |
| print("\nSaved: missing.txt, extra.txt, duplicates.txt") | |
| if __name__ == "__main__": | |
| main() | |