|
|
import json |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
from huggingface_hub import snapshot_download |
|
|
import tqdm.auto as tqdm |
|
|
from typing import Any, Dict, List, Tuple |
|
|
from collections import defaultdict |
|
|
from metric import _metric |
|
|
import os |
|
|
import pandas as pd |
|
|
|
|
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" |
|
|
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "20" |
|
|
COMP_CACHE = os.environ.get("COMP_CACHE", "./competition_cache") |
|
|
|
|
|
|
|
|
def download_competition_data(competition_names: List[str]) -> None: |
|
|
"""Download copies to local environment""" |
|
|
for repo_id in tqdm.tqdm(competition_names): |
|
|
snapshot_download( |
|
|
repo_id=repo_id, |
|
|
local_dir=os.path.join(COMP_CACHE, repo_id), |
|
|
repo_type="dataset", |
|
|
token=os.environ.get("HF_TOKEN"), |
|
|
ignore_patterns="submission_logs/*", |
|
|
) |
|
|
|
|
|
|
|
|
STATUS_MAP = {0: "PENDING", 1: "QUEUED", 2: "PROCESSING", 3: "SUCCESS", 4: "FAILED"} |
|
|
|
|
|
|
|
|
os.makedirs(Path("competition_cache") / "cached_results", exist_ok=True) |
|
|
os.makedirs(Path("competition_cache") / "cached_results" / "by_team", exist_ok=True) |
|
|
|
|
|
|
|
|
def load_teams(competition_space_path: Path) -> pd.DataFrame: |
|
|
team_file_name = "teams.json" |
|
|
return pd.read_json(Path(competition_space_path) / team_file_name).T |
|
|
|
|
|
|
|
|
def json_to_dataframe(data, extra_column_name=None, extra_column_value=None): |
|
|
flat_data = [] |
|
|
for entry in data: |
|
|
original_flat_entry = {**entry} |
|
|
flat_entry = {k: v for k, v in original_flat_entry.items() if not "score" in k} |
|
|
times = { |
|
|
k.replace("score", "time"): v.get("total_time", -1) for k, v in original_flat_entry.items() if "score" in k |
|
|
} |
|
|
flat_entry.update(times) |
|
|
if extra_column_name: |
|
|
flat_entry[extra_column_name] = extra_column_value |
|
|
flat_data.append(flat_entry) |
|
|
df = pd.DataFrame(flat_data) |
|
|
return df |
|
|
|
|
|
|
|
|
def load_submission_map(competition_space_path: Path) -> Tuple[Dict[str, str], pd.DataFrame]: |
|
|
submission_info_dir = "submission_info" |
|
|
submission_info_files = list((Path(competition_space_path) / submission_info_dir).glob("*.json")) |
|
|
|
|
|
|
|
|
team_submissions: Dict[str, str] = {} |
|
|
submission_summaries: List[pd.DataFrame] = [] |
|
|
for file in submission_info_files: |
|
|
with open(file, "r") as fn: |
|
|
json_data = json.load(fn) |
|
|
submission_summaries.append( |
|
|
json_to_dataframe( |
|
|
data=json_data["submissions"], extra_column_name="team_id", extra_column_value=json_data["id"] |
|
|
) |
|
|
) |
|
|
submission_list = pd.read_json(file).submissions.values.tolist() |
|
|
for submission in submission_list: |
|
|
team_submissions[submission["submission_id"]] = submission["submitted_by"] |
|
|
submission_summary = pd.concat(submission_summaries, axis=0) |
|
|
submission_summary["status_reason"] = submission_summary["status"].apply(lambda x: STATUS_MAP[x]) |
|
|
return team_submissions, submission_summary |
|
|
|
|
|
|
|
|
def get_member_to_team_map(teams: pd.DataFrame, team_submissions: Dict[str, str]) -> Dict[str, str]: |
|
|
member_map: Dict[str, str] = {} |
|
|
for member_id in team_submissions.values(): |
|
|
member_map[member_id] = teams[teams.members.apply(lambda x: member_id in x)].id.values[0] |
|
|
return member_map |
|
|
|
|
|
|
|
|
def load_submissions(competition_space_path: Path) -> Dict[str, Dict[str, pd.DataFrame]]: |
|
|
submission_dir = "submissions" |
|
|
submissions: Dict[str, Dict[str, pd.DataFrame]] = defaultdict(dict) |
|
|
for file in list((Path(competition_space_path) / submission_dir).glob("*.csv")): |
|
|
file_name = str(file).split("/")[-1].split(".")[0] |
|
|
team_id = "-".join(file_name.split("/")[-1].split("-")[:5]) |
|
|
sub_id = "-".join(file_name.split("/")[-1].split("-")[5:]) |
|
|
submissions[team_id][sub_id] = pd.read_csv(file).set_index("id") |
|
|
return submissions |
|
|
|
|
|
|
|
|
def compute_metric_per_team( |
|
|
solution_df: pd.DataFrame, |
|
|
team_submissions: Dict[str, pd.DataFrame], |
|
|
submission_summaries: pd.DataFrame, |
|
|
score_split: str = "source", |
|
|
) -> Dict[str, Any]: |
|
|
results: Dict[str, Any] = {} |
|
|
for submission_id, submission in team_submissions.items(): |
|
|
selected = ( |
|
|
submission_summaries.query(f'submission_id=="{submission_id}"') |
|
|
.filter(["selected"]) |
|
|
.reset_index(drop=True) |
|
|
.to_dict(orient="index") |
|
|
.get(0, {"selected": "False"}) |
|
|
.get("selected", "False") |
|
|
) |
|
|
try: |
|
|
results[submission_id] = _metric( |
|
|
solution_df=solution_df, |
|
|
submission_df=submission, |
|
|
score_name=score_split, |
|
|
use_all=True if score_split == "source" else False, |
|
|
) |
|
|
for key in (current_results := results[submission_id]): |
|
|
current_results[key]["selected"] = selected |
|
|
except Exception as e: |
|
|
|
|
|
print("SKIPPING: ", submission_id, e) |
|
|
return results |
|
|
|
|
|
|
|
|
def prep_public(public_results: Dict[str, Any]) -> Dict[str, Any]: |
|
|
new: Dict[str, Any] = {} |
|
|
for key, value in public_results.items(): |
|
|
if key in ["proportion", "roc", "original_source"]: |
|
|
continue |
|
|
new[key] = value |
|
|
return new |
|
|
|
|
|
|
|
|
def prep_private(private_results: Dict[str, Any]) -> Dict[str, Any]: |
|
|
new: Dict[str, Any] = {} |
|
|
for key, value in private_results.items(): |
|
|
if key in ["proportion", "roc", "anon_source"]: |
|
|
continue |
|
|
new[key] = value |
|
|
return new |
|
|
|
|
|
|
|
|
def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]: |
|
|
new: Dict[str, Any] = {} |
|
|
for key, value in results.items(): |
|
|
if key in ["roc"]: |
|
|
for sub_key, sub_value in value.items(): |
|
|
new[sub_key] = sub_value |
|
|
continue |
|
|
if key in ["auc"]: |
|
|
new[key] = value |
|
|
return new |
|
|
|
|
|
|
|
|
def add_custom_submission(path_to_cache, path_to_subfile, threshold=0): |
|
|
import pandas as pd |
|
|
import json |
|
|
|
|
|
data = pd.read_csv(path_to_subfile) |
|
|
data["id"] = data["ID"] |
|
|
data["score"] = data["Score"] |
|
|
data["pred"] = data["score"].apply(lambda a: "generated" if a >= threshold else "real") |
|
|
|
|
|
team_id = "insiders-id-1-2-3" |
|
|
team_name = "insiders" |
|
|
submission_id = f"sub{threshold}".replace(".", "") |
|
|
|
|
|
|
|
|
teams = json.load(open(path_to_cache + "/teams.json")) |
|
|
teams[team_id] = {"id": team_id, "name": team_name, "members": ["na"], "leader": "na"} |
|
|
|
|
|
with open(path_to_cache + "/teams.json", "w") as f: |
|
|
json.dump(teams, f, indent=4) |
|
|
|
|
|
|
|
|
|
|
|
submission_info_file = path_to_cache + f"/submission_info/{team_id}.json" |
|
|
|
|
|
if os.path.exists(submission_info_file): |
|
|
temp = json.load(open(submission_info_file)) |
|
|
else: |
|
|
temp = {"id": team_id, "submissions": []} |
|
|
|
|
|
temp["submissions"].append( |
|
|
{ |
|
|
"datetime": "2025-09-22 14:42:14", |
|
|
"submission_id": submission_id, |
|
|
"submission_comment": "", |
|
|
"submission_repo": "", |
|
|
"space_id": "", |
|
|
"submitted_by": "na", |
|
|
"status": 3, |
|
|
"selected": True, |
|
|
"public_score": {}, |
|
|
"private_score": {}, |
|
|
} |
|
|
) |
|
|
|
|
|
with open(submission_info_file, "w") as f: |
|
|
json.dump(temp, f) |
|
|
|
|
|
data.loc[:, ["id", "pred", "score"]].to_csv( |
|
|
path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False |
|
|
) |
|
|
|
|
|
|
|
|
def create_custom_subs(): |
|
|
import numpy as np |
|
|
|
|
|
for threshold in np.linspace(-6, 0, 10): |
|
|
add_custom_submission( |
|
|
path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config", |
|
|
path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt", |
|
|
threshold=threshold, |
|
|
) |
|
|
|
|
|
|
|
|
def save_by_team(df: pd.DataFrame, save_path_base: str) -> None: |
|
|
df = df.copy() |
|
|
for team in df["team"].unique(): |
|
|
os.makedirs(f"competition_cache/cached_results/by_team/{team}", exist_ok=True) |
|
|
df_ = df[df["team"] == team].copy() |
|
|
df_.to_csv( |
|
|
f"competition_cache/cached_results/by_team/{team}/{save_path_base}", |
|
|
index=False, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
spaces: List[str] = [ |
|
|
"safe-challenge/video-challenge-pilot-config", |
|
|
"safe-challenge/video-challenge-task-1-config", |
|
|
"safe-challenge/video-challenge-task-2-config", |
|
|
] |
|
|
download_competition_data(competition_names=spaces) |
|
|
|
|
|
if os.environ.get("MAKE_CUSTOM"): |
|
|
print("adding custom subs") |
|
|
create_custom_subs() |
|
|
|
|
|
|
|
|
for space in spaces: |
|
|
local_dir = Path("competition_cache") / space |
|
|
|
|
|
|
|
|
teams = load_teams(competition_space_path=local_dir) |
|
|
team_submissions, submission_summaries = load_submission_map(competition_space_path=local_dir) |
|
|
member_map = get_member_to_team_map(teams=teams, team_submissions=team_submissions) |
|
|
submissions = load_submissions(competition_space_path=local_dir) |
|
|
|
|
|
|
|
|
solutions_df = pd.read_csv(local_dir / "solution.csv").set_index("id") |
|
|
|
|
|
|
|
|
try: |
|
|
with open(local_dir / "map.json", "r") as fn: |
|
|
space_map = json.load(fn) |
|
|
for df_col, df_map in space_map.items(): |
|
|
solutions_df[df_col] = solutions_df[df_col].map(df_map) |
|
|
except Exception as e: |
|
|
print("NO MAP FOUND.") |
|
|
pass |
|
|
|
|
|
|
|
|
prep_categories = False |
|
|
try: |
|
|
categories = {} |
|
|
for category in solutions_df["category"].unique(): |
|
|
if category.replace("real_", "").replace("generated_", "") not in categories: |
|
|
categories[category.replace("real_", "").replace("generated_", "")] = f"c_{len(categories):02d}" |
|
|
solutions_df.loc[solutions_df["category"] == "real_camera", "category"] = "camera" |
|
|
solutions_df.loc[solutions_df["category"] == "generated_camera", "category"] = "camera" |
|
|
solutions_df["category_og"] = solutions_df["category"].copy() |
|
|
solutions_df["category"] = solutions_df["category_og"].map(categories) |
|
|
prep_categories = True |
|
|
except Exception as e: |
|
|
print(f"CATEGORIES NOT UPDATED.") |
|
|
pass |
|
|
solutions_df.to_csv(local_dir / "solution-processed.csv", index=False) |
|
|
|
|
|
|
|
|
if prep_categories: |
|
|
scores = ["source", "category"] |
|
|
else: |
|
|
scores = ["source"] |
|
|
for score_name in scores: |
|
|
|
|
|
public, private, private_only, rocs = [], [], [], [] |
|
|
|
|
|
for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby( |
|
|
"team_id" |
|
|
)["submission_id"]: |
|
|
|
|
|
submission_set = submissions[team_id] |
|
|
submission_set_ids_from_csvs = set(submission_set.keys()) |
|
|
submission_set_ids = set(submission_set_ids) |
|
|
union = submission_set_ids | submission_set_ids_from_csvs |
|
|
|
|
|
if not (submission_set_ids.issubset(submission_set_ids_from_csvs)): |
|
|
missing = union - submission_set_ids_from_csvs |
|
|
print(f"not all submission csv files found for {team_id}, missing {len(missing)}") |
|
|
|
|
|
if submission_set_ids != submission_set_ids_from_csvs: |
|
|
extra = union - submission_set_ids |
|
|
print(f"extra {len(extra)} submissions in csvs than in summary file for team {team_id}") |
|
|
print(f"dropping {extra}") |
|
|
for submission_id in extra: |
|
|
submission_set.pop(submission_id) |
|
|
|
|
|
results = compute_metric_per_team( |
|
|
solution_df=solutions_df, |
|
|
team_submissions=submission_set, |
|
|
submission_summaries=submission_summaries.query(f'team_id=="{team_id}"'), |
|
|
score_split=score_name, |
|
|
) |
|
|
public_results = { |
|
|
key: prep_public(value["public_score"]) for key, value in results.items() if key in team_submissions |
|
|
} |
|
|
private_results = { |
|
|
key: prep_private(value["private_score"]) |
|
|
for key, value in results.items() |
|
|
if key in team_submissions |
|
|
} |
|
|
private_only_results = { |
|
|
key: prep_private(value["private_only_score"]) |
|
|
for key, value in results.items() |
|
|
if key in team_submissions |
|
|
} |
|
|
|
|
|
|
|
|
public_times = { |
|
|
x["submission_id"]: x["public_time"] |
|
|
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][ |
|
|
["submission_id", "public_time"] |
|
|
].to_dict(orient="records") |
|
|
} |
|
|
private_times = { |
|
|
x["submission_id"]: x["private_time"] |
|
|
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][ |
|
|
["submission_id", "private_time"] |
|
|
].to_dict(orient="records") |
|
|
} |
|
|
private_only_times = { |
|
|
x["submission_id"]: x["private_time"] - x["public_time"] |
|
|
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][ |
|
|
["submission_id", "private_time", "public_time"] |
|
|
].to_dict(orient="records") |
|
|
} |
|
|
for key in public_results.keys(): |
|
|
public_results[key]["total_time"] = public_times[key] |
|
|
for key in private_results.keys(): |
|
|
private_results[key]["total_time"] = private_times[key] |
|
|
for key in private_only_results.keys(): |
|
|
private_only_results[key]["total_time"] = private_only_times[key] |
|
|
|
|
|
|
|
|
roc_results = { |
|
|
key: extract_roc(value["private_score"]) |
|
|
for key, value in results.items() |
|
|
if key in team_submissions |
|
|
} |
|
|
roc_df = pd.json_normalize(roc_results.values()) |
|
|
if len(roc_df) != 0: |
|
|
roc_df.insert(loc=0, column="submission_id", value=roc_results.keys()) |
|
|
roc_df.insert( |
|
|
loc=0, |
|
|
column="team", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] |
|
|
for submission_id in roc_results.keys() |
|
|
], |
|
|
) |
|
|
roc_df.insert( |
|
|
loc=0, |
|
|
column="submission_repo", |
|
|
value=[ |
|
|
submission_summaries[ |
|
|
submission_summaries.team_id == member_map[team_submissions[submission_id]] |
|
|
].submission_repo.values[0] |
|
|
for submission_id in roc_results.keys() |
|
|
], |
|
|
) |
|
|
roc_df.insert( |
|
|
loc=0, |
|
|
column="datetime", |
|
|
value=[ |
|
|
submission_summaries[ |
|
|
submission_summaries.team_id == member_map[team_submissions[submission_id]] |
|
|
].datetime.values[0] |
|
|
for submission_id in roc_results.keys() |
|
|
], |
|
|
) |
|
|
roc_df["label"] = roc_df.apply( |
|
|
lambda x: f"AUC: {round(x['auc'], 2)} - {x['team']} - {x['submission_repo']}", axis=1 |
|
|
) |
|
|
rocs.append(roc_df) |
|
|
|
|
|
|
|
|
public_df = pd.json_normalize(public_results.values()) |
|
|
public_df.insert( |
|
|
loc=0, |
|
|
column="submission_id", |
|
|
value=list(public_results.keys()), |
|
|
) |
|
|
public_df.insert( |
|
|
loc=0, |
|
|
column="team", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] |
|
|
for submission_id in public_results.keys() |
|
|
], |
|
|
) |
|
|
public_df.insert( |
|
|
loc=0, |
|
|
column="team_id", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0] |
|
|
for submission_id in public_results.keys() |
|
|
], |
|
|
) |
|
|
public_df.insert( |
|
|
loc=0, |
|
|
column="datetime", |
|
|
value=[ |
|
|
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0] |
|
|
for submission_id in public_results.keys() |
|
|
], |
|
|
) |
|
|
public.append(public_df) |
|
|
|
|
|
|
|
|
private_df = pd.json_normalize(private_results.values()) |
|
|
private_df.insert( |
|
|
loc=0, |
|
|
column="submission_id", |
|
|
value=list(private_results.keys()), |
|
|
) |
|
|
private_df.insert( |
|
|
loc=0, |
|
|
column="team", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] |
|
|
for submission_id in private_results.keys() |
|
|
], |
|
|
) |
|
|
private_df.insert( |
|
|
loc=0, |
|
|
column="team_id", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0] |
|
|
for submission_id in private_results.keys() |
|
|
], |
|
|
) |
|
|
private_df.insert( |
|
|
loc=0, |
|
|
column="datetime", |
|
|
value=[ |
|
|
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0] |
|
|
for submission_id in private_results.keys() |
|
|
], |
|
|
) |
|
|
private.append(private_df) |
|
|
|
|
|
|
|
|
private_only_df = pd.json_normalize(private_only_results.values()) |
|
|
private_only_df.insert( |
|
|
loc=0, |
|
|
column="submission_id", |
|
|
value=list(private_only_results.keys()), |
|
|
) |
|
|
private_only_df.insert( |
|
|
loc=0, |
|
|
column="team", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] |
|
|
for submission_id in private_only_results.keys() |
|
|
], |
|
|
) |
|
|
private_only_df.insert( |
|
|
loc=0, |
|
|
column="team_id", |
|
|
value=[ |
|
|
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0] |
|
|
for submission_id in private_only_results.keys() |
|
|
], |
|
|
) |
|
|
private_only_df.insert( |
|
|
loc=0, |
|
|
column="datetime", |
|
|
value=[ |
|
|
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0] |
|
|
for submission_id in private_only_results.keys() |
|
|
], |
|
|
) |
|
|
private_only.append(private_only_df) |
|
|
|
|
|
|
|
|
public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False) |
|
|
private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False) |
|
|
private_only = pd.concat(private_only, axis=0, ignore_index=True).sort_values( |
|
|
by="balanced_accuracy", ascending=False |
|
|
) |
|
|
rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True) |
|
|
public.to_csv( |
|
|
Path("competition_cache") |
|
|
/ "cached_results" |
|
|
/ f"{str(local_dir).split('/')[-1]}_{score_name}_public_score.csv", |
|
|
index=False, |
|
|
) |
|
|
private.to_csv( |
|
|
Path("competition_cache") |
|
|
/ "cached_results" |
|
|
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv", |
|
|
index=False, |
|
|
) |
|
|
private_only.to_csv( |
|
|
Path("competition_cache") |
|
|
/ "cached_results" |
|
|
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv", |
|
|
index=False, |
|
|
) |
|
|
save_by_team(df=public, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_public.csv") |
|
|
save_by_team(df=private, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_private.csv") |
|
|
|
|
|
rocs.to_csv( |
|
|
Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv", |
|
|
index=False, |
|
|
) |
|
|
|
|
|
submission_summaries["team"] = submission_summaries["team_id"].apply(lambda a: teams.loc[a, "name"]) |
|
|
|
|
|
submission_summaries.to_csv( |
|
|
Path("competition_cache") |
|
|
/ "cached_results" |
|
|
/ f"{str(local_dir).split('/')[-1]}_{score_name}_submissions.csv", |
|
|
index=False, |
|
|
) |
|
|
|
|
|
|
|
|
import datetime |
|
|
import pytz |
|
|
|
|
|
|
|
|
est_timezone = pytz.timezone("US/Eastern") |
|
|
current_time_est = datetime.datetime.now(est_timezone) |
|
|
|
|
|
|
|
|
formatted_time = current_time_est.strftime("%Y-%m-%d %H:%M:%S %Z") |
|
|
|
|
|
formatted = f"Updated on {formatted_time}" |
|
|
with open("competition_cache/updated.txt", "w") as file: |
|
|
file.write(formatted) |
|
|
|