import os import pandas as pd from pathlib import Path from pathlib import Path import pandas as pd import json import metric from sklearn.metrics import roc_auc_score, roc_curve import numpy as np import altair as alt import re _metric = metric._metric def get_submission(f): submission_info = json.load(open(f)) submissions = pd.DataFrame(submission_info["submissions"]) submissions["team_id"] = submission_info["id"] return submissions # def get_submissions_file(f): # submission_df = pd.read_csv(f).set_index("id") # if isinstance(submission_df.iloc[0]["score"],str): # submission_df.loc[:, "score"] = submission_df.loc[:, "score"].apply(lambda a: json.loads(re.sub(r'\b(\d+)\.(?!\d)', r'\1.0', a))[0] if isinstance(a,str) else float("nan")) # return submission_df def get_submissions_file(f): submission_df = pd.read_csv(f).set_index("id") if isinstance(submission_df.iloc[0]["score"], str): submission_df.loc[:, "score"] = submission_df.loc[:, "score"].apply( lambda a: float( np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze() if isinstance(a, str) else float("nan") ) ) return submission_df def load_results(local_dir): team_file_name = "teams.json" team_info = pd.read_json(Path(local_dir) / team_file_name).T team_info.loc["baselines", "name"] = "baselines" submission_info_dir = "submission_info" submission_info_files = list((Path(local_dir) / submission_info_dir).glob("*.json")) # submission_info_files += ["baselines/baselines.json"] submissions = pd.concat( [get_submission(f) for f in submission_info_files], ignore_index=True ) submissions.loc[:, "team"] = team_info.loc[ submissions["team_id"].values, "name" ].values submissions["submission_files"] = submissions.apply( lambda a: ( str( Path(local_dir) / "submissions" / (a["team_id"] + "-" + a["submission_id"] + ".csv") ) if a["team_id"] != "baselines" else str( Path("baselines") / (a["team_id"] + "-" + a["submission_id"] + ".csv") ) ), axis=1, ) submissions = submissions.drop(columns=["public_score", "private_score"]) submissions["submission"] = ( submissions["team"] + " - " + submissions["submission_repo"] ) return submissions def compute_metrics(submissions, local_dir, admin=True): submissions = submissions.query("status==3.0") # if not admin: # selected_by_team = submissions.groupby("team")["selected"].sum() # teams_no_selected = selected_by_team.index[selected_by_team==0] # submissions.loc[submissions.team.isin(teams_no_selected),"selected"] = True # submissions = submissions.query("selected") solution_df = pd.read_csv(Path(local_dir) / "solution.csv").set_index("id") results = {"private_score": [], "public_score": []} fields = ["team_id", "team", "submission_id", "submission_repo"] for i, row in submissions.T.items(): # r = pd.read_csv(row["submission_files"]).set_index("id") r = get_submissions_file(row["submission_files"]) eval = _metric( solution_df, r, mode="detailed", admin=admin, additional_columns=( ["augmentation"] if "augmentation" in solution_df.columns else None ), ) for m in ["private_score", "public_score"]: for f in fields: eval[m][f] = row[f] eval[m]["submission"] = f"{row.team} - {row.submission_repo}" eval[m] = pd.Series(eval[m]).to_frame().T results[m].append(eval[m]) for m in ["private_score", "public_score"]: temp = pd.concat(results[m], ignore_index=True).T temp.index.name = "metric" temp = temp.reset_index() # def parse(s): # if any(p in s for p in ["generated","pristine"]): # s = s.split("_") # return pd.Series(dict(pred = s[0], source = "_".join(s[1:]))) # else: # return pd.Series(dict(pred = s, source = None)) # temp = pd.concat([temp, temp["metric"].apply(parse)], axis = 1) # results[m] = temp.set_index(["pred","source"]) # results[m] = results[m].drop(columns = ["metric"]).T results[m] = ( temp.set_index("metric") .T.sort_values("balanced_accuracy", ascending=False) .drop_duplicates(subset=["team", "submission_repo"]) ) if not admin: # only show top selected results[m] = ( results[m] .sort_values(["team", "balanced_accuracy"], ascending=False) .drop_duplicates(subset=["team"]) .sort_values("balanced_accuracy", ascending=False) ) results[m] = results[m].set_index("submission" if admin else "team") fields_to_merge = [ "generated_accuracy", "pristine_accuracy", "balanced_accuracy", "total_time", "fail_rate", ] submissions = pd.concat( [ submissions.set_index("submission_id"), results["private_score"] .reset_index() .set_index("submission_id") .loc[:, fields_to_merge], ], axis=1, ).reset_index() return results, submissions status_lookup = "NA,QUEUED,PROCESSING,SUCCESS,FAILED".split(",") def process_data(path, save_path): submissions = load_results(path) submissions["datetime"] = pd.DatetimeIndex(submissions["datetime"]) submissions["date"] = submissions["datetime"].dt.date submissions["status_reason"] = ( submissions["status"].astype(int).apply(lambda a: status_lookup[a]) ) submissions.loc[ :, ["submission_id", "datetime", "date", "status", "status_reason"] ].to_csv(save_path + "_submissions.csv") results, submissions = compute_metrics(submissions, path, admin=False) cols_to_drop = ["team_id", "submission_id", "submission_repo", "submission"] results["public_score"].drop(columns=cols_to_drop).to_csv(save_path + ".csv") if __name__ == "__main__": path_to_cache = os.environ.get("COMP_CACHE","../competition_cache") process_data(os.path.join(path_to_cache,"temp_task1"), "task1") process_data(os.path.join(path_to_cache,"temp_task2"), "task2") process_data(os.path.join(path_to_cache,"temp_task3"), "task3") process_data(os.path.join(path_to_cache,"temp_practice"), "practice") # from datetime import date # # Get today's date # today = date.today() # # Print date in YYYY-MM-DD format # print("Today's date:", today) from datetime import datetime import pytz # Define EST timezone est = pytz.timezone("US/Eastern") # Get current time in EST est_time = datetime.now(est) # Print current date and time in EST today = f"Updated on {est_time.strftime('%Y-%m-%d %H:%M:%S')} EST" with open("updated.txt", "w") as f: f.write(str(today))