Spaces:
Sleeping
Sleeping
# Eval a given a training run name at the given steps, taking into account the chaning of the training runs | |
import sys | |
import pickle | |
import wandb | |
import argparse | |
import os | |
from src.utils.paths import get_path | |
from src.utils.wandb_utils import get_run_initial_steps, get_run_step_direct, get_run_step_ckpt, get_steps_from_file, get_run_by_name | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--tag", "-tag", type=str, required=False, default="") | |
parser.add_argument("--input", "-input", type=str, required=False, default="Feb26_2025_E1000_N500_noPartonFilter_C_F") # --input Feb26_2025_E1000_N500_full | |
parser.add_argument("--clustering-suffix", "-c", type=str, required=False, default="") # -c MinSamples0 | |
parser.add_argument("--no-submit", "-ns", action="store_true") # do not submit the slurm job | |
parser.add_argument("--submit-AKX", "-AKX", action="store_true") | |
parser.add_argument("--submit-AK8", "-AK8", action="store_true") | |
parser.add_argument("--parton-level", "-pl", action="store_true") # To be used together with 'fastjet_jets' and --submit-AKX | |
parser.add_argument("--gen-level", "-gl", action="store_true") | |
parser.add_argument("--overwrite", "-ow", action="store_true") # overwrite the slurm job if it exists | |
parser.add_argument("--pt-cutoff-jet", "-pt", type=float, default=100.0, help="pt cutoff for what is considered a jet") | |
parser.add_argument("--high-eta-only", "-he", action="store_true", help="Only evaluate high eta jets (eta > 1.5)") | |
parser.add_argument("--low-eta-only", "-le", action="store_true", help="Only evaluate low eta jets (eta < 1.5)") | |
parser.add_argument("--ds-cap", "-ds", type=int, default=10000, help="dataset cap ") | |
args = parser.parse_args() | |
api = wandb.Api() | |
DSCAP = args.ds_cap | |
def get_eval_run_names(tag): | |
# from the api, get all the runs with the tag that are finished | |
runs = api.runs( | |
path="fcc_ml/svj_clustering", | |
filters={"tags": {"$in": [tag.strip()]}} | |
) | |
return [run.name for run in runs if run.state == "finished"], [run.config for run in runs if run.state == "finished"] | |
def get_log_number(tag): | |
numbers = set() | |
for file in os.listdir("jobs/slurm_files"): | |
if tag in file: | |
numbers.add(int(file.split("_")[-1].split(".")[0])) | |
if len(numbers) == 0: | |
return 0 | |
return max(list(numbers)) + 1 | |
def get_slurm_file_text_AKX(tag, log_number): | |
bindings = "-B /t3home/gkrzmanc/ -B /work/gkrzmanc/" | |
partition = "standard" | |
account = "t3" | |
d = "jobs/logs/{}".format(tag) | |
err = d + "_{}_CPUerr.txt".format(log_number) | |
log = d + "_{}_CPUlog.txt".format(log_number) | |
suffix_pl = "--parton-level" if args.parton_level else "" | |
suffix_gl = "--gen-level" if args.gen_level else "" | |
pl_folder = "_PL" if args.parton_level else "" | |
gl_folder = "_GL" if args.gen_level else "" | |
if args.pt_cutoff_jet != 100.0: | |
pt_cutoff_suffix = f"_pt_{args.pt_cutoff_jet}" | |
pt_cutoff_suffix_cmd = " --pt-jet-cutoff {}".format(args.pt_cutoff_jet) | |
else: | |
pt_cutoff_suffix = "" | |
pt_cutoff_suffix_cmd = "" | |
if args.high_eta_only: | |
pt_cutoff_suffix += "_high_eta" | |
pt_cutoff_suffix_cmd += " --high-eta-only" | |
elif args.low_eta_only: | |
pt_cutoff_suffix += "_low_eta" | |
pt_cutoff_suffix_cmd += " --low-eta-only" | |
file = f"""#!/bin/bash | |
#SBATCH --partition={partition} # Specify the partition | |
#SBATCH --account={account} # Specify the account | |
#SBATCH --mem=25000 # Request 10GB of memory | |
#SBATCH --time=06:00:00 # Set the time limit to 1 hour | |
#SBATCH --job-name=SVJan_AKX{pl_folder}{gl_folder}_{str(log_number)} # Name the job | |
#SBATCH --error={err} # Redirect stderr to a log file | |
#SBATCH --output={log} # Redirect stderr to a log file | |
#SBATCH --mail-type=FAIL | |
#SBATCH --mail-user=gkrzmanc@student.ethz.ch | |
source env.sh | |
export APPTAINER_TMPDIR=/work/gkrzmanc/singularity_tmp | |
export APPTAINER_CACHEDIR=/work/gkrzmanc/singularity_cache | |
nvidia-smi | |
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}{pt_cutoff_suffix}/AKX{pl_folder}{gl_folder} --jets-object fastjet_jets {suffix_pl} {suffix_gl} --dataset-cap {DSCAP} {pt_cutoff_suffix_cmd} | |
""" | |
return file | |
def get_slurm_file_text_AK(tag, log_number): | |
bindings = "-B /t3home/gkrzmanc/ -B /work/gkrzmanc/" | |
partition = "standard" | |
account = "t3" | |
d = "jobs/logs/{}".format(tag) | |
err = d + "_{}_CPUerr.txt".format(log_number) | |
log = d + "_{}_CPUlog.txt".format(log_number) | |
file = f"""#!/bin/bash | |
#SBATCH --partition={partition} # Specify the partition | |
#SBATCH --account={account} # Specify the account | |
#SBATCH --mem=25000 # Request 10GB of memory | |
#SBATCH --time=02:00:00 # Set the time limit to 1 hour | |
#SBATCH --job-name=SVJan # Name the job | |
#SBATCH --error={err} # Redirect stderr to a log file | |
#SBATCH --output={log} # Redirect stderr to a log file | |
#SBATCH --mail-type=END,FAIL | |
#SBATCH --mail-user=gkrzmanc@student.ethz.ch | |
source env.sh | |
export APPTAINER_TMPDIR=/work/gkrzmanc/singularity_tmp | |
export APPTAINER_CACHEDIR=/work/gkrzmanc/singularity_cache | |
nvidia-smi | |
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}/AK8 --dataset-cap 1500 | |
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}/AK8_GenJets --jets-object genjets --dataset-cap {DSCAP} | |
""" | |
return file | |
def get_slurm_file_text(tag, eval_job_name, log_number, aug_suffix = ""): | |
bindings = "-B /t3home/gkrzmanc/ -B /work/gkrzmanc/ -B /pnfs/psi.ch/cms/trivcat/store/user/gkrzmanc/ " | |
partition = "standard" | |
account = "t3" | |
d = "jobs/logs/{}".format(tag) | |
err = d + "_{}_CPUerr.txt".format(log_number) | |
log = d + "_{}_CPUlog.txt".format(log_number) | |
clust_suffix = "" | |
if args.clustering_suffix != "": | |
clust_suffix = f" --clustering-suffix {args.clustering_suffix}" | |
pt_cutoff_suffix_cmd = f" --pt-jet-cutoff {args.pt_cutoff_jet}" | |
pt_cutoff_suffix = "" | |
if args.pt_cutoff_jet != 100.0: | |
pt_cutoff_suffix = f"_pt_{args.pt_cutoff_jet}" | |
if args.high_eta_only: | |
pt_cutoff_suffix += "_high_eta" | |
#aug_suffix += " --high-eta-only" | |
elif args.low_eta_only: | |
pt_cutoff_suffix += "_low_eta" | |
#aug_suffix += " --low-eta-only" | |
file = f"""#!/bin/bash | |
#SBATCH --partition={partition} # Specify the partition | |
#SBATCH --account={account} # Specify the account | |
#SBATCH --mem=25000 # Request 10GB of memory | |
#SBATCH --time=02:00:00 # Set the time limit to 1 hour | |
#SBATCH --job-name=SVJ_CPU_{eval_job_name}_{str(log_number)} # Name the job | |
#SBATCH --error={err} # Redirect stderr to a log file | |
#SBATCH --output={log} # Redirect stderr to a log file | |
#SBATCH --mail-type=FAIL | |
#SBATCH --mail-user=gkrzmanc@student.ethz.ch | |
source env.sh | |
export APPTAINER_TMPDIR=/work/gkrzmanc/singularity_tmp | |
export APPTAINER_CACHEDIR=/work/gkrzmanc/singularity_cache | |
nvidia-smi | |
srun singularity exec {bindings} docker://gkrz/lgatr:v3 python -m scripts.analysis.count_matched_quarks --input {args.input} --output {args.input}/batch_eval_2k/{tag}{pt_cutoff_suffix}/{eval_job_name}{args.clustering_suffix} --eval-dir train/{eval_job_name} --jets-object model_jets --dataset-cap {DSCAP} {aug_suffix} {clust_suffix} {pt_cutoff_suffix_cmd} | |
""" | |
return file | |
runs, run_config = get_eval_run_names(args.tag) | |
print("RUNS:", runs) | |
if args.submit_AK8: | |
# Submit also ak and ak8 | |
if not os.path.exists("jobs/slurm_files"): | |
os.makedirs("jobs/slurm_files") | |
if not os.path.exists("jobs/logs"): | |
os.makedirs("jobs/logs") | |
log_number = get_log_number(args.tag) | |
slurm_file_text = get_slurm_file_text_AK(args.tag, log_number) | |
# write the file to jobs/slurm_files | |
with open("jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number), "w") as f: | |
f.write(slurm_file_text) | |
print("Wrote file to jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number)) | |
if not args.no_submit: | |
os.system("sbatch jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number)) | |
print("---- Submitted AK8 run -----") | |
sys.exit(0) | |
def extract_n_events(filename): | |
if not os.path.exists(filename): | |
return -1 | |
content = open(filename).read().strip() | |
try: | |
return int(content) | |
except: | |
return -1 | |
if args.submit_AKX: | |
# Submit also AKX | |
if not os.path.exists("jobs/slurm_files"): | |
os.makedirs("jobs/slurm_files") | |
if not os.path.exists("jobs/logs"): | |
os.makedirs("jobs/logs") | |
log_number = get_log_number(args.tag) | |
slurm_file_text = get_slurm_file_text_AKX(args.tag, log_number) | |
# write the file to jobs/slurm_files | |
with open("jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number), "w") as f: | |
f.write(slurm_file_text) | |
print("Wrote file to jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number)) | |
if not args.no_submit: | |
os.system("sbatch jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number)) | |
print("---- Submitted AKX run -----") | |
sys.exit(0) | |
for i, run in enumerate(runs): | |
#if get_run_by_name(run).state != "finished": | |
# print("Run not finished (failed or still in progress) - skipping", run) | |
# continue | |
conf = get_run_by_name(run).config | |
if( conf.get("parton_level") or conf.get("gen_level")) and args.pt_cutoff_jet != 100.0: | |
print("Skipping run", run, "because it is parton level or gen level and pt cutoff is not 100.0") | |
continue | |
aug_soft_p = conf.get("augment_soft_particles", False) | |
if aug_soft_p: | |
aug_suffix = "-aug-soft" | |
else: | |
aug_suffix = "" | |
if not os.path.exists("jobs/slurm_files"): | |
os.makedirs("jobs/slurm_files") | |
if not os.path.exists("jobs/logs"): | |
os.makedirs("jobs/logs") | |
log_number = get_log_number(args.tag) | |
pt_cutoff_suffix = "" | |
if args.pt_cutoff_jet != 100.0: | |
pt_cutoff_suffix = f"_pt_{args.pt_cutoff_jet}" | |
if args.high_eta_only: | |
pt_cutoff_suffix += "_high_eta" | |
aug_suffix += " --high-eta-only" | |
elif args.low_eta_only: | |
pt_cutoff_suffix += "_low_eta" | |
aug_suffix += " --low-eta-only" | |
slurm_file_text = get_slurm_file_text(args.tag, run, log_number, aug_suffix) | |
rel_path_save = f"{args.input}/batch_eval_2k/{args.tag}{pt_cutoff_suffix}/{run}{args.clustering_suffix}" | |
rel_path_save = get_path(rel_path_save, "results") | |
if not os.path.exists(rel_path_save): | |
os.makedirs(rel_path_save) | |
#if evaluated(rel_path_save): | |
n_events = extract_n_events(os.path.join(rel_path_save, "count_matched_quarks", "n_events.txt")) | |
if os.path.exists(os.path.join(rel_path_save, "count_matched_quarks", "n_events.txt")) and not args.overwrite and n_events > 0: | |
print("Skipping", run, "because this file exists:", os.path.join(rel_path_save, "count_matched_quarks", "n_events.txt")) | |
continue | |
else: | |
print("Evaluating", run) | |
# save run config here | |
with open(f"{rel_path_save}/run_config.pkl", "wb") as f: | |
pickle.dump(run_config[i], f) | |
# write the file to jobs/slurm_files | |
with open("jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number), "w") as f: | |
f.write(slurm_file_text) | |
print("Wrote file to jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number)) | |
if not args.no_submit: | |
os.system("sbatch jobs/slurm_files/evalCPU_{}_{}.slurm".format(args.tag, log_number)) | |