from google.cloud import storage import pandas as pd import json import re import sys # Create a storage client client = storage.Client() # Get the bucket bucket_name = "nb-t5x-us-central2" bucket = client.bucket(bucket_name) checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"] start=["100000","200000","300000","400000","500000","1000000","1100000","1184000","1200000","1204000","1284000","1300000","1384000","1400000","1484000","1500000"] iterations=["1","2","3","4","5"] file_names=[] for i in iterations: for c in checkpoints: for s in start: if "scand" in c: name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300_scand-metrics.jsonl' elif "mt5" in c: name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300_mt5-metrics.jsonl' else: name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300-metrics.jsonl' file_names.append(name) #list to store json files data file_contents = [] downloaded = 0 not_downloaded = 0 #print(file_names) #print(bucket) #sys.exit(-1) #iterate over the files for file_name in file_names: # Get the file blob = bucket.get_blob(file_name) print(f'gs://{bucket_name}/{file_name}') if not blob: #print(f"Unable to download {file_name}") not_downloaded+=1 continue else: downloaded+=1 content = blob.download_as_string().decode("utf-8") #print(file_name) #print(content) # Split the content by newline lines = content.split("\n") #iterate over the lines for n,line in enumerate(lines): if not line: continue #print(line) #print(file_name) data = json.loads(line) data['base_file_name'] = file_name pretraining_steps = re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2) data['pretraining_steps'] = int(pretraining_steps) data['finetuning_steps'] = data['step'] - int(pretraining_steps) data['vocab'] = re.search(r"-(\w+?)voc", file_name).group(1) data['experiment'] = re.search(r"_exp(\w+?)-", file_name).group(1) data['version'] = re.search(r"_v(\w+?)_exp", file_name).group(1) data['experiment_name'] = re.search(r"exp\d+-(.*?)_", file_name).group(1) file_contents.append(data) print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded") df = pd.json_normalize(file_contents) df = df.drop_duplicates(subset=['step','experiment','version']).reset_index() only_5000 = df[df["finetuning_steps"] == 5000] grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"]) average_at_5000 = grouped_at_5000.mean().reset_index() average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values) only_3000 = df[df["finetuning_steps"] == 3000] grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"]) average_at_3000 = grouped.mean().reset_index() average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values) #print(average_at_3000.to_string(index=False)) print(average_at_5000.to_string(index=False)) print("\nNot complete:") uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5] print(uncomplete) df.to_json("stats/all.jsonl", orient="records", lines=True) df.to_csv("stats/all.csv", index=False) only_5000.to_json("stats/only_5000.jsonl", orient="records", lines=True) only_5000.to_csv("stats/only_5000.csv", index=False) average_at_5000.to_json("stats/average_at_5000.jsonl", orient="records", lines=True) average_at_5000.to_csv("stats/average_at_5000.csv", index=False) print(f"Files exported to stats")