File size: 4,685 Bytes
5c54fc2 cbadf27 5c54fc2 f81f398 5c54fc2 e7539cc 5c54fc2 bd5d1dc 5c54fc2 cbadf27 5c54fc2 cbadf27 5c54fc2 cbadf27 5c54fc2 f81f398 5c54fc2 952379b 5c54fc2 7372e86 5c54fc2 a89923e 5c54fc2 952379b 5c54fc2 952379b 5c54fc2 cbadf27 5c54fc2 cbadf27 5c54fc2 f7c632c 952379b 5c54fc2 952379b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from google.cloud import storage
import pandas as pd
import json
import re
import sys
# Create a storage client
client = storage.Client()
# Get the bucket
bucket_name = "nb-t5x-us-central2"
bucket = client.bucket(bucket_name)
checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]
start=["100000","200000","300000","400000","500000","1000000","1100000","1184000","1200000","1204000","1284000","1300000","1384000","1400000","1484000","1500000"]
iterations=["1","2","3","4","5"]
file_names=[]
for i in iterations:
for c in checkpoints:
for s in start:
if "scand" in c:
name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300_scand-metrics.jsonl'
elif "mt5" in c:
name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300_mt5-metrics.jsonl'
else:
name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300-metrics.jsonl'
file_names.append(name)
#list to store json files data
file_contents = []
downloaded = 0
not_downloaded = 0
#print(file_names)
#print(bucket)
#sys.exit(-1)
#iterate over the files
for file_name in file_names:
# Get the file
blob = bucket.get_blob(file_name)
print(f'gs://{bucket_name}/{file_name}')
if not blob:
#print(f"Unable to download {file_name}")
not_downloaded+=1
continue
else:
downloaded+=1
content = blob.download_as_string().decode("utf-8")
#print(file_name)
#print(content)
# Split the content by newline
lines = content.split("\n")
#iterate over the lines
for n,line in enumerate(lines):
if not line:
continue
#print(line)
#print(file_name)
data = json.loads(line)
data['base_file_name'] = file_name
pretraining_steps = re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
data['pretraining_steps'] = int(pretraining_steps)
data['finetuning_steps'] = data['step'] - int(pretraining_steps)
data['vocab'] = re.search(r"-(\w+?)voc", file_name).group(1)
data['experiment'] = re.search(r"_exp(\w+?)-", file_name).group(1)
data['version'] = re.search(r"_v(\w+?)_exp", file_name).group(1)
data['experiment_name'] = re.search(r"exp\d+-(.*?)_", file_name).group(1)
file_contents.append(data)
print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")
df = pd.json_normalize(file_contents)
df = df.drop_duplicates(subset=['step','experiment','version']).reset_index()
only_5000 = df[df["finetuning_steps"] == 5000]
grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
average_at_5000 = grouped_at_5000.mean().reset_index()
average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values)
only_3000 = df[df["finetuning_steps"] == 3000]
grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
average_at_3000 = grouped.mean().reset_index()
average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)
#print(average_at_3000.to_string(index=False))
print(average_at_5000.to_string(index=False))
print("\nNot complete:")
uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
print(uncomplete)
df.to_json("stats/all.jsonl", orient="records", lines=True)
df.to_csv("stats/all.csv", index=False)
only_5000.to_json("stats/only_5000.jsonl", orient="records", lines=True)
only_5000.to_csv("stats/only_5000.csv", index=False)
average_at_5000.to_json("stats/average_at_5000.jsonl", orient="records", lines=True)
average_at_5000.to_csv("stats/average_at_5000.csv", index=False)
print(f"Files exported to stats")
|