File size: 4,728 Bytes
5c54fc2
 
 
 
cbadf27
5c54fc2
 
 
 
 
f81f398
 
5c54fc2
 
bd5d1dc
952379b
5c54fc2
bd5d1dc
5c54fc2
 
 
 
 
 
 
 
cbadf27
 
 
5c54fc2
cbadf27
5c54fc2
 
 
 
 
 
 
 
cbadf27
 
 
 
 
5c54fc2
 
 
 
f81f398
 
5c54fc2
 
 
 
 
 
 
 
952379b
 
 
 
5c54fc2
 
 
 
 
 
 
 
7372e86
5c54fc2
 
a89923e
5c54fc2
 
 
 
 
 
 
 
 
 
 
952379b
5c54fc2
952379b
 
 
5c54fc2
cbadf27
 
 
5c54fc2
cbadf27
5c54fc2
 
f7c632c
952379b
 
 
5c54fc2
 
 
 
 
 
 
 
 
 
 
 
952379b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from google.cloud import storage
import pandas as pd
import json
import re
import sys

# Create a storage client
client = storage.Client()

# Get the bucket
bucket_name = "nb-t5x-us-central2"
bucket = client.bucket(bucket_name)


checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"] 
#checkpoints=["exp19-t5-small-ul2-mt5voc"]

start=["100000","200000","300000","400000","500000","1000000","1100000","1184000","1200000","1204000","1284000","1300000","1384000","1400000","1484000","1500000"]

iterations=["1","2","3","4","5"]
file_names=[]

for i in iterations:
    for c in checkpoints:
        for s in start:
            if "scand" in c:
                name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300_scand-metrics.jsonl'
            elif "mt5" in c:
                name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300_mt5-metrics.jsonl'
            else:
                name = f'finetuned/ul2test/eval_political_v{i}_{c}_{s}/inference_eval/parliament_max300-metrics.jsonl'
            file_names.append(name)


#list to store json files data
file_contents = []

downloaded = 0
not_downloaded = 0

#print(file_names)
#print(bucket)
#sys.exit(-1)

#iterate over the files
for file_name in file_names:
    # Get the file
    blob = bucket.get_blob(file_name)
    print(f'gs://{bucket_name}/{file_name}')

    if not blob:
        #print(f"Unable to download {file_name}")
        not_downloaded+=1
        continue
    else:
        downloaded+=1

    content = blob.download_as_string().decode("utf-8")
    
    #print(file_name)
    #print(content)

    # Split the content by newline
    lines = content.split("\n")
    
    #iterate over the lines
    for n,line in enumerate(lines):
        if not line:
            continue
        #print(line)
        #print(file_name)
        data = json.loads(line)
        data['base_file_name'] = file_name
        pretraining_steps = re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
        data['pretraining_steps'] = int(pretraining_steps)
        data['finetuning_steps'] = data['step'] - int(pretraining_steps)
        data['vocab'] = re.search(r"-(\w+?)voc", file_name).group(1)
        data['experiment'] = re.search(r"_exp(\w+?)-", file_name).group(1) 
        data['version'] = re.search(r"_v(\w+?)_exp", file_name).group(1) 
        data['experiment_name'] = re.search(r"exp\d+-(.*?)_", file_name).group(1) 
        file_contents.append(data)

print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")

df = pd.json_normalize(file_contents)
df = df.drop_duplicates(subset=['step','experiment','version']).reset_index()
only_5000 = df[df["finetuning_steps"] == 5000]
grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
average_at_5000 = grouped_at_5000.mean().reset_index()
average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values)
only_3000 = df[df["finetuning_steps"] == 3000]
grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
average_at_3000 = grouped.mean().reset_index()
average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)

#print(average_at_3000.to_string(index=False))
print(average_at_5000.to_string(index=False))

print("\nNot complete:")
uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
print(uncomplete)

df.to_json("stats/all.jsonl", orient="records", lines=True)
df.to_csv("stats/all.csv", index=False)

only_5000.to_json("stats/only_5000.jsonl", orient="records", lines=True)
only_5000.to_csv("stats/only_5000.csv", index=False)

average_at_5000.to_json("stats/average_at_5000.jsonl", orient="records", lines=True)
average_at_5000.to_csv("stats/average_at_5000.csv", index=False)


print(f"Files exported to stats")