pere
/

t5-nynorsk-norbench

Model card Files Files and versions Community

t5-nynorsk-norbench / generate_stats.py

pere

testing

bcc84fc over 1 year ago

raw

history blame

5.32 kB

	from google.cloud import storage
	import pandas as pd
	import json
	import re
	import sys

	# Create a storage client
	client = storage.Client()

	# Get the bucket
	bucket_name = "nb-t5x-us-central2"
	bucket = client.bucket(bucket_name)


	#checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]

	checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]

	start=["100000","200000","300000","400000","500000","1000000","1100000","1184000","1200000","1204000","1284000","1300000","1384000","1400000","1484000","1500000"]

	iterations=["1","2","3","4","5"]
	file_names=[]

	for i in iterations:
	for c in checkpoints:
	for s in start:
	if "scand" in c:
	name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long_scand-metrics.jsonl'
	elif "mt5" in c:
	name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long_mt5-metrics.jsonl'
	else:
	name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long-metrics.jsonl'
	file_names.append(name)


	#list to store json files data
	file_contents = []

	downloaded = 0
	not_downloaded = 0

	#print(file_names)
	#print(bucket)
	#sys.exit(-1)

	#iterate over the files
	for file_name in file_names:
	# Get the file
	blob = bucket.get_blob(file_name)
	print(f'gs://{bucket_name}/{file_name}')

	if not blob:
	#print(f"Unable to download {file_name}")
	not_downloaded+=1
	continue
	else:
	downloaded+=1

	content = blob.download_as_string().decode("utf-8")
	# Split the content by newline
	lines = content.split("\n")

	#iterate over the lines
	for n,line in enumerate(lines):
	if not line:
	continue
	#print(line)
	#print(file_name)
	data = json.loads(line)
	data['base_file_name'] = file_name
	pretraining_steps = re.search(r"(voc_\|voc-full_\|voc-full-scratch_\|voc-scratch_)(.*?)(?=/)", file_name).group(2)
	data['pretraining_steps'] = int(pretraining_steps)
	data['finetuning_steps'] = data['step'] - int(pretraining_steps)
	data['vocab'] = re.search(r"-(\w+?)voc", file_name).group(1)
	data['experiment'] = re.search(r"_exp(\w+?)-", file_name).group(1)
	data['version'] = re.search(r"_v(\w+?)_exp", file_name).group(1)
	data['experiment_name'] = re.search(r"exp\d+-(.*?)_", file_name).group(1)
	file_contents.append(data)

	print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")

	df = pd.json_normalize(file_contents)
	only_5000 = df[df["finetuning_steps"] == 5000]
	grouped = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro", "bleu"]].groupby(["experiment","experiment_name","pretraining_steps"])
	average_at_5000 = grouped.mean().reset_index()
	average_at_5000 = average_at_5000.assign(num_experiments=grouped.size().values)

	only_3000 = df[df["finetuning_steps"] == 3000]
	grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro", "bleu"]].groupby(["experiment","experiment_name","pretraining_steps"])
	average_at_3000 = grouped.mean().reset_index()
	average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)

	#print(average_at_3000.to_string(index=False))
	print(average_at_5000.to_string(index=False))

	print("\nNot complete:")
	uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
	print(uncomplete)


	df.to_json("stats/all.jsonl", orient="records", lines=True)
	df.to_csv("stats/all.csv", index=False)

	only_5000.to_json("stats/only_5000.jsonl", orient="records", lines=True)
	only_5000.to_csv("stats/only_5000.csv", index=False)

	average_at_5000.to_json("stats/average_at_5000.jsonl", orient="records", lines=True)
	average_at_5000.to_csv("stats/average_at_5000.csv", index=False)


	print(f"Files exported to stats")