giskard-evaluator

Running

App Files Files Community

200

giskard-evaluator / cicd /scan_retrieved.py

inoki-giskard

Init cicd with commit 9bf277b

b6a7e2b 12 months ago

raw

history blame

4.94 kB

	import argparse
	import pandas as pd
	from ast import literal_eval
	from string import Template
	import os


	def model_has_dataset(model):
	for tag in model.tags:
	if tag.startswith("dataset:"):
	return True
	return False


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	prog="Giskard Batch Scanner", description="Scan Retrieved HF models."
	)
	parser.add_argument(
	"--data_path",
	help="Path to retrieved models in csv format (need to run retrieve.py first).",
	required=True,
	)
	parser.add_argument("--first_Nmodels",
	help="Number of models to be scanned from the sorted list of models available.",
	required=True)
	parser.add_argument("--output_path",
	help="Path of dir to save all the reports",
	required=True)

	args = parser.parse_args()

	df = pd.read_csv(args.data_path)

	df_to_be_skipped = None
	to_be_skipped_file_path = ".models_and_datasets_to_be_skipped.csv"
	if os.path.exists(to_be_skipped_file_path):
	df_to_be_skipped = pd.read_csv(to_be_skipped_file_path)

	command_template = Template("python cli.py --loader huggingface --model $model --dataset $dataset "
	"--dataset_split $dataset_split --dataset_config $dataset_config "
	"--output ${output_path}/${model_name}__default_scan_with__${dataset_name}.html")

	result_path_template = Template("${output_path}/${model_name}__default_scan_with__${dataset_name}.${suffix}")

	if not os.path.exists(args.output_path):
	os.makedirs(args.output_path)

	dataset_split_exceptions = {"facebook/bart-large-mnli": "validation_matched"}

	dataset_config_exceptions = {"tweet_eval": "sentiment"}

	for i in range(int(args.first_Nmodels)):
	row = df.iloc[i]
	model = row.modelId
	dataset = literal_eval(row.datasets)[0]

	message = f"{model} with {dataset}"

	if ((df_to_be_skipped['model'] == model) & (df_to_be_skipped['dataset'] == dataset)).any() \
	and df_to_be_skipped is not None:
	print(f"[{i}] ==== ⏩ skipping {message} ====")
	continue

	print(f"[{i}] ==== 🔍 scanning {message} ====")

	result_path = result_path_template.substitute(model_name=model.replace("/", "--"),
	dataset_name=dataset.replace("/", "--"),
	output_path=args.output_path,
	suffix="html")
	if os.path.exists(result_path):
	answer = input(f"{result_path} already exists, Overwrite[o] or Skip[s]? ")

	while answer not in ["o", "s"]:
	answer = input("Invalid answer, please choose between 'o' and 's'")

	if answer == 'o':
	os.remove(result_path)
	elif answer == 's':
	continue

	command = command_template.substitute(model=model, dataset=dataset,
	dataset_split=dataset_split_exceptions.get(model, "validation"),
	dataset_config=dataset_config_exceptions.get(dataset, None),
	model_name=model.replace("/", "--"),
	dataset_name=dataset.replace("/", "--"),
	output_path=args.output_path)

	try:
	os.system(command) # call the cli script in order for try, except to work
	new_row = pd.DataFrame({"model": model, "dataset": dataset, "status": "done"}, index=[0])
	df_to_be_skipped = pd.concat([df_to_be_skipped, new_row], ignore_index=True)
	df_to_be_skipped.to_csv(to_be_skipped_file_path, index=False)
	except Exception as e:
	new_row = pd.DataFrame({"model": model, "dataset": dataset, "status": "error"}, index=[0])
	df_to_be_skipped = pd.concat([df_to_be_skipped, new_row], ignore_index=True)
	df_to_be_skipped.to_csv(to_be_skipped_file_path, index=False)
	result_path = result_path_template.substitute(model_name=model.replace("/", "--"),
	dataset_name=dataset.replace("/", "--"),
	output_path=args.output_path,
	suffix="error")
	with open(result_path, "w") as error_log:
	error_log.write(e)
	print(
	f"Something went wrong while {message}, error is logged at {result_path}. "
	"continuing with the next model...")
	# raise Exception(f"Something went wrong while {message}") from e