Spaces:

responsibility-framing
/

sociofillmore_public

Sleeping

App Files Files Community

sociofillmore_public / sociofillmore /femicides /evalita_err_analysis.py

Gosse Minnema

Add sociofillmore code, load dataset via private dataset repo

b11ac48 7 months ago

raw

history blame

No virus

5.98 kB

	from itertools import product

	import pandas as pd
	import numpy as np
	from scipy.spatial.distance import cosine

	from nltk.corpus import framenet as fn

	from sociofillmore.common.analyze_text import read_frames_of_interest


	COSINE_THRESH = [0.1, 0.2, 0.3, 0.4, 0.5]


	PREDICTION_FILES = {
	"evalita-dev": {
	"stupid-svm": "../stupid-svm-frameid/evalita_predictions.csv",
	"lome-en": "misc/frame_prediction_output_lome-en_dev.csv",
	"lome-it": "misc/frame_prediction_output_lome-it-best_dev.csv",
	},
	"evalita-test": {
	"stupid-svm": "../stupid-svm-frameid/evalita_predictions_test.csv",
	"lome-en": "misc/frame_prediction_output_lome-en_test.csv",
	"lome-it": "misc/frame_prediction_output_lome-it-best_test.csv",
	},
	"rai_femicides": {
	"stupid-svm": "../stupid-svm-frameid/rai_predictions.csv",
	"lome-en": "misc/frame_prediction_output_lome-en_rai.csv",
	"lome-it": "misc/frame_prediction_output_lome-it-best_rai.csv",
	},
	}


	def load_embeddings(embedding_file):
	frame_vocab = []
	word_vocab = []
	vectors = []

	with open(embedding_file, encoding="utf-8") as f:
	for line in f:
	columns = line.split()
	frame = columns[0]
	words = tuple(columns[1].split("+"))
	vector = np.array([float(i) for i in columns[2:]])

	frame_vocab.append(frame)
	word_vocab.append(words)
	vectors.append(vector)

	frames_to_idxs = {}
	for i, frame in enumerate(frame_vocab):
	frames_to_idxs[frame] = i

	return np.array(vectors, dtype=np.float64), frames_to_idxs


	def femicide_frame_distances(embeddings, frame_to_idx):
	femicide_frames = read_frames_of_interest("femicides/rai")
	print("Cosines: ")
	for fr1, fr2 in product(femicide_frames, femicide_frames):
	dist = cosine(embeddings[frame_to_idx[fr1]], embeddings[frame_to_idx[fr2]])
	print(f"\t{fr1}-{fr2}: {dist:.4f}")


	def embedding_scores(predictions, embeddings, frame_to_idx):
	correct = 0
	close_calls = {threshold: 0 for threshold in COSINE_THRESH}
	total_dist = 0.0

	for _, row in predictions.iterrows():
	predicted = row["frame_pred"]
	gold = row["frame_gold"]
	dist = cosine(
	embeddings[frame_to_idx[predicted]], embeddings[frame_to_idx[gold]]
	)
	if predicted == gold:
	correct += 1
	else:
	for threshold in COSINE_THRESH:
	if dist < threshold:
	close_calls[threshold] += 1
	total_dist += dist

	print("#correct: ", correct / len(predictions))
	print("#close calls: ")
	for threshold in COSINE_THRESH:
	print("\t", threshold, (close_calls[threshold]) / len(predictions))
	print("#correct or close: ")
	for threshold in COSINE_THRESH:
	print("\t", threshold, (correct + close_calls[threshold]) / len(predictions))
	print("avg cosine dist: ", total_dist / len(predictions))


	def generalization_exp(predictions, evalita_train_counts, fn_frames, femicide_frames):

	all_frames = predictions
	ifn_frames = predictions[
	predictions["frame_gold"].isin(evalita_train_counts["label"])
	]
	bfn_frames = predictions[predictions["frame_gold"].isin(fn_frames)]
	rai_frames = predictions[predictions["frame_gold"].isin(femicide_frames)]


	print("LEN (ALL/IFN/BFN/RAI:)")
	print(
	"\t".join(
	[
	str(len(preds))
	for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
	]
	)
	)

	print("ACC (ALL/IFN/BFN/RAI:)")
	print(
	"\t".join(
	[
	str(len(preds[preds["frame_gold"] == preds["frame_pred"]]) / len(preds))
	for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
	]
	)
	)


	def main():

	evalita_train_counts = pd.read_csv(
	"output/femicides/compare_lome_models/evalita_trainset_counts.csv"
	)

	fn_frames = {fr.name for fr in fn.frames()}
	femicide_frames = read_frames_of_interest("femicides/rai")
	evalita_train_counts = pd.read_csv(
	"output/femicides/compare_lome_models/evalita_trainset_counts.csv"
	)

	for dataset in PREDICTION_FILES:
	print(f"==={dataset}===")
	for model, predictions_file in PREDICTION_FILES[dataset].items():

	print(f"---{model}---")

	predictions = pd.read_csv(predictions_file, index_col=0)
	print("Total predictions:", len(predictions))

	# predictions_with_fn_frames = predictions[
	# predictions["frame_gold"].isin(fn_frames)
	# & predictions["frame_pred"].isin(fn_frames)
	# ]
	# print("Predictions with FN frames: ", len(predictions_with_fn_frames))

	# errors = predictions[predictions["frame_gold"] != predictions["frame_pred"]]
	# print("Total errors: ", len(errors))

	# errors_with_fn_frames = errors[
	# errors["frame_gold"].isin(fn_frames) & errors["frame_pred"].isin(fn_frames)
	# ]
	# print("Errors with FN frames: ", len(errors_with_fn_frames))

	# print("Loading embeddings...")
	# embeddings, frame_to_idx = load_embeddings(
	# "../bert-for-framenet/data/embeddings/bag_of_lu_embeddings.txt"
	# )
	# # femicide_frame_distances(embeddings, frame_to_idx)
	# embedding_scores(predictions_with_fn_frames, embeddings, frame_to_idx)

	if dataset == "rai_femicides":
	predictions = predictions[predictions["frame_gold"].isin(femicide_frames)]


	femicide_frames = read_frames_of_interest("femicides/rai")
	generalization_exp(
	predictions, evalita_train_counts, fn_frames, femicide_frames
	)


	print()
	print()


	if __name__ == "__main__":
	main()