Spaces:
Build error
Build error
from itertools import product | |
import pandas as pd | |
import numpy as np | |
from scipy.spatial.distance import cosine | |
from nltk.corpus import framenet as fn | |
from sociofillmore.common.analyze_text import read_frames_of_interest | |
COSINE_THRESH = [0.1, 0.2, 0.3, 0.4, 0.5] | |
PREDICTION_FILES = { | |
"evalita-dev": { | |
"stupid-svm": "../stupid-svm-frameid/evalita_predictions.csv", | |
"lome-en": "misc/frame_prediction_output_lome-en_dev.csv", | |
"lome-it": "misc/frame_prediction_output_lome-it-best_dev.csv", | |
}, | |
"evalita-test": { | |
"stupid-svm": "../stupid-svm-frameid/evalita_predictions_test.csv", | |
"lome-en": "misc/frame_prediction_output_lome-en_test.csv", | |
"lome-it": "misc/frame_prediction_output_lome-it-best_test.csv", | |
}, | |
"rai_femicides": { | |
"stupid-svm": "../stupid-svm-frameid/rai_predictions.csv", | |
"lome-en": "misc/frame_prediction_output_lome-en_rai.csv", | |
"lome-it": "misc/frame_prediction_output_lome-it-best_rai.csv", | |
}, | |
} | |
def load_embeddings(embedding_file): | |
frame_vocab = [] | |
word_vocab = [] | |
vectors = [] | |
with open(embedding_file, encoding="utf-8") as f: | |
for line in f: | |
columns = line.split() | |
frame = columns[0] | |
words = tuple(columns[1].split("+")) | |
vector = np.array([float(i) for i in columns[2:]]) | |
frame_vocab.append(frame) | |
word_vocab.append(words) | |
vectors.append(vector) | |
frames_to_idxs = {} | |
for i, frame in enumerate(frame_vocab): | |
frames_to_idxs[frame] = i | |
return np.array(vectors, dtype=np.float64), frames_to_idxs | |
def femicide_frame_distances(embeddings, frame_to_idx): | |
femicide_frames = read_frames_of_interest("femicides/rai") | |
print("Cosines: ") | |
for fr1, fr2 in product(femicide_frames, femicide_frames): | |
dist = cosine(embeddings[frame_to_idx[fr1]], embeddings[frame_to_idx[fr2]]) | |
print(f"\t{fr1}-{fr2}: {dist:.4f}") | |
def embedding_scores(predictions, embeddings, frame_to_idx): | |
correct = 0 | |
close_calls = {threshold: 0 for threshold in COSINE_THRESH} | |
total_dist = 0.0 | |
for _, row in predictions.iterrows(): | |
predicted = row["frame_pred"] | |
gold = row["frame_gold"] | |
dist = cosine( | |
embeddings[frame_to_idx[predicted]], embeddings[frame_to_idx[gold]] | |
) | |
if predicted == gold: | |
correct += 1 | |
else: | |
for threshold in COSINE_THRESH: | |
if dist < threshold: | |
close_calls[threshold] += 1 | |
total_dist += dist | |
print("#correct: ", correct / len(predictions)) | |
print("#close calls: ") | |
for threshold in COSINE_THRESH: | |
print("\t", threshold, (close_calls[threshold]) / len(predictions)) | |
print("#correct or close: ") | |
for threshold in COSINE_THRESH: | |
print("\t", threshold, (correct + close_calls[threshold]) / len(predictions)) | |
print("avg cosine dist: ", total_dist / len(predictions)) | |
def generalization_exp(predictions, evalita_train_counts, fn_frames, femicide_frames): | |
all_frames = predictions | |
ifn_frames = predictions[ | |
predictions["frame_gold"].isin(evalita_train_counts["label"]) | |
] | |
bfn_frames = predictions[predictions["frame_gold"].isin(fn_frames)] | |
rai_frames = predictions[predictions["frame_gold"].isin(femicide_frames)] | |
print("LEN (ALL/IFN/BFN/RAI:)") | |
print( | |
"\t".join( | |
[ | |
str(len(preds)) | |
for preds in [all_frames, ifn_frames, bfn_frames, rai_frames] | |
] | |
) | |
) | |
print("ACC (ALL/IFN/BFN/RAI:)") | |
print( | |
"\t".join( | |
[ | |
str(len(preds[preds["frame_gold"] == preds["frame_pred"]]) / len(preds)) | |
for preds in [all_frames, ifn_frames, bfn_frames, rai_frames] | |
] | |
) | |
) | |
def main(): | |
evalita_train_counts = pd.read_csv( | |
"output/femicides/compare_lome_models/evalita_trainset_counts.csv" | |
) | |
fn_frames = {fr.name for fr in fn.frames()} | |
femicide_frames = read_frames_of_interest("femicides/rai") | |
evalita_train_counts = pd.read_csv( | |
"output/femicides/compare_lome_models/evalita_trainset_counts.csv" | |
) | |
for dataset in PREDICTION_FILES: | |
print(f"==={dataset}===") | |
for model, predictions_file in PREDICTION_FILES[dataset].items(): | |
print(f"---{model}---") | |
predictions = pd.read_csv(predictions_file, index_col=0) | |
print("Total predictions:", len(predictions)) | |
# predictions_with_fn_frames = predictions[ | |
# predictions["frame_gold"].isin(fn_frames) | |
# & predictions["frame_pred"].isin(fn_frames) | |
# ] | |
# print("Predictions with FN frames: ", len(predictions_with_fn_frames)) | |
# errors = predictions[predictions["frame_gold"] != predictions["frame_pred"]] | |
# print("Total errors: ", len(errors)) | |
# errors_with_fn_frames = errors[ | |
# errors["frame_gold"].isin(fn_frames) & errors["frame_pred"].isin(fn_frames) | |
# ] | |
# print("Errors with FN frames: ", len(errors_with_fn_frames)) | |
# print("Loading embeddings...") | |
# embeddings, frame_to_idx = load_embeddings( | |
# "../bert-for-framenet/data/embeddings/bag_of_lu_embeddings.txt" | |
# ) | |
# # femicide_frame_distances(embeddings, frame_to_idx) | |
# embedding_scores(predictions_with_fn_frames, embeddings, frame_to_idx) | |
if dataset == "rai_femicides": | |
predictions = predictions[predictions["frame_gold"].isin(femicide_frames)] | |
femicide_frames = read_frames_of_interest("femicides/rai") | |
generalization_exp( | |
predictions, evalita_train_counts, fn_frames, femicide_frames | |
) | |
print() | |
print() | |
if __name__ == "__main__": | |
main() | |