|
from itertools import product |
|
|
|
import pandas as pd |
|
import numpy as np |
|
from scipy.spatial.distance import cosine |
|
|
|
from nltk.corpus import framenet as fn |
|
|
|
from sociofillmore.common.analyze_text import read_frames_of_interest |
|
|
|
|
|
COSINE_THRESH = [0.1, 0.2, 0.3, 0.4, 0.5] |
|
|
|
|
|
PREDICTION_FILES = { |
|
"evalita-dev": { |
|
"stupid-svm": "../stupid-svm-frameid/evalita_predictions.csv", |
|
"lome-en": "misc/frame_prediction_output_lome-en_dev.csv", |
|
"lome-it": "misc/frame_prediction_output_lome-it-best_dev.csv", |
|
}, |
|
"evalita-test": { |
|
"stupid-svm": "../stupid-svm-frameid/evalita_predictions_test.csv", |
|
"lome-en": "misc/frame_prediction_output_lome-en_test.csv", |
|
"lome-it": "misc/frame_prediction_output_lome-it-best_test.csv", |
|
}, |
|
"rai_femicides": { |
|
"stupid-svm": "../stupid-svm-frameid/rai_predictions.csv", |
|
"lome-en": "misc/frame_prediction_output_lome-en_rai.csv", |
|
"lome-it": "misc/frame_prediction_output_lome-it-best_rai.csv", |
|
}, |
|
} |
|
|
|
|
|
def load_embeddings(embedding_file): |
|
frame_vocab = [] |
|
word_vocab = [] |
|
vectors = [] |
|
|
|
with open(embedding_file, encoding="utf-8") as f: |
|
for line in f: |
|
columns = line.split() |
|
frame = columns[0] |
|
words = tuple(columns[1].split("+")) |
|
vector = np.array([float(i) for i in columns[2:]]) |
|
|
|
frame_vocab.append(frame) |
|
word_vocab.append(words) |
|
vectors.append(vector) |
|
|
|
frames_to_idxs = {} |
|
for i, frame in enumerate(frame_vocab): |
|
frames_to_idxs[frame] = i |
|
|
|
return np.array(vectors, dtype=np.float64), frames_to_idxs |
|
|
|
|
|
def femicide_frame_distances(embeddings, frame_to_idx): |
|
femicide_frames = read_frames_of_interest("femicides/rai") |
|
print("Cosines: ") |
|
for fr1, fr2 in product(femicide_frames, femicide_frames): |
|
dist = cosine(embeddings[frame_to_idx[fr1]], embeddings[frame_to_idx[fr2]]) |
|
print(f"\t{fr1}-{fr2}: {dist:.4f}") |
|
|
|
|
|
def embedding_scores(predictions, embeddings, frame_to_idx): |
|
correct = 0 |
|
close_calls = {threshold: 0 for threshold in COSINE_THRESH} |
|
total_dist = 0.0 |
|
|
|
for _, row in predictions.iterrows(): |
|
predicted = row["frame_pred"] |
|
gold = row["frame_gold"] |
|
dist = cosine( |
|
embeddings[frame_to_idx[predicted]], embeddings[frame_to_idx[gold]] |
|
) |
|
if predicted == gold: |
|
correct += 1 |
|
else: |
|
for threshold in COSINE_THRESH: |
|
if dist < threshold: |
|
close_calls[threshold] += 1 |
|
total_dist += dist |
|
|
|
print("#correct: ", correct / len(predictions)) |
|
print("#close calls: ") |
|
for threshold in COSINE_THRESH: |
|
print("\t", threshold, (close_calls[threshold]) / len(predictions)) |
|
print("#correct or close: ") |
|
for threshold in COSINE_THRESH: |
|
print("\t", threshold, (correct + close_calls[threshold]) / len(predictions)) |
|
print("avg cosine dist: ", total_dist / len(predictions)) |
|
|
|
|
|
def generalization_exp(predictions, evalita_train_counts, fn_frames, femicide_frames): |
|
|
|
all_frames = predictions |
|
ifn_frames = predictions[ |
|
predictions["frame_gold"].isin(evalita_train_counts["label"]) |
|
] |
|
bfn_frames = predictions[predictions["frame_gold"].isin(fn_frames)] |
|
rai_frames = predictions[predictions["frame_gold"].isin(femicide_frames)] |
|
|
|
|
|
print("LEN (ALL/IFN/BFN/RAI:)") |
|
print( |
|
"\t".join( |
|
[ |
|
str(len(preds)) |
|
for preds in [all_frames, ifn_frames, bfn_frames, rai_frames] |
|
] |
|
) |
|
) |
|
|
|
print("ACC (ALL/IFN/BFN/RAI:)") |
|
print( |
|
"\t".join( |
|
[ |
|
str(len(preds[preds["frame_gold"] == preds["frame_pred"]]) / len(preds)) |
|
for preds in [all_frames, ifn_frames, bfn_frames, rai_frames] |
|
] |
|
) |
|
) |
|
|
|
|
|
def main(): |
|
|
|
evalita_train_counts = pd.read_csv( |
|
"output/femicides/compare_lome_models/evalita_trainset_counts.csv" |
|
) |
|
|
|
fn_frames = {fr.name for fr in fn.frames()} |
|
femicide_frames = read_frames_of_interest("femicides/rai") |
|
evalita_train_counts = pd.read_csv( |
|
"output/femicides/compare_lome_models/evalita_trainset_counts.csv" |
|
) |
|
|
|
for dataset in PREDICTION_FILES: |
|
print(f"==={dataset}===") |
|
for model, predictions_file in PREDICTION_FILES[dataset].items(): |
|
|
|
print(f"---{model}---") |
|
|
|
predictions = pd.read_csv(predictions_file, index_col=0) |
|
print("Total predictions:", len(predictions)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if dataset == "rai_femicides": |
|
predictions = predictions[predictions["frame_gold"].isin(femicide_frames)] |
|
|
|
|
|
femicide_frames = read_frames_of_interest("femicides/rai") |
|
generalization_exp( |
|
predictions, evalita_train_counts, fn_frames, femicide_frames |
|
) |
|
|
|
|
|
print() |
|
print() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|