Spaces:

responsibility-framing
/

sociofillmore_public

Build error

File size: 5,978 Bytes

b11ac48

from itertools import product

import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

from nltk.corpus import framenet as fn

from sociofillmore.common.analyze_text import read_frames_of_interest


COSINE_THRESH = [0.1, 0.2, 0.3, 0.4, 0.5]


PREDICTION_FILES = {
    "evalita-dev": {
        "stupid-svm": "../stupid-svm-frameid/evalita_predictions.csv",
        "lome-en": "misc/frame_prediction_output_lome-en_dev.csv",
        "lome-it": "misc/frame_prediction_output_lome-it-best_dev.csv",
    },
    "evalita-test": {
        "stupid-svm": "../stupid-svm-frameid/evalita_predictions_test.csv",
        "lome-en": "misc/frame_prediction_output_lome-en_test.csv",
        "lome-it": "misc/frame_prediction_output_lome-it-best_test.csv",
    },
    "rai_femicides": {
        "stupid-svm": "../stupid-svm-frameid/rai_predictions.csv",
        "lome-en": "misc/frame_prediction_output_lome-en_rai.csv",
        "lome-it": "misc/frame_prediction_output_lome-it-best_rai.csv",
    },
}


def load_embeddings(embedding_file):
    frame_vocab = []
    word_vocab = []
    vectors = []

    with open(embedding_file, encoding="utf-8") as f:
        for line in f:
            columns = line.split()
            frame = columns[0]
            words = tuple(columns[1].split("+"))
            vector = np.array([float(i) for i in columns[2:]])

            frame_vocab.append(frame)
            word_vocab.append(words)
            vectors.append(vector)

    frames_to_idxs = {}
    for i, frame in enumerate(frame_vocab):
        frames_to_idxs[frame] = i

    return np.array(vectors, dtype=np.float64), frames_to_idxs


def femicide_frame_distances(embeddings, frame_to_idx):
    femicide_frames = read_frames_of_interest("femicides/rai")
    print("Cosines: ")
    for fr1, fr2 in product(femicide_frames, femicide_frames):
        dist = cosine(embeddings[frame_to_idx[fr1]], embeddings[frame_to_idx[fr2]])
        print(f"\t{fr1}-{fr2}: {dist:.4f}")


def embedding_scores(predictions, embeddings, frame_to_idx):
    correct = 0
    close_calls = {threshold: 0 for threshold in COSINE_THRESH}
    total_dist = 0.0

    for _, row in predictions.iterrows():
        predicted = row["frame_pred"]
        gold = row["frame_gold"]
        dist = cosine(
            embeddings[frame_to_idx[predicted]], embeddings[frame_to_idx[gold]]
        )
        if predicted == gold:
            correct += 1
        else:
            for threshold in COSINE_THRESH:
                if dist < threshold:
                    close_calls[threshold] += 1
        total_dist += dist

    print("#correct: ", correct / len(predictions))
    print("#close calls: ")
    for threshold in COSINE_THRESH:
        print("\t", threshold, (close_calls[threshold]) / len(predictions))
    print("#correct or close: ")
    for threshold in COSINE_THRESH:
        print("\t", threshold, (correct + close_calls[threshold]) / len(predictions))
    print("avg cosine dist: ", total_dist / len(predictions))


def generalization_exp(predictions, evalita_train_counts, fn_frames, femicide_frames):

    all_frames = predictions
    ifn_frames = predictions[
        predictions["frame_gold"].isin(evalita_train_counts["label"])
    ]
    bfn_frames = predictions[predictions["frame_gold"].isin(fn_frames)]
    rai_frames = predictions[predictions["frame_gold"].isin(femicide_frames)]


    print("LEN (ALL/IFN/BFN/RAI:)")
    print(
        "\t".join(
            [
                str(len(preds))
                for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
            ]
        )
    )

    print("ACC (ALL/IFN/BFN/RAI:)")
    print(
        "\t".join(
            [
                str(len(preds[preds["frame_gold"] == preds["frame_pred"]]) / len(preds))
                for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
            ]
        )
    )


def main():

    evalita_train_counts = pd.read_csv(
        "output/femicides/compare_lome_models/evalita_trainset_counts.csv"
    )

    fn_frames = {fr.name for fr in fn.frames()}
    femicide_frames = read_frames_of_interest("femicides/rai")
    evalita_train_counts = pd.read_csv(
        "output/femicides/compare_lome_models/evalita_trainset_counts.csv"
    )

    for dataset in PREDICTION_FILES:
        print(f"==={dataset}===")
        for model, predictions_file in PREDICTION_FILES[dataset].items():

            print(f"---{model}---")

            predictions = pd.read_csv(predictions_file, index_col=0)
            print("Total predictions:", len(predictions))

            # predictions_with_fn_frames = predictions[
            #     predictions["frame_gold"].isin(fn_frames)
            #     & predictions["frame_pred"].isin(fn_frames)
            # ]
            # print("Predictions with FN frames: ", len(predictions_with_fn_frames))

            # errors = predictions[predictions["frame_gold"] != predictions["frame_pred"]]
            # print("Total errors: ", len(errors))

            # errors_with_fn_frames = errors[
            #     errors["frame_gold"].isin(fn_frames) & errors["frame_pred"].isin(fn_frames)
            # ]
            # print("Errors with FN frames: ", len(errors_with_fn_frames))

            # print("Loading embeddings...")
            # embeddings, frame_to_idx = load_embeddings(
            #     "../bert-for-framenet/data/embeddings/bag_of_lu_embeddings.txt"
            # )
            # # femicide_frame_distances(embeddings, frame_to_idx)
            # embedding_scores(predictions_with_fn_frames, embeddings, frame_to_idx)

            if dataset == "rai_femicides":
                predictions = predictions[predictions["frame_gold"].isin(femicide_frames)]


            femicide_frames = read_frames_of_interest("femicides/rai")
            generalization_exp(
                predictions, evalita_train_counts, fn_frames, femicide_frames
            )


            print()
        print()


if __name__ == "__main__":
    main()