Spaces:

spark-ds549
/

Epik

Sleeping

File size: 8,231 Bytes

import os
import pickle
import tempfile
import gradio as gr
from tqdm import tqdm
from app.utils import (
    create_input_instruction,
    format_prediction_ouptut,
    remove_temp_dir,
    decode_numeric_label,
    decode_speaker_role,
    display_sentiment_score_table,
    sentiment_flow_plot,
    EXAMPLE_CONVERSATIONS,
)
from fairseq.data.data_utils import collate_tokens

import sys

sys.path.insert(0, "../")  # neccesary to load modules outside of app

from app import roberta, comet, COSMIC_MODEL, cosmic_args
from preprocessing import preprocess
from Model.COSMIC.erc_training.predict_epik import predict, get_valid_dataloader


def cosmic_preprocess(input, dir="."):
    result = preprocess.process_user_input(input)

    if not result["success"]:
        raise gr.Error(result["message"])

    data = result["data"]

    # processed the data and turn it into a csv file
    output_csv_path = os.path.join(dir, "epik.csv")
    grouped_df = preprocess.preapre_csv(data, output_csv_path, with_label=False)

    # convert the csv to pickle file of speakers, labels, sentences
    pickle_dest = os.path.join(dir, "epik.pkl")
    preprocess.convert_to_pickle(
        source=output_csv_path,
        dest=pickle_dest,
        index_col="ConversationId",
        list_type_columns=[
            "Text",
            "ParticipantRoleEncoded",
            "LabelNumeric",
        ],
        order=[
            "ParticipantRoleEncoded",
            "LabelNumeric",
            "Text",
        ],
        exclude=["ParticipantRole"],
    )

    # split the id for prediction, we'll put these in validation ids
    preprocess.split_and_save_ids(
        grouped_df["ConversationId"].to_list(), 0, 0, 1, dir=dir
    )

    # add ids into the pickle files
    preprocess.merge_pkl_with_ids(
        pickle_src=pickle_dest,
        ids_files=["train_set.txt", "test_set.txt", "validation_set.txt"],
        dir=dir,
    )

    # generate the sentences pickle file
    sentences_pkl_path = os.path.join(dir, "epik_sentences.pkl")
    preprocess.convert_to_pickle(
        source=output_csv_path,
        dest=sentences_pkl_path,
        index_col="ConversationId",
        list_type_columns=["Text"],
        exclude=[
            "ParticipantRole",
            "ParticipantRoleEncoded",
            "LabelNumeric",
        ],
    )

    return pickle_dest, sentences_pkl_path


def cosmic_roberta_extract(path, dest_dir="."):
    # load the feature from file at path
    speakers, labels, sentences, train_ids, test_ids, valid_ids = pickle.load(
        open(path, "rb")
    )
    roberta1, roberta2, roberta3, roberta4 = {}, {}, {}, {}

    all_ids = train_ids + test_ids + valid_ids

    for i in tqdm(range(len(all_ids))):
        item = all_ids[i]
        sent = sentences[item]
        sent = [s.encode("ascii", errors="ignore").decode("utf-8") for s in sent]
        batch = collate_tokens([roberta.encode(s) for s in sent], pad_idx=1)
        feat = roberta.extract_features(batch, return_all_hiddens=True)
        roberta1[item] = [row for row in feat[-1][:, 0, :].detach().numpy()]
        roberta2[item] = [row for row in feat[-2][:, 0, :].detach().numpy()]
        roberta3[item] = [row for row in feat[-3][:, 0, :].detach().numpy()]
        roberta4[item] = [row for row in feat[-4][:, 0, :].detach().numpy()]

    roberta_feature_path = os.path.join(dest_dir, "epik_features_roberta.pkl")
    pickle.dump(
        [
            speakers,
            labels,
            roberta1,
            roberta2,
            roberta3,
            roberta4,
            sentences,
            train_ids,
            test_ids,
            valid_ids,
        ],
        open(roberta_feature_path, "wb"),
    )

    return roberta_feature_path


def cosmic_comet_extract(path, dir="."):
    print("Extracting features in", path)
    sentences = pickle.load(open(path, "rb"))
    feaures = comet.extract(sentences)

    comet_feature_path = os.path.join(dir, "epik_features_comet.pkl")
    pickle.dump(feaures, open(comet_feature_path, "wb"))

    return comet_feature_path


def cosmic_classifier(input):
    # create a temporary directory for the input data
    temp_dir = tempfile.mkdtemp(dir=os.getcwd(), prefix="temp")

    epik_path, epik_sentences_path = cosmic_preprocess(input, temp_dir)

    roberta_path = cosmic_roberta_extract(epik_path, temp_dir)
    comet_path = cosmic_comet_extract(epik_sentences_path, temp_dir)

    # use cosmic model to make predictions
    data_loader, ids = get_valid_dataloader(roberta_path, comet_path)
    predictions = predict(COSMIC_MODEL, data_loader, cosmic_args)

    speakers, _, sentences, _, _, valid_ids = pickle.load(open(epik_path, "rb"))

    # Assuming that there's only one conversation
    conv_id = ids[0]
    speaker_roles = [
        decode_speaker_role(numeric_role) for numeric_role in speakers[conv_id]
    ]
    labels = [decode_numeric_label(pred) for pred in predictions[0]]
    output = format_prediction_ouptut(speaker_roles, sentences[conv_id], labels)

    print()
    print("======= Removing Temporary Directory =======")
    remove_temp_dir(temp_dir)
    return output


def cosmic_ui():
    with gr.Blocks() as cosmic_model:
        gr.Markdown(
            """
            # COSMIC
            COSMIC is a popular model for predicting sentiment labels using the entire
            context of the conversation. In other words, it analyzes the previous
            messages to predict the sentiment label for the current message.<br/>
            The model was adopted from this
            [repo](https://github.com/declare-lab/conv-emotion.git), implemented based
            on this research [paper](https://arxiv.org/pdf/2010.02795.pdf).
            
            ```bash COSMIC: COmmonSense knowledge for eMotion Identification in
            Conversations. D. Ghosal, N. Majumder, A. Gelbukh, R. Mihalcea, & S. Poria. Findings of EMNLP 2020.
            ```
            """
        )

        create_input_instruction()
        with gr.Row():
            with gr.Column():
                example_dropdown = gr.Dropdown(
                    choices=["-- Not Selected --"] + list(EXAMPLE_CONVERSATIONS.keys()),
                    value="-- Not Selected --",
                    label="Select an example",
                )

                gr.Markdown('<p style="text-align: center;color: gray;">--- OR ---</p>')

                conversation_input = gr.TextArea(
                    value="",
                    label="Input you conversation",
                    placeholder="Plese input your conversation here",
                    lines=15,
                    max_lines=15,
                )

                def on_example_change(input):
                    if input in EXAMPLE_CONVERSATIONS:
                        return EXAMPLE_CONVERSATIONS[input]

                    return ""

                example_dropdown.input(
                    on_example_change,
                    inputs=example_dropdown,
                    outputs=conversation_input,
                )

            with gr.Column():
                output = gr.Textbox(
                    value="",
                    label="Predicted Sentiment Labels",
                    lines=22,
                    max_lines=22,
                    interactive=False,
                )
        submit_btn = gr.Button(value="Submit")
        submit_btn.click(cosmic_classifier, conversation_input, output)

        # reset the output whenever a change in the input is detected
        conversation_input.change(lambda x: "", conversation_input, output)

        gr.Markdown("# Sentiment Flow Plot")
        with gr.Row():
            with gr.Column(scale=1):
                display_sentiment_score_table()
            with gr.Column(scale=2):
                plot_box = gr.Plot(label="Analysis Plot")

        plot_btn = gr.Button(value="Plot Sentiment Flow")
        plot_btn.click(sentiment_flow_plot, inputs=[output], outputs=[plot_box])

        # reset all outputs whenever a change in the input is detected
        conversation_input.change(
            lambda x: ("", None),
            conversation_input,
            outputs=[output, plot_box],
        )
    return cosmic_model