Spaces:
Sleeping
Sleeping
import os | |
import pickle | |
import tempfile | |
import gradio as gr | |
from tqdm import tqdm | |
from app.utils import ( | |
create_input_instruction, | |
format_prediction_ouptut, | |
remove_temp_dir, | |
decode_numeric_label, | |
decode_speaker_role, | |
display_sentiment_score_table, | |
sentiment_flow_plot, | |
EXAMPLE_CONVERSATIONS, | |
) | |
from fairseq.data.data_utils import collate_tokens | |
import sys | |
sys.path.insert(0, "../") # neccesary to load modules outside of app | |
from app import roberta, comet, COSMIC_MODEL, cosmic_args | |
from preprocessing import preprocess | |
from Model.COSMIC.erc_training.predict_epik import predict, get_valid_dataloader | |
def cosmic_preprocess(input, dir="."): | |
result = preprocess.process_user_input(input) | |
if not result["success"]: | |
raise gr.Error(result["message"]) | |
data = result["data"] | |
# processed the data and turn it into a csv file | |
output_csv_path = os.path.join(dir, "epik.csv") | |
grouped_df = preprocess.preapre_csv(data, output_csv_path, with_label=False) | |
# convert the csv to pickle file of speakers, labels, sentences | |
pickle_dest = os.path.join(dir, "epik.pkl") | |
preprocess.convert_to_pickle( | |
source=output_csv_path, | |
dest=pickle_dest, | |
index_col="ConversationId", | |
list_type_columns=[ | |
"Text", | |
"ParticipantRoleEncoded", | |
"LabelNumeric", | |
], | |
order=[ | |
"ParticipantRoleEncoded", | |
"LabelNumeric", | |
"Text", | |
], | |
exclude=["ParticipantRole"], | |
) | |
# split the id for prediction, we'll put these in validation ids | |
preprocess.split_and_save_ids( | |
grouped_df["ConversationId"].to_list(), 0, 0, 1, dir=dir | |
) | |
# add ids into the pickle files | |
preprocess.merge_pkl_with_ids( | |
pickle_src=pickle_dest, | |
ids_files=["train_set.txt", "test_set.txt", "validation_set.txt"], | |
dir=dir, | |
) | |
# generate the sentences pickle file | |
sentences_pkl_path = os.path.join(dir, "epik_sentences.pkl") | |
preprocess.convert_to_pickle( | |
source=output_csv_path, | |
dest=sentences_pkl_path, | |
index_col="ConversationId", | |
list_type_columns=["Text"], | |
exclude=[ | |
"ParticipantRole", | |
"ParticipantRoleEncoded", | |
"LabelNumeric", | |
], | |
) | |
return pickle_dest, sentences_pkl_path | |
def cosmic_roberta_extract(path, dest_dir="."): | |
# load the feature from file at path | |
speakers, labels, sentences, train_ids, test_ids, valid_ids = pickle.load( | |
open(path, "rb") | |
) | |
roberta1, roberta2, roberta3, roberta4 = {}, {}, {}, {} | |
all_ids = train_ids + test_ids + valid_ids | |
for i in tqdm(range(len(all_ids))): | |
item = all_ids[i] | |
sent = sentences[item] | |
sent = [s.encode("ascii", errors="ignore").decode("utf-8") for s in sent] | |
batch = collate_tokens([roberta.encode(s) for s in sent], pad_idx=1) | |
feat = roberta.extract_features(batch, return_all_hiddens=True) | |
roberta1[item] = [row for row in feat[-1][:, 0, :].detach().numpy()] | |
roberta2[item] = [row for row in feat[-2][:, 0, :].detach().numpy()] | |
roberta3[item] = [row for row in feat[-3][:, 0, :].detach().numpy()] | |
roberta4[item] = [row for row in feat[-4][:, 0, :].detach().numpy()] | |
roberta_feature_path = os.path.join(dest_dir, "epik_features_roberta.pkl") | |
pickle.dump( | |
[ | |
speakers, | |
labels, | |
roberta1, | |
roberta2, | |
roberta3, | |
roberta4, | |
sentences, | |
train_ids, | |
test_ids, | |
valid_ids, | |
], | |
open(roberta_feature_path, "wb"), | |
) | |
return roberta_feature_path | |
def cosmic_comet_extract(path, dir="."): | |
print("Extracting features in", path) | |
sentences = pickle.load(open(path, "rb")) | |
feaures = comet.extract(sentences) | |
comet_feature_path = os.path.join(dir, "epik_features_comet.pkl") | |
pickle.dump(feaures, open(comet_feature_path, "wb")) | |
return comet_feature_path | |
def cosmic_classifier(input): | |
# create a temporary directory for the input data | |
temp_dir = tempfile.mkdtemp(dir=os.getcwd(), prefix="temp") | |
epik_path, epik_sentences_path = cosmic_preprocess(input, temp_dir) | |
roberta_path = cosmic_roberta_extract(epik_path, temp_dir) | |
comet_path = cosmic_comet_extract(epik_sentences_path, temp_dir) | |
# use cosmic model to make predictions | |
data_loader, ids = get_valid_dataloader(roberta_path, comet_path) | |
predictions = predict(COSMIC_MODEL, data_loader, cosmic_args) | |
speakers, _, sentences, _, _, valid_ids = pickle.load(open(epik_path, "rb")) | |
# Assuming that there's only one conversation | |
conv_id = ids[0] | |
speaker_roles = [ | |
decode_speaker_role(numeric_role) for numeric_role in speakers[conv_id] | |
] | |
labels = [decode_numeric_label(pred) for pred in predictions[0]] | |
output = format_prediction_ouptut(speaker_roles, sentences[conv_id], labels) | |
print() | |
print("======= Removing Temporary Directory =======") | |
remove_temp_dir(temp_dir) | |
return output | |
def cosmic_ui(): | |
with gr.Blocks() as cosmic_model: | |
gr.Markdown( | |
""" | |
# COSMIC | |
COSMIC is a popular model for predicting sentiment labels using the entire | |
context of the conversation. In other words, it analyzes the previous | |
messages to predict the sentiment label for the current message.<br/> | |
The model was adopted from this | |
[repo](https://github.com/declare-lab/conv-emotion.git), implemented based | |
on this research [paper](https://arxiv.org/pdf/2010.02795.pdf). | |
```bash COSMIC: COmmonSense knowledge for eMotion Identification in | |
Conversations. D. Ghosal, N. Majumder, A. Gelbukh, R. Mihalcea, & S. Poria. Findings of EMNLP 2020. | |
``` | |
""" | |
) | |
create_input_instruction() | |
with gr.Row(): | |
with gr.Column(): | |
example_dropdown = gr.Dropdown( | |
choices=["-- Not Selected --"] + list(EXAMPLE_CONVERSATIONS.keys()), | |
value="-- Not Selected --", | |
label="Select an example", | |
) | |
gr.Markdown('<p style="text-align: center;color: gray;">--- OR ---</p>') | |
conversation_input = gr.TextArea( | |
value="", | |
label="Input you conversation", | |
placeholder="Plese input your conversation here", | |
lines=15, | |
max_lines=15, | |
) | |
def on_example_change(input): | |
if input in EXAMPLE_CONVERSATIONS: | |
return EXAMPLE_CONVERSATIONS[input] | |
return "" | |
example_dropdown.input( | |
on_example_change, | |
inputs=example_dropdown, | |
outputs=conversation_input, | |
) | |
with gr.Column(): | |
output = gr.Textbox( | |
value="", | |
label="Predicted Sentiment Labels", | |
lines=22, | |
max_lines=22, | |
interactive=False, | |
) | |
submit_btn = gr.Button(value="Submit") | |
submit_btn.click(cosmic_classifier, conversation_input, output) | |
# reset the output whenever a change in the input is detected | |
conversation_input.change(lambda x: "", conversation_input, output) | |
gr.Markdown("# Sentiment Flow Plot") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
display_sentiment_score_table() | |
with gr.Column(scale=2): | |
plot_box = gr.Plot(label="Analysis Plot") | |
plot_btn = gr.Button(value="Plot Sentiment Flow") | |
plot_btn.click(sentiment_flow_plot, inputs=[output], outputs=[plot_box]) | |
# reset all outputs whenever a change in the input is detected | |
conversation_input.change( | |
lambda x: ("", None), | |
conversation_input, | |
outputs=[output, plot_box], | |
) | |
return cosmic_model | |