Spaces:

PRENT
/

PRENT-Codebook

Runtime error

App Files Files

PRENT commited on Oct 12, 2022

Commit

cdc2127

1 Parent(s): abde596

Upload dashboard

Browse files

Files changed (9) hide show

Data_Loading.py +119 -0
README.md +1 -1
codebook_demo.json +108 -0
data_demo.csv +35 -0
helpers.py +630 -0
pages/1_Codebook_Design.py +731 -0
pages/2_Codebook_Advanced_Edit.py +251 -0
pages/3_Apply_Codebook.py +222 -0
requirements.txt +10 -0

Data_Loading.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import sys
+import streamlit as st
+from st_aggrid import AgGrid, DataReturnMode
+current = os.path.dirname(os.path.realpath(__file__))
+parent = os.path.dirname(current)
+sys.path.append(parent)
+from helpers import apply_style, get_idx_column, read_csv_from_web, read_json_from_web
+apply_style()
+codebook = {}
+st.markdown(
+    """
+    # Codebook Creation/Edition Tool based on the PR-ENT Approach.
+    ### *Rethinking the Event Coding Pipeline with Prompt Entailment*
+    ### Author: Anonymized for submission"
+    ##### Version: 1.0
+"""
+)
+st.markdown("***********")
+st.markdown(
+    """
+## Data Loading
+"""
+)
+st.markdown(
+    """
+    ### Upload a CSV of event descriptions.
+"""
+)
+uploaded_file = st.file_uploader("Upload a csv file containing event descriptions")
+if uploaded_file is not None:
+    st.session_state.data = read_csv_from_web(uploaded_file)
+if "data" in st.session_state:
+    # Filter will be reset if the page is left and then used again
+    loading_df = st.text("Loading data display...")
+    st.write(
+        """
+        The below display of the data can be used to filter the data. Click on the *3 bars logo* when hovering over a column name and the filtering
+        tool will appear. Filters are kept in memory on the whole dashboard as long as the `Reset Filters` button is not clicked.
+        Current limitation: If a filter is set and the user change page. Then it can not be modified anymore and needs to be reset.
+    """
+    )
+    if "filtered_df" not in st.session_state:
+        st.session_state.filtered_df = st.session_state.data
+    if st.button("Reset Filters"):
+        st.session_state.filtered_df = st.session_state.data
+    st.session_state.filtered_df = AgGrid(
+        st.session_state.filtered_df,
+        height=400,
+        data_return_mode=DataReturnMode.FILTERED,
+        update_mode="MANUAL",
+    )["data"]
+    if "text_column_design_perm" not in st.session_state:
+        st.session_state[
+            "text_column_design_perm"
+        ] = st.session_state.filtered_df.columns[0]
+    def callback_function(mod, key):
+        st.session_state[mod] = st.session_state[key]
+    st.write("Select the column which contains the event descriptions.")
+    st.selectbox(
+        "Select the event description column:",
+        st.session_state.filtered_df.columns,
+        key="text_column_design",
+        on_change=callback_function,
+        args=("text_column_design_perm", "text_column_design"),
+        index=get_idx_column(
+            st.session_state["text_column_design_perm"],
+            list(st.session_state.filtered_df.columns),
+        ),
+    )
+    loading_df.text("")
+    # Remove NaN Texts
+    if st.button("Remove Empty Event Descriptions"):
+        st.session_state.filtered_df = st.session_state.filtered_df.dropna(
+            subset=[st.session_state["text_column_design_perm"]]
+        )
+st.write("********")
+st.markdown("## Optional Upload")
+st.markdown(
+    """
+    ### Upload a codebook if available. It needs to be in the format used in this dashboard.
+"""
+)
+uploaded_codebook = st.file_uploader("Upload a codebook if available (OPTIONAL)")
+if uploaded_codebook is not None:
+    codebook = read_json_from_web(uploaded_codebook)
+    st.session_state.codebook = codebook
+st.markdown(
+    """
+    ### Upload a validated dataset (accept, reject, ignored) in the format of this dashboard.
+"""
+)
+uploaded_validated_data = st.file_uploader(
+    "Upload a json file containing validated data (OPTIONAL)"
+)
+if uploaded_validated_data is not None:
+    st.session_state.validated_data = read_json_from_web(uploaded_validated_data)

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: gray
 colorTo: gray
 sdk: streamlit
 sdk_version: 1.10.0
-app_file: app.py
 pinned: false
 ---

 colorTo: gray
 sdk: streamlit
 sdk_version: 1.10.0
+app_file: Data_Loading.py
 pinned: false
 ---

codebook_demo.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+   "events": {
+      "Arrest": {
+         "all": [],
+         "any": [
+            "People were arrested."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Destruction": {
+         "all": [],
+         "any": [
+            "This event involves arson."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Killing": {
+         "all": [],
+         "any": [
+            "This event involves killing."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Looting": {
+         "all": [],
+         "any": [],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Other": {
+         "all": [],
+         "any": [],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Explosions": {
+         "all": [],
+         "any": [
+            "This event involves explosive."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Kidnapping": {
+         "all": [],
+         "any": [
+            "People were kidnapped.",
+            "This event involves kidnapping."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Sexual Violence": {
+         "all": [],
+         "any": [
+            "This event involves rape.",
+            "People were abused."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      },
+      "Protests": {
+         "all": [],
+         "any": [
+            "This event involves protest.",
+            "This event involves demonstration.",
+            "This event involves protester."
+         ],
+         "not": {},
+         "not_any": [],
+         "not_all": [],
+         "all_any_rel": "OR",
+         "not_all_any_rel": "OR"
+      }
+   },
+   "templates": [
+      "This event involves [Z].",
+      "People were [Z]."
+   ],
+   "add_words": []
+}

data_demo.csv ADDED Viewed

	@@ -0,0 +1,35 @@

+Event Descriptions
+"An organization abducts a group of 50 women and children. 'They started loading our women and children into their vehicles, threatening to shoot whoever disobeyed them. Everybody was scared,' a witness stated. The date of the attack was not included in the report, but is likely to have occurred on 13/10 or 14/10."
+"On 7 July 2019, unidentified gunmen abducted a man. No requests for ransom have been reported."
+"On 2 March 2020, a driver was abducted by militiamen in the region. The person was later released in a village."
+"On 23 August, a group attacked a village, abducting 6 people."
+05 May 2021. An unidentified armed group abducted a girl in the town. [women targeted: girls]
+Mass demonstrations honoring the bravery of Revolution martyrs.
+"The national organization of Teachers, has organised a mass protest both against the substantive issue and against police brutality on December 11th, which is the international day of human rights."
+"Answering the call of the religious organization following the Friday prayer, citizens held two protest sit-ins to express their support for discriminated people and denounce normalization of relations with the neighboring state. [size=no report]"
+Families affected by the collapse of their building  organized a protest gathering  to demand from the local authorities to relocate them. [size=no report]
+"On 15 September, gold miners held a protest sit-in to demand the closure of illegal gold refineries. [size=no report]"
+"Around 14 August 2021 (between 13 - 18 August), police intelligence agents arrested the national party leader, a journalist and 8 other members of the party in the city. Reason for arrest not clear."
+"5 arrested, houses ransacked, due to suspected links with terrorist organizations, then extradited "
+"On 15 June 2020, Police Forces arrested 70 civilians, in an ongoing crackdown against religious minorities."
+Arrests: Police detain a journalist who recently published a politically disagreeable article.
+About 23 suspected political thugs were arrested by the men of the state police command during the just concluded national assembly poll.
+"Around 13 June 2019 (as reported), local militiamen set a settlement ablaze. Event connected to an earlier attack on residents by pastoralists."
+"Around 4 March, suspected separatists destroyed a number of houses and property in the region."
+"On 30 January, an unidentified armed group has razed an Ebola handwashing station."
+"On 12 November 2019, unknown individuals set ablaze a half hectare of soja harvest."
+"On 11 February 2020, suspected fighters stole about 30 cattle during a raid."
+"On 3 December, at least 12 illegal checkpoints were erected by unknown troops. The soldiers reportedly extort money from travelers."
+"Around 12 October 2021 (between 12 - 13 October), unknown gunmen seized livestock belonging to a councilor in the village."
+"Pro-government militia have set up more than 20 checkpoints along the  road, demanding fees from vehicles, robbing passengers, stealing from lorries & seizing vehicles"
+"2 armed men, believed to be part of a vigilante militia, robbed a NGO camp, which has led to a reduction in the NGO operations in the area. No casualties were reported."
+Members of a peacekeeping unit have been accused of raping a young girl. [women targeted: girls]
+"On 12 April 2020, presumed militants raped a woman."
+Peacekeepers allegedly carried out sexual abuse on civilians.
+2 people were killed during an attack by 20 militiamen in the chiefdom of. 1 person was killed and 2 raped. The militiamen looted several house and fired into the air as they raided the areas.
+A five-year-old girl was raped and then killed by unidentified men. [women targeted: girls]
+"Grenade explodes, killing 2 - affiliated to ruling party"
+"On 25 February 2021, terrorist militants threw a hand grenade at a house. Two soldiers were injured in the explosion."
+"On 8 August 2020, air force targeted a Military Faculty. No casualties reported."
+Air forces drop 6 bombs in the region
+"On 13 November, a cart was struck by an IED about 3km north, one civilian was killed and another severely wounded, the two donkeys that pulled the cart were also killed. The IED was most likely planted by terrorist militants."

helpers.py ADDED Viewed

	@@ -0,0 +1,630 @@

+import json
+import string
+from time import time
+import en_core_web_lg
+import inflect
+import nltk
+import numpy as np
+import pandas as pd
+import streamlit as st
+from nltk.tokenize import sent_tokenize
+from transformers import pipeline
+# Set constant values
+INFLECT_ENGINE = inflect.engine()
+TOP_K = 30
+NLI_LIMIT = 0.9
+st.set_page_config(layout="wide")
+def get_top_k():
+    return TOP_K
+def get_nli_limit():
+    return NLI_LIMIT
+### Streamlit specific
+@st.cache(allow_output_mutation=True)
+def load_model_prompting():
+    return pipeline("fill-mask", model="distilbert-base-uncased")
+@st.cache(allow_output_mutation=True)
+def load_model_nli():
+    try:
+        return pipeline(
+            task="sentiment-analysis", model="roberta-large-mnli", device="mps"
+        )
+    except:
+        return pipeline(task="sentiment-analysis", model="roberta-large-mnli")
+@st.cache(allow_output_mutation=True)
+def load_spacy_pipeline():
+    return en_core_web_lg.load()
+@st.cache()
+def download_punkt():
+    nltk.download("punkt")
+download_punkt()
+@st.experimental_memo(max_entries=1)
+def read_json_from_web(uploaded_json):
+    return json.load(uploaded_json)
+@st.experimental_memo(max_entries=1)
+def read_csv_from_web(uploaded_file):
+    """Read CSV from the streamlit interface
+    :param uploaded_file: File to read
+    :type uploaded_file: UploadedFile (BytesIO)
+    :return: Dataframe
+    :rtype: pandas DataFrame
+    """
+    try:
+        # Try first to read comma separated and semicolon separated files
+        data = pd.read_csv(uploaded_file, sep=None, engine="python")
+        # If both are not correct, then it will error and go to the except
+    except pd.errors.ParserError:
+        # This should be the case when there is no separator (1 column csv)
+        # Reset the IO object due to the previous crash
+        uploaded_file.seek(0)
+        # Use standard reading of CSV (no separator)
+        data = pd.read_csv(uploaded_file)
+    return data
+def apply_style():
+    # Avoid having ellipsis in the multi select options
+    styl = """
+        <style>
+            .stMultiSelect span{
+                max-width: none;
+            }
+        </style>
+        """
+    st.markdown(styl, unsafe_allow_html=True)
+    # Set color of multiselect to red
+    st.markdown(
+        """
+        <style>
+            span[data-baseweb="tag"] {
+                background-color: red !important;
+            }
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+    hide_st_style = """
+                <style>
+                #MainMenu {visibility: hidden;}
+                footer {visibility: hidden;}
+                header {visibility: hidden;}
+                </style>
+                """
+    st.markdown(hide_st_style, unsafe_allow_html=True)
+def choose_text_menu(text):
+    if "text" not in st.session_state:
+        st.session_state.text = "Several demonstrators were injured."
+    text = st.text_area("Event description", st.session_state.text)
+    return text
+def initiate_widget_st_state(widget_key, perm_key, default_value):
+    if perm_key not in st.session_state:
+        st.session_state[perm_key] = default_value
+    if widget_key not in st.session_state:
+        st.session_state[widget_key] = st.session_state[perm_key]
+def get_idx_column(col_name, col_list):
+    if col_name in col_list:
+        return col_list.index(col_name)
+    else:
+        return 0
+def callback_add_to_multiselect(str_to_add, multiselect_key, text_input_key, *keys):
+    if len(str_to_add) == 0:
+        st.warning("Word is empty, did you press Enter on the field text?")
+        return
+    current_dict = st.session_state
+    *dict_keys, item_keys = keys
+    try:
+        for key in dict_keys:
+            current_dict = current_dict[key]
+        current_dict[item_keys].append(str_to_add)
+    except KeyError as e:
+        raise KeyError(keys) from e
+    if multiselect_key in st.session_state:
+        st.session_state[multiselect_key].append(str_to_add)
+    else:
+        st.session_state[multiselect_key] = [str_to_add]
+    st.session_state[text_input_key] = ""
+# Split the text into sentences. Necessary for NLI models
+def split_sentences(text):
+    return sent_tokenize(text)
+def get_num_sentences_in_list_text(list_texts):
+    num_sentences = 0
+    for text in list_texts:
+        num_sentences += len(split_sentences(text))
+    return num_sentences
+###### Prompting
+def query_model_prompting(model, text, prompt_with_mask, top_k, targets):
+    """Query the prompting model
+    :param model: Prompting model object
+    :type model: Huggingface pipeline object
+    :param text: Event description (context)
+    :type text: str
+    :param prompt_with_mask: Prompt with a mask
+    :type prompt_with_mask: str
+    :param top_k: Number of tokens to output
+    :type top_k: integer
+    :param targets: Restrict the answer to these possible tokens
+    :type targets: list
+    :return: Results of the prompting model
+    :rtype: list of dict
+    """
+    sequence = text + prompt_with_mask
+    output_tokens = model(sequence, top_k=top_k, targets=targets)
+    return output_tokens
+def do_sentence_entailment(sentence, hypothesis, model):
+    """Concatenate context and hypothesis then perform entailment
+    :param sentence: Event description (context), 1 sentence
+    :type sentence: str
+    :param hypothesis: Mask filled with a token
+    :type hypothesis: str
+    :param model: NLI Model
+    :type model: Huggingface pipeline
+    :return: DataFrame containing the result of the entailment
+    :rtype: pandas DataFrame
+    """
+    text = sentence + "</s></s>" + hypothesis
+    res = model(text, return_all_scores=True)
+    df_res = pd.DataFrame(res[0])
+    df_res["label"] = df_res["label"].apply(lambda x: x.lower())
+    df_res.columns = ["Label", "Score"]
+    return df_res
+def softmax(x):
+    """Compute softmax values for each sets of scores in x."""
+    return np.exp(x) / np.sum(np.exp(x), axis=0)
+def get_singular_form(word):
+    """Get the singular form of a word
+    :param word: word
+    :type word: string
+    :return: singular form of the word
+    :rtype: string
+    """
+    if INFLECT_ENGINE.singular_noun(word):
+        return INFLECT_ENGINE.singular_noun(word)
+    else:
+        return word
+######### NLI + PROMPTING
+def do_text_entailment(text, hypothesis, model):
+    """
+    Do entailment for each sentence of the event description as
+    model was trained on sentence pair
+    :param text: Event Description (context)
+    :type text: str
+    :param hypothesis: Mask filled with a token
+    :type hypothesis: str
+    :param model: Model NLI
+    :type model: Huggingface pipeline
+    :return: List of entailment results for each sentence of the text
+    :rtype: list
+    """
+    text_entailment_results = []
+    for i, sentence in enumerate(split_sentences(text)):
+        df_score = do_sentence_entailment(sentence, hypothesis, model)
+        text_entailment_results.append((sentence, hypothesis, df_score))
+    return text_entailment_results
+def get_true_entailment(text_entailment_results, nli_limit):
+    """
+    From the result of each sentence entailment, extract the maximum entailment score and
+    check if it's higher than the entailment threshold.
+    """
+    true_hypothesis_list = []
+    max_score = 0
+    for sentence_entailment in text_entailment_results:
+        df_score = sentence_entailment[2]
+        score = df_score[df_score["Label"] == "entailment"]["Score"].values.max()
+        if score > max_score:
+            max_score = score
+    if max_score > nli_limit:
+        true_hypothesis_list.append((sentence_entailment[1], np.round(max_score, 2)))
+    return list(set(true_hypothesis_list))
+def run_model_nli(data, batch_size, model_nli, use_tf=False):
+    if not use_tf:
+        return model_nli(data, top_k=3, batch_size=batch_size)
+    else:
+        raise NotImplementedError
+        # return run_pipeline_on_gpu(data, batch_size, model_nli["tokenizer"], model_nli["model"])
+def prompt_to_nli_batching(
+    text,
+    prompt,
+    model_prompting,
+    nli_model,
+    nlp,
+    top_k=10,
+    nli_limit=0.5,
+    targets=None,
+    additional_words=None,
+    remove_lemma=False,
+    use_tf=False,
+):
+    # Check if text has end ponctuation
+    if text[-1] not in string.punctuation:
+        text += "."
+    prompt_masked = prompt.format(model_prompting.tokenizer.mask_token)
+    output_prompting = query_model_prompting(
+        model_prompting, text, prompt_masked, top_k, targets=targets
+    )
+    if remove_lemma:
+        output_prompting = filter_prompt_output_by_lemma(prompt, output_prompting, nlp)
+    full_batch_concat = []
+    prompt_tokens = []
+    for token in output_prompting:
+        hypothesis = prompt.format(token["token_str"])
+        for i, sentence in enumerate(split_sentences(text)):
+            full_batch_concat.append(sentence + "</s></s>" + hypothesis)
+            prompt_tokens.append((token["token_str"], token["score"]))
+    # Add words that must be tried for entailment
+    # Also increase batch_size
+    if additional_words:
+        for i, sentence in enumerate(split_sentences(text)):
+            for token in additional_words:
+                hypothesis = prompt.format(token)
+                full_batch_concat.append(sentence + "</s></s>" + hypothesis)
+                prompt_tokens.append((token, 1))
+                top_k = top_k + 1
+    results_nli = run_model_nli(full_batch_concat, top_k, nli_model, use_tf)
+    # Get entailed tokens
+    entailed_tokens = []
+    for i, res in enumerate(results_nli):
+        entailed_tokens.extend(
+            [
+                (get_singular_form(prompt_tokens[i][0]), x["score"])
+                for x in res
+                if ((x["label"] == "ENTAILMENT") & (x["score"] > nli_limit))
+            ]
+        )
+    if entailed_tokens:
+        entailed_tokens = list(
+            pd.DataFrame(entailed_tokens).groupby(0).max()[1].items()
+        )
+    return entailed_tokens, list(set(prompt_tokens))
+def remove_similar_lemma_from_list(prompt, list_words, nlp):
+    ## Compute a dictionnary with the lemma for all tokens
+    ## If there is a duplicate lemma then the dictionnary value will be a list of the corresponding tokens
+    lemma_dict = {}
+    for each in list_words:
+        mask_filled = nlp(prompt.strip(".").format(each))
+        lemma_dict.setdefault([x.lemma_ for x in mask_filled][-1], []).append(each)
+    ## Get back the list of tokens
+    ## If multiple tokens available then take the shortest one
+    new_token_list = []
+    for key in lemma_dict.keys():
+        if len(lemma_dict[key]) >= 1:
+            new_token_list.append(min(lemma_dict[key], key=len))
+        else:
+            raise ValueError("Lemma dict has 0 corresponding words")
+    return new_token_list
+def filter_prompt_output_by_lemma(prompt, output_prompting, nlp):
+    """
+    Remove all similar lemmas from the prompt output (e.g. "protest", "protests")
+    """
+    list_words = [x["token_str"] for x in output_prompting]
+    new_token_list = remove_similar_lemma_from_list(prompt, list_words, nlp)
+    return [x for x in output_prompting if x["token_str"] in new_token_list]
+# Streamlit specific run functions
+@st.experimental_memo(max_entries=1024)
+def do_prent(text, template, top_k, nli_limit, additional_words=None):
+    """Function used to execute PRENT model
+    :param text: Event text
+    :type text: string
+    :param template: Template with mask
+    :type template: string
+    :param top_k: Maximum tokens to output from prompting model
+    :type top_k: int
+    :param nli_limit: Threshold of entailment for NLI [0,1]
+    :type nli_limit: float
+    :param additional_words: List of words that bypass prompting and goes directly to NLI, defaults to None
+    :type additional_words: list, optional
+    :return: (Results Entailment, Results Prompting)
+    :rtype: tuple
+    """
+    results_nli, results_pr = prompt_to_nli_batching(
+        text,
+        template,
+        load_model_prompting(),
+        load_model_nli(),
+        load_spacy_pipeline(),
+        top_k=top_k,
+        nli_limit=nli_limit,
+        targets=None,
+        additional_words=additional_words,
+        remove_lemma=True,
+    )
+    return results_nli, results_pr
+def get_additional_words():
+    """Extract the additional words from the codebook
+    :return: list of additional words
+    :rtype: list
+    """
+    if "add_words" in st.session_state.codebook:
+        additional_words = st.session_state.codebook["add_words"]
+    else:
+        additional_words = None
+    return additional_words
+def run_prent(
+    text="", templates=[], additional_words=None, progress=True, display_text=True
+):
+    """Execute PRENT over a list of templates and display streamlit widgets
+    :param text: Event description, defaults to ""
+    :type text: str, optional
+    :param templates: Templates with a mask, defaults to []
+    :type templates: list, optional
+    :param additional_words: List of words to bypass prompting, defaults to None
+    :type additional_words: list, optional
+    :param progress: Display or not the progress bar, defaults to True
+    :type progress: bool, optional
+    :return: (results of prent, computation time)
+    :rtype: tuple
+    """
+    # Check if there is any template and event description available
+    if not templates:
+        st.warning("Template list is empty. Please add one.")
+        return None, None
+    if not text:
+        st.warning("Event description is empty.")
+        return None, None
+    # Display text only when computing
+    if display_text:
+        temp_text = st.empty()
+        temp_text.markdown("**Event Descriptions:** {}".format(text))
+    # Start progress bar
+    if progress:
+        progress_bar = st.progress(0)
+    num_prent_call = len(templates)
+    num_sentences = get_num_sentences_in_list_text([text])
+    iter = 0
+    t0 = time()
+    # We set the radio choice of streamlit to Ignore at first
+    if "accept_reject_text_perm" in st.session_state:
+        st.session_state["accept_reject_text_perm"] = "Ignore"
+    res = {}
+    for template in templates:
+        template = template.replace("[Z]", "{}")
+        results_nli, results_pr = do_prent(
+            text,
+            template,
+            top_k=TOP_K,
+            nli_limit=NLI_LIMIT,
+            additional_words=additional_words,
+        )
+        # Results_nli contains % of entailment, we only care about the tokens string
+        res[template] = [x[0] for x in results_nli]
+        # Update progress bar
+        iter += 1
+        if progress:
+            progress_bar.progress((1 / num_prent_call) * (iter))
+    if display_text:
+        temp_text.markdown("")
+    time_comput = (time() - t0) / num_sentences
+    # This check is done otherwise the time of computation is replaced by the
+    # time of computation when using cached value
+    if not time_comput < st.session_state.time_comput / 5:
+        st.session_state.time_comput = int(time_comput)
+    # Store some results
+    res["templates_used"] = templates
+    res["additional_words_used"] = additional_words
+    return res, time_comput
+####### Find event types based on codebook and PRENT results
+def check_any_conds(cond_any, list_res):
+    """Function that evaluates the "OR" conditions of the codebook versus the list of filled templates
+    :param cond_any: List of groundtruth filled templates
+    :type cond_any: list
+    :param list_res: A list of the filled templates given by PRENT
+    :type list_res: list
+    :return: True if any groundtruth template is inside the list given by PRENT
+    :rtype: bool
+    """
+    cond_any = list(cond_any)
+    condition = False
+    # Return False if there is no any condition
+    if not cond_any:
+        return False
+    for cond in cond_any:
+        # With the current codebook design, this should never be true.
+        # Before it was possible to have recursion to check AND conditions inside an OR condition
+        if isinstance(cond, dict):
+            condition = check_all_conds(cond["all"], list_res)
+        else:
+            # Check lowercase version of templates
+            if cond.lower() in [x.lower() for x in list_res]:
+                condition = True
+                # Exit function as the other templates won't change the outcome
+                return condition
+    return condition
+def check_all_conds(cond_all, list_res):
+    """Function that evaluates the "AND" conditions of the codebook versus the list of filled templates
+    :param cond_all: List of groundtruth filled templates
+    :type cond_all: list
+    :param list_res: A list of the filled templates given by PRENT
+    :type list_res: list
+    :return: True if all groundtruth template are inside the list given by PRENT
+    :rtype: bool
+    """
+    cond_all = list(cond_all)
+    # Return False if there is no all condition
+    if not cond_all:
+        return False
+    # Start bool on True, and put it to false if any template is missing
+    condition = True
+    for cond in cond_all:
+        # With the current codebook design, this should never be true.
+        # Before it was possible to have recursion to check OR conditions inside an AND condition
+        if isinstance(cond, dict):
+            condition = check_any_conds(cond["any"])
+        else:
+            # Check lowercase version of templates
+            if not (cond.lower() in [x.lower() for x in list_res]):
+                condition = False
+                # Exit function as the other templates won't change the outcome
+                return condition
+    return condition
+def find_event_types(codebook, list_res):
+    """This function evaluates the codebook and then outputs a list of events types corresponding to the given results of PRENT (list of filled templates).
+    :param codebook: A codebook in the format given by the dashboard
+    :type codebook: dict
+    :param list_res: A list of the filled templates given by PRENT
+    :type list_res: list
+    :return: List of event type
+    :rtype: list
+    """
+    list_event_type = []
+    # Iterate over all defined event types
+    for event_type in codebook["events"]:
+        code_event = codebook["events"][event_type]
+        is_not_all_event, is_not_any_event, is_not_event = False, False, False
+        is_all_event, is_any_event, is_event = False, False, False
+        # First check if NOT conditions are met
+        # e.g. a filled template that is contrary to the event is present
+        if "not_all" in code_event:
+            cond_all = code_event["not_all"]
+            if check_all_conds(cond_all, list_res):
+                is_not_all_event = True
+        if "not_any" in code_event:
+            cond_any = code_event["not_any"]
+            if check_any_conds(cond_any, list_res):
+                is_not_any_event = True
+        # Next we need to check if the "not_all" and "not_any" are related
+        # by an "OR" or "AND".
+        # This latest case needs special care because one of two list can
+        # be empty so False
+        if code_event["not_all_any_rel"] == "AND":
+            if is_not_all_event and (not code_event["not_any"]):
+                # If all TRUE and ANY is empty (so false)
+                is_not_event = True
+            elif is_not_any_event and (not code_event["not_all"]):
+                # If any TRUE and ALL is empty (so false)
+                is_not_event = True
+            if is_not_all_event and is_not_any_event:
+                is_not_event = True
+        elif code_event["not_all_any_rel"] == "OR":
+            if is_not_all_event or is_not_any_event:
+                is_not_event = True
+        # The other checks are not necessary if this is true, so we go
+        # to the next iteration
+        if is_not_event:
+            continue
+        # Similar to the previous checks but this time we look for templates that should be present
+        if "all" in code_event:
+            cond_all = code_event["all"]
+            ## Then check if All conditions are met, if not exit
+            if check_all_conds(cond_all, list_res):
+                is_all_event = True
+        if "any" in code_event:
+            ## Finally check if Any conditions is met, if not exit
+            cond_any = code_event["any"]
+            if check_any_conds(cond_any, list_res):
+                is_any_event = True
+        # This case needs special care because one of two list can
+        # be empty so False
+        if code_event["all_any_rel"] == "AND":
+            if is_all_event and (not code_event["any"]):
+                # If all TRUE and ANY is empty (so false)
+                is_event = True
+            elif is_any_event and (not code_event["all"]):
+                # If any TRUE and ALL is empty (so false)
+                is_event = True
+            elif is_all_event and is_any_event:
+                is_event = True
+        elif code_event["all_any_rel"] == "OR":
+            if is_all_event or is_any_event:
+                is_event = True
+        # If all checks are correct, then we can add the event type to the output list
+        if is_event:
+            list_event_type.append(event_type)
+    return list_event_type

pages/1_Codebook_Design.py ADDED Viewed

	@@ -0,0 +1,731 @@

+import datetime as datetime
+import hashlib
+import json
+import os
+import sys
+import pandas as pd
+import streamlit as st
+current = os.path.dirname(os.path.realpath(__file__))
+parent = os.path.dirname(current)
+sys.path.append(parent)
+from helpers import (
+    apply_style,
+    callback_add_to_multiselect,
+    choose_text_menu,
+    do_prent,
+    find_event_types,
+    get_additional_words,
+    get_idx_column,
+    get_nli_limit,
+    get_num_sentences_in_list_text,
+    get_top_k,
+    initiate_widget_st_state,
+    run_prent,
+)
+# Set constant values
+TOP_K = get_top_k()
+NLI_LIMIT = get_nli_limit()
+### Styling
+# Needs to be done first
+apply_style()
+# Avoid having ellipsis in the multi select options
+styl = """
+    <style>
+        .stMultiSelect span{
+            max-width: none;
+        }
+    </style>
+    """
+st.markdown(styl, unsafe_allow_html=True)
+# Set color of multiselect to red
+st.markdown(
+    """
+    <style>
+        span[data-baseweb="tag"] {
+            background-color: red !important;
+        }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+def validated_metric_per_event_types(validated_dataset):
+    """Compute the accuracy metrics of the validated dataset
+    for each event type. Compute True Positive, False Negative,
+    True Negative, False Positive.
+    :param validated_dataset: Dictionary containing results of PRENT validated by the user
+    :type validated_dataset: dict
+    :return: Dictionnary containing accuracy metric for all event types
+    :rtype: dict
+    """
+    dict_acc = {}
+    for key, val in validated_dataset.items():
+        # Compute the event types based on the computed templates of PRENT
+        pred_event_types = find_event_types(
+            st.session_state.codebook, val["filled_templates"]
+        )
+        true_event_types = val["event_types"]
+        # Compute only accuracy for accepted samples
+        if val["decision"] == "Accept":
+            # Iterate over all possible event types
+            for event_type in st.session_state.codebook["events"].keys():
+                dict_acc.setdefault(event_type, {})
+                dict_acc[event_type].setdefault("TP", 0)
+                dict_acc[event_type].setdefault("FN", 0)
+                dict_acc[event_type].setdefault("FP", 0)
+                dict_acc[event_type].setdefault("TN", 0)
+                if (event_type in true_event_types) and (
+                    event_type in pred_event_types
+                ):
+                    dict_acc[event_type]["TP"] += 1
+                elif (event_type in true_event_types) and not (
+                    event_type in pred_event_types
+                ):
+                    dict_acc[event_type]["FN"] += 1
+                elif not (event_type in true_event_types) and (
+                    event_type in pred_event_types
+                ):
+                    dict_acc[event_type]["FP"] += 1
+                else:
+                    dict_acc[event_type]["TN"] += 1
+    # Normalize metrics
+    if dict_acc:
+        for event_type in st.session_state.codebook["events"].keys():
+            dict_acc[event_type]["Accuracy"] = (
+                dict_acc[event_type]["TP"] + dict_acc[event_type]["TN"]
+            ) / (
+                dict_acc[event_type]["TP"]
+                + dict_acc[event_type]["TN"]
+                + dict_acc[event_type]["FP"]
+                + dict_acc[event_type]["FN"]
+            )
+    return dict_acc
+def store_validated_data(
+    text,
+    decision,
+    text_idx,
+    templates,
+    additional_words,
+    list_event_type,
+    prent_params=(TOP_K, NLI_LIMIT),
+):
+    """Function used to store the results of PRENT in a DataFrame and in the
+    session state of Streamlit.
+    :param text: Event description
+    :type text: string
+    :param decision: Decision of the user (Accept/Reject/Ignore)
+    :type decision: string
+    :param text_idx: Index of the event
+    :type text_idx: int
+    :param templates: List of template used
+    :type templates: list
+    :param additional_words: List of additional words used
+    :type additional_words: list
+    :param list_event_type: List of event type found by PRENT and Codebook
+    :type list_event_type: list
+    :param prent_params: Parameters of PRENT, defaults to (TOP_K, NLI_LIMIT)
+    :type prent_params: tuple, optional
+    """
+    if "validated_data" not in st.session_state:
+        st.session_state["validated_data"] = {}
+    # Generate an index if the text is not coming from a csv
+    if not text_idx:
+        # Create a hash of 8 digits of the text to put as index
+        data_idx = str(
+            "manual_{}".format(
+                int(
+                    hashlib.sha256(text.encode("utf-8")).hexdigest(),
+                    16,
+                )
+                % 10**8
+            )
+        )
+    else:
+        data_idx = str(text_idx)
+    if data_idx not in st.session_state["validated_data"]:
+        st.session_state["validated_data"][data_idx] = {}
+    st.session_state["validated_data"][data_idx]["text"] = text
+    st.session_state["validated_data"][data_idx]["templates"] = [
+        template.replace("{}", "[Z]") for template in templates
+    ]
+    st.session_state["validated_data"][data_idx]["additional_words"] = additional_words
+    st.session_state["validated_data"][data_idx]["event_types"] = list_event_type
+    st.session_state["validated_data"][data_idx][
+        "filled_templates"
+    ] = list_filled_templates
+    st.session_state["validated_data"][data_idx]["decision"] = decision
+    st.session_state["validated_data"][data_idx]["prent_params"] = prent_params
+### Initialize session state variables
+if "codebook" not in st.session_state:
+    st.session_state.codebook = {}
+    st.session_state.codebook.setdefault("events", {})
+    st.session_state.codebook["templates"] = []
+if "text" not in st.session_state:
+    st.session_state.text = ""
+if "res" not in st.session_state:
+    st.session_state.res = None
+if "accept_reject_text_perm" not in st.session_state:
+    st.session_state.accept_reject_text_perm = None
+if "validated_data" not in st.session_state:
+    st.session_state["validated_data"] = {}
+if "time_comput" not in st.session_state:
+    st.session_state.time_comput = 20
+if "rerun" not in st.session_state:
+    st.session_state.rerun = False
+if "recompute_all_templates" not in st.session_state:
+    st.session_state.recompute_all_templates = False
+def reset_computation_results():
+    """Reset cached values in session state related to computations"""
+    st.session_state.res = {}
+    st.session_state.recompute_all_templates = True
+    st.session_state["accept_reject_text_perm"] = "Ignore"
+    st.session_state.rerun = True
+def get_all_filled_templates(results):
+    """Create the filled templates from PRENT results. Merging template with mask
+    with the entailed tokens.
+    :param results: Dictionary containing PRENT results
+    :type results: dict
+    :return: List of all entailed templates
+    :rtype: list
+    """
+    filled_templates = []
+    templates_used = [x.replace("[Z]", "{}") for x in results["templates_used"]]
+    for template in templates_used:
+        filled_template = [template.format(x) for x in results[template]]
+        filled_templates.extend(filled_template)
+    return filled_templates
+# Split streamlit dashboard
+col_intro_left, col_intro_righter = st.columns([8, 8])
+with col_intro_left:
+    st.markdown(
+        """ # Codebook Design
+    """
+    )
+def load_demo(
+    codebook_path="codebook_demo.json",
+    validated_data_path="validated_data_demo.json",
+    csv_data_path="data_demo.csv",
+):
+    """Load demonstration files from disk
+    :param codebook_path: path to codebook, defaults to "codebook_demo.json"
+    :type codebook_path: str, optional
+    :param validated_data_path: path to validated dataset, defaults to "validated_data_demo.json"
+    :type validated_data_path: str, optional
+    :param csv_data_path: path to raw data, defaults to "data_demo.csv"
+    :type csv_data_path: str, optional
+    """
+    st.session_state.codebook = json.load(open(codebook_path))
+    st.session_state.validated_data = json.load(open(validated_data_path))
+    st.session_state.data = pd.read_csv(csv_data_path, delimiter=";")
+    st.session_state.filtered_df = st.session_state.data
+    st.session_state.text_column_design_perm = "Event Descriptions"
+    st.session_state["multiselect_classes"] = list(
+        st.session_state.codebook["events"].keys()
+    )
+    st.session_state.text_idx = 0
+    st.session_state.text = (
+        "On 23 August, a group attacked a village, abducting 6 people."
+    )
+    st.session_state.text_display = (
+        "On 23 August, a group attacked a village, abducting 6 people."
+    )
+    st.session_state["text_options_valid_perm"] = "From CSV"
+    st.session_state["text_options_valid"] = "From CSV"
+def clear_all():
+    """Cleare session state"""
+    for each in st.session_state:
+        del st.session_state[each]
+    st.experimental_rerun()
+# Add two buttons in the sidebar to load and clear the demo
+with st.sidebar:
+    if st.button("Load Demo"):
+        load_demo()
+    if st.button("Clear Demo"):
+        clear_all()
+    st.write("********")
+with st.sidebar:
+    # Next function used for callback when download
+    def update_codebook_save_time():
+        st.session_state.save_codebook_time = (
+            datetime.datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
+        )
+    if st.download_button(
+        label="Download codebook as JSON",
+        data=json.dumps(st.session_state.codebook, indent=3).encode("ASCII"),
+        file_name="codebook.json",
+        mime="application/json",
+    ):
+        update_codebook_save_time()
+    if "save_codebook_time" in st.session_state:
+        st.write("Saved on: " + st.session_state.save_codebook_time)
+with st.sidebar:
+    # Next function used for callback when download
+    def update_validated_save_time():
+        st.session_state.save_validated_time = (
+            datetime.datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
+        )
+    if st.download_button(
+        label="Download labeled data",
+        data=json.dumps(st.session_state["validated_data"], indent=3).encode("ASCII"),
+        file_name="validated_data.json",
+        mime="application/json",
+    ):
+        update_validated_save_time()
+    if "save_validated_time" in st.session_state:
+        st.write("Saved on: " + st.session_state.save_validated_time)
+# Add text to sidebar
+with st.sidebar:
+    st.write("********")
+    st.markdown(
+        """
+#### Manual:
+1. Set the list of possible event types
+2. Select the input mode of the data (Manual or CSV)
+3. If the codebook is empty, write a default template
+   - `This event involves [Z].` is a good starting point
+4. Write/Select an event description
+5. Run PR-ENT
+6. Check the event type classification
+   - If it is correct then select Accept and return to step 4.
+   - If it is wrong then select Reject and populate the codebook with the appropriate filled templates. The classification is updated for each change, when it is correct, click Accept.
+7. Return to step 4
+#### Tips & Tricks:
+- If you start a codebook from scratch, it may be easier to pass a manual text example for each event type to get a first codebook draft
+- Current codebook accuracy based on labeled data can be found in the top right
+- The approach does not aim for perfect accuracy and some failures can happen, e.g. some event descriptions can produce filled templates that are not satisfactory.
+    """
+    )
+# Add accuracy table
+with col_intro_righter:
+    accuracy = st.empty()
+    # We fill the table with the last acc to avoid having it disappearing each time
+    if "acc_df" in st.session_state:
+        accuracy.table(
+            st.session_state.acc_df.loc["Accuracy":"Accuracy"].style.format("{:.2}")
+        )
+    performance_container = st.expander("Detailed Performances")
+st.write("*********")
+col_left, col_right = st.columns(2)
+# Add widgets to add event type and choose text input
+with col_intro_left:
+    with st.expander("Event Types List"):
+        st.markdown(
+            """
+            ## Select Event Types.
+        """
+        )
+        if "class_list_perm" not in st.session_state:
+            st.session_state["class_list_perm"] = []
+        # Text field + button to add new event types to multiselect
+        new_class = st.text_input(
+            "Add a new event type", "", key="new_class_text_input"
+        )
+        st.button(
+            "Add Class",
+            on_click=callback_add_to_multiselect,
+            args=(
+                new_class,
+                "multiselect_classes",
+                "new_class_text_input",
+                "class_list_perm",
+            ),
+        )
+        # Multiselect to choose event types
+        if "multiselect_classes" not in st.session_state:
+            st.session_state["multiselect_classes"] = list(
+                st.session_state.codebook["events"].keys()
+            )
+        class_list = st.multiselect(
+            "Event Type List",
+            set(
+                st.session_state["class_list_perm"]
+                + list(st.session_state.codebook["events"].keys())
+            ),
+            st.session_state["multiselect_classes"],
+            key="multiselect_classes",
+        )
+        st.session_state["class_list_perm"] = class_list
+    with st.expander("Select Text Input Mode (Manual, CSV)"):
+        st.write(
+            """
+            Choose the text input of the event descriptions. Three choices:
+            - Manual: One event description can be manually input
+            - From CSV: If a CSV of event descriptions was provided
+        """
+        )
+        def callback_radio_text_choice():
+            st.session_state.text = ""
+            st.session_state.text_display = ""
+        initiate_widget_st_state(
+            "text_options_valid", "text_options_valid_perm", "Manual"
+        )
+        st.session_state["text_options_valid_perm"] = st.radio(
+            "Choose text input",
+            ["Manual", "From CSV"],
+            index=get_idx_column(
+                st.session_state["text_options_valid"], ["Manual", "From CSV"]
+            ),
+            key="text_options_valid",
+            on_change=callback_radio_text_choice,
+            horizontal=True,
+        )
+with col_left:
+    if st.session_state["text_options_valid_perm"] == "Manual":
+        text = choose_text_menu("")
+        # Reset all computations if text has changed
+        if text != st.session_state.text:
+            reset_computation_results()
+        st.session_state.text_idx = None
+        st.session_state.text = text
+        st.session_state.text_display = text
+    elif st.session_state["text_options_valid_perm"] == "From CSV":
+        if st.button("Select Random Text"):
+            sample = st.session_state.filtered_df.sample(n=1).iloc[0]
+            text = sample[st.session_state["text_column_design_perm"]]
+            idx = sample.name
+            if text != st.session_state.text:
+                reset_computation_results()
+            st.session_state.text = text
+            st.session_state.text_idx = idx
+            st.session_state.text_display = st.session_state.text
+    expected_time = st.session_state.time_comput * get_num_sentences_in_list_text(
+        [st.session_state.text]
+    )
+    if st.button("Run PR-ENT / Expected time: {}sec".format(expected_time)):
+        if "templates" in st.session_state.codebook:
+            templates = st.session_state.codebook["templates"]
+        else:
+            templates = []
+            st.warning("No template in codebook. Please add one.")
+        additional_words = get_additional_words()
+        st.session_state.res = {}
+        res, time_comput = run_prent(st.session_state.text, templates, additional_words)
+        st.session_state.res = res
+    st.write("**Event Descriptions:** {}".format(st.session_state.text_display))
+    ev_desc = st.empty()
+    radio_empty = st.empty()
+    if st.session_state.res:
+        list_filled_templates = get_all_filled_templates(st.session_state.res)
+        list_event_type = find_event_types(
+            st.session_state.codebook, list_filled_templates
+        )
+        event_type_text = ev_desc.markdown(
+            "**Current Event Types Classification**: {}".format(
+                "; ".join(list_event_type)
+            )
+        )
+        if "accept_reject_text_perm" not in st.session_state:
+            st.session_state["accept_reject_text_perm"] = "Ignore"
+        def callback_function(mod, key):
+            st.session_state[mod] = st.session_state[key]
+        radio_empty.radio(
+            "Accept or Reject Coding",
+            ["Ignore", "Accept", "Reject"],
+            key="accept_reject_text",
+            on_change=callback_function,
+            args=(
+                "accept_reject_text_perm",
+                "accept_reject_text",
+            ),
+            index=get_idx_column(
+                st.session_state["accept_reject_text_perm"],
+                ["Ignore", "Accept", "Reject"],
+            ),
+            horizontal=True,
+        )
+        decision = st.session_state["accept_reject_text_perm"]
+        text_idx = st.session_state.text_idx
+        text = st.session_state.text
+        store_validated_data(
+            text,
+            decision,
+            text_idx,
+            st.session_state.res["templates_used"],
+            st.session_state.res["additional_words_used"],
+            list_event_type,
+            prent_params=(TOP_K, NLI_LIMIT),
+        )
+with col_right:
+    if (
+        st.session_state["accept_reject_text_perm"] == "Reject"
+    ) or not st.session_state.codebook["templates"]:
+        with st.expander("Add Templates + Explanation"):
+            st.markdown(
+                """
+                ## Add Templates
+            """
+            )
+            st.markdown(
+                """
+                For each template added. PR-ENT will be run on the selected text.
+            """
+            )
+            if "templates" not in st.session_state.codebook:
+                st.session_state.codebook["templates"] = []
+            template = st.text_input(
+                "Template with a mask [Z].", "This event involves [Z]."
+            )
+            if st.button("Add template"):
+                if template not in st.session_state.codebook["templates"]:
+                    ## Add template to codebook
+                    st.session_state.codebook["templates"].append(template)
+                    additional_words = get_additional_words()
+                    prompt = template.replace("[Z]", "{}")
+                    results_nli, _ = do_prent(
+                        st.session_state.text,
+                        prompt,
+                        TOP_K,
+                        NLI_LIMIT,
+                        additional_words,
+                    )
+                    tokens_nli = [x[0] for x in results_nli]
+                    # Update result table with new template
+                    if not st.session_state["res"]:
+                        st.session_state.res = {}
+                        st.session_state.res["additional_words_used"] = additional_words
+                        st.session_state.res["templates_used"] = []
+                    st.session_state.res[prompt] = tokens_nli
+                    st.session_state.res["templates_used"].append(template)
+                    st.write("Template '{}' added.".format(template))
+                else:
+                    st.write("Template '{}' already added.".format(template))
+        if st.session_state.codebook["templates"]:
+            with st.expander("Populate Codebook Explanation"):
+                st.markdown(
+                    """
+                ## Set the filled template to each class.
+                For each class you can select one or more filled templates. When the evaluation will
+                be made, these templates will be compared with the results of PR-ENT. There are 4 options:
+                - ALL: If **ALL** of these filled templates are present in the results of PR-ENT then this event type is correct
+                - ANY: If **ANY** of these filled templates is present in the results of PR-ENT then this event type is correct
+                - NOT ALL: If **ALL** of these filled templates are present in the results of PR-ENT, then this event type is **not** correct
+                    - e.g. You may want to remove all *explosions* events from a class *Killings*.
+                - NOT ANY: If **ANY** of these filled templates is present in the results of PR-ENT, then this event type is **not** correct
+                Moreover, **ANY/ALL** and **NOT ANY/ NOT ALL** can be made in relation by a **AND / OR** condition.
+                """
+                )
+            st.write("***************")
+            st.write("### Populate Codebook")
+            if not class_list:
+                st.warning("No event type in codebook.")
+            tokens_list = get_all_filled_templates(st.session_state.res)
+            for event_type in class_list:
+                st.session_state.codebook["events"].setdefault(event_type, {})
+                event_type_chosen = event_type
+                with st.expander(event_type):
+                    def declare_ms_event_templates(
+                        widget_key, widget_display, codebook_key
+                    ):
+                        if widget_key not in st.session_state:
+                            st.session_state[widget_key] = st.session_state.codebook[
+                                "events"
+                            ][event_type_chosen].setdefault(codebook_key, [])
+                        tokens_all = st.multiselect(
+                            widget_display,
+                            set(
+                                list(
+                                    tokens_list
+                                    + st.session_state.codebook["events"][
+                                        event_type_chosen
+                                    ].setdefault(codebook_key, [])
+                                )
+                            ),
+                            st.session_state[widget_key],
+                            key=widget_key,
+                        )
+                        st.session_state.codebook["events"][event_type_chosen][
+                            codebook_key
+                        ] = tokens_all
+                    declare_ms_event_templates(
+                        "ms_all_{}".format(event_type_chosen), "ALL", "all"
+                    )
+                    st.session_state.codebook["events"][event_type_chosen][
+                        "all_any_rel"
+                    ] = st.selectbox(
+                        "Relation",
+                        ["AND", "OR"],
+                        index=get_idx_column(
+                            st.session_state.codebook["events"][
+                                event_type_chosen
+                            ].setdefault("all_any_rel", "OR"),
+                            ["AND", "OR"],
+                        ),
+                        key="select_relation_any_all_{}".format(event_type_chosen),
+                    )
+                    declare_ms_event_templates(
+                        "ms_any_{}".format(event_type_chosen), "ANY", "any"
+                    )
+                    declare_ms_event_templates(
+                        "ms_not_all_{}".format(event_type_chosen), "NOT ALL", "not_all"
+                    )
+                    st.session_state.codebook["events"][event_type_chosen][
+                        "not_all_any_rel"
+                    ] = st.selectbox(
+                        "Relation",
+                        ["AND", "OR"],
+                        index=get_idx_column(
+                            st.session_state.codebook["events"][
+                                event_type_chosen
+                            ].setdefault("not_all_any_rel", "OR"),
+                            ["AND", "OR"],
+                        ),
+                        key="select_relation_not_any_all_{}".format(event_type_chosen),
+                    )
+                    declare_ms_event_templates(
+                        "ms_not_any_{}".format(event_type_chosen), "NOT ANY", "not_any"
+                    )
+            # Workaround to avoid the expanders closing after first modification
+            # I have no explanation for the bug
+            if st.session_state.rerun:
+                st.session_state.rerun = False
+                st.experimental_rerun()
+if "validated_data" in st.session_state:
+    recompute = False
+    performance_container.markdown(
+        "If a new template is added, the previous labeled samples needs to be recomputed with it. The next button allows that, however it can take some time depending on the number of samples."
+    )
+    if performance_container.button(
+        "Recompute Missing Templates", key="recompute_temp"
+    ):
+        prog_bar = performance_container.progress(0)
+        for i, datapoint in enumerate(st.session_state["validated_data"].values()):
+            if not set(st.session_state.codebook["templates"]).issubset(
+                set(datapoint["templates"])
+            ):
+                # Get templates that are missing from results but present in codebook
+                # These happens if templates are added a posteriori
+                missing_templates = list(
+                    set(st.session_state.codebook["templates"])
+                    - set(set(datapoint["templates"]))
+                )
+                recompute = True
+            # For now additional words are not recomputed
+            if not set(st.session_state.codebook["add_words"]).issubset(
+                set(datapoint["additional_words"])
+            ):
+                missing_add_words = list(
+                    set(st.session_state.codebook["add_words"])
+                    - set(set(datapoint["additional_words"]))
+                )
+                recompute = True
+            else:
+                missing_add_words = None
+            if recompute:
+                res, _ = run_prent(
+                    datapoint["text"],
+                    missing_templates,
+                    missing_add_words,
+                    progress=False,
+                )
+                datapoint["filled_templates"].extend(get_all_filled_templates(res))
+                datapoint["templates"].extend(missing_templates)
+            prog_bar.progress(
+                (1 / len(st.session_state["validated_data"].values())) * (i + 1)
+            )
+    st.session_state.acc_df = pd.DataFrame(
+        validated_metric_per_event_types(st.session_state["validated_data"])
+    )
+    accuracy.table(
+        st.session_state.acc_df.loc["Accuracy":"Accuracy"].style.format("{:.2}")
+    )
+    performance_container.markdown("### Performances on labeled dataset")
+    performance_container.dataframe(st.session_state.acc_df.style.format("{:.3}"))
+if st.session_state.res:
+    list_filled_templates = get_all_filled_templates(st.session_state.res)
+    list_event_type = find_event_types(st.session_state.codebook, list_filled_templates)
+    ev_desc.markdown(
+        "**Current Event Types Classification**: {}".format("; ".join(list_event_type))
+    )

pages/2_Codebook_Advanced_Edit.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import json
+import streamlit as st
+from helpers import apply_style, callback_add_to_multiselect, get_idx_column
+apply_style()
+# Avoid having ellipsis in the multi select options
+styl = """
+    <style>
+        .stMultiSelect span{
+            max-width: none;
+        }
+    </style>
+    """
+st.markdown(styl, unsafe_allow_html=True)
+st.write("# Codebook Edit")
+st.write(
+    """In this tab you can:
+- Add or remove templates
+- Add or remove additional answer candidates
+- Modify the filled templates by adding new ones manually"""
+)
+if "templates" not in st.session_state.codebook:
+    st.warning("No codebook loaded")
+    st.stop()
+st.write("## Codebook: Template")
+with st.expander("Templates"):
+    template = st.text_input(
+        "Template with a mask [Z].",
+        "This event involves [Z].",
+        key="add_template_text_input",
+    )
+    st.button(
+        "Add template",
+        on_click=callback_add_to_multiselect,
+        args=(
+            template,
+            "multiselect_templates",
+            "add_template_text_input",
+            "codebook",
+            "templates",
+        ),
+    )
+    if "multiselect_templates" not in st.session_state:
+        st.session_state["multiselect_templates"] = st.session_state.codebook[
+            "templates"
+        ]
+    st.write("Removed templates will be removed from the codebook.")
+    templates = st.multiselect(
+        "Templates",
+        set(st.session_state.codebook["templates"]),
+        st.session_state["multiselect_templates"],
+        key="multiselect_templates",
+    )
+    st.session_state.codebook["templates"] = templates
+st.write("## Codebook: Additional Answer Candidates")
+st.write(
+    """
+    You can manually add answer candidates. Then they will be tested for entailment on every event
+    description and every template even if they are not present in the prompting results.
+    This is intended for case when the event that you try to describe is quite rare (e.g. shelling, missiles).
+    **Caution**: Each word added will increase the computation time (about +3%).
+    **Caution**: The PR-ENT model will always try to output the singular form of the word.
+"""
+)
+with st.expander("Add answer candidates"):
+    new_word = st.text_input(
+        "Answer Candidate (1 word)", "", key="add_words_text_input"
+    )
+    st.button(
+        "Add Word",
+        on_click=callback_add_to_multiselect,
+        args=(
+            new_word,
+            "multiselect_addwords",
+            "add_words_text_input",
+            "codebook",
+            "add_words",
+        ),
+    )
+    if "add_words" not in st.session_state.codebook:
+        st.session_state.codebook["add_words"] = []
+    if "multiselect_addwords" not in st.session_state:
+        st.session_state["multiselect_addwords"] = st.session_state.codebook[
+            "add_words"
+        ]
+    templates = st.multiselect(
+        "Add Words",
+        set(st.session_state.codebook["add_words"]),
+        st.session_state["multiselect_addwords"],
+        key="multiselect_addwords",
+    )
+    st.session_state.codebook["add_words"] = templates
+# TODO: Change by giving a list of templates and allow only filling a word.
+st.write("## Codebook: Additional Filled Templates")
+st.write(
+    """
+    You can also manually add filled templates to the codebook. This is for the case when you know that a
+    filled template could appear but you don't find corresponding events. This does not increase much the
+    computation time. For example you could add `This event involves kidnapping.` if you have no kidnapping
+    event in your dataset but you know it could happen.
+    **Caution**: The PR-ENT model will always try to output the singular form of the word. (e.g. "Protests" -> "Protest")
+"""
+)
+class_list = list(st.session_state.codebook["events"].keys())
+if "filled_templates" not in st.session_state:
+    st.session_state["filled_templates"] = []
+with st.expander("Add Filled Template"):
+    template_chosen = st.selectbox(
+        "Choose a template:",
+        st.session_state.codebook["templates"],
+        # index=get_idx_column(template, st.session_state.codebook["templates"]),
+        key="template_sct",
+    )
+    def add_template_with_word(template_chosen, new_word, key_text_input):
+        if len(new_word) == 0:
+            st.warning("Word is empty, did you press Enter on the field text?")
+        else:
+            st.session_state["filled_templates"].append(
+                template_chosen.replace("[Z]", new_word)
+            )
+        st.session_state[key_text_input] = ""
+    new_word = st.text_input("1 Word Mask", "", key="filled_template_text_input")
+    if st.button(
+        "Add Filled Template",
+        on_click=add_template_with_word,
+        args=(template_chosen, new_word, "filled_template_text_input"),
+    ):
+        st.write("Filled template added.")
+    st.write("The template can then be selected for each class below.")
+st.write("## Codebook: Event Types")
+st.write(
+    """
+    Here you have access to all filled templates independently of the template. You can add/remove some of them for
+    each event type.
+"""
+)
+for event_type in st.session_state.codebook["events"].keys():
+    for any_not_all in st.session_state.codebook["events"][event_type].keys():
+        if (any_not_all == "all_any_rel") or (any_not_all == "not_all_any_rel"):
+            pass
+        else:
+            st.session_state["filled_templates"].extend(
+                st.session_state.codebook["events"][event_type][any_not_all]
+            )
+for event_type in class_list:
+    st.session_state.codebook["events"].setdefault(event_type, {})
+    event_type_chosen = event_type
+    with st.expander(event_type):
+        def declare_ms_codebook_edit(widget_key, codebook_key, widget_display):
+            if widget_key not in st.session_state:
+                st.session_state[widget_key] = st.session_state.codebook["events"][
+                    event_type_chosen
+                ].setdefault(codebook_key, [])
+            tokens_all = st.multiselect(
+                widget_display,
+                set(st.session_state["filled_templates"]),
+                st.session_state[widget_key],
+                key=widget_key,
+            )
+            st.session_state.codebook["events"][event_type_chosen][
+                codebook_key
+            ] = tokens_all
+        declare_ms_codebook_edit("ms_all_{}".format(event_type_chosen), "all", "ALL")
+        st.session_state.codebook["events"][event_type_chosen][
+            "all_any_rel"
+        ] = st.selectbox(
+            "Relation",
+            ["AND", "OR"],
+            index=get_idx_column(
+                st.session_state.codebook["events"][event_type_chosen].setdefault(
+                    "all_any_rel", "OR"
+                ),
+                ["AND", "OR"],
+            ),
+            key="select_relation_any_all_{}".format(event_type_chosen),
+        )
+        declare_ms_codebook_edit("ms_any_{}".format(event_type_chosen), "any", "ANY")
+        declare_ms_codebook_edit(
+            "ms_not_all_{}".format(event_type_chosen), "not_all", "NOT ALL"
+        )
+        st.session_state.codebook["events"][event_type_chosen][
+            "not_all_any_rel"
+        ] = st.selectbox(
+            "Relation",
+            ["AND", "OR"],
+            index=get_idx_column(
+                st.session_state.codebook["events"][event_type_chosen].setdefault(
+                    "not_all_any_rel", "OR"
+                ),
+                ["AND", "OR"],
+            ),
+            key="select_relation_not_any_all_{}".format(event_type_chosen),
+        )
+        declare_ms_codebook_edit(
+            "ms_not_any_{}".format(event_type_chosen), "not_any", "NOT ANY"
+        )
+        if st.button("Remove Class", key="remove_class_{}".format(event_type_chosen)):
+            del st.session_state.codebook["events"][event_type_chosen]
+st.write("## Codebook: Download")
+st.download_button(
+    label="Download codebook as JSON",
+    data=json.dumps(st.session_state.codebook, indent=3).encode("ASCII"),
+    file_name="codebook.json",
+    mime="application/json",
+)

pages/3_Apply_Codebook.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import json
+import os
+import sys
+import pandas as pd
+import streamlit as st
+current = os.path.dirname(os.path.realpath(__file__))
+parent = os.path.dirname(current)
+sys.path.append(parent)
+from helpers import (
+    apply_style,
+    find_event_types,
+    get_additional_words,
+    get_nli_limit,
+    get_num_sentences_in_list_text,
+    get_top_k,
+    run_prent,
+)
+### Styling
+apply_style()
+TOP_K = get_top_k()
+NLI_LIMIT = get_nli_limit()
+### Initialize session state variables
+if "codebook" not in st.session_state:
+    st.session_state.codebook = {}
+    st.session_state.codebook.setdefault("events", {})
+if "text" not in st.session_state:
+    st.session_state.text = ""
+if "res" not in st.session_state:
+    st.session_state.res = None
+if "accept_reject_text_perm" not in st.session_state:
+    st.session_state.accept_reject_text_perm = None
+if "validated_data" not in st.session_state:
+    st.session_state["validated_data"] = {}
+if "time_comput" not in st.session_state:
+    st.session_state.time_comput = 20
+if "rerun" not in st.session_state:
+    st.session_state.rerun = False
+if "label_res" not in st.session_state:
+    st.session_state.label_res = {}
+if "filtered_df" not in st.session_state:
+    st.session_state["filtered_df"] = pd.DataFrame()
+if len(st.session_state["filtered_df"]) == 0:
+    st.warning("No data loaded.")
+def reset_computation_results():
+    st.session_state.res = {}
+    st.session_state.recompute_all_templates = True
+    st.session_state["accept_reject_text_perm"] = "Ignore"
+    st.session_state.rerun = True
+with st.sidebar:
+    st.markdown(
+        "Clicking any of these button during labeling will pause the process and download the latest version."
+    )
+    dl_labeled_button = st.empty()
+    dl_labeled_button.download_button(
+        label="Download Labeled Data",
+        data=st.session_state["filtered_df"].to_csv(sep=";").encode("utf-8"),
+        file_name="labeled_data.csv",
+        mime="text/csv",
+    )
+    dl_prent_button = st.empty()
+    dl_prent_button.download_button(
+        label="Download PR-ENT results",
+        data=json.dumps(st.session_state["label_res"], indent=3).encode("ASCII"),
+        file_name="prent_results.json",
+        mime="application/json",
+    )
+st.markdown(
+    """# Apply codebook to the dataset
+The currently loaded codebook will be used to find the event types of all event description in the currently loaded dataset. This can take some time (minutes to hours) depending on the size of the dataset (number of events, length of text).
+"""
+)
+markdown_num_events = st.empty()
+label_button = st.empty()
+st.markdown("#### Main progress bar")
+main_progress_bar = st.empty()
+main_progress_bar = main_progress_bar.progress(0)
+st.markdown("#### Last labeled event")
+temp_text = st.empty()
+temp_class = st.empty()
+temp_text.markdown("**Event Descriptions:** {}".format(""))
+temp_class.markdown("**Event Types Classification**: {}".format(""))
+st.markdown(
+    """#### Pause/Stop the event coding
+Pressing the button once will stop the process at the next iteration."""
+)
+stop_button = st.button("Stop")
+for event_type in st.session_state.codebook["events"]:
+    if event_type not in st.session_state.filtered_df.columns:
+        st.session_state.filtered_df[event_type] = 0
+expected_time = 0
+num_sentences = 0
+for idx in st.session_state.filtered_df.index:
+    subsampled_data = st.session_state.filtered_df.loc[idx:idx]
+    list_text = subsampled_data[st.session_state["text_column_design_perm"]].values[:1]
+    list_index = subsampled_data.index[:1]
+    if list_text[0] != st.session_state.text:
+        reset_computation_results()
+    st.session_state.text = list_text[0]
+    num_sentences += get_num_sentences_in_list_text([st.session_state.text])
+    expected_time += st.session_state.time_comput * get_num_sentences_in_list_text(
+        [st.session_state.text]
+    )
+markdown_num_events.markdown(
+    "Number of events: {} ¦ Number of sentences: {}".format(
+        len(st.session_state.filtered_df.index), num_sentences
+    )
+)
+if label_button.button(
+    "Label Data", disabled=len(st.session_state["filtered_df"]) == 0
+):
+    num_text = 0
+    main_progress_bar.progress(num_text)
+    temp_text.markdown("")
+    temp_class.markdown("")
+    tot_num_text = len(st.session_state.filtered_df.index)
+    for idx in st.session_state.filtered_df.index:
+        subsampled_data = st.session_state.filtered_df.loc[idx:idx]
+        list_text = subsampled_data[st.session_state["text_column_design_perm"]].values[
+            :1
+        ]
+        list_index = subsampled_data.index[:1]
+        if list_text[0] != st.session_state.text:
+            reset_computation_results()
+        st.session_state.text = list_text[0]
+        st.session_state.text_idx = list_index[0]
+        st.session_state.template_list = []
+        st.session_state.text_display = st.session_state.text
+        st.session_state.res = {}
+        res, time_comput = run_prent(
+            st.session_state.text,
+            st.session_state.codebook["templates"],
+            get_additional_words(),
+            progress=False,
+            display_text=False,
+        )
+        st.session_state.res = res
+        list_filled_templates = []
+        for template in st.session_state.res:
+            tmp = template.replace("[Z]", "{}")
+            list_filled_templates.extend(
+                [tmp.format(x) for x in st.session_state.res[template]]
+            )
+        list_event_type = find_event_types(
+            st.session_state.codebook, list_filled_templates
+        )
+        for event_type in list_event_type:
+            st.session_state.filtered_df.loc[idx, event_type] = 1
+        temp_text.markdown(
+            "**Event Descriptions:** {}".format(st.session_state.text_display)
+        )
+        temp_class.markdown(
+            "**Event Types Classification**: {}".format("; ".join(list_event_type))
+        )
+        # Save results
+        st.session_state.label_res[st.session_state.text_display] = {}
+        st.session_state.label_res[st.session_state.text_display][
+            "prent_results"
+        ] = st.session_state.res
+        st.session_state.label_res[st.session_state.text_display]["prent_params"] = (
+            TOP_K,
+            NLI_LIMIT,
+        )
+        st.session_state.label_res[st.session_state.text_display][
+            "event_types"
+        ] = list_event_type
+        num_text += 1
+        main_progress_bar.progress(num_text / tot_num_text)
+    # Need to update the buttons otherwise it doesn't update the downloaded file
+    # and the user would need to click two times
+    dl_labeled_button.download_button(
+        label="Download Labeled Data",
+        data=st.session_state["filtered_df"].to_csv(sep=";").encode("utf-8"),
+        file_name="labeled_data.csv",
+        mime="text/csv",
+        key="tmp",
+    )
+    dl_prent_button.download_button(
+        label="Download PR-ENT results",
+        data=json.dumps(st.session_state["label_res"], indent=3).encode("ASCII"),
+        file_name="prent_results.json",
+        mime="application/json",
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# see environments.yml
+numpy==1.23.2
+pandas==1.4.2
+spacy==3.2.3
+https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl
+transformers[torch]==4.22.1
+nltk==3.7
+streamlit==1.10.0
+streamlit-aggrid==0.2.3.post2
+inflect==6.0.0