import streamlit as st
import pandas as pd
import json
import nltk

import re 

nltk.download("punkt")

with open("_perception_cache.json") as f:
    s2s = json.load(f)
with open("_analysis_cache.json") as f:
    s2f = json.load(f)

db = pd.read_excel("data.xlsx")

FRAMES_OF_INTEREST = ["Abusing", "Attack", "Hit_target", "Quarreling", "Use_firearm", "Death", "Dead_or_alive", "Experience_bodily_harm", "Cause_harm", "Killing", "Event", "Catastrophe", "Offenses"]


def get_frame_analysis(s):
    frame_analysis = []
    if s not in s2f:
        return None
    for fns in s2f[s]["sociofillmore"][0]["fn_structures"]:
        # if True:
        if fns["frame"] in FRAMES_OF_INTEREST:
            analysis = {
                "frame": fns["frame"],
                "trigger": " ".join(fns["target"]["tokens_str"])
            }
            analysis.update({
                rol[0]: " ".join(rol[1]["tokens_str"]) for rol in fns["roles"]
            })
            frame_analysis.append(analysis)

    if len(frame_analysis) > 0:
        return pd.DataFrame(frame_analysis)
    else:
        return None

def analyze_document(doc):
    if not pd.isna(doc):
        sentences = nltk.sent_tokenize(
            doc,
            language="english"
        )
    else:
        sentences = []

    perception_tables = []
    frame_tables = []
    for si, s in enumerate(sentences[:20]):
        frame_analysis_df = get_frame_analysis(s)
        frame_tables.append(frame_analysis_df)
        perception_tables.append(s2s.get(s))
    return sentences, perception_tables, frame_tables


st.write("# LorentzFillmore: WATCH YOUR LANGUAGE")

st.dataframe(db)

st.write("## Writing Exercises & Perception Scores")

text_columns = [col for col in db.columns if col.startswith("Writing exercise:")]
selected_column = st.selectbox(label="Writing exercise:", options=text_columns)
aggregate_sentences = st.checkbox(label="Aggregate over sentences?")

perception_rows = []
for _, row in db.iterrows():
    sentences, perception_tables, frame_tables = analyze_document(row[selected_column])
    # mean_blame_score = pd.DataFrame(perception_tables).mean()["blame-assassin"]
    
    if aggregate_sentences:
        perception_row = {
            "writer": row["Email Address"], 
            "gender": row["I identify as ..."], 
            "language": row["What is your native language?"], 
            "background": row["What is your background?"], 
            "text": sentences
        }
        for k, v in pd.DataFrame(perception_tables).mean().to_dict().items():
            perception_row[k] = v
        perception_rows.append(perception_row)
    else:
        for s, pt, ft in zip(sentences, perception_tables, frame_tables):
            perception_row = {
                "writer": row["Email Address"], 
                "gender": row["I identify as ..."], 
                "language": row["What is your native language?"], 
                "background": row["What is your background?"], 
                "text": s
            }
            for k, v in pd.Series(pt).to_dict().items():
                perception_row[k] = v
            perception_rows.append(perception_row)

perception_df = pd.DataFrame(perception_rows)

dimension = st.selectbox(label="Which dimension of perception?", options=["blame", "cause", "focus"])
dim_cols = [col for col in perception_df.columns if col.startswith(dimension)]
dim_df = (
    perception_df[["writer", "text"] + dim_cols]
    .style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu")

)
st.dataframe(dim_df)

st.write("### Analysis by demographic attribute")

demo_attrib = st.selectbox("Select demographic attribute:", options=["writer", "gender", "language", "background"])
perc_attrib = st.selectbox("Select perception attribute", options=dim_cols)
st.plotly_chart(perception_df.groupby(demo_attrib).agg({perc_attrib: "mean"}).plot.bar(backend="plotly"))


st.write("## Comparing versions")
v_number = int(re.search(r"version (\d)", selected_column).group(1))
if v_number < 2:
    st.warning("To compare versions, select a writing exercise with version number 2 or higher.")
else:
    prev_version = re.sub(r"version (\d)", f"version {v_number - 1}", selected_column)
    assert prev_version in text_columns
    st.info(f"Comparing _{selected_column.replace('Writing exercise: ', '')}_ ↔️ _{prev_version.replace('Writing exercise: ', '')}_")

    perception_diff_rows = []
    for _, row in db.iterrows():
        sentences, perception_tables, frame_tables = analyze_document(row[selected_column])
        prev_sentences, prev_perception_tables, prev_frame_tables = analyze_document(row[prev_version])
        
        perception_diff_row = {
            "writer": row["Email Address"], 
            "gender": row["I identify as ..."], 
            "language": row["What is your native language?"], 
            "background": row["What is your background?"], 
            f"text_v{v_number - 1}": prev_sentences,
            f"text_v{v_number}": sentences
        }
        perc_new = pd.DataFrame(perception_tables).mean().to_dict()
        perc_old = pd.DataFrame(prev_perception_tables).mean().to_dict()
        for k, v in perc_new.items():
            if k not in perc_old:
                perception_diff_row[k] = 0    
            else:
                perception_diff_row[k] = v - perc_old[k]
        perception_diff_rows.append(perception_diff_row)

    perception_diff_df = pd.DataFrame(perception_diff_rows)

    dim_diff_df = (
        perception_diff_df[["writer", f"text_v{v_number - 1}", f"text_v{v_number}"] + dim_cols]
        .style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu")

    )
    st.dataframe(dim_diff_df)

    st.write("### Analysis by demographic attribute")

    demo_attrib_diff = st.selectbox("Select demographic attribute for diff:", options=["writer", "gender", "language", "background"])
    perc_attrib_diff = st.selectbox("Select perception attribute for diff", options=dim_cols)
    st.plotly_chart(perception_diff_df.groupby(demo_attrib_diff).agg({perc_attrib_diff: "mean"}).plot.bar(backend="plotly"))


st.write("## Frame analysis")
only_sentences_with_relevant_frames = st.checkbox("Only analyze sentences containing relevant frames?")

selected_writer = st.selectbox(
    label="Select a writer:",
    options=sorted([f"{row['Email Address']}" for idx, row in db.iterrows()])
)

st.write("----")

writer_row = db[db["Email Address"] == selected_writer].iloc[0]

st.write("### Text information")
st.dataframe(writer_row)

sentences, perception_tables, frame_tables = analyze_document(writer_row[selected_column])


st.write("### Analysis of the entire document")
mean_perception = pd.DataFrame(perception_tables).mean().to_frame(name="mean perception")
st.dataframe(mean_perception.style.highlight_max(axis=0), width=500)

st.write("---")
st.write("### Analysis by sentence")

for si, s in enumerate(sentences[:20]):
    frame_analysis_df = frame_tables[si]
    if frame_analysis_df is None and only_sentences_with_relevant_frames:
        continue

    st.write(f"#### Sentence #{1+si:02}/{len(sentences[:20])}\n*{s}*")
    if s not in s2s or s not in s2f:
        st.write("(Analysis not found)")
        continue

    st.write("##### Perception")
    perception_table = perception_tables[si]
    perception_df = pd.Series(perception_table).to_frame(name="predicted perception").style.highlight_max(axis=0)
    st.dataframe(perception_df, width=500)

    if frame_analysis_df is not None:
        st.write("##### Relevant frames")
        st.dataframe(frame_analysis_df, width=750)