import streamlit as st import pandas as pd import json import nltk import re nltk.download("punkt") with open("_perception_cache.json") as f: s2s = json.load(f) with open("_analysis_cache.json") as f: s2f = json.load(f) db = pd.read_excel("data.xlsx") FRAMES_OF_INTEREST = ["Abusing", "Attack", "Hit_target", "Quarreling", "Use_firearm", "Death", "Dead_or_alive", "Experience_bodily_harm", "Cause_harm", "Killing", "Event", "Catastrophe", "Offenses"] def get_frame_analysis(s): frame_analysis = [] if s not in s2f: return None for fns in s2f[s]["sociofillmore"][0]["fn_structures"]: # if True: if fns["frame"] in FRAMES_OF_INTEREST: analysis = { "frame": fns["frame"], "trigger": " ".join(fns["target"]["tokens_str"]) } analysis.update({ rol[0]: " ".join(rol[1]["tokens_str"]) for rol in fns["roles"] }) frame_analysis.append(analysis) if len(frame_analysis) > 0: return pd.DataFrame(frame_analysis) else: return None def analyze_document(doc): if not pd.isna(doc): sentences = nltk.sent_tokenize( doc, language="english" ) else: sentences = [] perception_tables = [] frame_tables = [] for si, s in enumerate(sentences[:20]): frame_analysis_df = get_frame_analysis(s) frame_tables.append(frame_analysis_df) perception_tables.append(s2s.get(s)) return sentences, perception_tables, frame_tables st.write("# LorentzFillmore: WATCH YOUR LANGUAGE") st.dataframe(db) st.write("## Writing Exercises & Perception Scores") text_columns = [col for col in db.columns if col.startswith("Writing exercise:")] selected_column = st.selectbox(label="Writing exercise:", options=text_columns) aggregate_sentences = st.checkbox(label="Aggregate over sentences?") perception_rows = [] for _, row in db.iterrows(): sentences, perception_tables, frame_tables = analyze_document(row[selected_column]) # mean_blame_score = pd.DataFrame(perception_tables).mean()["blame-assassin"] if aggregate_sentences: perception_row = { "writer": row["Email Address"], "gender": row["I identify as ..."], "language": row["What is your native language?"], "background": row["What is your background?"], "text": sentences } for k, v in pd.DataFrame(perception_tables).mean().to_dict().items(): perception_row[k] = v perception_rows.append(perception_row) else: for s, pt, ft in zip(sentences, perception_tables, frame_tables): perception_row = { "writer": row["Email Address"], "gender": row["I identify as ..."], "language": row["What is your native language?"], "background": row["What is your background?"], "text": s } for k, v in pd.Series(pt).to_dict().items(): perception_row[k] = v perception_rows.append(perception_row) perception_df = pd.DataFrame(perception_rows) dimension = st.selectbox(label="Which dimension of perception?", options=["blame", "cause", "focus"]) dim_cols = [col for col in perception_df.columns if col.startswith(dimension)] dim_df = ( perception_df[["writer", "text"] + dim_cols] .style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu") ) st.dataframe(dim_df) st.write("### Analysis by demographic attribute") demo_attrib = st.selectbox("Select demographic attribute:", options=["writer", "gender", "language", "background"]) perc_attrib = st.selectbox("Select perception attribute", options=dim_cols) st.plotly_chart(perception_df.groupby(demo_attrib).agg({perc_attrib: "mean"}).plot.bar(backend="plotly")) st.write("## Comparing versions") v_number = int(re.search(r"version (\d)", selected_column).group(1)) if v_number < 2: st.warning("To compare versions, select a writing exercise with version number 2 or higher.") else: prev_version = re.sub(r"version (\d)", f"version {v_number - 1}", selected_column) assert prev_version in text_columns st.info(f"Comparing _{selected_column.replace('Writing exercise: ', '')}_ ↔️ _{prev_version.replace('Writing exercise: ', '')}_") perception_diff_rows = [] for _, row in db.iterrows(): sentences, perception_tables, frame_tables = analyze_document(row[selected_column]) prev_sentences, prev_perception_tables, prev_frame_tables = analyze_document(row[prev_version]) perception_diff_row = { "writer": row["Email Address"], "gender": row["I identify as ..."], "language": row["What is your native language?"], "background": row["What is your background?"], f"text_v{v_number - 1}": prev_sentences, f"text_v{v_number}": sentences } perc_new = pd.DataFrame(perception_tables).mean().to_dict() perc_old = pd.DataFrame(prev_perception_tables).mean().to_dict() for k, v in perc_new.items(): if k not in perc_old: perception_diff_row[k] = 0 else: perception_diff_row[k] = v - perc_old[k] perception_diff_rows.append(perception_diff_row) perception_diff_df = pd.DataFrame(perception_diff_rows) dim_diff_df = ( perception_diff_df[["writer", f"text_v{v_number - 1}", f"text_v{v_number}"] + dim_cols] .style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu") ) st.dataframe(dim_diff_df) st.write("### Analysis by demographic attribute") demo_attrib_diff = st.selectbox("Select demographic attribute for diff:", options=["writer", "gender", "language", "background"]) perc_attrib_diff = st.selectbox("Select perception attribute for diff", options=dim_cols) st.plotly_chart(perception_diff_df.groupby(demo_attrib_diff).agg({perc_attrib_diff: "mean"}).plot.bar(backend="plotly")) st.write("## Frame analysis") only_sentences_with_relevant_frames = st.checkbox("Only analyze sentences containing relevant frames?") selected_writer = st.selectbox( label="Select a writer:", options=sorted([f"{row['Email Address']}" for idx, row in db.iterrows()]) ) st.write("----") writer_row = db[db["Email Address"] == selected_writer].iloc[0] st.write("### Text information") st.dataframe(writer_row) sentences, perception_tables, frame_tables = analyze_document(writer_row[selected_column]) st.write("### Analysis of the entire document") mean_perception = pd.DataFrame(perception_tables).mean().to_frame(name="mean perception") st.dataframe(mean_perception.style.highlight_max(axis=0), width=500) st.write("---") st.write("### Analysis by sentence") for si, s in enumerate(sentences[:20]): frame_analysis_df = frame_tables[si] if frame_analysis_df is None and only_sentences_with_relevant_frames: continue st.write(f"#### Sentence #{1+si:02}/{len(sentences[:20])}\n*{s}*") if s not in s2s or s not in s2f: st.write("(Analysis not found)") continue st.write("##### Perception") perception_table = perception_tables[si] perception_df = pd.Series(perception_table).to_frame(name="predicted perception").style.highlight_max(axis=0) st.dataframe(perception_df, width=500) if frame_analysis_df is not None: st.write("##### Relevant frames") st.dataframe(frame_analysis_df, width=750)