Spaces:

magnolia-psychometrics
/

synthetic-correlations

Running

App Files Files Community

bjorn-hommel commited on Apr 20

Commit

4832fb3

•

1 Parent(s): b7cee2e

init commit

Browse files

Files changed (9) hide show

.env +1 -0
.gitignore +1 -0
app.py +186 -0
init.json +3 -0
logo-130x130.svg +35 -0
modeling.py +139 -0
requirements.txt +10 -0
sample_input.yaml +12 -0
utils.py +27 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ model_path="/nlp/models/published/surveybot3000"

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import streamlit as st
+import pandas as pd
+import logging
+import json
+import yaml
+import pandas as pd
+import numpy as np
+from dotenv import load_dotenv
+import modeling
+def show_launch(placeholder):
+    with placeholder.container():
+        st.divider()
+        st.markdown("""
+            ## Before Using the App
+            ### Disclaimer
+            This application is provided as-is, without any warranty or guarantee of any kind, expressed or implied. It is intended for educational, non-commercial use only.
+            The developers of this app shall not be held liable for any damages or losses incurred from its use. By using this application, you agree to the terms and conditions
+            outlined herein and acknowledge that any commercial use or reliance on its functionality is strictly prohibited.
+            Furthermore, by using this application, you consent to the collection of anonymous usage data. This data will be used for research purposes and to improve the
+            application's functionality. No personal information will be recorded or stored.
+        """, unsafe_allow_html=True)
+        button_placeholder = st.empty()
+        if button_placeholder.button(label='Accept Disclaimer', type='primary', use_container_width=True):
+            st.session_state.show_launch = False
+            placeholder.empty()
+            button_placeholder.empty()
+def show_demo(placeholder):
+    with placeholder:
+        with st.container():
+            st.divider()
+            st.markdown("""
+                ## Try it yourself!
+                Our recent research shows that sentence transformer ("AI" models)
+                can predict respondent patterns in survey data! The model accurately
+                infers item-correlation with *r* = **.71** 🧨, and shows even higher
+                precision for scale correlations (*r* = **.89** 💥) and reliability
+                coefficients (*r* = **.86** 💣)!
+                Try it yourself by defining a scale structure using the input field
+                below and let the **SurveyBot3000** predict the expected response
+                pattern. Use the [YAML](https://yaml.org/) format or follow the structure
+                outlined by the preset example.
+            """)
+            with st.form("my_form"):
+                input_yaml = st.text_area(
+                    label="Questionnaire Structure (YAML-Formatted)",
+                    value=st.session_state['input_yaml'],
+                    height=250
+                )
+                st.session_state.results_as_matrix = st.checkbox(
+                    label="Result as matrix",
+                    help="Results will be list-formated (long) by default. Enable to get (wide-format) matrices."
+                )
+                submitted = st.form_submit_button(
+                    label="Get Synthetic Estimates",
+                    type="primary",
+                    use_container_width=True
+                )
+                if submitted:
+                    try:
+                        yaml_dict = yaml.safe_load(input_yaml)
+                    except yaml.YAMLError as e:
+                        st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help!")
+                        return(None)
+                    try:
+                        modeling.load_model()
+                    except Exception as error:
+                        st.error(f"Error while loading model: {error}")
+                        st.json(yaml_dict)
+                        return(None)
+                    try:
+                        st.session_state.input_data = modeling.process_yaml_input(yaml_dict)
+                    except Exception as error:
+                        st.error(error)
+                        st.json(yaml_dict)
+                        return(None)
+                    try:
+                        st.session_state.input_data = modeling.encode_input_data()
+                    except Exception as error:
+                        st.error(error)
+                        st.json(yaml_dict)
+                        return(None)
+                if 'input_data' in st.session_state:
+                    tab1, tab2, tab3 = st.tabs(["Item Correlations", "Scale Correlations", "Scale Reliabilities"])
+                    with tab1:
+                        st.markdown("Θ = Synthetic Item Correlation")
+                        df = modeling.synthetic_item_correlations()
+                        st.dataframe(df, use_container_width=True)
+                    with tab2:
+                        st.markdown("Θ = Synthetic Scale Correlation")
+                        df = modeling.synthetic_scale_correlations()
+                        st.dataframe(df, use_container_width=True)
+                    with tab3:
+                        st.markdown("alpha (Θ) = Synthetic Reliability Estimate")
+                        if np.min(modeling.get_items_per_scale()) < 3:
+                            st.error("Please make sure that each scale consits of at least 3 items!")
+                        else:
+                            df = modeling.synthetic_reliabilities()
+                            st.dataframe(df, use_container_width=True)
+                    if 'yaml_dict' in locals():
+                        st.markdown("### Input Structure:")
+                        st.json(yaml_dict)
+def handle_checkbox_change():
+    # Update session state
+    st.session_state.checkbox_state = not st.session_state.checkbox_state
+    # You can also add additional actions to be triggered by the checkbox here
+def initialize():
+    load_dotenv()
+    logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+    if 'state_loaded' not in st.session_state:
+        st.session_state['state_loaded'] = True
+        with open('init.json') as json_data:
+            st.session_state.update(json.load(json_data))
+def main():
+    st.set_page_config(page_title='Synthetic Correlations')
+    col1, col2 = st.columns([2, 5])
+    with col1:
+        st.image('logo-130x130.svg')
+    with col2:
+        st.markdown("# Synthetic Correlations")
+        st.markdown("#### Estimate Item and Scale Correlations, as well as Reliability Coefficients based on nothing but Text!")
+    st.markdown("""
+        📖 **Preprint (Open Access)**: https://osf.io/preprints/psyarxiv/kjuce
+        🖊️ **Cite**: *Hommel, B. E., & Arslan, R. C. (2024). Language models accurately infer correlations between psychological items and scales from text alone. https://doi.org/10.31234/osf.io/kjuce*
+        🌐 **Project website**: https://synth-science.github.io/surveybot3000/
+        💾 **Data**: https://osf.io/z47qs/
+        #️⃣ **Social Media**:
+            - [Björn Hommel on X/Twitter](https://twitter.com/BjoernHommel)
+            - [Ruben Arslan on X/Twitter](https://twitter.com/rubenarslan/)
+        The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
+    """, unsafe_allow_html=True)
+    placeholder_launch = st.empty()
+    placeholder_demo = st.empty()
+    if 'input_yaml' not in st.session_state:
+        with open('sample_input.yaml', 'r') as file:
+            try:
+                st.session_state['input_yaml'] = file.read()
+            except Exception as error:
+                print(error)
+    if 'disclaimer' not in st.session_state:
+        show_launch(placeholder_launch)
+        st.session_state['disclaimer'] = True
+    else:
+        show_demo(placeholder_demo)
+if __name__ == '__main__':
+    initialize()
+    main()

init.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "results_as_matrix" : false
+}

logo-130x130.svg ADDED Viewed

modeling.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import logging
+import streamlit as st
+import pandas as pd
+import pingouin as pg
+from sentence_transformers import SentenceTransformer, util
+def load_model():
+    if st.session_state.get('model') is None:
+        with st.spinner('Loading the model might take a couple of seconds...'):
+            if os.environ.get('remote_model_path'):
+                model_path = os.environ.get('remote_model_path')
+            else:
+                model_path = os.getenv('model_path')
+            st.session_state.model = SentenceTransformer(
+                model_name_or_path=model_path#,
+                #use_auth_token=
+            )
+            logging.info('Loaded SurveyBot3000!')
+def process_yaml_input(yaml_dict):
+    input_data = pd.DataFrame({k: pd.Series(v) for k, v in yaml_dict.items()})
+    df = (
+        input_data
+        .stack()
+        .reset_index()
+        .drop('level_0', axis=1)
+        .rename(columns={'level_1': 'scale', 0: "item"})
+    )
+    return df
+def get_items_per_scale():
+    input_data = st.session_state.input_data
+    items_per_scale = input_data.groupby('scale').size().tolist()
+    return(items_per_scale)
+def encode_input_data():
+    with st.spinner('Encoding items...'):
+        input_data = st.session_state.input_data
+        input_data['embeddings'] = input_data.item.apply(lambda x: st.session_state.model.encode(
+            sentences=x,
+            convert_to_numpy=True
+        ))
+        return(input_data)
+def synthetic_item_correlations():
+    df = pd.DataFrame(
+        data = util.cos_sim(
+            a=st.session_state.input_data.embeddings,
+            b=st.session_state.input_data.embeddings
+        ),
+        columns=st.session_state.input_data.item,
+        index=st.session_state.input_data.item
+    ).round(2)
+    if st.session_state.results_as_matrix is False:
+        df = (
+            df
+            .reset_index()
+            .melt(id_vars=['item'], var_name='item_b', value_name='Θ')
+            .rename(columns={'item': 'item_a'})
+            .query('item_a < item_b')
+    )
+    return(df)
+def synthetic_scale_correlations():
+    scales = st.session_state.input_data.scale
+    embeddings = st.session_state.input_data.embeddings.apply(pd.Series)
+    def func(group_data):
+        return(group_data.T.iloc[1:,:].mean(axis=1))
+    x = pd.concat([scales, embeddings], axis=1).groupby('scale').apply(lambda group: func(group))
+    print(x.T.corr())
+    data = (
+        pd
+        .concat([scales, embeddings], axis=1)
+        .groupby('scale')
+        .mean()
+        .reset_index()
+    )
+    mean_embeddings = data.apply(lambda row: [row[col] for col in data.columns if col != 'scale'], axis=1)
+    matrix = util.cos_sim(a=mean_embeddings, b=mean_embeddings)
+    df = pd.DataFrame(
+        data=matrix,
+        columns = data.scale.tolist(),
+        index=data.scale.tolist()
+    ).round(2)
+    if st.session_state.results_as_matrix is False:
+        df = (
+            df
+            .reset_index()
+            .melt(id_vars='index', var_name='scale_b', value_name='Θ')
+            .rename(columns={'index': 'scale_a'})
+            .query('scale_a < scale_b')
+        )
+    return(df)
+def synthetic_reliabilities():
+    def reliability(group_data):
+        group_data = group_data.drop('scale', axis=1).T
+        alpha = pg.cronbach_alpha(data=group_data)
+        x = [alpha[0], alpha[1][0], alpha[1][1]]
+        return(x)
+    scales = st.session_state.input_data.scale
+    embeddings = st.session_state.input_data.embeddings.apply(pd.Series)
+    data = (
+        pd
+        .concat([scales, embeddings], axis=1)
+        .groupby('scale')
+        .apply(lambda group: reliability(group))
+    )
+    df = pd.DataFrame(
+        data=[[v] + data.tolist()[k] for k, v in enumerate(data.index.tolist())],
+        columns=['scale', 'alpha (Θ)', 'ci_lower', 'ci_upper']
+    ).round(2)
+    return(df)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+yaml
+json
+pandas==2.2.0
+numpy==1.26.3
+sentence_transformers==2.2.2
+sentencepiece==0.1.99
+altair==4.2.2
+pingouin==0.5.4
+python-dotenv

sample_input.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+Buttox-Fixation:
+  - I like big butts.
+  - But that butt you got makes me so horny.
+  - Baby got back.
+  - Shake that healthy butt.
+  - My anaconda don´t want want none, unless you got buns, hun.
+  - So ladies if tha butt is round
+  - But please don´ lose that butt
+Delinquent behavior:
+  - I cannot lie.
+  - I am hooked an´ I cannot stop starin´
+  - I am actin´ like an animal.

utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# code by Martijn Pieters https://stackoverflow.com/a/23499088/1114975
+from functools import singledispatch, wraps
+@singledispatch
+def depth(_, _level=1, _memo=None):
+    return _level
+def _protect(f):
+    """Protect against circular references"""
+    @wraps(f)
+    def wrapper(o, _level=1, _memo=None, **kwargs):
+        _memo, id_ = _memo or set(), id(o)
+        if id_ in _memo: return _level
+        _memo.add(id_)
+        return f(o, _level=_level, _memo=_memo, **kwargs)
+    return wrapper
+def _protected_register(cls, func=None, _orig=depth.register):
+    """Include the _protect decorator when registering"""
+    if func is None and isinstance(cls, type):
+        return lambda f: _orig(cls, _protect(f))
+    return _orig(cls, _protect(func)) if func is not None else _orig(_protect(cls))
+depth.register = _protected_register
+@depth.register
+def _dict_depth(d: dict, _level=1, **kw):
+    return max(depth(v, _level=_level + 1, **kw) for v in d.values())