|
import os |
|
import time |
|
import random |
|
import logging |
|
import json |
|
import yaml |
|
import pandas as pd |
|
import numpy as np |
|
import streamlit as st |
|
import pandas as pd |
|
from datetime import datetime |
|
from dotenv import load_dotenv |
|
|
|
import db |
|
import modeling |
|
import utils |
|
|
|
def show_launch(placeholder): |
|
with placeholder.container(): |
|
st.divider() |
|
st.markdown(""" |
|
## Before Using the App |
|
### Disclaimer |
|
This application is provided as-is, without any warranty or guarantee of any kind, expressed or implied. It is intended for educational, non-commercial use only. |
|
The developers of this app shall not be held liable for any damages or losses incurred from its use. By using this application, you agree to the terms and conditions |
|
outlined herein and acknowledge that any commercial use or reliance on its functionality is strictly prohibited. |
|
|
|
Furthermore, by using this application, you consent to the collection of anonymous usage data. This data will be used for research purposes and to improve the |
|
application's functionality. No personal information will be recorded or stored. |
|
""", unsafe_allow_html=True) |
|
|
|
button_placeholder = st.empty() |
|
|
|
connect_to_database() |
|
if button_placeholder.button(label='Accept Disclaimer', type='primary', use_container_width=True): |
|
st.session_state.show_launch = False |
|
placeholder.empty() |
|
button_placeholder.empty() |
|
|
|
def show_demo(placeholder): |
|
|
|
with placeholder: |
|
with st.container(): |
|
st.divider() |
|
st.markdown(""" |
|
## Try it yourself! |
|
Our recent research shows that sentence transformer ("AI" models) |
|
can predict respondent patterns in survey data! The model accurately |
|
infers item-correlation with *r* = **.71** 🧨, and shows even higher |
|
precision for scale correlations (*r* = **.89** 💥) and reliability |
|
coefficients (*r* = **.86** 💣)! |
|
|
|
Try it yourself by defining a scale structure using the input field |
|
below and let the **SurveyBot3000** predict the expected response |
|
pattern. Use the [YAML](https://yaml.org/) format or follow the structure |
|
outlined by the preset example: |
|
- Scale names must end with "**:**" |
|
- Nest items under a scale name by prepending "**-**" before each item |
|
""") |
|
|
|
with st.form("submission_form"): |
|
|
|
input_yaml = st.text_area( |
|
label="Questionnaire Structure (YAML-Formatted)", |
|
value=st.session_state['input_yaml'], |
|
height=250 |
|
) |
|
|
|
st.session_state.results_as_matrix = st.checkbox( |
|
label="Result as matrix", |
|
help="Results will be list-formated (long) by default. Enable to get (wide-format) matrices." |
|
) |
|
|
|
submitted = st.form_submit_button( |
|
label="Get Synthetic Estimates", |
|
type="primary", |
|
use_container_width=True |
|
) |
|
if submitted: |
|
|
|
try: |
|
yaml_dict = yaml.safe_load(input_yaml) |
|
except yaml.YAMLError as e: |
|
st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help!") |
|
return(None) |
|
|
|
try: |
|
modeling.load_model() |
|
except Exception as error: |
|
st.error(f"Error while loading model: {error}") |
|
st.json(yaml_dict) |
|
return(None) |
|
|
|
try: |
|
st.session_state.input_data = modeling.process_yaml_input(yaml_dict) |
|
except Exception as error: |
|
error_msg = f"Error while processing YAML-input: {error}" |
|
st.error(error_msg) |
|
|
|
st.json(yaml_dict) |
|
return(None) |
|
|
|
try: |
|
st.session_state.input_data = modeling.encode_input_data() |
|
except Exception as error: |
|
error_msg = f"Error while encoding data: {error}" |
|
st.error(error_msg) |
|
print(error) |
|
st.json(yaml_dict) |
|
return(None) |
|
|
|
if os.environ.get('remote_model_path'): |
|
n_items = st.session_state.input_data.shape[0] |
|
if n_items > 50: |
|
st.error(f"You've entered too many items ({n_items} on a 50 item limit)! Please contact bjoern.hommel@uni-leipzig.de if you require estimates of larger sets!") |
|
return(None) |
|
|
|
if 'input_data' in st.session_state: |
|
|
|
st.warning('**Note:** The SurveyBot3000 cannot determine the direction of a scale and may incorrectly invert the correlation direction by randomly flipping items at either end of the scale, even though it recognizes items that need inversion.', icon="⚠️") |
|
|
|
if 'yaml_dict' in locals(): |
|
|
|
input_data_serialized = utils.serialize_data(yaml_dict) |
|
input_data_hashed = utils.hash(input_data_serialized) |
|
payload = { |
|
'user_id': st.session_state.user_id, |
|
'timestap': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), |
|
'input_hash': input_data_hashed, |
|
'input_data': input_data_serialized |
|
} |
|
|
|
if st.session_state['init_input_hash'] != input_data_hashed: |
|
|
|
if os.environ.get('remote_model_path'): |
|
|
|
db.write_to_db(st.session_state.db, payload) |
|
|
|
tab1, tab2, tab3 = st.tabs(["Item Correlations", "Scale Correlations", "Scale Reliabilities"]) |
|
|
|
with tab1: |
|
st.markdown("Θ = Synthetic Item Correlation") |
|
synthetic_item_correlations = modeling.get_synthetic_item_correlations() |
|
st.dataframe(synthetic_item_correlations, use_container_width=True) |
|
|
|
with tab2: |
|
st.markdown("Θ = Synthetic Scale Correlation") |
|
synthetic_scale_correlations = modeling.get_synthetic_scale_correlations() |
|
st.dataframe(synthetic_scale_correlations, use_container_width=True) |
|
|
|
with tab3: |
|
st.markdown("alpha (Θ) = Synthetic Reliability Estimate (Cronbach's Alpha)") |
|
if np.min(modeling.get_items_per_scale()) < 3: |
|
st.error("Please make sure that each scale consits of at least 3 items!") |
|
else: |
|
synthetic_reliabilities = modeling.get_synthetic_reliabilities() |
|
st.dataframe(synthetic_reliabilities, use_container_width=True) |
|
|
|
if 'yaml_dict' in locals(): |
|
st.markdown("### Input Structure:") |
|
st.json(yaml_dict) |
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
if 'synthetic_item_correlations' in locals(): |
|
st.download_button( |
|
label="Download Synthetic Item Correlations as CSV", |
|
data=utils.df_to_csv(synthetic_item_correlations), |
|
file_name='synthetic_item_correlations.csv', |
|
mime='text/csv', |
|
) |
|
with col2: |
|
if 'synthetic_scale_correlations' in locals(): |
|
st.download_button( |
|
label="Download Synthetic Scale Correlations as CSV", |
|
data=utils.df_to_csv(synthetic_scale_correlations), |
|
file_name='synthetic_scale_correlations.csv', |
|
mime='text/csv', |
|
) |
|
|
|
with col3: |
|
if 'synthetic_reliabilities' in locals(): |
|
st.download_button( |
|
label="Download Synthetic Scale Reliabilities as CSV", |
|
data=utils.df_to_csv(synthetic_reliabilities), |
|
file_name='synthetic_reliabilities.csv', |
|
mime='text/csv', |
|
) |
|
|
|
def handle_checkbox_change(): |
|
|
|
st.session_state.checkbox_state = not st.session_state.checkbox_state |
|
|
|
def initialize(): |
|
load_dotenv() |
|
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) |
|
|
|
if 'state_loaded' not in st.session_state: |
|
st.session_state['state_loaded'] = True |
|
with open('init.json') as json_data: |
|
st.session_state.update(json.load(json_data)) |
|
|
|
if 'user_id' not in st.session_state: |
|
st.session_state.user_id = random.randint(1, 999_999_999) |
|
|
|
def connect_to_database(): |
|
"""Establishes a connection to the database.""" |
|
|
|
if st.session_state.db is None: |
|
credentials_dict = db.load_credentials() |
|
connection_attempts = 0 |
|
|
|
while st.session_state.db is None and connection_attempts < 3: |
|
st.session_state.db = db.connect_to_db(credentials_dict) |
|
if st.session_state.db is None: |
|
logging.info('Retrying to connect to db...') |
|
connection_attempts += 1 |
|
time.sleep(1) |
|
|
|
def main(): |
|
st.set_page_config(page_title='Synthetic Correlations') |
|
|
|
col1, col2 = st.columns([2, 5]) |
|
with col1: |
|
st.image('logo-130x130.svg') |
|
|
|
with col2: |
|
st.markdown("# Synthetic Correlations") |
|
st.markdown("#### Estimate Item and Scale Correlations, as well as Reliability Coefficients based on nothing but Text!") |
|
|
|
st.markdown(""" |
|
|
|
- 📖 **Preprint (Open Access)**: https://osf.io/preprints/psyarxiv/kjuce |
|
- 🖊️ **Cite**: *Hommel, B. E., & Arslan, R. C. (2024). Language models accurately infer correlations between psychological items and scales from text alone. https://doi.org/10.31234/osf.io/kjuce* |
|
- 🌐 **Project website**: https://synth-science.github.io/surveybot3000/ |
|
- 💾 **Data**: https://osf.io/z47qs/ |
|
- #️⃣ **Social Media**: |
|
- Björn Hommel: [X/Twitter](https://twitter.com/BjoernHommel) | [ResearchGate](https://www.researchgate.net/profile/Bjoern-Hommel) | [Bsky](https://bsky.app/profile/bjoernhommel.bsky.social) |
|
- Ruben Arslan: [X/Twitter](https://twitter.com/rubenarslan/) | [ResearchGate](https://www.researchgate.net/profile/Ruben_Arslan) | [Bsky](https://bsky.app/profile/ruben.the100.ci) |
|
|
|
The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/). |
|
""", unsafe_allow_html=True) |
|
|
|
placeholder_launch = st.empty() |
|
placeholder_demo = st.empty() |
|
|
|
if 'input_yaml' not in st.session_state: |
|
|
|
with open('sample_input.yaml', 'r') as file: |
|
try: |
|
st.session_state['input_yaml'] = file.read() |
|
init_input_dict = yaml.safe_load(st.session_state['input_yaml']) |
|
init_input_serialized = utils.serialize_data(init_input_dict) |
|
init_input_hashed = utils.hash(init_input_serialized) |
|
st.session_state['init_input_hash'] = init_input_hashed |
|
except Exception as error: |
|
print(error) |
|
|
|
if 'disclaimer' not in st.session_state: |
|
show_launch(placeholder_launch) |
|
st.session_state['disclaimer'] = True |
|
else: |
|
show_demo(placeholder_demo) |
|
|
|
if __name__ == '__main__': |
|
initialize() |
|
main() |