import os import logging import streamlit as st import pandas as pd import pingouin as pg from sentence_transformers import SentenceTransformer, util def load_model(): if st.session_state.get('model') is None: with st.spinner('Loading the model might take a couple of seconds...'): if os.environ.get('remote_model_path'): model_path = os.environ.get('remote_model_path') else: model_path = os.getenv('model_path') st.session_state.model = SentenceTransformer( model_name_or_path=model_path#, #use_auth_token= ) logging.info('Loaded SurveyBot3000!') def process_yaml_input(yaml_dict): input_data = pd.DataFrame({k: pd.Series(v) for k, v in yaml_dict.items()}) df = ( input_data .stack() .reset_index() .drop('level_0', axis=1) .rename(columns={'level_1': 'scale', 0: "item"}) ) return df def get_items_per_scale(): input_data = st.session_state.input_data items_per_scale = input_data.groupby('scale').size().tolist() return(items_per_scale) def encode_input_data(): with st.spinner('Encoding items...'): input_data = st.session_state.input_data input_data['embeddings'] = input_data.item.apply(lambda x: st.session_state.model.encode( sentences=x, convert_to_numpy=True )) return(input_data) def synthetic_item_correlations(): df = pd.DataFrame( data = util.cos_sim( a=st.session_state.input_data.embeddings, b=st.session_state.input_data.embeddings ), columns=st.session_state.input_data.item, index=st.session_state.input_data.item ).round(2) if st.session_state.results_as_matrix is False: df = ( df .reset_index() .melt(id_vars=['item'], var_name='item_b', value_name='Θ') .rename(columns={'item': 'item_a'}) .query('item_a < item_b') ) return(df) def synthetic_scale_correlations(): scales = st.session_state.input_data.scale embeddings = st.session_state.input_data.embeddings.apply(pd.Series) def func(group_data): return(group_data.T.iloc[1:,:].mean(axis=1)) x = pd.concat([scales, embeddings], axis=1).groupby('scale').apply(lambda group: func(group)) print(x.T.corr()) data = ( pd .concat([scales, embeddings], axis=1) .groupby('scale') .mean() .reset_index() ) mean_embeddings = data.apply(lambda row: [row[col] for col in data.columns if col != 'scale'], axis=1) matrix = util.cos_sim(a=mean_embeddings, b=mean_embeddings) df = pd.DataFrame( data=matrix, columns = data.scale.tolist(), index=data.scale.tolist() ).round(2) if st.session_state.results_as_matrix is False: df = ( df .reset_index() .melt(id_vars='index', var_name='scale_b', value_name='Θ') .rename(columns={'index': 'scale_a'}) .query('scale_a < scale_b') ) return(df) def synthetic_reliabilities(): def reliability(group_data): group_data = group_data.drop('scale', axis=1).T alpha = pg.cronbach_alpha(data=group_data) x = [alpha[0], alpha[1][0], alpha[1][1]] return(x) scales = st.session_state.input_data.scale embeddings = st.session_state.input_data.embeddings.apply(pd.Series) data = ( pd .concat([scales, embeddings], axis=1) .groupby('scale') .apply(lambda group: reliability(group)) ) df = pd.DataFrame( data=[[v] + data.tolist()[k] for k, v in enumerate(data.index.tolist())], columns=['scale', 'alpha (Θ)', 'ci_lower', 'ci_upper'] ).round(2) return(df)