LEURN / app.py
CaglarAytekin
auto-feature selection and realistic boundaries
74f6289
import streamlit as st
import pandas as pd
import numpy as np
from LEURN import LEURN
import torch
from DATA import split_and_processing
from TRAINER import Trainer
import numpy as np
import openml
# Initialize or reset session states if necessary
if 'init' not in st.session_state:
st.session_state['training_completed'] = False
st.session_state['data_chosen'] = False
st.session_state['init'] = True
st.session_state['selected_row']=False
st.session_state['explanation_made']=False
st.session_state['result']=False
# Streamlit application layout
st.title("LEURN")
# Usage
st.subheader("Usage")
st.write("1. Upload your training excel/csv file. The file should contain all the features and the target to be predicted")
st.write("2. Select target variable to predict")
st.write("3. Select categorical variables in your dataset")
st.write("4. Select neural network hyperparameters")
st.write("5. Press Train Neural Network")
st.write("6. If you want to make inference and explain decisions, go to 7, if you want to generate data, go to 10")
st.write("7. Upload your test excel/csv file. The file should only contain features, and not target")
st.write("8. Select the row you want to make inference and explain.")
st.write("9. Press explain button.")
st.write("10. Press generate button. This directly generates new samples with explanations and output.")
# Upload csv or excel
st.subheader("File Uploader")
uploaded_file = st.file_uploader("Upload your Excel/CSV file", type=["csv", "xlsx"])
if uploaded_file is not None:
def are_all_strings(series):
return all(isinstance(item, str) for item in series)
# Reading the uploaded file
df = pd.read_csv(uploaded_file) if uploaded_file.type == "text/csv" else pd.read_excel(uploaded_file)
st.write("Data Preview:")
st.write(df.head())
st.subheader("Categorical Feature and Target Selection")
# Selecting the target variable
target = st.selectbox("Select the target variable", options=df.columns)
# Define features and target
X = df.drop(target, axis=1)
y = df[target]
attribute_names = X.columns
# Select categorical variables
st.write("Select categorical variables:")
categoricals = [st.checkbox(f"{col} is categorical", key=col,value=are_all_strings(X[col])) for col in X.columns]
# User input for model parameters
st.subheader("Model Training Parameters")
depth = st.selectbox("Select Model Depth", options=[1, 2, 3, 4, 5], index=2)
batch_size = st.selectbox("Select Batch Size", options=[64, 128, 256, 512, 1024, 2048, 4096], index=4)
lr = st.selectbox("Select Learning Rate", options=[1e-4, 5e-4, 1e-3, 5e-3, 1e-2], index=3)
epochs = st.number_input("Enter Number of Epochs", min_value=1, max_value=1000, value=300)
droprate = st.slider("Select Dropout Rate", min_value=0.0, max_value=1.0, value=0.0, step=0.05)
output_type = st.radio("Select Output Type (0: regression, 1: binary classification, 2: multi-class classification)", options=[0, 1, 2], index=0)
if st.button("Train Neural Network"):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#Split and process
X_train, X_val, X_test, y_train, y_val, y_test, preprocessor = split_and_processing(X, y, categoricals, output_type, attribute_names)
#Initialize model
model = LEURN(preprocessor, depth=depth, droprate=droprate).to(device)
#Train model
model_trainer = Trainer(model, X_train, X_val, y_train, y_val, lr=lr, batch_size=batch_size, epochs=epochs, problem_type=output_type, verbose=False)
model_trainer.train()
#Load best model
model.load_state_dict(model_trainer.best_model)
#Get performances
perf_train = model_trainer.evaluate(X_train, y_train)
perf_val = model_trainer.evaluate(X_val, y_val)
perf_test = model_trainer.evaluate(X_test, y_test)
st.session_state['perf_train']=perf_train
st.session_state['perf_val']=perf_val
st.session_state['perf_test']=perf_test
#Save test dataset and model to explain/generate later
X_test_inverse = preprocessor.inverse_transform_X(X_test)
X_test_inverse.to_csv('test.csv',index=False)
st.session_state['training_completed'] = True
st.session_state['model'] = model # Adjusted for compatibility
if st.session_state['training_completed'] == True:
#Print performances
st.write("Here are performances, try different hyperparameters if not satisfied")
if output_type == 0:
st.subheader("Training Results (MSE)")
elif output_type == 1:
st.subheader("Training Results (ROC-AUC)")
else:
st.subheader("Training Results (ACC)")
st.write(f"Training Score: {st.session_state['perf_train']:.4f}")
st.write(f"Validation Score: {st.session_state['perf_val']:.4f}")
st.write(f"Test Score: {st.session_state['perf_test']:.4f}")
# File uploader for explanation
st.subheader("Explain New Inputs")
uploaded_file_to_explain = st.file_uploader("Upload your Excel/CSV file to explain. Uploaded file should not have the target variable.", type=["csv", "xlsx"])
if uploaded_file_to_explain is not None:
# Reading the uploaded file
print(uploaded_file_to_explain)
X_test_inverse = pd.read_csv(uploaded_file_to_explain) if uploaded_file_to_explain.type == "text/csv" else pd.read_excel(uploaded_file_to_explain)
# Save DataFrame
st.session_state['X_test_inverse_df'] = X_test_inverse.to_json()
st.session_state['data_chosen'] = True # Flag to indicate data is chosen
if st.session_state['data_chosen'] == True:
# Load DataFrame from session state
X_test_inverse = pd.read_json(st.session_state['X_test_inverse_df'])
# Always display the DataFrame to ensure it's visible for selection
st.write("Test DataFrame:")
st.write(X_test_inverse)
# Let users select a row, selection is dynamic and updates session state
selected_index = st.selectbox("Select a row:", options=X_test_inverse.index, key="selected_index")
selected_row = X_test_inverse.loc[[st.session_state['selected_index']]]
st.write("Selected Data for Explanation:")
st.write(selected_row)
st.session_state['selected_row'] = selected_row
#Explain selected row
if st.button("Explain"):
model=st.session_state['model']
Exp_df_test_sample,result,result_original_format=model.explain(torch.from_numpy(model.preprocessor.transform_X(st.session_state['selected_row']).values.astype('float32')),include_causal_analysis=True)
st.session_state['explanation_made']=True
st.session_state['Exp_df_test_sample']=Exp_df_test_sample
st.session_state['result_original_format']=result_original_format
st.session_state['result']=result
#Print explanations
if st.session_state['explanation_made']==True:
st.write("Explanation DataFrame:")
st.write(st.session_state['Exp_df_test_sample'])
st.write("Predicted Output: (Network format)")
st.write(st.session_state['result'].detach().numpy().astype('str'))
if output_type==1:
if np.sign(st.session_state['result'].detach().numpy())>0:
st.write("Result here is positive; this means output class below is represented by positive sign. In the explanation dataframe, positive contributions increase class likelihood")
else:
st.write("Result here is negative; this means output class below is represented by negative sign. In the explanation dataframe, negative contributions increase class likelihood")
st.write("Predicted Output: (original format)")
st.write(st.session_state['result_original_format'])
#Data generation part
st.subheader("Generate Data From Scratch")
if st.button("Generate"):
model=st.session_state['model']
generated_sample_nn_friendly, generated_sample_original_input_format,output=model.generate()
st.write("Generated Data:")
st.write(generated_sample_original_input_format)
Exp_df_generated_sample,result,result_original_format=model.explain(generated_sample_nn_friendly,include_causal_analysis=True)
st.write("Explanation DataFrame:")
st.write(Exp_df_generated_sample)
st.write("Predicted Output: (Network format)")
st.write(result.detach().numpy().astype('str'))
if output_type==1:
if np.sign(result.detach().numpy())>0:
st.write("Result here is positive; this means output class below is represented by positive sign. In the explanation dataframe, positive contributions increase class likelihood")
else:
st.write("Result here is negative; this means output class below is represented by negative sign. In the explanation dataframe, negative contributions increase class likelihood")
st.write("Predicted Output: (original format)")
st.write(result_original_format)