|
import streamlit as st |
|
import h2o |
|
from h2o.automl import H2OAutoML |
|
import pandas as pd |
|
import os |
|
import numpy as np |
|
from sklearn.metrics import mean_squared_error |
|
import matplotlib.pyplot as plt |
|
import shutil |
|
import zipfile |
|
import io |
|
import tempfile |
|
import zipfile |
|
|
|
|
|
st.set_page_config(page_title="AquaLearn", layout="wide") |
|
|
|
|
|
h2o.init() |
|
def rename_columns_alphabetically(df): |
|
new_columns = [chr(65 + i) for i in range(len(df.columns))] |
|
return df.rename(columns=dict(zip(df.columns, new_columns))) |
|
|
|
def sanitize_column_name(name): |
|
|
|
sanitized = ''.join(c if c.isalnum() else '_' for c in name) |
|
|
|
if not sanitized[0].isalpha() and sanitized[0] != '_': |
|
sanitized = 'f_' + sanitized |
|
return sanitized |
|
|
|
|
|
if not os.path.exists("saved_models"): |
|
os.makedirs("saved_models") |
|
|
|
def load_data(): |
|
st.title("Aqua Learn") |
|
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") |
|
if uploaded_file is not None: |
|
train = pd.read_csv(uploaded_file) |
|
st.write(train.head()) |
|
return h2o.H2OFrame(train) |
|
return None |
|
|
|
def select_problem_type(): |
|
return st.selectbox("Select Problem Type:", ['Classification', 'Regression']) |
|
|
|
def select_target_column(train_h2o): |
|
return st.selectbox("Select Target Column:", train_h2o.columns) |
|
|
|
def prepare_features(train_h2o, y, problem_type): |
|
x = train_h2o.columns |
|
x.remove(y) |
|
if problem_type == 'Classification': |
|
train_h2o[y] = train_h2o[y].asfactor() |
|
|
|
|
|
new_columns = [chr(65 + i) for i in range(len(train_h2o.columns))] |
|
train_h2o.columns = new_columns |
|
y = new_columns[-1] |
|
x = new_columns[:-1] |
|
|
|
return x, y, train_h2o |
|
|
|
def select_algorithms(): |
|
algorithm_options = ['DeepLearning', 'GLM', 'GBM', 'DRF', 'XGBoost'] |
|
return st.multiselect("Select Algorithms:", algorithm_options) |
|
|
|
def set_automl_parameters(): |
|
max_models = st.number_input("Max Models:", value=20, min_value=1) |
|
max_runtime = st.number_input("Max Runtime (seconds):", value=600, min_value=1) |
|
return max_models, max_runtime |
|
|
|
def run_automl(x, y, train, problem_type, selected_algos, max_models, max_runtime): |
|
aml = H2OAutoML(max_models=max_models, |
|
seed=1, |
|
max_runtime_secs=max_runtime, |
|
sort_metric="AUC" if problem_type == 'Classification' else "RMSE", |
|
include_algos=selected_algos) |
|
aml.train(x=x, y=y, training_frame=train) |
|
return aml |
|
|
|
def display_results(aml, test): |
|
st.subheader("AutoML Leaderboard") |
|
st.write(aml.leaderboard.as_data_frame()) |
|
|
|
st.subheader("Best Model Performance") |
|
best_model = aml.leader |
|
perf = best_model.model_performance(test) |
|
st.write(perf) |
|
|
|
def save_and_evaluate_models(aml, test, y, problem_type): |
|
if st.button("Save Models and Calculate Performance"): |
|
model_performances = [] |
|
for model_id in aml.leaderboard['model_id'].as_data_frame().values: |
|
model = h2o.get_model(model_id[0]) |
|
|
|
|
|
|
|
|
|
|
|
preds = model.predict(test) |
|
actual = test[y].as_data_frame().values.flatten() |
|
predicted = preds.as_data_frame()['predict'].values.flatten() |
|
|
|
if problem_type == 'Classification': |
|
performance = (actual == predicted).mean() |
|
metric_name = 'accuracy' |
|
else: |
|
performance = np.sqrt(mean_squared_error(actual, predicted)) |
|
metric_name = 'rmse' |
|
|
|
model_performances.append({'model_id': model_id[0], metric_name: performance}) |
|
|
|
performance_df = pd.DataFrame(model_performances) |
|
st.write(performance_df) |
|
|
|
|
|
st.subheader("Model Performance Visualization") |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
performance_df.sort_values(by=metric_name, ascending=False, inplace=True) |
|
ax.barh(performance_df['model_id'], performance_df[metric_name], color='skyblue') |
|
ax.set_xlabel(metric_name.capitalize()) |
|
ax.set_ylabel('Model ID') |
|
ax.set_title(f'Model {metric_name.capitalize()} from H2O AutoML') |
|
ax.grid(axis='x') |
|
st.pyplot(fig) |
|
|
|
def download_model(): |
|
st.subheader("Download Model") |
|
if 'saved_models' in st.session_state and st.session_state.saved_models: |
|
model_to_download = st.selectbox("Select Model to Download:", |
|
[model[0] for model in st.session_state.saved_models]) |
|
if st.button("Download Selected Model"): |
|
model_path = next(model[1] for model in st.session_state.saved_models if model[0] == model_to_download) |
|
|
|
if os.path.isdir(model_path): |
|
|
|
zip_buffer = io.BytesIO() |
|
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: |
|
for root, _, files in os.walk(model_path): |
|
for file in files: |
|
zip_file.write(os.path.join(root, file), |
|
os.path.relpath(os.path.join(root, file), model_path)) |
|
|
|
zip_buffer.seek(0) |
|
st.download_button( |
|
label="Click to Download", |
|
data=zip_buffer, |
|
file_name=f"{model_to_download}.zip", |
|
mime="application/zip" |
|
) |
|
else: |
|
|
|
with open(model_path, "rb") as file: |
|
st.download_button( |
|
label="Click to Download", |
|
data=file, |
|
file_name=f"{model_to_download}.zip", |
|
mime="application/zip" |
|
) |
|
else: |
|
st.write("No models available for download. Please train and save models first.") |
|
|
|
def further_training(aml, x, y, train, problem_type): |
|
st.subheader("Further Training") |
|
leaderboard_df = aml.leaderboard.as_data_frame() |
|
model_to_train = st.selectbox("Select Model for Training:", leaderboard_df['model_id'].tolist()) |
|
training_time = st.number_input("Training Time (seconds):", value=60, min_value=1) |
|
|
|
if st.button("Train Model"): |
|
model = h2o.get_model(model_to_train) |
|
|
|
with st.spinner(f"Training model: {model_to_train} for {training_time} seconds..."): |
|
if isinstance(model, h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator): |
|
aml = H2OAutoML(max_runtime_secs=training_time, seed=1, sort_metric="AUC" if problem_type == 'Classification' else "RMSE") |
|
aml.train(x=x, y=y, training_frame=train) |
|
model = aml.leader |
|
else: |
|
model.train(x=x, y=y, training_frame=train, max_runtime_secs=training_time) |
|
|
|
perf = model.model_performance(train) |
|
st.write("Model performance after training:") |
|
st.write(perf) |
|
|
|
|
|
temp_dir = os.path.join("saved_models", "temp") |
|
os.makedirs(temp_dir, exist_ok=True) |
|
model_path = os.path.join(temp_dir, f"{model.model_id}") |
|
h2o.save_model(model=model, path=model_path, force=True) |
|
|
|
|
|
zip_buffer = io.BytesIO() |
|
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: |
|
for root, _, files in os.walk(model_path): |
|
for file in files: |
|
zip_file.write(os.path.join(root, file), |
|
os.path.relpath(os.path.join(root, file), model_path)) |
|
|
|
zip_buffer.seek(0) |
|
st.download_button( |
|
label="Download Retrained Model", |
|
data=zip_buffer, |
|
file_name=f"{model.model_id}.zip", |
|
mime="application/zip" |
|
) |
|
|
|
|
|
shutil.rmtree(temp_dir) |
|
|
|
st.success(f"Retrained model ready for download: {model.model_id}") |
|
|
|
def make_prediction(): |
|
st.subheader("Make Prediction") |
|
|
|
uploaded_zip = st.file_uploader("Upload a zip file containing the model", type="zip") |
|
if uploaded_zip is not None: |
|
with tempfile.TemporaryDirectory() as tmpdirname: |
|
zip_path = os.path.join(tmpdirname, "model.zip") |
|
with open(zip_path, "wb") as f: |
|
f.write(uploaded_zip.getbuffer()) |
|
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref: |
|
zip_ref.extractall(tmpdirname) |
|
|
|
extracted_files = os.listdir(tmpdirname) |
|
if len(extracted_files) == 0: |
|
st.error("The uploaded zip file is empty.") |
|
return |
|
|
|
model_file = next((f for f in extracted_files if f != "model.zip"), None) |
|
if model_file is None: |
|
st.error("No model file found in the uploaded zip.") |
|
return |
|
|
|
model_path = os.path.join(tmpdirname, model_file) |
|
|
|
try: |
|
model_for_prediction = h2o.load_model(model_path) |
|
except Exception as e: |
|
st.error(f"Error loading the model: {str(e)}") |
|
st.error("Please ensure you're uploading a valid H2O model file.") |
|
return |
|
|
|
|
|
feature_names_input = st.text_input("Enter feature names, separated by commas:") |
|
original_feature_names = [name.strip() for name in feature_names_input.split(',') if name.strip()] |
|
|
|
if not original_feature_names: |
|
st.error("Please enter at least one feature name.") |
|
return |
|
|
|
|
|
feature_mapping = {name: chr(65 + i) for i, name in enumerate(original_feature_names)} |
|
reverse_mapping = {v: k for k, v in feature_mapping.items()} |
|
|
|
prediction_type = st.radio("Choose prediction type:", ["Upload CSV", "Single Entry"]) |
|
|
|
if prediction_type == "Upload CSV": |
|
uploaded_csv = st.file_uploader("Upload a CSV file for prediction", type="csv") |
|
if uploaded_csv is not None: |
|
prediction_data = pd.read_csv(uploaded_csv) |
|
|
|
|
|
prediction_data = prediction_data.rename(columns=feature_mapping) |
|
|
|
prediction_h2o = h2o.H2OFrame(prediction_data) |
|
try: |
|
predictions = model_for_prediction.predict(prediction_h2o) |
|
predictions_df = predictions.as_data_frame() |
|
|
|
|
|
result_df = pd.concat([prediction_data, predictions_df], axis=1) |
|
|
|
|
|
result_df = result_df.rename(columns=reverse_mapping) |
|
|
|
st.write("Predictions (showing first 10 rows):") |
|
st.write(result_df.head(10)) |
|
|
|
|
|
csv = result_df.to_csv(index=False) |
|
st.download_button( |
|
label="Download full results as CSV", |
|
data=csv, |
|
file_name="predictions_results.csv", |
|
mime="text/csv" |
|
) |
|
except Exception as e: |
|
st.error(f"Error making predictions: {str(e)}") |
|
st.error("Please ensure your CSV file matches the model's expected input format.") |
|
|
|
else: |
|
sample_input = {} |
|
for original_name, coded_name in feature_mapping.items(): |
|
value = st.text_input(f"Enter {original_name} ({coded_name}):") |
|
try: |
|
sample_input[coded_name] = [float(value)] |
|
except ValueError: |
|
sample_input[coded_name] = [value] |
|
|
|
if st.button("Predict"): |
|
sample_h2o = h2o.H2OFrame(sample_input) |
|
try: |
|
predictions = model_for_prediction.predict(sample_h2o) |
|
prediction_value = predictions['predict'][0,0] |
|
st.write(f"Predicted value: {prediction_value}") |
|
except Exception as e: |
|
st.error(f"Error making prediction: {str(e)}") |
|
st.error("Please ensure you've entered valid input values.") |
|
else: |
|
st.write("Please upload a zip file containing the model to make predictions.") |
|
def main(): |
|
train_h2o = load_data() |
|
if train_h2o is not None: |
|
problem_type = select_problem_type() |
|
target_column = select_target_column(train_h2o) |
|
|
|
if st.button("Set Target and Continue"): |
|
x, target_column, train_h2o = prepare_features(train_h2o, target_column, problem_type) |
|
st.session_state.features_prepared = True |
|
st.session_state.x = x |
|
st.session_state.target_column = target_column |
|
st.session_state.train_h2o = train_h2o |
|
st.session_state.problem_type = problem_type |
|
|
|
if 'features_prepared' in st.session_state and st.session_state.features_prepared: |
|
st.write(f"Target Column: {st.session_state.target_column}") |
|
st.write(f"Feature Columns: {st.session_state.x}") |
|
|
|
train, test = st.session_state.train_h2o.split_frame(ratios=[0.8]) |
|
|
|
selected_algos = select_algorithms() |
|
max_models, max_runtime = set_automl_parameters() |
|
|
|
if st.button("Start AutoML"): |
|
if not selected_algos: |
|
st.error("Please select at least one algorithm.") |
|
else: |
|
with st.spinner("Running AutoML..."): |
|
aml = run_automl(st.session_state.x, st.session_state.target_column, train, |
|
st.session_state.problem_type, selected_algos, max_models, max_runtime) |
|
|
|
st.success("AutoML training completed.") |
|
st.session_state.aml = aml |
|
st.session_state.test = test |
|
|
|
if 'aml' in st.session_state: |
|
display_results(st.session_state.aml, st.session_state.test) |
|
save_and_evaluate_models(st.session_state.aml, st.session_state.test, st.session_state.target_column, st.session_state.problem_type) |
|
download_model() |
|
further_training(st.session_state.aml, st.session_state.x, st.session_state.target_column, train, st.session_state.problem_type) |
|
|
|
make_prediction() |
|
|
|
if __name__ == "__main__": |
|
if 'features_prepared' not in st.session_state: |
|
st.session_state.features_prepared = False |
|
if 'saved_models' not in st.session_state: |
|
st.session_state.saved_models = [] |
|
main() |
|
|
|
|
|
shutil.rmtree("saved_models", ignore_errors=True) |
|
|