upload app.py with model and load dataset
Browse files- app.py +49 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_shap import st_shap
|
3 |
+
import shap
|
4 |
+
from datasets import load_dataset
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
import lightgbm as lgb
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
|
11 |
+
@st.experimental_memo
|
12 |
+
def load_data():
|
13 |
+
dataset = load_dataset("ttd22/house-price", streaming = True)
|
14 |
+
df = pd.DataFrame.from_dict(dataset["train"])
|
15 |
+
df = df.drop('Id', axis=1)
|
16 |
+
drop_columns = (df.isnull().sum().sort_values(ascending=False).loc[lambda x : x > .90*1460]).index.to_list()
|
17 |
+
df = df.drop(drop_columns, axis = 'columns', errors = 'ignore')
|
18 |
+
cols_with_missing_values = df.columns[df.isnull().sum() > 0]
|
19 |
+
# Iterate through each column with missing values
|
20 |
+
for col in cols_with_missing_values:
|
21 |
+
# Check if the column is numeric
|
22 |
+
if df[col].dtype in ['int64', 'float64']:
|
23 |
+
# Impute missing values with median
|
24 |
+
median = df[col].median()
|
25 |
+
df[col].fillna(median, inplace=True)
|
26 |
+
else:
|
27 |
+
# Impute missing values with mode
|
28 |
+
mode = df[col].mode()[0]
|
29 |
+
df[col].fillna(mode, inplace=True)
|
30 |
+
X, y = df.drop("SalePrice", axis=1), df["SalePrice"]
|
31 |
+
# Extract categoricals and their indices
|
32 |
+
cat_features = X.select_dtypes(exclude=np.number).columns.to_list()
|
33 |
+
cat_idx = [X.columns.get_loc(col) for col in cat_features]
|
34 |
+
# Convert cat_features to pd.Categorical dtype
|
35 |
+
for col in cat_features:
|
36 |
+
X[col] = pd.Categorical(X[col])
|
37 |
+
return X,y,cat_idx
|
38 |
+
|
39 |
+
@st.experimental_memo
|
40 |
+
def load_model(X, y, cat_idx):
|
41 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
42 |
+
params = {'n_estimators': 569, 'num_leaves': 62, 'max_depth': 10, 'learning_rate': 0.010786783375710743, 'colsample_bytree': 0.5065493231651268, 'subsample': 0.7900705177300663, 'lambda_l1': 4.998785478697207, 'lambda_l2': 2.1857959934319657, 'min_child_weight': 11.187719709451862}
|
43 |
+
model = lgb.LGBMRegressor(**params)
|
44 |
+
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], categorical_feature=cat_idx, verbose = False)
|
45 |
+
return model
|
46 |
+
|
47 |
+
# train LightGBM model
|
48 |
+
X,y,cat_idx = load_data()
|
49 |
+
model = load_model(X, y, cat_idx)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
streamlit_shap
|
3 |
+
shap
|
4 |
+
datasets
|
5 |
+
sklearn
|
6 |
+
lightgbm
|
7 |
+
numpy
|
8 |
+
pandas
|