Spaces:

waleko
/

bmw-defect-detection

Sleeping

App Files Files Community

waleko commited on 17 days ago

Commit

ff83063

•

1 Parent(s): 2b8c339

first version

Browse files

Files changed (8) hide show

app.py +229 -0
catboost_model.cbm +3 -0
customfeatureselector.pkl +3 -0
hourly_data.csv +3 -0
predictions.csv +3 -0
requirements.txt +8 -0
scaler.pkl +3 -0
shap_importance.csv +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.pipeline import make_pipeline
+from catboost import CatBoostClassifier
+from sklearn.preprocessing import StandardScaler
+import shap
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.feature_selection import SelectKBest
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+from catboost import CatBoostClassifier
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.cluster import DBSCAN
+from sklearn.neighbors import NearestNeighbors
+import numpy as np
+import pandas as pd
+from tqdm.auto import tqdm
+from sklearn.preprocessing import OneHotEncoder
+import pickle
+class CustomFeatureTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+        self.column_means_ = None
+    def fit(self, X, y=None):
+        X_copy = X.copy()
+        self.numerical_columns = list(X_copy.select_dtypes(include=np.number).columns)
+        self.categorical_columns = list(X_copy.select_dtypes(exclude=np.number).columns)
+        # filter out with > 100 unique values
+        for col in self.categorical_columns:
+            if len(X_copy[col].unique()) > 100:
+                self.categorical_columns.remove(col)
+                if self.verbose:
+                    print(f'removed {col} with {len(X_copy[col].unique())} unique values')
+        # Store means for each column
+        self.column_means_ = X_copy[self.numerical_columns].mean().fillna(0)
+        self.onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        self.onehot_encoder.fit(X_copy[self.categorical_columns])
+        return self
+    def transform(self, X):
+        X_copy = X.copy()
+        X_copy.reset_index(drop=True, inplace=True)
+        result_dfs = []
+        # Process each column
+        for col in self.numerical_columns:
+            # Add is_null indicator
+            is_null = X_copy[col].isna()
+            result_dfs.append(pd.DataFrame({
+                f"{col}_is_null": is_null.astype(int)
+            }))
+            filled_values = X_copy[col].fillna(self.column_means_[col])
+            result_dfs.append(pd.DataFrame({
+                f"{col}_value": filled_values
+            }))
+        # Add non-numerical columns using one-hot encoding
+        result_dfs.append(pd.DataFrame(self.onehot_encoder.transform(X_copy[self.categorical_columns]), columns=self.onehot_encoder.get_feature_names_out()))
+        # Concatenate all transformed features
+        df = pd.concat(result_dfs, axis=1)
+        assert not df.isna().any().any()
+        return df
+class DayNumberTransformer:
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        X = X.copy()
+        X['message_timestamp'] = pd.to_datetime(X['message_timestamp'])
+        X['week_number'] = X['message_timestamp'].dt.strftime('%U %w')
+        return X
+class WeatherTransformer:
+    def __init__(self, weather):
+        self.weather = weather
+        self.weather['date'] = pd.to_datetime(self.weather['date']).dt.tz_convert('Europe/Berlin')
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        X = X.copy()
+        # round ot hour
+        X['message_timestamp'] = pd.to_datetime(X['message_timestamp']).dt.tz_localize('Europe/Berlin')
+        X['message_timestamp'] = X['message_timestamp'].dt.round('h')
+        # join weather data by column message_timestamp and date
+        X = X.merge(self.weather, left_on='message_timestamp', right_on='date', how='left')
+        # print number of rows in X that have no weather data
+        if X['temperature_2m'].isna().sum() > 0:
+            print("Number of rows without weather data: ", X['temperature_2m'].isna().sum())
+        columns_X = X.columns
+        # delete all that contain 'sensor' in the name
+        columns_X = [col for col in columns_X if 'sensor' not in col]
+        # print("Columns in X: ", columns_X)
+        # 1 / 0
+        return X
+class TopFeaturesSelector:
+    def __init__(self, top_features):
+        self.top_features = top_features
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        return X[self.top_features]
+import warnings
+warnings.filterwarnings("ignore")
+weather_file = 'hourly_data.csv'
+shap_importance_file = 'shap_importance.csv'
+weather = pd.read_csv(weather_file)
+shap_importance_df = pd.read_csv(shap_importance_file)
+top_features = shap_importance_df['Feature'].head(25).values
+catboost = CatBoostClassifier().load_model('catboost_model.cbm')
+scaler = pickle.load(open('scaler.pkl', 'rb'))
+custom_feature_transformer = pickle.load(open('customfeatureselector.pkl', 'rb'))
+# Define the sklearn pipeline
+pipe = make_pipeline(
+    WeatherTransformer(weather),
+    DayNumberTransformer(),
+    custom_feature_transformer,
+    TopFeaturesSelector(top_features),
+    scaler,
+    catboost
+)
+def egor_plots(X_test, k=1000):
+    # Preprocess X_test
+    X_prescaled = pipe[:-2].transform(X_test)[:k]
+    X_test_preprocessed = pipe[-2].transform(X_prescaled)
+    # SHAP Analysis
+    st.write("SHAP Analysis... This may take a couple of minutes depending on the number of samples.")
+    explainer = shap.TreeExplainer(pipe[-1])
+    shap_values = explainer(X_test_preprocessed)
+    shap_values.feature_names = X_prescaled.columns
+    # SHAP Summary Plot
+    st.write("### SHAP Summary Plot")
+    fig_summary = shap.summary_plot(shap_values, X_test_preprocessed, show=False)
+    st.pyplot(fig_summary)
+    # SHAP Scatter Plots
+    st.write("### SHAP Scatter Plots")
+    for i in range(25):
+        feature_name = top_features[i]
+        st.write(f"#### Scatter Plot for Feature: {feature_name}")
+        fig, ax = plt.subplots()
+        shap.plots.scatter(shap_values[:, i], X_test_preprocessed[:, i], show=False, ax=ax)
+        ax.axhline(y=0, color='r', linestyle='--')
+        ax.axvline(x=0, color='g', linestyle='--')
+        st.pyplot(fig)
+# Streamlit App
+st.title("BMW Hackathon Defect Detection")
+st.write("### Upload your tabular data")
+# File uploader
+uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
+# Add radio button for prediction type
+prediction_type = st.radio(
+    "Select prediction type",
+    ["predict", "predict_proba"],
+    index=0
+)
+k = st.slider("Number of samples for SHAP plots", min_value=10, max_value=1000, value=100)
+if uploaded_file:
+    # Load the uploaded file
+    data = pd.read_csv(uploaded_file)
+    st.write("Uploaded Data:")
+    st.write(data.head())
+    st.write("Predicting...")
+    if prediction_type == 'predict':
+        y_pred = pipe.predict(data)
+        # status 1 -> OK, 0 -> NOK
+        status = pd.Series(['OK' if pred == 1 else 'NOK' for pred in y_pred])
+    elif prediction_type == 'predict_proba':
+        status = pipe.predict_proba(data)[:, 1]
+    else:
+        raise ValueError(f"Invalid prediction type: {prediction_type}")
+    res = pd.DataFrame(
+        {"physical_part_id": data["physical_part_id"],
+         "status": status}
+    )
+    st.write("### Results")
+    st.write(res.head())
+    # Download the predictions as CSV
+    csv = res.to_csv(index=False)
+    st.download_button(
+        label="Download predictions as CSV",
+        data=csv,
+        file_name="predictions.csv",
+        mime="text/csv"
+    )
+    st.write("### SHAP plots")
+    egor_plots(data)

catboost_model.cbm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad04e1e3d1f47b2472afe968e4f9f5a6944766136c8a51e83d22e38d9d6fbbb5
+size 32984696

customfeatureselector.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cdb80823b17987678aeca72c82f1dcc6d96734ee3d90c6c18a29461f0c18094
+size 18816

hourly_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16dea43ddccacab82981233c1dddec016696c42d887ca55e6d8fddf52e10d524
+size 160429

predictions.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59e60207af9ea2f7c7ca343325b2d380f4a3f2cbbb844c55a5caf2ff412c231e
+size 555823

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+scikit-learn
+pandas
+numpy
+catboost
+shap
+matplotlib
+tqdm

scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2807a66307ef299ad1e1818faf34916950f5d5c22c5924a4aee8575903c31d67
+size 1916

shap_importance.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7cf3a7acd2c10ae46196f67d25e680950c92ebacde9ed6a3802c9d3d2502ddc
+size 43112