Spaces:

itou-daiki
/

pycaret_datascience_streamlit_demo

Sleeping

File size: 29,276 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
import joblib
import base64
import io
import shap
from pycaret.regression import *
from sklearn.tree import plot_tree
from sklearn.inspection import permutation_importance
import common

import matplotlib.font_manager as fm

# フォントファイルのパスを指定
font_path = 'ipaexg.ttf' 

# フォントプロパティを作成
font_prop = fm.FontProperties(fname=font_path)

# フォントをMatplotlibのデフォルトフォントに設定
plt.rcParams['font.family'] = font_prop.get_name()

# ページ設定
st.set_page_config(
    page_title="easyAutoML（回帰）",
    page_icon="📊",
    layout="wide"
)

# セッションステートの初期化
if 'model_configs' not in st.session_state:
    st.session_state.model_configs = {
        'uploaded_data': None,
        'target_variable': None,
        'ignore_features': [],
        'setup_done': False,
        'model_trained': False,
        'current_model': None,
        'final_model': None,
        'feature_importance': None,
        'model_comparison': None,
        'pre_tuned_scores': None,
        'post_tuned_scores': None,
        'X_train': None,
        'features': None
    }

# メインアプリケーション
st.title("easyAutoML（回帰）")
common.display_header()

with st.expander("このアプリケーションについて", expanded=False):
    st.markdown("""
    ### 📊 AutoMLアプリケーションの概要
    このアプリケーションは、機械学習モデルの構築と評価を自動化し、詳細な分析結果を提供します。
    """)

    st.markdown("### 🔍 主な機能")
    
    st.markdown("#### 1. データ分析")
    st.markdown("""
    - データの基本統計量の確認
    - 欠損値の分析と自動処理
    - 外れ値の検出と処理オプション
    """)

    st.markdown("#### 2. モデル構築と最適化")
    st.markdown("""
    - 複数の機械学習モデルの自動比較
    - 最適なモデルの選択支援
    - ハイパーパラメータの自動チューニング
    - 交差検証による性能評価
    """)

    st.markdown("#### 3. モデル評価と解釈")
    st.markdown("""
    - 特徴量重要度の分析
        - モデルベースの重要度
        - SHAP値による解釈
    - 予測性能の詳細な評価
        - 残差分析
        - 予測値と実測値の比較
    - クックの距離による影響度分析
    - 決定木モデルの構造可視化（対応モデルのみ）
    """)

    st.markdown("#### 4. 最終モデルの生成")
    st.markdown("""
    - モデルのファイナライズ
    - 最終評価結果の確認
    - トレーニング済みモデルのダウンロード
    """)

    st.info("⚠️ 注意: データの前処理（欠損値の処理など）は自動的に行われますが、データの品質が結果に大きく影響します。")

# 1. データアップロード部分
st.markdown("---")
st.header("1. データのアップロード")
st.write("データのアップロードについて")
st.markdown("""
- CSVまたはExcelファイルをアップロードしてください
- 欠損値や異常値は自動的に処理されます
""")

uploaded_file = st.file_uploader(
    "ファイルを選択してください（CSVまたはExcel）",
    type=['csv', 'xlsx']
)

if uploaded_file is not None:
    try:
        # 新しいファイルがアップロードされた場合のセッションリセット
        if 'last_file_name' not in st.session_state or st.session_state.last_file_name != uploaded_file.name:
            st.session_state.model_configs = {
                'uploaded_data': None,
                'target_variable': None,
                'ignore_features': [],
                'setup_done': False,
                'model_trained': False,
                'current_model': None,
                'final_model': None,
                'feature_importance': None,
                'model_comparison': None,
                'pre_tuned_scores': None,
                'post_tuned_scores': None,
                'X_train': None,
                'features': None
            }
            st.session_state.last_file_name = uploaded_file.name

        # データ読み込み
        if uploaded_file.name.endswith('.csv'):
            data = pd.read_csv(uploaded_file)
        else:
            data = pd.read_excel(uploaded_file)
        
        st.session_state.model_configs['uploaded_data'] = data
        
        # データの基本情報表示
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("データ件数", f"{len(data):,}件")
        with col2:
            st.metric("項目数", f"{len(data.columns)}個")
        with col3:
            st.metric("欠損値を含む列", f"{data.isnull().any().sum()}個")
        
        # データプレビュー
        st.subheader("データプレビュー")
        st.dataframe(data.head(), use_container_width=True)
        
        # 基本統計量
        with st.expander("データの基本統計量を表示", expanded=False):
            st.dataframe(data.describe(), use_container_width=True)
        
        # 欠損値の情報
        if data.isnull().any().sum() > 0:
            with st.expander("欠損値の詳細を表示", expanded=False):
                missing_data = pd.DataFrame({
                    '欠損値数': data.isnull().sum(),
                    '欠損率(%)': (data.isnull().sum() / len(data) * 100).round(2)
                }).reset_index()
                missing_data.columns = ['列名', '欠損値数', '欠損率(%)']
                st.dataframe(missing_data[missing_data['欠損値数'] > 0], use_container_width=True)

        st.markdown("---")
        # 2. モデル設定
        st.header("2. モデル設定")
        
        col1, col2 = st.columns(2)
        with col1:
            target_variable = st.selectbox(
                '予測対象（目的変数）の選択',
                options=data.columns,
                help="予測したい項目を選択してください"
            )
            st.session_state.model_configs['target_variable'] = target_variable

        with col2:
            ignore_features = st.multiselect(
                '除外する項目の選択',
                options=[col for col in data.columns if col != target_variable],
                help="モデルの学習に使用しない項目を選択してください"
            )
            st.session_state.model_configs['ignore_features'] = ignore_features

        # データ処理オプション
        col1, col2, col3 = st.columns(3)
        with col1:
            remove_outliers = st.checkbox('外れ値を除去する', value=False)
        with col2:
            pca_option = st.checkbox('PCAを適用する', value=False)
        with col3:
            if pca_option:
                max_components = len(data.columns) - 1  # 目的変数を除く
                pca_components = st.slider('主成分の数', min_value=1, max_value=max_components, value=min(10, max_components))
            else:
                pca_components = None

        # モデル比較の実行
        if st.button('モデルの比較を開始', use_container_width=True):
            try:
                with st.spinner("モデルを比較中..."):
                    # プログレスバーの表示
                    progress_bar = st.progress(0)
                    status_text = st.empty()

                    # データの前処理
                    status_text.text("データの前処理を実行中...")
                    progress_bar.progress(20)
                    
                    # setup() に PCA 関連のパラメータを追加
                    setup_params = {
                        'data': data,
                        'target': target_variable,
                        'ignore_features': ignore_features,
                        'remove_outliers': remove_outliers,
                        'session_id': 123,
                        'verbose': False,
                        'pca': pca_option
                    }
                    if pca_option:
                        setup_params['pca_components'] = pca_components
                    setup_data = setup(**setup_params)
                    
                    # 特徴量の保存
                    X_train = get_config('X_train')
                    st.session_state.model_configs['X_train'] = X_train
                    st.session_state.model_configs['features'] = X_train.columns.tolist()
                    
                    # モデルの比較
                    progress_bar.progress(40)
                    status_text.text("モデルを比較中...")
                    
                    models_comparison = compare_models(
                        exclude=['catboost'],
                        fold=5,
                        sort='MAE',
                        n_select=15,
                        verbose=False
                    )
                    
                    # 比較結果を保存
                    comparison_df = pull()
                    st.session_state.model_configs['model_comparison'] = comparison_df.copy()
                    
                    progress_bar.progress(100)
                    status_text.text("モデルの比較が完了しました！")
                    st.success("✅ モデルの比較が完了しました！")

            except Exception as e:
                st.error(f"エラーが発生しました: {str(e)}")
                st.stop()

        # モデルの選択とチューニング
        if 'model_comparison' in st.session_state.model_configs and st.session_state.model_configs['model_comparison'] is not None:
            st.markdown("---")
            st.header("モデルの選択とチューニング")
            
            # モデル比較結果の表示
            st.subheader("モデル比較結果")
            st.dataframe(st.session_state.model_configs['model_comparison'], use_container_width=True)
            with st.expander("モデルの説明", expanded=False):
                st.markdown("""
                **モデルの説明をここに記載**（省略）
                """)

            # 比較結果の説明
            with st.expander("評価指標の説明", expanded=False):
                st.markdown("""
                **評価指標の説明をここに記載**（省略）
                """)
            
            # モデルの選択
            selected_model = st.selectbox(
                'チューニングするモデルを選択',
                options=st.session_state.model_configs['model_comparison'].index,
                help="比較結果から最適なモデルを選択してください"
            )
            
            # ハイパーパラメータの設定
            st.subheader("ハイパーパラメータの設定")
            
            # 交差検証の設定
            cv_folds = st.slider('交差検証のfold数', min_value=2, max_value=20, value=5)

            # モデル固有のハイパーパラメータ設定
            params = {}
            if selected_model in ['rf', 'et']:
                n_estimators = st.slider('決定木の数', min_value=50, max_value=500, value=100, step=50)
                max_depth = st.slider('決定木の最大深さ', min_value=3, max_value=20, value=3)
                min_samples_split = st.slider('分割のための最小サンプル数', min_value=2, max_value=20, value=2)
                min_samples_leaf = st.slider('葉となるための最小サンプル数', min_value=1, max_value=10, value=1)
                params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf
                }
            elif selected_model in ['xgboost', 'lightgbm']:
                learning_rate = st.slider('学習率', min_value=0.01, max_value=0.3, value=0.1, step=0.01)
                n_estimators = st.slider('決定木の数', min_value=50, max_value=500, value=100, step=50)
                max_depth = st.slider('決定木の最大深さ', min_value=3, max_value=20, value=3)
                params = {
                    'learning_rate': learning_rate,
                    'n_estimators': n_estimators,
                    'max_depth': max_depth
                }

            # モデルのトレーニングとチューニング
            if st.button('選択したモデルでトレーニング開始', use_container_width=True):
                try:
                    with st.spinner("モデルをトレーニング中..."):
                        progress_bar = st.progress(0)
                        status_text = st.empty()

                        # 選択したモデルの作成
                        status_text.text("モデルを作成中...")
                        progress_bar.progress(30)
                        
                        # モデルの作成とスコアの保存
                        base_model = create_model(selected_model, fold=cv_folds, **params)
                        pre_tuned_scores = pull()
                        st.session_state.model_configs['pre_tuned_scores'] = pre_tuned_scores.copy()
                        
                        # チューニング
                        status_text.text("ハイパーパラメータをチューニング中...")
                        progress_bar.progress(60)
                        tuned_model = tune_model(
                            base_model,
                            n_iter=10,
                            fold=cv_folds,
                            optimize='MAE'
                        )
                        
                        # チューニング後のスコアを保存
                        post_tuned_scores = pull()
                        st.session_state.model_configs.update({
                            'current_model': tuned_model,
                            'post_tuned_scores': post_tuned_scores.copy(),
                            'model_trained': True
                        })
                        
                        # チューニング結果の表示
                        st.subheader("チューニング結果")
                        col1, col2 = st.columns(2)
                        with col1:
                            st.write("チューニング前")
                            st.dataframe(pre_tuned_scores, use_container_width=True)
                        with col2:
                            st.write("チューニング後")
                            st.dataframe(post_tuned_scores, use_container_width=True)
                        
                        progress_bar.progress(100)
                        status_text.text("モデルのトレーニングが完了しました！")
                        st.success("✅ モデルのトレーニングが完了しました！")

                except Exception as e:
                    st.error(f"モデルのトレーニング中にエラーが発生しました: {str(e)}")
                    st.stop()

        # モデル評価
        if st.session_state.model_configs['model_trained']:
            st.markdown("---")
            st.header("3. モデルの評価")
        
            # 1. 特徴量重要度の分析
            st.subheader("3-1. 特徴量重要度")
            try:
                model = st.session_state.model_configs['current_model']
                X_train_transformed = get_config('X_train_transformed')
                feature_names = X_train_transformed.columns
            
                col1, col2 = st.columns(2)
            
                with col1:
                    st.write("モデルベースの特徴量重要度")
                    with st.spinner('特徴量重要度を計算中...'):
                        try:
                            if hasattr(model, 'feature_importances_'):
                                importance_df = pd.DataFrame({
                                    '特徴量': feature_names,
                                    '重要度': model.feature_importances_
                                }).sort_values(by='重要度', ascending=False)
                                # プロット
                                fig, ax = plt.subplots(figsize=(10, 6))
                                sns.barplot(data=importance_df, x='重要度', y='特徴量', ax=ax)
                                st.pyplot(fig)
                                st.caption("**モデルが学習した結果、各特徴量が目的変数の予測にどれだけ寄与したかを示しています。重要度が高い特徴量ほど、モデルの予測に大きな影響を与えています。**")
                                with st.expander("特徴量重要度データ", expanded=False):
                                    st.dataframe(importance_df, use_container_width=True)
                            else:
                                st.warning("このモデルでは特徴量重要度を直接取得できません。")
                        except Exception as e:
                            st.warning(f"特徴量重要度のプロットに失敗しました: {str(e)}")
            
                with col2:
                    st.write("SHAP値による特徴量重要度")
                    with st.spinner('SHAP値を計算中...'):
                        try:
                            explainer = shap.Explainer(model, X_train_transformed)
                            shap_values = explainer(X_train_transformed)
                            shap.summary_plot(shap_values, X_train_transformed, plot_type="bar", show=False)
                            st.pyplot(plt.gcf())
                            st.caption("**SHAP値に基づく特徴量の重要度を示しています。各特徴量が予測結果に与える影響を定量的に評価できます。（青：正の影響　緑：負の影響）**")
                            plt.clf()
                            with st.expander("SHAP値データ", expanded=False):
                                shap_df = pd.DataFrame({
                                    '特徴量': feature_names,
                                    'SHAP値の平均絶対値': np.abs(shap_values.values).mean(axis=0)
                                }).sort_values(by='SHAP値の平均絶対値', ascending=False)
                                st.dataframe(shap_df, use_container_width=True)
                        except Exception as e:
                            st.warning(f"SHAP値の計算中にエラーが発生しました: {str(e)}")
            
            except Exception as e:
                st.error(f"特徴量重要度の分析中にエラーが発生しました: {str(e)}")
            st.markdown("---")
        
            # 2. 予測性能の分析
            st.subheader("3-2. 予測性能の分析と外れ値の検出")
            try:
                with st.spinner('予測性能を分析中...'):
                    col1, col2 = st.columns(2)
                    with col1:
                        st.write("残差プロット")
                        plot_model(model, plot='residuals', display_format="streamlit")
                        st.caption("残差（予測値と実測値の差）と予測値の関係を示しています。パターンがなければ、モデルが適切にフィットしていることを示唆します。")
            
                    with col2:
                        st.write("予測誤差プロット")
                        plot_model(model, plot='error', display_format="streamlit")
                        st.caption("予測値と実測値の差を示しています。誤差が小さいほど、モデルの予測精度が高いことを示します。")
            
                    col3, col4 = st.columns(2)
                    with col3:
                        st.write("学習曲線")
                        plot_model(model, plot='learning', display_format="streamlit")
                        st.caption("トレーニングデータと検証データに対するモデルのパフォーマンスを示します。学習曲線が収束していれば、モデルが適切に学習していることを示します。")
            
                    with col4:
                        st.write("クックの距離")
                        plot_model(model, plot='cooks', display_format="streamlit")
                        st.caption("クックの距離が大きいデータポイントは、**モデルに強い影響を与える可能性がある**ため、外れ値の検出に役立ちます。")
            except Exception as e:
                st.error(f"予測性能の分析中にエラーが発生しました: {str(e)}")
        
            # 4. 決定木の可視化（該当モデルの場合のみ）
            if selected_model in ['dt', 'rf', 'et', 'gbr', 'xgboost', 'lightgbm']:
                st.markdown("---")
                st.subheader("3-4. 決定木の構造")
                try:
                    with st.spinner('決定木を可視化中...'):
                        if selected_model == 'dt':
                            fig, ax = plt.subplots(figsize=(40, 20))
                            plot_tree(
                                model,
                                feature_names=feature_names,
                                filled=True,
                                rounded=True,
                                fontsize=12,
                                ax=ax
                            )
                            st.pyplot(fig)
                            st.caption("決定木の構造を表示しています。")
                        else:
                            from sklearn.metrics import mean_squared_error
                            X_test_transformed = get_config('X_test_transformed')
                            y_test = get_config('y_test')
            
                            if selected_model in ['rf', 'et']:
                                estimators = model.estimators_
                            elif selected_model == 'gbr':
                                estimators = [est[0] for est in model.estimators_]
                            elif selected_model == 'xgboost':
                                import xgboost as xgb
                                estimators = model.get_booster().get_dump()
                            elif selected_model == 'lightgbm':
                                import lightgbm as lgb
                                estimators = model.booster_.dump_model()['tree_info']
                            else:
                                estimators = []
            
                            best_score = float('inf')
                            best_estimator_index = 0
            
                            for idx, estimator in enumerate(estimators):
                                if selected_model in ['rf', 'et', 'gbr']:
                                    y_pred = estimator.predict(X_test_transformed)
                                elif selected_model == 'xgboost':
                                    y_pred = model.predict(X_test_transformed, ntree_limit=idx+1)
                                elif selected_model == 'lightgbm':
                                    y_pred = model.predict(X_test_transformed, num_iteration=idx+1)
                                else:
                                    continue
            
                                mse = mean_squared_error(y_test, y_pred)
                                if mse < best_score:
                                    best_score = mse
                                    best_estimator_index = idx
            
                            if selected_model in ['rf', 'et', 'gbr']:
                                best_tree = estimators[best_estimator_index]
                                fig, ax = plt.subplots(figsize=(40, 20))
                                plot_tree(
                                    best_tree,
                                    feature_names=feature_names,
                                    filled=True,
                                    rounded=True,
                                    fontsize=12,
                                    ax=ax
                                )
                                st.pyplot(fig)
                                st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                            elif selected_model == 'xgboost':
                                import xgboost as xgb
                                booster = model.get_booster()
                                fig, ax = plt.subplots(figsize=(40, 20))
                                xgb.plot_tree(booster, num_trees=best_estimator_index, ax=ax)
                                st.pyplot(fig)
                                st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                            elif selected_model == 'lightgbm':
                                import lightgbm as lgb
                                graph = lgb.create_tree_digraph(model, tree_index=best_estimator_index)
                                st.graphviz_chart(graph)
                                st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                            else:
                                st.warning(f"{selected_model}モデルの決定木の可視化は現在サポートされていません。")
                except Exception as e:
                    st.error(f"決定木の可視化中にエラーが発生しました: {str(e)}")

            # 5. モデルのファイナライズ
            st.markdown("---")
            st.header("4. モデルのファイナライズ")
            
            if st.button('モデルをファイナライズ', use_container_width=True):
                try:
                    with st.spinner("モデルをファイナライズ中..."):
                        progress_bar = st.progress(0)
                        status_text = st.empty()

                        status_text.text("モデルをファイナライズ中...")
                        progress_bar.progress(30)
                        final_model = finalize_model(model)
                        
                        status_text.text("最終評価を実行中...")
                        progress_bar.progress(60)
                        predictions = predict_model(final_model)
                        final_scores = pull()
                        
                        st.subheader("ファイナルモデルの評価結果")
                        col1, col2 = st.columns(2)
                        with col1:
                            st.write("チューニング後の評価結果")
                            st.dataframe(st.session_state.model_configs['post_tuned_scores'], use_container_width=True)
                        with col2:
                            st.write("ファイナライズ後の評価結果")
                            st.dataframe(final_scores, use_container_width=True)

                        final_model.target_column = target_variable

                        status_text.text("モデルを保存中...")
                        progress_bar.progress(90)
                        model_name = f"{target_variable}_finalized_model"

                        save_model(final_model, model_name)
                        
                        with open(f"{model_name}.pkl", 'rb') as f:
                            model_bytes = f.read()
                        
                        st.download_button(
                            label="ファイナライズしたモデルをダウンロード",
                            data=model_bytes,
                            file_name=f"{model_name}.pkl",
                            mime="application/octet-stream",
                            use_container_width=True
                        )
                        
                        progress_bar.progress(100)
                        status_text.text("ファイナライズが完了しました！")
                        st.success("✅ モデルのファイナライズが完了しました！")

                except Exception as e:
                    st.error(f"モデルのファイナライズ中にエラーが発生しました: {str(e)}")

    except Exception as e:
        st.error(f"予期せぬエラーが発生しました: {str(e)}")

# コピーライト情報
common.display_copyright()