Spaces:

itou-daiki
/

pycaret_datascience_streamlit_demo

Sleeping

App Files Files Community

itou-daiki commited on Feb 8

Commit

9e5faab

verified ·

1 Parent(s): f4dc268

Upload 2 files

Browse files

Files changed (2) hide show

app.py +28 -49
common.py +48 -0

app.py CHANGED Viewed

@@ -11,10 +11,12 @@ import shap
 from pycaret.regression import *
 from sklearn.tree import plot_tree
 from sklearn.inspection import permutation_importance
 import matplotlib.font_manager as fm
 # フォントファイルのパスを指定
-font_path = 'ipaexg.ttf'  # フォントファイルのパスを正確に指定してください
 # フォントプロパティを作成
 font_prop = fm.FontProperties(fname=font_path)
@@ -49,7 +51,7 @@ if 'model_configs' not in st.session_state:
 # メインアプリケーション
 st.title("easyAutoML（回帰）")
-st.caption("Created by Dit-Lab.(Daiki Ito)")
 with st.expander("このアプリケーションについて", expanded=False):
     st.markdown("""
@@ -93,15 +95,6 @@ with st.expander("このアプリケーションについて", expanded=False):
     - トレーニング済みモデルのダウンロード
     """)
-    st.markdown("### 💡 使用方法")
-    st.markdown("""
-    1. CSVまたはExcelファイルをアップロード
-    2. 予測したい目的変数を選択
-    3. モデルの比較と選択
-    4. 選択したモデルのチューニングと評価
-    5. 最終モデルの保存
-    """)
     st.info("⚠️ 注意: データの前処理（欠損値の処理など）は自動的に行われますが、データの品質が結果に大きく影響します。")
 # 1. データアップロード部分
@@ -196,9 +189,17 @@ if uploaded_file is not None:
             st.session_state.model_configs['ignore_features'] = ignore_features
         # データ処理オプション
-        col1, col2 = st.columns(2)
         with col1:
             remove_outliers = st.checkbox('外れ値を除去する', value=False)
         # モデル比較の実行
         if st.button('モデルの比較を開始', use_container_width=True):
@@ -208,19 +209,23 @@ if uploaded_file is not None:
                     progress_bar = st.progress(0)
                     status_text = st.empty()
-                    # データのセットアップ
                     status_text.text("データの前処理を実行中...")
                     progress_bar.progress(20)
-                    # セットアップの実行
-                    setup_data = setup(
-                        data=data,
-                        target=target_variable,
-                        ignore_features=ignore_features,
-                        remove_outliers=remove_outliers,
-                        session_id=123,
-                        verbose=False
-                    )
                     # 特徴量の保存
                     X_train = get_config('X_train')
@@ -385,9 +390,7 @@ if uploaded_file is not None:
                                 fig, ax = plt.subplots(figsize=(10, 6))
                                 sns.barplot(data=importance_df, x='重要度', y='特徴量', ax=ax)
                                 st.pyplot(fig)
-                                # キャプションを追加
                                 st.caption("**モデルが学習した結果、各特徴量が目的変数の予測にどれだけ寄与したかを示しています。重要度が高い特徴量ほど、モデルの予測に大きな影響を与えています。**")
-                                # データフレームを表示
                                 with st.expander("特徴量重要度データ", expanded=False):
                                     st.dataframe(importance_df, use_container_width=True)
                             else:
@@ -399,16 +402,12 @@ if uploaded_file is not None:
                     st.write("SHAP値による特徴量重要度")
                     with st.spinner('SHAP値を計算中...'):
                         try:
-                            # SHAP値の計算
                             explainer = shap.Explainer(model, X_train_transformed)
                             shap_values = explainer(X_train_transformed)
-                            # SHAPサマリープロット
                             shap.summary_plot(shap_values, X_train_transformed, plot_type="bar", show=False)
                             st.pyplot(plt.gcf())
-                            # キャプションを追加
                             st.caption("**SHAP値に基づく特徴量の重要度を示しています。各特徴量が予測結果に与える影響を定量的に評価できます。（青：正の影響　緑：負の影響）**")
                             plt.clf()
-                            # SHAP値をデータフレームで表示
                             with st.expander("SHAP値データ", expanded=False):
                                 shap_df = pd.DataFrame({
                                     '特徴量': feature_names,
@@ -437,7 +436,6 @@ if uploaded_file is not None:
                         plot_model(model, plot='error', display_format="streamlit")
                         st.caption("予測値と実測値の差を示しています。誤差が小さいほど、モデルの予測精度が高いことを示します。")
-                    # 追加の評価図を表示
                     col3, col4 = st.columns(2)
                     with col3:
                         st.write("学習曲線")
@@ -458,8 +456,6 @@ if uploaded_file is not None:
                 try:
                     with st.spinner('決定木を可視化中...'):
                         if selected_model == 'dt':
-                            # 決定木モデルの場合、直接ツリーをプロット
-                            # 図を描画
                             fig, ax = plt.subplots(figsize=(40, 20))
                             plot_tree(
                                 model,
@@ -472,14 +468,10 @@ if uploaded_file is not None:
                             st.pyplot(fig)
                             st.caption("決定木の構造を表示しています。")
                         else:
-                            # ランダムフォレストや勾配ブースティングなどの場合
                             from sklearn.metrics import mean_squared_error
-                            # テストデータを取得
                             X_test_transformed = get_config('X_test_transformed')
                             y_test = get_config('y_test')
-                            # 各決定木の性能を評価
                             if selected_model in ['rf', 'et']:
                                 estimators = model.estimators_
                             elif selected_model == 'gbr':
@@ -511,10 +503,8 @@ if uploaded_file is not None:
                                     best_score = mse
                                     best_estimator_index = idx
-                            # ベストなツリーを取得
                             if selected_model in ['rf', 'et', 'gbr']:
                                 best_tree = estimators[best_estimator_index]
-                                # 図を描画
                                 fig, ax = plt.subplots(figsize=(40, 20))
                                 plot_tree(
                                     best_tree,
@@ -527,7 +517,6 @@ if uploaded_file is not None:
                                 st.pyplot(fig)
                                 st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                             elif selected_model == 'xgboost':
-                                # xgboostの場合
                                 import xgboost as xgb
                                 booster = model.get_booster()
                                 fig, ax = plt.subplots(figsize=(40, 20))
@@ -535,7 +524,6 @@ if uploaded_file is not None:
                                 st.pyplot(fig)
                                 st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                             elif selected_model == 'lightgbm':
-                                # lightgbmの場合
                                 import lightgbm as lgb
                                 graph = lgb.create_tree_digraph(model, tree_index=best_estimator_index)
                                 st.graphviz_chart(graph)
@@ -555,18 +543,15 @@ if uploaded_file is not None:
                         progress_bar = st.progress(0)
                         status_text = st.empty()
-                        # モデルのファイナライズ
                         status_text.text("モデルをファイナライズ中...")
                         progress_bar.progress(30)
                         final_model = finalize_model(model)
-                        # 最終評価
                         status_text.text("最終評価を実行中...")
                         progress_bar.progress(60)
                         predictions = predict_model(final_model)
                         final_scores = pull()
-                        # 評価結果の表示
                         st.subheader("ファイナルモデルの評価結果")
                         col1, col2 = st.columns(2)
                         with col1:
@@ -576,19 +561,14 @@ if uploaded_file is not None:
                             st.write("ファイナライズ後の評価結果")
                             st.dataframe(final_scores, use_container_width=True)
-                        # 目的変数の名前をモデルに保存
                         final_model.target_column = target_variable
-                        # モデルの保存
                         status_text.text("モデルを保存中...")
                         progress_bar.progress(90)
                         model_name = f"{target_variable}_finalized_model"
-                        # モデルの保存
                         save_model(final_model, model_name)
-                        # モデルの読み込み
                         with open(f"{model_name}.pkl", 'rb') as f:
                             model_bytes = f.read()
@@ -611,5 +591,4 @@ if uploaded_file is not None:
         st.error(f"予期せぬエラーが発生しました: {str(e)}")
 # コピーライト情報
-st.markdown("---")
-st.caption('© 2022-2024 Dit-Lab.(Daiki Ito). All Rights Reserved.')

 from pycaret.regression import *
 from sklearn.tree import plot_tree
 from sklearn.inspection import permutation_importance
+import common
 import matplotlib.font_manager as fm
 # フォントファイルのパスを指定
+font_path = 'ipaexg.ttf'
 # フォントプロパティを作成
 font_prop = fm.FontProperties(fname=font_path)
 # メインアプリケーション
 st.title("easyAutoML（回帰）")
+common.display_header()
 with st.expander("このアプリケーションについて", expanded=False):
     st.markdown("""
     - トレーニング済みモデルのダウンロード
     """)
     st.info("⚠️ 注意: データの前処理（欠損値の処理など）は自動的に行われますが、データの品質が結果に大きく影響します。")
 # 1. データアップロード部分
             st.session_state.model_configs['ignore_features'] = ignore_features
         # データ処理オプション
+        col1, col2, col3 = st.columns(3)
         with col1:
             remove_outliers = st.checkbox('外れ値を除去する', value=False)
+        with col2:
+            pca_option = st.checkbox('PCAを適用する', value=False)
+        with col3:
+            if pca_option:
+                max_components = len(data.columns) - 1  # 目的変数を除く
+                pca_components = st.slider('主成分の数', min_value=1, max_value=max_components, value=min(10, max_components))
+            else:
+                pca_components = None
         # モデル比較の実行
         if st.button('モデルの比較を開始', use_container_width=True):
                     progress_bar = st.progress(0)
                     status_text = st.empty()
+                    # データの前処理
                     status_text.text("データの前処理を実行中...")
                     progress_bar.progress(20)
+                    # setup() に PCA 関連のパラメータを追加
+                    setup_params = {
+                        'data': data,
+                        'target': target_variable,
+                        'ignore_features': ignore_features,
+                        'remove_outliers': remove_outliers,
+                        'session_id': 123,
+                        'verbose': False,
+                        'pca': pca_option
+                    }
+                    if pca_option:
+                        setup_params['pca_components'] = pca_components
+                    setup_data = setup(**setup_params)
                     # 特徴量の保存
                     X_train = get_config('X_train')
                                 fig, ax = plt.subplots(figsize=(10, 6))
                                 sns.barplot(data=importance_df, x='重要度', y='特徴量', ax=ax)
                                 st.pyplot(fig)
                                 st.caption("**モデルが学習した結果、各特徴量が目的変数の予測にどれだけ寄与したかを示しています。重要度が高い特徴量ほど、モデルの予測に大きな影響を与えています。**")
                                 with st.expander("特徴量重要度データ", expanded=False):
                                     st.dataframe(importance_df, use_container_width=True)
                             else:
                     st.write("SHAP値による特徴量重要度")
                     with st.spinner('SHAP値を計算中...'):
                         try:
                             explainer = shap.Explainer(model, X_train_transformed)
                             shap_values = explainer(X_train_transformed)
                             shap.summary_plot(shap_values, X_train_transformed, plot_type="bar", show=False)
                             st.pyplot(plt.gcf())
                             st.caption("**SHAP値に基づく特徴量の重要度を示しています。各特徴量が予測結果に与える影響を定量的に評価できます。（青：正の影響　緑：負の影響）**")
                             plt.clf()
                             with st.expander("SHAP値データ", expanded=False):
                                 shap_df = pd.DataFrame({
                                     '特徴量': feature_names,
                         plot_model(model, plot='error', display_format="streamlit")
                         st.caption("予測値と実測値の差を示しています。誤差が小さいほど、モデルの予測精度が高いことを示します。")
                     col3, col4 = st.columns(2)
                     with col3:
                         st.write("学習曲線")
                 try:
                     with st.spinner('決定木を可視化中...'):
                         if selected_model == 'dt':
                             fig, ax = plt.subplots(figsize=(40, 20))
                             plot_tree(
                                 model,
                             st.pyplot(fig)
                             st.caption("決定木の構造を表示しています。")
                         else:
                             from sklearn.metrics import mean_squared_error
                             X_test_transformed = get_config('X_test_transformed')
                             y_test = get_config('y_test')
                             if selected_model in ['rf', 'et']:
                                 estimators = model.estimators_
                             elif selected_model == 'gbr':
                                     best_score = mse
                                     best_estimator_index = idx
                             if selected_model in ['rf', 'et', 'gbr']:
                                 best_tree = estimators[best_estimator_index]
                                 fig, ax = plt.subplots(figsize=(40, 20))
                                 plot_tree(
                                     best_tree,
                                 st.pyplot(fig)
                                 st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                             elif selected_model == 'xgboost':
                                 import xgboost as xgb
                                 booster = model.get_booster()
                                 fig, ax = plt.subplots(figsize=(40, 20))
                                 st.pyplot(fig)
                                 st.caption(f"ベストな決定木（ツリー番号: {best_estimator_index}）の構造を表示しています。")
                             elif selected_model == 'lightgbm':
                                 import lightgbm as lgb
                                 graph = lgb.create_tree_digraph(model, tree_index=best_estimator_index)
                                 st.graphviz_chart(graph)
                         progress_bar = st.progress(0)
                         status_text = st.empty()
                         status_text.text("モデルをファイナライズ中...")
                         progress_bar.progress(30)
                         final_model = finalize_model(model)
                         status_text.text("最終評価を実行中...")
                         progress_bar.progress(60)
                         predictions = predict_model(final_model)
                         final_scores = pull()
                         st.subheader("ファイナルモデルの評価結果")
                         col1, col2 = st.columns(2)
                         with col1:
                             st.write("ファイナライズ後の評価結果")
                             st.dataframe(final_scores, use_container_width=True)
                         final_model.target_column = target_variable
                         status_text.text("モデルを保存中...")
                         progress_bar.progress(90)
                         model_name = f"{target_variable}_finalized_model"
                         save_model(final_model, model_name)
                         with open(f"{model_name}.pkl", 'rb') as f:
                             model_bytes = f.read()
         st.error(f"予期せぬエラーが発生しました: {str(e)}")
 # コピーライト情報
+common.display_copyright()

common.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+def display_header():
+    st.caption("Created by Dit-Lab.(Daiki Ito)")
+def set_font():
+    import matplotlib.pyplot as plt
+    import japanize_matplotlib
+    font_path = "ipaexg.ttf"
+    plt.rcParams['font.family'] = 'IPAexGothic'
+def display_guide():
+    st.markdown("""
+    - [**情報探究ステップアップガイド**](https://dit-lab.notion.site/612d9665350544aa97a2a8514a03c77c?v=85ad37a3275b4717a0033516b9cfd9cc)
+    - [**中の人のページ（Dit-Lab.）**](https://dit-lab.notion.site/Dit-Lab-da906d09d3cf42a19a011cf4bf25a673?pvs=4)
+    """)
+def display_link():
+    st.header("リンク")
+    st.markdown("""
+    - [**中の人のページ（Dit-Lab.）**](https://dit-lab.notion.site/Dit-Lab-da906d09d3cf42a19a011cf4bf25a673?pvs=4)
+    - [**進数変換学習アプリ**](https://easy-base-converter.streamlit.app)
+    - [**easyRSA**](https://easy-rsa.streamlit.app/)
+    - [**easyAutoML（回帰）**](https://huggingface.co/spaces/itou-daiki/pycaret_datascience_streamlit)
+    - [**pkl_predict_reg**](https://huggingface.co/spaces/itou-daiki/pkl_predict_reg)
+    - [**音のデータサイエンス**](https://audiovisualizationanalysis-bpeekdjwymuf6nkqcb4cqy.streamlit.app)
+    - [**3D RGB Cube Visualizer**](https://boxplot-4-mysteams.streamlit.app)
+    - [**上マーク角度計算補助ツール**](https://sailing-mark-angle.streamlit.app)
+    - [**Factor Score Calculator**](https://factor-score-calculator.streamlit.app/)
+    - [**easy Excel Merge**](https://easy-xl-merge.streamlit.app)
+    - [**フィードバックはこちらまで**](https://forms.gle/G5sMYm7dNpz2FQtU9)
+    - [**ソースコードはこちら（GitHub）**](https://github.com/itou-daiki/easy_stat)
+    """)
+def display_copyright():
+    st.subheader("")
+    st.write('ご意見・ご要望は→', 'https://forms.gle/G5sMYm7dNpz2FQtU9', 'まで')
+    st.write("")
+    st.subheader('© 2022-2025 Dit-Lab.(Daiki Ito). All Rights Reserved.')
+    st.write("easyStat: Open Source for Ubiquitous Statistics")
+    st.write("Democratizing data, everywhere.")
+    st.write("")
+def display_special_thanks():
+    st.subheader("In collaboration with our esteemed contributors:")
+    st.write("・Toshiyuki")
+    st.write("With heartfelt appreciation for their dedication and support.")