Spaces:

itou-daiki
/

pycaret_datascience_streamlit_demo

Sleeping

App Files Files Community

itou-daiki commited on May 30, 2024

Commit

f2d31c5

verified ·

1 Parent(s): e259083

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -82

app.py CHANGED Viewed

@@ -32,15 +32,18 @@ if uploaded_file is not None and ('last_uploaded_file' not in st.session_state o
     st.session_state.last_uploaded_file = uploaded_file.name  # 最後にアップロードされたファイル名を保存
 if uploaded_file:
-    if uploaded_file.name.endswith('.csv'):
-        train_data = pd.read_csv(uploaded_file)
-    elif uploaded_file.name.endswith('.xlsx'):
-        train_data = pd.read_excel(uploaded_file)
-    else:
-        st.error("無効なファイルタイプです。CSVまたはExcelファイルをアップロードしてください.")
-    st.session_state.uploaded_data = train_data
-    st.dataframe(train_data.head())
 # ターゲット変数の選択
 if 'uploaded_data' in st.session_state:
@@ -62,7 +65,7 @@ if 'uploaded_data' in st.session_state:
     # Excelファイルダウンロード機能
     towrite = io.BytesIO()
-    downloaded_file = filtered_data.to_excel(towrite, encoding='utf-8', index=False, header=True)
     towrite.seek(0)
     b64 = base64.b64encode(towrite.read()).decode()
     original_filename = uploaded_file.name.split('.')[0]  # 元のファイル名を取得
@@ -83,41 +86,68 @@ if 'uploaded_data' in st.session_state:
             st.warning("ターゲット変数に欠損値が含まれているレコードを削除します。")
             st.session_state.uploaded_data = st.session_state.uploaded_data.dropna(subset=[target_variable])
         # 前処理（モデル作成の準備）
         if 'exp_clf101_setup_done' not in st.session_state:  # このセッションで既にセットアップが完了していない場合のみ実行
-            with st.spinner('データの前処理中...'):
-                exp_clf101 = setup(data=st.session_state.uploaded_data,
-                                   target=target_variable,
-                                   session_id=123,
-                                   remove_outliers=remove_outliers_option,
-                                   ignore_features=ignore_variable)
-                st.session_state.exp_clf101 = exp_clf101
-                st.session_state.exp_clf101_setup_done = True  # セットアップ完了フラグをセッションに保存
-        st.info("前処理が完了しました")
         setup_list_df = pull()
         st.write("前処理の結果")
         st.caption("以下は、前処理のステップとそれに伴うデータのパラメータを示す表です。")
         st.write(setup_list_df)
         # モデルの比較
-        with st.spinner('モデルを比較中...'):
-            models_comparison = compare_models(exclude=['dummy','catboost'])
-            st.session_state.models_comparison = models_comparison  # セッション状態にモデル比較を保存
-            models_comparison_df = pull()
-            st.session_state.models_comparison_df = models_comparison_df
 # モデルの選択とチューニング
     if 'models_comparison' in st.session_state:
-        st.success("モデルの比較が完了しました！")
         # モデル比較の表示
         models_comparison_df = pull()
         st.session_state.models_comparison_df = models_comparison_df
         st.write("モデル比較結果")
         st.caption("以下は、利用可能な各モデルの性能を示す表です。")
         st.dataframe(st.session_state.models_comparison_df)
         st.header("5. モデルの選択とチューニング")
         st.caption("最も性能の良いモデルを選択し、さらにそのモデルのパラメータをチューニングします。")
         selected_model_name = st.selectbox('使用するモデルを選択してください。', st.session_state.models_comparison_df.index)
@@ -130,6 +160,8 @@ if 'uploaded_data' in st.session_state:
         if selected_model_name in tree_models:
             max_depth = st.slider("決定木の最大の深さを選択", 1, 10, 3)  # 例として最小1、最大10、デフォルト3
         if st.button('チューニングの実行'):
             with st.spinner('チューニング中...'):
@@ -153,17 +185,23 @@ if 'uploaded_data' in st.session_state:
                     st.write(setup_tuned_model_df)
                 st.caption("上記表は、チューニング前後のモデルの交差検証結果を示す表です。")
                 # チューニング後のモデルを保存
                 if 'tuned_model' in st.session_state:
                     # モデルをバイナリ形式で保存
-                    with open("tuned_model.pkl", "wb") as f:
                         joblib.dump(st.session_state.tuned_model, f)
                     # ファイルをbase64エンコードしてダウンロードリンクを作成
-                    with open("tuned_model.pkl", "rb") as f:
                         model_file = f.read()
                         model_b64 = base64.b64encode(model_file).decode()
-                        href = f'<a href="data:application/octet-stream;base64,{model_b64}" download="tuned_model.pkl">チューニングされたモデルをダウンロード</a>'
                         st.markdown(href, unsafe_allow_html=True)
                 st.header("6. モデルの可視化及び評価")
@@ -178,67 +216,69 @@ if 'uploaded_data' in st.session_state:
                     ('vc', '＜検証曲線��', 'パラメータの異なる値に対するモデルの性能を示しています'),
                     ('manifold', '＜マニホールド学習＞', '高次元データを2次元にマッピングしたものを示しています')
                 ]
                 for plot_type, plot_name, plot_description in plot_types:
-                    with st.spinner('プロット中...'):
-                        try:
-                            st.write(plot_name)
                             img = plot_model(tuned_model, plot=plot_type, display_format="streamlit", save=True)
                             st.image(img)
                             st.caption(plot_description)  # グラフの説明を追加
-                        except Exception as e:
-                            st.warning(f"{plot_name}の表示中にエラーが発生しました: {str(e)}")
                 # 決定木のプロット
                 if selected_model_name in tree_models:
-                    st.write("＜決定木のプロット＞")
                     st.caption("決定木は、モデルがどのように予測を行っているかを理解するのに役立ちます。")
-                    with st.spinner('プロット中...'):
-                        if selected_model_name in ['dt']:
-                            from sklearn.tree import plot_tree
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            plot_tree(tuned_model, proportion=True, filled=True, rounded=True, ax=ax, max_depth=3)
-                            st.pyplot(fig)
-                        elif selected_model_name in ['rf', 'et']:
-                            from sklearn.tree import plot_tree
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            plot_tree(tuned_model.estimators_[0], feature_names=train_data.columns, proportion=True, filled=True, rounded=True, ax=ax, max_depth=3)
-                            st.pyplot(fig)
-                        elif selected_model_name == 'ada':
-                            from sklearn.tree import plot_tree
-                            base_estimator = tuned_model.get_model().estimators_[0]
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            plot_tree(base_estimator, filled=True, rounded=True, ax=ax, max_depth=3)
-                            st.pyplot(fig)
-                        elif selected_model_name == 'gbr':
-                            from sklearn.tree import plot_tree
-                            base_estimator = tuned_model.get_model().estimators_[0][0]
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            plot_tree(base_estimator, filled=True, rounded=True, ax=ax, max_depth=3)
-                            st.pyplot(fig)
-                        elif selected_model_name == 'catboost':
-                            from catboost import CatBoostClassifier, plot_tree
-                            catboost_model = tuned_model.get_model()
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            plot_tree(catboost_model, tree_idx=0, ax=ax, max_depth=3)
-                            st.pyplot(fig)
-                        elif selected_model_name == 'lightgbm':
-                            import lightgbm as lgb
-                            booster = tuned_model._Booster  # LightGBM Booster object
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            lgb.plot_tree(booster, tree_index=0, ax=ax, max_depth=3)
-                            st.pyplot(fig)
-                        elif selected_model_name == 'xgboost':
-                            import xgboost as xgb
-                            booster = tuned_model.get_booster()  # XGBoost Booster object
-                            fig, ax = plt.subplots(figsize=(20,10))
-                            xgb.plot_tree(booster, num_trees=0, ax=ax, max_depth=3)
-                            st.pyplot(fig)

     st.session_state.last_uploaded_file = uploaded_file.name  # 最後にアップロードされたファイル名を保存
 if uploaded_file:
+    try:
+        if uploaded_file.name.endswith('.csv'):
+            train_data = pd.read_csv(uploaded_file)
+        elif uploaded_file.name.endswith('.xlsx'):
+            train_data = pd.read_excel(uploaded_file)
+        else:
+            raise ValueError("無効なファイルタイプです。CSVまたはExcelファイルをアップロードしてください。")
+        st.session_state.uploaded_data = train_data
+        st.dataframe(train_data.head())
+    except Exception as e:
+        st.error(str(e))
 # ターゲット変数の選択
 if 'uploaded_data' in st.session_state:
     # Excelファイルダウンロード機能
     towrite = io.BytesIO()
+    downloaded_file = filtered_data.to_excel(towrite, index=False, header=True)
     towrite.seek(0)
     b64 = base64.b64encode(towrite.read()).decode()
     original_filename = uploaded_file.name.split('.')[0]  # 元のファイル名を取得
             st.warning("ターゲット変数に欠損値が含まれているレコードを削除します。")
             st.session_state.uploaded_data = st.session_state.uploaded_data.dropna(subset=[target_variable])
+        # 前処理の進捗状況を表示
+        progress_bar = st.progress(0)
+        status_text = st.empty()
         # 前処理（モデル作成の準備）
         if 'exp_clf101_setup_done' not in st.session_state:  # このセッションで既にセットアップが完了していない場合のみ実行
+            try:
+                with st.spinner('データの前処理中...'):
+                    exp_clf101 = setup(data=st.session_state.uploaded_data,
+                                       target=target_variable,
+                                       session_id=123,
+                                       remove_outliers=remove_outliers_option,
+                                       ignore_features=ignore_variable)
+                    st.session_state.exp_clf101 = exp_clf101
+                    st.session_state.exp_clf101_setup_done = True  # セットアップ完了フラグをセッションに保存
+                    # 前処理の進捗状況を更新
+                    progress_bar.progress(50)
+                    status_text.text("前処理が完了しました。")
+            except Exception as e:
+                st.error(f"前処理中にエラーが発生しました: {str(e)}")
+                st.stop()
         setup_list_df = pull()
         st.write("前処理の結果")
         st.caption("以下は、前処理のステップとそれに伴うデータのパラメータを示す表です。")
         st.write(setup_list_df)
         # モデルの比較
+        try:
+            with st.spinner('モデルを比較中...'):
+                models_comparison = compare_models(exclude=['dummy','catboost'])
+                st.session_state.models_comparison = models_comparison  # セッション状態にモデル比較を保存
+                models_comparison_df = pull()
+                st.session_state.models_comparison_df = models_comparison_df
+                # モデル比較の進捗状況を���新
+                progress_bar.progress(100)
+                status_text.text("モデルの比較が完了しました！")
+        except Exception as e:
+            st.error(f"モデルの比較中にエラーが発生しました: {str(e)}")
+            st.stop()
 # モデルの選択とチューニング
     if 'models_comparison' in st.session_state:
         # モデル比較の表示
         models_comparison_df = pull()
         st.session_state.models_comparison_df = models_comparison_df
         st.write("モデル比較結果")
         st.caption("以下は、利用可能な各モデルの性能を示す表です。")
         st.dataframe(st.session_state.models_comparison_df)
+        # モデル比較結果の解釈の説明を追加
+        st.write("モデル比較結果の解釈:")
+        st.write("- Accuracy: モデルの予測精度を示します。値が高いほどモデルの性能が良いことを示します。")
+        st.write("- AUC: ROC曲線下の面積を示します。値が高いほどモデルの性能が良いことを示します。")
+        st.write("- Recall: 実際の正例のうち、正しく正例と予測された割合を示します。")
+        st.write("- Precision: 正例と予測されたもののうち、実際に正例である割合を示します。")
+        st.write("- F1: RecallとPrecisionの調和平均を示します。両者のバランスを考慮した指標です。")
+        st.write("- Kappa: モデルの予測結果と実際の結果の一致度を示します。値が高いほどモデルの性能が良いことを示します。")
+        st.write("- MCC: 不均衡データにおけるモデルの性能を示します。値が高いほどモデルの性能が良いことを示します。")
         st.header("5. モデルの選択とチューニング")
         st.caption("最も性能の良いモデルを選択し、さらにそのモデルのパラメータをチューニングします。")
         selected_model_name = st.selectbox('使用するモデルを選択してください。', st.session_state.models_comparison_df.index)
         if selected_model_name in tree_models:
             max_depth = st.slider("決定木の最大の深さを選択", 1, 10, 3)  # 例として最小1、最大10、デフォルト3
+        # モデル名の入力
+        model_name = st.text_input("保存するモデルの名前を入力してください", value="tuned_model")
         if st.button('チューニングの実行'):
             with st.spinner('チューニング中...'):
                     st.write(setup_tuned_model_df)
                 st.caption("上記表は、チューニング前後のモデルの交差検証結果を示す表です。")
+                # チューニング前後の比較結果の解釈の説明を追加
+                st.write("チューニング前後の比較結果の解釈:")
+                st.write("- チューニング後の方が、Accuracy、AUC、Recall、Precision、F1、Kappa、MCCの値が高い場合、モデルの性能が向上したことを示します。")
+                st.write("- チューニング後の方が、これらの指標の値が低い場合、モデルの性能が悪化したことを示します。")
+                st.write("- チューニングによる変化がない場合は、モデルの性能に大きな影響がなかったことを示します。")
                 # チューニング後のモデルを保存
                 if 'tuned_model' in st.session_state:
                     # モデルをバイナリ形式で保存
+                    with open(f"{model_name}.pkl", "wb") as f:
                         joblib.dump(st.session_state.tuned_model, f)
                     # ファイルをbase64エンコードしてダウンロードリンクを作成
+                    with open(f"{model_name}.pkl", "rb") as f:
                         model_file = f.read()
                         model_b64 = base64.b64encode(model_file).decode()
+                        href = f'<a href="data:application/octet-stream;base64,{model_b64}" download="{model_name}.pkl">チューニングされたモデルをダウンロード</a>'
                         st.markdown(href, unsafe_allow_html=True)
                 st.header("6. モデルの可視化及び評価")
                     ('vc', '＜検証曲線��', 'パラメータの異なる値に対するモデルの性能を示しています'),
                     ('manifold', '＜マニホールド学習＞', '高次元データを2次元にマッピングしたものを示しています')
                 ]
                 for plot_type, plot_name, plot_description in plot_types:
+                    try:
+                        with st.spinner(f'{plot_name}のプロット中...'):
                             img = plot_model(tuned_model, plot=plot_type, display_format="streamlit", save=True)
+                            st.subheader(plot_name)  # グラフのタイトルを追加
                             st.image(img)
                             st.caption(plot_description)  # グラフの説明を追加
+                    except Exception as e:
+                        st.warning(f"{plot_name}の表示中にエラーが発生しました: {str(e)}")
                 # 決定木のプロット
                 if selected_model_name in tree_models:
+                    st.subheader("＜決定木のプロット＞")
                     st.caption("決定木は、モデルがどのように予測を行っているかを理解するのに役立ちます。")
+                    try:
+                        with st.spinner('決定木のプロット中...'):
+                            if selected_model_name in ['dt']:
+                                from sklearn.tree import plot_tree
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                plot_tree(tuned_model, proportion=True, filled=True, rounded=True, ax=ax, max_depth=3, fontsize=14)  # フォントサイズを変更
+                                st.pyplot(fig)
+                            elif selected_model_name in ['rf', 'et']:
+                                from sklearn.tree import plot_tree
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                plot_tree(tuned_model.estimators_[0], feature_names=train_data.columns, proportion=True, filled=True, rounded=True, ax=ax, max_depth=3, fontsize=14)  # フォントサイズを変更
+                                st.pyplot(fig)
+                            elif selected_model_name == 'ada':
+                                from sklearn.tree import plot_tree
+                                base_estimator = tuned_model.get_model().estimators_[0]
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                plot_tree(base_estimator, filled=True, rounded=True, ax=ax, max_depth=3, fontsize=14)  # フォントサイズを変更
+                                st.pyplot(fig)
+                            elif selected_model_name == 'gbr':
+                                from sklearn.tree import plot_tree
+                                base_estimator = tuned_model.get_model().estimators_[0][0]
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                plot_tree(base_estimator, filled=True, rounded=True, ax=ax, max_depth=3, fontsize=14)  # フォントサイズを変更
+                                st.pyplot(fig)
+                            elif selected_model_name == 'catboost':
+                                from catboost import CatBoostClassifier, plot_tree
+                                catboost_model = tuned_model.get_model()
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                plot_tree(catboost_model, tree_idx=0, ax=ax, max_depth=3)
+                                st.pyplot(fig)
+                            elif selected_model_name == 'lightgbm':
+                                import lightgbm as lgb
+                                booster = tuned_model._Booster  # LightGBM Booster object
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                lgb.plot_tree(booster, tree_index=0, ax=ax, max_depth=3)
+                                st.pyplot(fig)
+                            elif selected_model_name == 'xgboost':
+                                import xgboost as xgb
+                                booster = tuned_model.get_booster()  # XGBoost Booster object
+                                fig, ax = plt.subplots(figsize=(20,10))
+                                xgb.plot_tree(booster, num_trees=0, ax=ax, max_depth=3)
+                                st.pyplot(fig)
+                    except Exception as e:
+                        st.warning(f"決定木のプロット中にエラーが発生しました: {str(e)}")