Upload 3 files

Browse files

Files changed (3) hide show

Random_forest.py/Random_forest_ver1.ipynb +358 -0
Random_forest.py/Random_forest_ver2.ipynb +185 -0
Random_forest.py/Random_forest_ver3.ipynb +0 -0

Random_forest.py/Random_forest_ver1.ipynb ADDED Viewed

	@@ -0,0 +1,358 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "29834325",
+   "metadata": {
+    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:11.557719Z",
+     "iopub.status.busy": "2023-06-28T14:29:11.557247Z",
+     "iopub.status.idle": "2023-06-28T14:29:11.571599Z",
+     "shell.execute_reply": "2023-06-28T14:29:11.570549Z"
+    },
+    "papermill": {
+     "duration": 0.026028,
+     "end_time": "2023-06-28T14:29:11.574556",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:11.548528",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
+    "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
+    "# For example, here's several helpful packages to load\n",
+    "\n",
+    "import numpy as np # linear algebra\n",
+    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+    "\n",
+    "# Input data files are available in the read-only \"../input/\" directory\n",
+    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
+    "\n",
+    "import os\n",
+    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
+    "    for filename in filenames:\n",
+    "        pass\n",
+    "#         print(os.path.join(dirname, filename))\n",
+    "\n",
+    "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
+    "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "68b4799b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:11.586208Z",
+     "iopub.status.busy": "2023-06-28T14:29:11.585762Z",
+     "iopub.status.idle": "2023-06-28T14:29:13.734524Z",
+     "shell.execute_reply": "2023-06-28T14:29:13.732965Z"
+    },
+    "papermill": {
+     "duration": 2.158201,
+     "end_time": "2023-06-28T14:29:13.737697",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:11.579496",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.impute import SimpleImputer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dd1aa6d5",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:13.749104Z",
+     "iopub.status.busy": "2023-06-28T14:29:13.748590Z",
+     "iopub.status.idle": "2023-06-28T14:29:13.805019Z",
+     "shell.execute_reply": "2023-06-28T14:29:13.803969Z"
+    },
+    "papermill": {
+     "duration": 0.06561,
+     "end_time": "2023-06-28T14:29:13.807921",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:13.742311",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Open file with pd.read_csv\n",
+    "df_train = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n",
+    "df_test = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "563c47ff",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:13.819160Z",
+     "iopub.status.busy": "2023-06-28T14:29:13.818727Z",
+     "iopub.status.idle": "2023-06-28T14:29:13.839746Z",
+     "shell.execute_reply": "2023-06-28T14:29:13.838298Z"
+    },
+    "papermill": {
+     "duration": 0.030103,
+     "end_time": "2023-06-28T14:29:13.843061",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:13.812958",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively\n",
+    "df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})\n",
+    "df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "af9245ad",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:13.853869Z",
+     "iopub.status.busy": "2023-06-28T14:29:13.853426Z",
+     "iopub.status.idle": "2023-06-28T14:29:13.867982Z",
+     "shell.execute_reply": "2023-06-28T14:29:13.866486Z"
+    },
+    "papermill": {
+     "duration": 0.022904,
+     "end_time": "2023-06-28T14:29:13.870386",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:13.847482",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Split the training data into features (X) and target variable (y)\n",
+    "X_train = df_train.drop([\"Class\", \"Id\"], axis=1)  # Exclude non-numeric columns\n",
+    "y_train = df_train[\"Class\"]\n",
+    "\n",
+    "# Split the test data into features (X_test)\n",
+    "X_test = df_test.drop(\"Id\", axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "48963e25",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:13.881371Z",
+     "iopub.status.busy": "2023-06-28T14:29:13.880917Z",
+     "iopub.status.idle": "2023-06-28T14:29:13.900968Z",
+     "shell.execute_reply": "2023-06-28T14:29:13.899934Z"
+    },
+    "papermill": {
+     "duration": 0.029018,
+     "end_time": "2023-06-28T14:29:13.903834",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:13.874816",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Identify columns with missing values\n",
+    "columns_with_missing = X_train.columns[X_train.isna().any()].tolist()\n",
+    "\n",
+    "# Impute missing values with the mean of each column\n",
+    "imputer = SimpleImputer(strategy='mean')\n",
+    "X_train_imputed = imputer.fit_transform(X_train)\n",
+    "X_test_imputed = imputer.transform(X_test)\n",
+    "\n",
+    "# Scale the features using StandardScaler\n",
+    "scaler = StandardScaler()\n",
+    "X_train_scaled = scaler.fit_transform(X_train_imputed)\n",
+    "X_test_scaled = scaler.transform(X_test_imputed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7c337184",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:13.915609Z",
+     "iopub.status.busy": "2023-06-28T14:29:13.914400Z",
+     "iopub.status.idle": "2023-06-28T14:29:14.392939Z",
+     "shell.execute_reply": "2023-06-28T14:29:14.391879Z"
+    },
+    "papermill": {
+     "duration": 0.487453,
+     "end_time": "2023-06-28T14:29:14.395785",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:13.908332",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Get feature importances\n",
+    "rfc = RandomForestClassifier()\n",
+    "rfc.fit(X_train_scaled, y_train)\n",
+    "feature_importances = rfc.feature_importances_\n",
+    "\n",
+    "# Create a DataFrame for feature importance\n",
+    "importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})\n",
+    "\n",
+    "# Sort the features by importance (descending order)\n",
+    "importance_df = importance_df.sort_values(by='Importance', ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ce5fddae",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:14.406819Z",
+     "iopub.status.busy": "2023-06-28T14:29:14.406345Z",
+     "iopub.status.idle": "2023-06-28T14:29:14.413437Z",
+     "shell.execute_reply": "2023-06-28T14:29:14.412266Z"
+    },
+    "papermill": {
+     "duration": 0.015929,
+     "end_time": "2023-06-28T14:29:14.416226",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:14.400297",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Select the top important variables\n",
+    "num_variables = 10  # Specify the number of top important variables to use\n",
+    "important_variables = importance_df['Feature'].tolist()[:num_variables]\n",
+    "X_train_important = X_train_scaled[:, importance_df.index[:num_variables]]\n",
+    "X_test_important = X_test_scaled[:, importance_df.index[:num_variables]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4e746beb",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:14.427650Z",
+     "iopub.status.busy": "2023-06-28T14:29:14.427116Z",
+     "iopub.status.idle": "2023-06-28T14:29:14.756684Z",
+     "shell.execute_reply": "2023-06-28T14:29:14.755491Z"
+    },
+    "papermill": {
+     "duration": 0.338831,
+     "end_time": "2023-06-28T14:29:14.759951",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:14.421120",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Train the random forest model using only the important variables\n",
+    "rfc_important = RandomForestClassifier()\n",
+    "rfc_important.fit(X_train_important, y_train)\n",
+    "\n",
+    "# Predict on the test set using only the important variables\n",
+    "rfc_pred = rfc_important.predict(X_test_important)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "13cf4b5b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-06-28T14:29:14.771894Z",
+     "iopub.status.busy": "2023-06-28T14:29:14.771075Z",
+     "iopub.status.idle": "2023-06-28T14:29:14.796398Z",
+     "shell.execute_reply": "2023-06-28T14:29:14.795487Z"
+    },
+    "papermill": {
+     "duration": 0.034975,
+     "end_time": "2023-06-28T14:29:14.799451",
+     "exception": false,
+     "start_time": "2023-06-28T14:29:14.764476",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Predict probabilities for each class in the test set\n",
+    "rfc_pred_proba = rfc.predict_proba(X_test_scaled)\n",
+    "\n",
+    "# Create a DataFrame to store the predictions\n",
+    "predictions_df = pd.DataFrame({'Id': df_test['Id'],\n",
+    "                               'class_0': rfc_pred_proba[:, 0],\n",
+    "                               'class_1': rfc_pred_proba[:, 1]})\n",
+    "\n",
+    "# Save the predictions to a CSV file\n",
+    "predictions_df.to_csv('submission.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 18.313658,
+   "end_time": "2023-06-28T14:29:16.232503",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2023-06-28T14:28:57.918845",
+   "version": "2.4.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Random_forest.py/Random_forest_ver2.ipynb ADDED Viewed

	@@ -0,0 +1,185 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "75418eb6",
+   "metadata": {
+    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+    "execution": {
+     "iopub.execute_input": "2023-07-06T13:42:23.141738Z",
+     "iopub.status.busy": "2023-07-06T13:42:23.141143Z",
+     "iopub.status.idle": "2023-07-06T13:42:23.155666Z",
+     "shell.execute_reply": "2023-07-06T13:42:23.154445Z"
+    },
+    "papermill": {
+     "duration": 0.021833,
+     "end_time": "2023-07-06T13:42:23.158621",
+     "exception": false,
+     "start_time": "2023-07-06T13:42:23.136788",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\n",
+      "/kaggle/input/icr-identify-age-related-conditions/greeks.csv\n",
+      "/kaggle/input/icr-identify-age-related-conditions/train.csv\n",
+      "/kaggle/input/icr-identify-age-related-conditions/test.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
+    "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
+    "# For example, here's several helpful packages to load\n",
+    "\n",
+    "import numpy as np # linear algebra\n",
+    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+    "\n",
+    "# Input data files are available in the read-only \"../input/\" directory\n",
+    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
+    "\n",
+    "import os\n",
+    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
+    "    for filename in filenames:\n",
+    "         print(os.path.join(dirname, filename))\n",
+    "\n",
+    "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
+    "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "21694925",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-06T13:42:23.164800Z",
+     "iopub.status.busy": "2023-07-06T13:42:23.164345Z",
+     "iopub.status.idle": "2023-07-06T13:43:47.729268Z",
+     "shell.execute_reply": "2023-07-06T13:43:47.728318Z"
+    },
+    "papermill": {
+     "duration": 84.570727,
+     "end_time": "2023-07-06T13:43:47.731786",
+     "exception": false,
+     "start_time": "2023-07-06T13:42:23.161059",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier\n",
+    "\n",
+    "# Open file with pd.read_csv\n",
+    "df_train = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n",
+    "df_test = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")\n",
+    "\n",
+    "# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively\n",
+    "df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})\n",
+    "df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})\n",
+    "\n",
+    "# Split the training data into features (X) and target variable (y)\n",
+    "X_train = df_train.drop([\"Class\", \"Id\"], axis=1)  # Exclude non-numeric columns\n",
+    "y_train = df_train[\"Class\"]\n",
+    "\n",
+    "# Split the test data into features (X_test)\n",
+    "X_test = df_test.drop(\"Id\", axis=1)\n",
+    "\n",
+    "# Identify columns with missing values\n",
+    "columns_with_missing = X_train.columns[X_train.isna().any()].tolist()\n",
+    "\n",
+    "# Impute missing values with the mean of each column\n",
+    "imputer = SimpleImputer(strategy='mean')\n",
+    "X_train_imputed = imputer.fit_transform(X_train)\n",
+    "X_test_imputed = imputer.transform(X_test)\n",
+    "\n",
+    "# Scale the features using StandardScaler\n",
+    "scaler = StandardScaler()\n",
+    "X_train_scaled = scaler.fit_transform(X_train_imputed)\n",
+    "X_test_scaled = scaler.transform(X_test_imputed)\n",
+    "\n",
+    "# Handling class imbalance using oversampling\n",
+    "oversampler = RandomOverSampler(random_state=42)\n",
+    "X_train_scaled, y_train = oversampler.fit_resample(X_train_scaled, y_train)\n",
+    "\n",
+    "# Hyperparameter tuning for Random Forest Classifier\n",
+    "rfc = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "param_grid = {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}\n",
+    "grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='neg_log_loss')\n",
+    "grid_search.fit(X_train_scaled, y_train)\n",
+    "best_rfc = grid_search.best_estimator_\n",
+    "\n",
+    "# Hyperparameter tuning for Gradient Boosting Classifier\n",
+    "gbc = GradientBoostingClassifier(n_estimators=100, random_state=42)\n",
+    "param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 1.0]}\n",
+    "grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='neg_log_loss')\n",
+    "grid_search.fit(X_train_scaled, y_train)\n",
+    "best_gbc = grid_search.best_estimator_\n",
+    "\n",
+    "# Ensemble of models\n",
+    "ensemble_model = VotingClassifier(estimators=[('rfc', best_rfc), ('gbc', best_gbc)], voting='soft')\n",
+    "ensemble_model.fit(X_train_scaled, y_train)\n",
+    "\n",
+    "# Predict probabilities for each class in the test set\n",
+    "ensemble_pred_proba = ensemble_model.predict_proba(X_test_scaled)\n",
+    "\n",
+    "# Create a DataFrame to store the predictions\n",
+    "predictions_df = pd.DataFrame({'Id': df_test['Id'],\n",
+    "                               'class_0': ensemble_pred_proba[:, 0],\n",
+    "                               'class_1': ensemble_pred_proba[:, 1]})\n",
+    "\n",
+    "# Save the predictions to a CSV file\n",
+    "predictions_df.to_csv('submission.csv', index=False)\n",
+    "                                                       "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 97.427632,
+   "end_time": "2023-07-06T13:43:48.755891",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2023-07-06T13:42:11.328259",
+   "version": "2.4.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Random_forest.py/Random_forest_ver3.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff