PrabhakarVenkat commited on
Commit
aa79d7c
·
verified ·
1 Parent(s): 25ab4cf

Upload 3 files

Browse files
Random_forest.py/Random_forest_ver1.ipynb ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "29834325",
7
+ "metadata": {
8
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10
+ "execution": {
11
+ "iopub.execute_input": "2023-06-28T14:29:11.557719Z",
12
+ "iopub.status.busy": "2023-06-28T14:29:11.557247Z",
13
+ "iopub.status.idle": "2023-06-28T14:29:11.571599Z",
14
+ "shell.execute_reply": "2023-06-28T14:29:11.570549Z"
15
+ },
16
+ "papermill": {
17
+ "duration": 0.026028,
18
+ "end_time": "2023-06-28T14:29:11.574556",
19
+ "exception": false,
20
+ "start_time": "2023-06-28T14:29:11.548528",
21
+ "status": "completed"
22
+ },
23
+ "tags": []
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "# This Python 3 environment comes with many helpful analytics libraries installed\n",
28
+ "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
29
+ "# For example, here's several helpful packages to load\n",
30
+ "\n",
31
+ "import numpy as np # linear algebra\n",
32
+ "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
33
+ "\n",
34
+ "# Input data files are available in the read-only \"../input/\" directory\n",
35
+ "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
36
+ "\n",
37
+ "import os\n",
38
+ "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
39
+ " for filename in filenames:\n",
40
+ " pass\n",
41
+ "# print(os.path.join(dirname, filename))\n",
42
+ "\n",
43
+ "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
44
+ "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 2,
50
+ "id": "68b4799b",
51
+ "metadata": {
52
+ "execution": {
53
+ "iopub.execute_input": "2023-06-28T14:29:11.586208Z",
54
+ "iopub.status.busy": "2023-06-28T14:29:11.585762Z",
55
+ "iopub.status.idle": "2023-06-28T14:29:13.734524Z",
56
+ "shell.execute_reply": "2023-06-28T14:29:13.732965Z"
57
+ },
58
+ "papermill": {
59
+ "duration": 2.158201,
60
+ "end_time": "2023-06-28T14:29:13.737697",
61
+ "exception": false,
62
+ "start_time": "2023-06-28T14:29:11.579496",
63
+ "status": "completed"
64
+ },
65
+ "tags": []
66
+ },
67
+ "outputs": [],
68
+ "source": [
69
+ "import pandas as pd\n",
70
+ "import numpy as np\n",
71
+ "from sklearn.model_selection import train_test_split\n",
72
+ "from sklearn.ensemble import RandomForestClassifier\n",
73
+ "from sklearn.preprocessing import StandardScaler\n",
74
+ "from sklearn.impute import SimpleImputer"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 3,
80
+ "id": "dd1aa6d5",
81
+ "metadata": {
82
+ "execution": {
83
+ "iopub.execute_input": "2023-06-28T14:29:13.749104Z",
84
+ "iopub.status.busy": "2023-06-28T14:29:13.748590Z",
85
+ "iopub.status.idle": "2023-06-28T14:29:13.805019Z",
86
+ "shell.execute_reply": "2023-06-28T14:29:13.803969Z"
87
+ },
88
+ "papermill": {
89
+ "duration": 0.06561,
90
+ "end_time": "2023-06-28T14:29:13.807921",
91
+ "exception": false,
92
+ "start_time": "2023-06-28T14:29:13.742311",
93
+ "status": "completed"
94
+ },
95
+ "tags": []
96
+ },
97
+ "outputs": [],
98
+ "source": [
99
+ "# Open file with pd.read_csv\n",
100
+ "df_train = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n",
101
+ "df_test = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 4,
107
+ "id": "563c47ff",
108
+ "metadata": {
109
+ "execution": {
110
+ "iopub.execute_input": "2023-06-28T14:29:13.819160Z",
111
+ "iopub.status.busy": "2023-06-28T14:29:13.818727Z",
112
+ "iopub.status.idle": "2023-06-28T14:29:13.839746Z",
113
+ "shell.execute_reply": "2023-06-28T14:29:13.838298Z"
114
+ },
115
+ "papermill": {
116
+ "duration": 0.030103,
117
+ "end_time": "2023-06-28T14:29:13.843061",
118
+ "exception": false,
119
+ "start_time": "2023-06-28T14:29:13.812958",
120
+ "status": "completed"
121
+ },
122
+ "tags": []
123
+ },
124
+ "outputs": [],
125
+ "source": [
126
+ "# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively\n",
127
+ "df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})\n",
128
+ "df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 5,
134
+ "id": "af9245ad",
135
+ "metadata": {
136
+ "execution": {
137
+ "iopub.execute_input": "2023-06-28T14:29:13.853869Z",
138
+ "iopub.status.busy": "2023-06-28T14:29:13.853426Z",
139
+ "iopub.status.idle": "2023-06-28T14:29:13.867982Z",
140
+ "shell.execute_reply": "2023-06-28T14:29:13.866486Z"
141
+ },
142
+ "papermill": {
143
+ "duration": 0.022904,
144
+ "end_time": "2023-06-28T14:29:13.870386",
145
+ "exception": false,
146
+ "start_time": "2023-06-28T14:29:13.847482",
147
+ "status": "completed"
148
+ },
149
+ "tags": []
150
+ },
151
+ "outputs": [],
152
+ "source": [
153
+ "# Split the training data into features (X) and target variable (y)\n",
154
+ "X_train = df_train.drop([\"Class\", \"Id\"], axis=1) # Exclude non-numeric columns\n",
155
+ "y_train = df_train[\"Class\"]\n",
156
+ "\n",
157
+ "# Split the test data into features (X_test)\n",
158
+ "X_test = df_test.drop(\"Id\", axis=1)"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 6,
164
+ "id": "48963e25",
165
+ "metadata": {
166
+ "execution": {
167
+ "iopub.execute_input": "2023-06-28T14:29:13.881371Z",
168
+ "iopub.status.busy": "2023-06-28T14:29:13.880917Z",
169
+ "iopub.status.idle": "2023-06-28T14:29:13.900968Z",
170
+ "shell.execute_reply": "2023-06-28T14:29:13.899934Z"
171
+ },
172
+ "papermill": {
173
+ "duration": 0.029018,
174
+ "end_time": "2023-06-28T14:29:13.903834",
175
+ "exception": false,
176
+ "start_time": "2023-06-28T14:29:13.874816",
177
+ "status": "completed"
178
+ },
179
+ "tags": []
180
+ },
181
+ "outputs": [],
182
+ "source": [
183
+ "# Identify columns with missing values\n",
184
+ "columns_with_missing = X_train.columns[X_train.isna().any()].tolist()\n",
185
+ "\n",
186
+ "# Impute missing values with the mean of each column\n",
187
+ "imputer = SimpleImputer(strategy='mean')\n",
188
+ "X_train_imputed = imputer.fit_transform(X_train)\n",
189
+ "X_test_imputed = imputer.transform(X_test)\n",
190
+ "\n",
191
+ "# Scale the features using StandardScaler\n",
192
+ "scaler = StandardScaler()\n",
193
+ "X_train_scaled = scaler.fit_transform(X_train_imputed)\n",
194
+ "X_test_scaled = scaler.transform(X_test_imputed)"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 7,
200
+ "id": "7c337184",
201
+ "metadata": {
202
+ "execution": {
203
+ "iopub.execute_input": "2023-06-28T14:29:13.915609Z",
204
+ "iopub.status.busy": "2023-06-28T14:29:13.914400Z",
205
+ "iopub.status.idle": "2023-06-28T14:29:14.392939Z",
206
+ "shell.execute_reply": "2023-06-28T14:29:14.391879Z"
207
+ },
208
+ "papermill": {
209
+ "duration": 0.487453,
210
+ "end_time": "2023-06-28T14:29:14.395785",
211
+ "exception": false,
212
+ "start_time": "2023-06-28T14:29:13.908332",
213
+ "status": "completed"
214
+ },
215
+ "tags": []
216
+ },
217
+ "outputs": [],
218
+ "source": [
219
+ "# Get feature importances\n",
220
+ "rfc = RandomForestClassifier()\n",
221
+ "rfc.fit(X_train_scaled, y_train)\n",
222
+ "feature_importances = rfc.feature_importances_\n",
223
+ "\n",
224
+ "# Create a DataFrame for feature importance\n",
225
+ "importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})\n",
226
+ "\n",
227
+ "# Sort the features by importance (descending order)\n",
228
+ "importance_df = importance_df.sort_values(by='Importance', ascending=False)"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 8,
234
+ "id": "ce5fddae",
235
+ "metadata": {
236
+ "execution": {
237
+ "iopub.execute_input": "2023-06-28T14:29:14.406819Z",
238
+ "iopub.status.busy": "2023-06-28T14:29:14.406345Z",
239
+ "iopub.status.idle": "2023-06-28T14:29:14.413437Z",
240
+ "shell.execute_reply": "2023-06-28T14:29:14.412266Z"
241
+ },
242
+ "papermill": {
243
+ "duration": 0.015929,
244
+ "end_time": "2023-06-28T14:29:14.416226",
245
+ "exception": false,
246
+ "start_time": "2023-06-28T14:29:14.400297",
247
+ "status": "completed"
248
+ },
249
+ "tags": []
250
+ },
251
+ "outputs": [],
252
+ "source": [
253
+ "# Select the top important variables\n",
254
+ "num_variables = 10 # Specify the number of top important variables to use\n",
255
+ "important_variables = importance_df['Feature'].tolist()[:num_variables]\n",
256
+ "X_train_important = X_train_scaled[:, importance_df.index[:num_variables]]\n",
257
+ "X_test_important = X_test_scaled[:, importance_df.index[:num_variables]]"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": 9,
263
+ "id": "4e746beb",
264
+ "metadata": {
265
+ "execution": {
266
+ "iopub.execute_input": "2023-06-28T14:29:14.427650Z",
267
+ "iopub.status.busy": "2023-06-28T14:29:14.427116Z",
268
+ "iopub.status.idle": "2023-06-28T14:29:14.756684Z",
269
+ "shell.execute_reply": "2023-06-28T14:29:14.755491Z"
270
+ },
271
+ "papermill": {
272
+ "duration": 0.338831,
273
+ "end_time": "2023-06-28T14:29:14.759951",
274
+ "exception": false,
275
+ "start_time": "2023-06-28T14:29:14.421120",
276
+ "status": "completed"
277
+ },
278
+ "tags": []
279
+ },
280
+ "outputs": [],
281
+ "source": [
282
+ "# Train the random forest model using only the important variables\n",
283
+ "rfc_important = RandomForestClassifier()\n",
284
+ "rfc_important.fit(X_train_important, y_train)\n",
285
+ "\n",
286
+ "# Predict on the test set using only the important variables\n",
287
+ "rfc_pred = rfc_important.predict(X_test_important)\n"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 10,
293
+ "id": "13cf4b5b",
294
+ "metadata": {
295
+ "execution": {
296
+ "iopub.execute_input": "2023-06-28T14:29:14.771894Z",
297
+ "iopub.status.busy": "2023-06-28T14:29:14.771075Z",
298
+ "iopub.status.idle": "2023-06-28T14:29:14.796398Z",
299
+ "shell.execute_reply": "2023-06-28T14:29:14.795487Z"
300
+ },
301
+ "papermill": {
302
+ "duration": 0.034975,
303
+ "end_time": "2023-06-28T14:29:14.799451",
304
+ "exception": false,
305
+ "start_time": "2023-06-28T14:29:14.764476",
306
+ "status": "completed"
307
+ },
308
+ "tags": []
309
+ },
310
+ "outputs": [],
311
+ "source": [
312
+ "# Predict probabilities for each class in the test set\n",
313
+ "rfc_pred_proba = rfc.predict_proba(X_test_scaled)\n",
314
+ "\n",
315
+ "# Create a DataFrame to store the predictions\n",
316
+ "predictions_df = pd.DataFrame({'Id': df_test['Id'],\n",
317
+ " 'class_0': rfc_pred_proba[:, 0],\n",
318
+ " 'class_1': rfc_pred_proba[:, 1]})\n",
319
+ "\n",
320
+ "# Save the predictions to a CSV file\n",
321
+ "predictions_df.to_csv('submission.csv', index=False)"
322
+ ]
323
+ }
324
+ ],
325
+ "metadata": {
326
+ "kernelspec": {
327
+ "display_name": "Python 3",
328
+ "language": "python",
329
+ "name": "python3"
330
+ },
331
+ "language_info": {
332
+ "codemirror_mode": {
333
+ "name": "ipython",
334
+ "version": 3
335
+ },
336
+ "file_extension": ".py",
337
+ "mimetype": "text/x-python",
338
+ "name": "python",
339
+ "nbconvert_exporter": "python",
340
+ "pygments_lexer": "ipython3",
341
+ "version": "3.10.10"
342
+ },
343
+ "papermill": {
344
+ "default_parameters": {},
345
+ "duration": 18.313658,
346
+ "end_time": "2023-06-28T14:29:16.232503",
347
+ "environment_variables": {},
348
+ "exception": null,
349
+ "input_path": "__notebook__.ipynb",
350
+ "output_path": "__notebook__.ipynb",
351
+ "parameters": {},
352
+ "start_time": "2023-06-28T14:28:57.918845",
353
+ "version": "2.4.0"
354
+ }
355
+ },
356
+ "nbformat": 4,
357
+ "nbformat_minor": 5
358
+ }
Random_forest.py/Random_forest_ver2.ipynb ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "75418eb6",
7
+ "metadata": {
8
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10
+ "execution": {
11
+ "iopub.execute_input": "2023-07-06T13:42:23.141738Z",
12
+ "iopub.status.busy": "2023-07-06T13:42:23.141143Z",
13
+ "iopub.status.idle": "2023-07-06T13:42:23.155666Z",
14
+ "shell.execute_reply": "2023-07-06T13:42:23.154445Z"
15
+ },
16
+ "papermill": {
17
+ "duration": 0.021833,
18
+ "end_time": "2023-07-06T13:42:23.158621",
19
+ "exception": false,
20
+ "start_time": "2023-07-06T13:42:23.136788",
21
+ "status": "completed"
22
+ },
23
+ "tags": []
24
+ },
25
+ "outputs": [
26
+ {
27
+ "name": "stdout",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\n",
31
+ "/kaggle/input/icr-identify-age-related-conditions/greeks.csv\n",
32
+ "/kaggle/input/icr-identify-age-related-conditions/train.csv\n",
33
+ "/kaggle/input/icr-identify-age-related-conditions/test.csv\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "# This Python 3 environment comes with many helpful analytics libraries installed\n",
39
+ "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
40
+ "# For example, here's several helpful packages to load\n",
41
+ "\n",
42
+ "import numpy as np # linear algebra\n",
43
+ "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
44
+ "\n",
45
+ "# Input data files are available in the read-only \"../input/\" directory\n",
46
+ "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
47
+ "\n",
48
+ "import os\n",
49
+ "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
50
+ " for filename in filenames:\n",
51
+ " print(os.path.join(dirname, filename))\n",
52
+ "\n",
53
+ "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
54
+ "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 2,
60
+ "id": "21694925",
61
+ "metadata": {
62
+ "execution": {
63
+ "iopub.execute_input": "2023-07-06T13:42:23.164800Z",
64
+ "iopub.status.busy": "2023-07-06T13:42:23.164345Z",
65
+ "iopub.status.idle": "2023-07-06T13:43:47.729268Z",
66
+ "shell.execute_reply": "2023-07-06T13:43:47.728318Z"
67
+ },
68
+ "papermill": {
69
+ "duration": 84.570727,
70
+ "end_time": "2023-07-06T13:43:47.731786",
71
+ "exception": false,
72
+ "start_time": "2023-07-06T13:42:23.161059",
73
+ "status": "completed"
74
+ },
75
+ "tags": []
76
+ },
77
+ "outputs": [],
78
+ "source": [
79
+ "import pandas as pd\n",
80
+ "import numpy as np\n",
81
+ "from sklearn.model_selection import train_test_split\n",
82
+ "from sklearn.preprocessing import StandardScaler\n",
83
+ "from sklearn.impute import SimpleImputer\n",
84
+ "from imblearn.over_sampling import RandomOverSampler\n",
85
+ "from sklearn.model_selection import GridSearchCV\n",
86
+ "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier\n",
87
+ "\n",
88
+ "# Open file with pd.read_csv\n",
89
+ "df_train = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n",
90
+ "df_test = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")\n",
91
+ "\n",
92
+ "# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively\n",
93
+ "df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})\n",
94
+ "df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})\n",
95
+ "\n",
96
+ "# Split the training data into features (X) and target variable (y)\n",
97
+ "X_train = df_train.drop([\"Class\", \"Id\"], axis=1) # Exclude non-numeric columns\n",
98
+ "y_train = df_train[\"Class\"]\n",
99
+ "\n",
100
+ "# Split the test data into features (X_test)\n",
101
+ "X_test = df_test.drop(\"Id\", axis=1)\n",
102
+ "\n",
103
+ "# Identify columns with missing values\n",
104
+ "columns_with_missing = X_train.columns[X_train.isna().any()].tolist()\n",
105
+ "\n",
106
+ "# Impute missing values with the mean of each column\n",
107
+ "imputer = SimpleImputer(strategy='mean')\n",
108
+ "X_train_imputed = imputer.fit_transform(X_train)\n",
109
+ "X_test_imputed = imputer.transform(X_test)\n",
110
+ "\n",
111
+ "# Scale the features using StandardScaler\n",
112
+ "scaler = StandardScaler()\n",
113
+ "X_train_scaled = scaler.fit_transform(X_train_imputed)\n",
114
+ "X_test_scaled = scaler.transform(X_test_imputed)\n",
115
+ "\n",
116
+ "# Handling class imbalance using oversampling\n",
117
+ "oversampler = RandomOverSampler(random_state=42)\n",
118
+ "X_train_scaled, y_train = oversampler.fit_resample(X_train_scaled, y_train)\n",
119
+ "\n",
120
+ "# Hyperparameter tuning for Random Forest Classifier\n",
121
+ "rfc = RandomForestClassifier(n_estimators=100, random_state=42)\n",
122
+ "param_grid = {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}\n",
123
+ "grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='neg_log_loss')\n",
124
+ "grid_search.fit(X_train_scaled, y_train)\n",
125
+ "best_rfc = grid_search.best_estimator_\n",
126
+ "\n",
127
+ "# Hyperparameter tuning for Gradient Boosting Classifier\n",
128
+ "gbc = GradientBoostingClassifier(n_estimators=100, random_state=42)\n",
129
+ "param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 1.0]}\n",
130
+ "grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='neg_log_loss')\n",
131
+ "grid_search.fit(X_train_scaled, y_train)\n",
132
+ "best_gbc = grid_search.best_estimator_\n",
133
+ "\n",
134
+ "# Ensemble of models\n",
135
+ "ensemble_model = VotingClassifier(estimators=[('rfc', best_rfc), ('gbc', best_gbc)], voting='soft')\n",
136
+ "ensemble_model.fit(X_train_scaled, y_train)\n",
137
+ "\n",
138
+ "# Predict probabilities for each class in the test set\n",
139
+ "ensemble_pred_proba = ensemble_model.predict_proba(X_test_scaled)\n",
140
+ "\n",
141
+ "# Create a DataFrame to store the predictions\n",
142
+ "predictions_df = pd.DataFrame({'Id': df_test['Id'],\n",
143
+ " 'class_0': ensemble_pred_proba[:, 0],\n",
144
+ " 'class_1': ensemble_pred_proba[:, 1]})\n",
145
+ "\n",
146
+ "# Save the predictions to a CSV file\n",
147
+ "predictions_df.to_csv('submission.csv', index=False)\n",
148
+ " "
149
+ ]
150
+ }
151
+ ],
152
+ "metadata": {
153
+ "kernelspec": {
154
+ "display_name": "Python 3",
155
+ "language": "python",
156
+ "name": "python3"
157
+ },
158
+ "language_info": {
159
+ "codemirror_mode": {
160
+ "name": "ipython",
161
+ "version": 3
162
+ },
163
+ "file_extension": ".py",
164
+ "mimetype": "text/x-python",
165
+ "name": "python",
166
+ "nbconvert_exporter": "python",
167
+ "pygments_lexer": "ipython3",
168
+ "version": "3.10.10"
169
+ },
170
+ "papermill": {
171
+ "default_parameters": {},
172
+ "duration": 97.427632,
173
+ "end_time": "2023-07-06T13:43:48.755891",
174
+ "environment_variables": {},
175
+ "exception": null,
176
+ "input_path": "__notebook__.ipynb",
177
+ "output_path": "__notebook__.ipynb",
178
+ "parameters": {},
179
+ "start_time": "2023-07-06T13:42:11.328259",
180
+ "version": "2.4.0"
181
+ }
182
+ },
183
+ "nbformat": 4,
184
+ "nbformat_minor": 5
185
+ }
Random_forest.py/Random_forest_ver3.ipynb ADDED
The diff for this file is too large to render. See raw diff