Arafath10 commited on
Commit
1608dda
1 Parent(s): 78818fd

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +29 -2
main.py CHANGED
@@ -8,6 +8,7 @@ import os,datetime
8
  import pandas as pd
9
  from sklearn.model_selection import train_test_split, GridSearchCV
10
  from sklearn.preprocessing import LabelEncoder
 
11
  from xgboost import XGBClassifier
12
  from sklearn.metrics import accuracy_score, classification_report
13
  from joblib import dump, load
@@ -49,11 +50,12 @@ def train_the_model():
49
  y = data_filled['status_name']
50
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
51
 
 
52
  # Parameters to use for the model
53
  params = {
54
  'colsample_bytree': 0.3,
55
- 'learning_rate': 1,
56
- 'max_depth': 16,
57
  'n_estimators': 100,
58
  'subsample': 0.9,
59
  'use_label_encoder': False,
@@ -132,6 +134,31 @@ async def your_continuous_function(page: str,paginate: str):
132
  df.to_csv("trainer_data.csv")
133
  print("data created")
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
  accuracy,classification_rep,message = train_the_model()
 
8
  import pandas as pd
9
  from sklearn.model_selection import train_test_split, GridSearchCV
10
  from sklearn.preprocessing import LabelEncoder
11
+ from sklearn.utils import resample
12
  from xgboost import XGBClassifier
13
  from sklearn.metrics import accuracy_score, classification_report
14
  from joblib import dump, load
 
50
  y = data_filled['status_name']
51
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
52
 
53
+ # Parameters to use for the model
54
  # Parameters to use for the model
55
  params = {
56
  'colsample_bytree': 0.3,
57
+ 'learning_rate': 0.6,
58
+ 'max_depth': 6,
59
  'n_estimators': 100,
60
  'subsample': 0.9,
61
  'use_label_encoder': False,
 
134
  df.to_csv("trainer_data.csv")
135
  print("data created")
136
 
137
+ # Load the dataset
138
+ file_path = 'trainer_data.csv' # Update to the correct file path
139
+
140
+ # Analyze class distribution
141
+ class_distribution = data['status_name'].value_counts()
142
+ print("Class Distribution before balancing:\n", class_distribution)
143
+
144
+ # Get the size of the largest class to match other classes' sizes
145
+ max_class_size = class_distribution.max()
146
+
147
+ # Oversampling
148
+ oversampled_data = pd.DataFrame()
149
+ for class_name, group in data.groupby('status_name'):
150
+ oversampled_group = resample(group,
151
+ replace=True, # Sample with replacement
152
+ n_samples=max_class_size, # to match majority class
153
+ random_state=123) # for reproducibility
154
+ oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0)
155
+
156
+ # Verify new class distribution
157
+ print("Class Distribution after oversampling:\n", oversampled_data['status_name'].value_counts())
158
+
159
+ # Save the balanced dataset if needed
160
+ oversampled_data.to_csv('trainer_data.csv', index=False)
161
+
162
 
163
 
164
  accuracy,classification_rep,message = train_the_model()