transexpress_ml_api

Sleeping

App Files Files Community

Arafath10 commited on Apr 30

Commit

1608dda

•

1 Parent(s): 78818fd

Update main.py

Browse files

Files changed (1) hide show

main.py +29 -2

main.py CHANGED Viewed

@@ -8,6 +8,7 @@ import os,datetime
 import pandas as pd
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.preprocessing import LabelEncoder
 from xgboost import XGBClassifier
 from sklearn.metrics import accuracy_score, classification_report
 from joblib import dump, load
@@ -49,11 +50,12 @@ def train_the_model():
         y = data_filled['status_name']
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
         # Parameters to use for the model
         params = {
             'colsample_bytree': 0.3,
-            'learning_rate': 1,
-            'max_depth': 16,
             'n_estimators': 100,
             'subsample': 0.9,
             'use_label_encoder': False,
@@ -132,6 +134,31 @@ async def your_continuous_function(page: str,paginate: str):
         df.to_csv("trainer_data.csv")
         print("data created")
     accuracy,classification_rep,message = train_the_model()

 import pandas as pd
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import resample
 from xgboost import XGBClassifier
 from sklearn.metrics import accuracy_score, classification_report
 from joblib import dump, load
         y = data_filled['status_name']
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        # Parameters to use for the model
         # Parameters to use for the model
         params = {
             'colsample_bytree': 0.3,
+            'learning_rate': 0.6,
+            'max_depth': 6,
             'n_estimators': 100,
             'subsample': 0.9,
             'use_label_encoder': False,
         df.to_csv("trainer_data.csv")
         print("data created")
+    # Load the dataset
+    file_path = 'trainer_data.csv'  # Update to the correct file path
+    # Analyze class distribution
+    class_distribution = data['status_name'].value_counts()
+    print("Class Distribution before balancing:\n", class_distribution)
+    # Get the size of the largest class to match other classes' sizes
+    max_class_size = class_distribution.max()
+    # Oversampling
+    oversampled_data = pd.DataFrame()
+    for class_name, group in data.groupby('status_name'):
+        oversampled_group = resample(group,
+                                     replace=True,  # Sample with replacement
+                                     n_samples=max_class_size,  # to match majority class
+                                     random_state=123)  # for reproducibility
+        oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0)
+    # Verify new class distribution
+    print("Class Distribution after oversampling:\n", oversampled_data['status_name'].value_counts())
+    # Save the balanced dataset if needed
+    oversampled_data.to_csv('trainer_data.csv', index=False)
     accuracy,classification_rep,message = train_the_model()