Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -28,36 +28,59 @@ app.add_middleware(
|
|
28 |
|
29 |
def train_the_model(data):
|
30 |
try:
|
|
|
31 |
new_data = data
|
32 |
encoders = load('transexpress_encoders.joblib')
|
33 |
xgb_model = load('transexpress_xgb_model.joblib')
|
|
|
|
|
34 |
selected_columns = ['customer_name', 'customer_address', 'customer_phone_no',
|
35 |
-
'weight','cod','pickup_address','client_number','destination_city',
|
36 |
'status_name']
|
37 |
-
|
38 |
new_data_filled = new_data[selected_columns].fillna('Missing')
|
|
|
|
|
39 |
for col, encoder in encoders.items():
|
40 |
if col in new_data_filled.columns:
|
41 |
unseen_categories = set(new_data_filled[col]) - set(encoder.classes_)
|
42 |
if unseen_categories:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
new_data_filled[col] = encoder.transform(new_data_filled[col])
|
48 |
X_new = new_data_filled.drop('status_name', axis=1)
|
49 |
y_new = new_data_filled['status_name']
|
50 |
-
|
51 |
-
X_train, X_test, y_train, y_test = train_test_split(X_new,y_new, test_size=0.2, random_state=42)
|
52 |
|
53 |
-
|
54 |
-
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
accuracy = accuracy_score(y_test, y_pred)
|
59 |
classification_rep = classification_report(y_test, y_pred)
|
60 |
-
|
|
|
|
|
|
|
61 |
|
62 |
|
63 |
except:
|
@@ -94,7 +117,7 @@ def train_the_model(data):
|
|
94 |
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
|
95 |
|
96 |
# Setup GridSearchCV
|
97 |
-
grid_search = GridSearchCV(xgb, param_grid, cv=
|
98 |
|
99 |
# Fit the grid search to the data
|
100 |
grid_search.fit(X_train, y_train)
|
|
|
28 |
|
29 |
def train_the_model(data):
|
30 |
try:
|
31 |
+
|
32 |
new_data = data
|
33 |
encoders = load('transexpress_encoders.joblib')
|
34 |
xgb_model = load('transexpress_xgb_model.joblib')
|
35 |
+
|
36 |
+
# Selecting and filling missing data
|
37 |
selected_columns = ['customer_name', 'customer_address', 'customer_phone_no',
|
38 |
+
'weight', 'cod', 'pickup_address', 'client_number', 'destination_city',
|
39 |
'status_name']
|
|
|
40 |
new_data_filled = new_data[selected_columns].fillna('Missing')
|
41 |
+
|
42 |
+
# Encoding categorical data
|
43 |
for col, encoder in encoders.items():
|
44 |
if col in new_data_filled.columns:
|
45 |
unseen_categories = set(new_data_filled[col]) - set(encoder.classes_)
|
46 |
if unseen_categories:
|
47 |
+
encoder.classes_ = np.append(encoder.classes_, unseen_categories)
|
48 |
+
new_data_filled[col] = encoder.transform(new_data_filled[col])
|
49 |
+
|
50 |
+
# Splitting data into features and target
|
|
|
51 |
X_new = new_data_filled.drop('status_name', axis=1)
|
52 |
y_new = new_data_filled['status_name']
|
|
|
|
|
53 |
|
54 |
+
# Splitting data into training and testing sets
|
55 |
+
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)
|
56 |
|
57 |
+
# Setting up parameter grid for hyperparameter tuning
|
58 |
+
param_grid = {
|
59 |
+
'max_depth': [3, 4, 5],
|
60 |
+
'learning_rate': [0.01, 0.1, 0.4],
|
61 |
+
'n_estimators': [100, 200, 300],
|
62 |
+
'subsample': [0.8, 0.9, 1],
|
63 |
+
'colsample_bytree': [0.3, 0.7]
|
64 |
+
}
|
65 |
+
|
66 |
+
# Initializing GridSearchCV
|
67 |
+
grid_search = GridSearchCV(estimator=xgb_model, param_grid, cv=50, n_jobs=-1, scoring='accuracy')
|
68 |
+
|
69 |
+
# Fitting GridSearchCV
|
70 |
+
grid_search.fit(X_train, y_train)
|
71 |
+
|
72 |
+
# Updating the model with the best estimator
|
73 |
+
best_model = grid_search.best_estimator_
|
74 |
+
dump(best_model, 'transexpress_xgb_model.joblib')
|
75 |
+
|
76 |
+
# Making predictions and evaluating the model
|
77 |
+
y_pred = best_model.predict(X_test)
|
78 |
accuracy = accuracy_score(y_test, y_pred)
|
79 |
classification_rep = classification_report(y_test, y_pred)
|
80 |
+
|
81 |
+
# Returning the results
|
82 |
+
return accuracy, classification_rep, "Model finetuned with new data."
|
83 |
+
|
84 |
|
85 |
|
86 |
except:
|
|
|
117 |
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
|
118 |
|
119 |
# Setup GridSearchCV
|
120 |
+
grid_search = GridSearchCV(xgb, param_grid, cv=50, n_jobs=-1, scoring='accuracy')
|
121 |
|
122 |
# Fit the grid search to the data
|
123 |
grid_search.fit(X_train, y_train)
|