| import time |
| import joblib |
| import pandas as pd |
| import numpy as np |
| import xgboost as xgb |
| import matplotlib.pyplot as plt |
|
|
| from tqdm.auto import tqdm |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.metrics import classification_report |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.metrics import confusion_matrix |
| from scipy.sparse import hstack, csr_matrix |
|
|
| |
| |
| |
|
|
| TRAIN_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/train.csv" |
| VAL_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/val.csv" |
| TEST_PATH = "/Users/vidyasagarkaruturi/Downloads/machine learning/src/data/processed/test.csv" |
|
|
| MODEL_SAVE_PATH = "document_classifier_xgb.pkl" |
|
|
| |
| |
| |
|
|
| print("π Loading data...") |
|
|
| train_df = pd.read_csv(TRAIN_PATH) |
| val_df = pd.read_csv(VAL_PATH) |
| test_df = pd.read_csv(TEST_PATH) |
|
|
| X_train_text = train_df["text"].fillna("") |
| X_val_text = val_df["text"].fillna("") |
| X_test_text = test_df["text"].fillna("") |
|
|
| y_train = train_df["label"] |
| y_val = val_df["label"] |
| y_test = test_df["label"] |
|
|
| print("β
Data loaded successfully") |
|
|
| |
| |
| |
|
|
| print("π§ Creating TF-IDF features...") |
|
|
| word_vectorizer = TfidfVectorizer( |
| max_features=40000, |
| ngram_range=(1, 2), |
| stop_words="english" |
| ) |
|
|
| char_vectorizer = TfidfVectorizer( |
| analyzer="char", |
| ngram_range=(3, 5), |
| max_features=20000 |
| ) |
|
|
| X_train_word = word_vectorizer.fit_transform(X_train_text) |
| X_val_word = word_vectorizer.transform(X_val_text) |
| X_test_word = word_vectorizer.transform(X_test_text) |
|
|
| X_train_char = char_vectorizer.fit_transform(X_train_text) |
| X_val_char = char_vectorizer.transform(X_val_text) |
| X_test_char = char_vectorizer.transform(X_test_text) |
|
|
| X_train_text_features = hstack([X_train_word, X_train_char]) |
| X_val_text_features = hstack([X_val_word, X_val_char]) |
| X_test_text_features = hstack([X_test_word, X_test_char]) |
|
|
| print("β
Text features ready") |
|
|
| |
| |
| |
|
|
| print("π’ Adding numeric features...") |
|
|
| numeric_cols = [ |
| "char_count", |
| "digit_count", |
| "uppercase_count", |
| "currency_count", |
| "line_count" |
| ] |
|
|
| scaler = StandardScaler() |
|
|
| X_train_num = scaler.fit_transform(train_df[numeric_cols]) |
| X_val_num = scaler.transform(val_df[numeric_cols]) |
| X_test_num = scaler.transform(test_df[numeric_cols]) |
|
|
| X_train_num = csr_matrix(X_train_num) |
| X_val_num = csr_matrix(X_val_num) |
| X_test_num = csr_matrix(X_test_num) |
|
|
| |
| X_train = hstack([X_train_text_features, X_train_num]) |
| X_val = hstack([X_val_text_features, X_val_num]) |
| X_test = hstack([X_test_text_features, X_test_num]) |
|
|
| print("β
Feature matrix ready") |
| |
| |
| |
|
|
| print("π Starting training...") |
|
|
| N_ESTIMATORS = 400 |
|
|
| class TqdmCallback(xgb.callback.TrainingCallback): |
| def __init__(self, total): |
| self.pbar = tqdm(total=total, desc="Training Progress", unit="trees") |
|
|
| def after_iteration(self, model, epoch, evals_log): |
| self.pbar.update(1) |
| return False |
|
|
| def after_training(self, model): |
| self.pbar.close() |
| return model |
|
|
| model = xgb.XGBClassifier( |
| n_estimators=N_ESTIMATORS, |
| max_depth=6, |
| learning_rate=0.1, |
| tree_method="hist", |
| eval_metric="mlogloss", |
| early_stopping_rounds=30, |
| callbacks=[TqdmCallback(N_ESTIMATORS)] |
| ) |
|
|
| start_time = time.time() |
|
|
| model.fit( |
| X_train, |
| y_train, |
| eval_set=[(X_train, y_train), (X_val, y_val)], |
| verbose=False |
| ) |
|
|
| print(f"\nβ± Training completed in {round(time.time() - start_time, 2)} seconds") |
|
|
| |
| |
| |
|
|
| print("\nπ Validation Performance:") |
| val_preds = model.predict(X_val) |
| print(classification_report(y_val, val_preds)) |
|
|
| print("\nπ Test Performance:") |
| test_preds = model.predict(X_test) |
| print(classification_report(y_test, test_preds)) |
|
|
| |
| |
| |
|
|
| results = model.evals_result() |
|
|
| train_loss = results["validation_0"]["mlogloss"] |
| val_loss = results["validation_1"]["mlogloss"] |
|
|
| plt.figure(figsize=(8,5)) |
| plt.plot(train_loss, label="Train Loss") |
| plt.plot(val_loss, label="Validation Loss") |
| plt.xlabel("Boosting Rounds") |
| plt.ylabel("Log Loss") |
| plt.title("Training Curve") |
| plt.legend() |
| plt.savefig("training_curve.png", dpi=150, bbox_inches="tight") |
| plt.close() |
| print("π Training curve saved to training_curve.png") |
|
|
| |
| |
| |
|
|
| plt.figure(figsize=(10,8)) |
| xgb.plot_importance(model, max_num_features=20) |
| plt.title("Top 20 Important Features") |
| plt.savefig("feature_importance.png", dpi=150, bbox_inches="tight") |
| plt.close() |
| print("π Feature importance saved to feature_importance.png") |
|
|
| |
| |
| |
|
|
| |
| |
| model.set_params(callbacks=[]) |
|
|
| joblib.dump({ |
| "model": model, |
| "word_vectorizer": word_vectorizer, |
| "char_vectorizer": char_vectorizer, |
| "scaler": scaler |
| }, MODEL_SAVE_PATH) |
|
|
| print(f"\nπΎ Model saved to {MODEL_SAVE_PATH}") |
| print("π₯ All done!") |