519Project2JHP
/

XGBoost_TF_IDF

Model card Files Files and versions Community

Jingyuan-Zhu commited on 25 days ago

Commit

b8ebb97

•

1 Parent(s): 2e115c9

Added updated model weights, vectorizer, and evaluation notebook

Browse files

Files changed (3) hide show

XGBoost_TF_IDF/evaluation_TFIDF_XGBoost.ipynb +1 -0
XGBoost_TF_IDF/tfidf_vectorizer.pkl +3 -0
XGBoost_TF_IDF/xgboost_tfidf_updated.json +0 -0

XGBoost_TF_IDF/evaluation_TFIDF_XGBoost.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPDK8wSQcaWQMOOXfo1XOQv"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"CLMep2SSTGSP","executionInfo":{"status":"ok","timestamp":1733863886877,"user_tz":300,"elapsed":9137,"user":{"displayName":"Jingyuan Zhu","userId":"12540342436467806828"}}},"outputs":[],"source":["import pandas as pd\n","import xgboost as xgb\n","import pickle\n","from sklearn.metrics import accuracy_score, classification_report, confusion_matrix"]},{"cell_type":"markdown","source":["# Please change the csv_file_path to the actual test dataset path"],"metadata":{"id":"4mltevn4W3vl"}},{"cell_type":"code","source":["vectorizer_path = \"/content/drive/Shared drives/5190_NLP_Project/TF_IDF_XGBoost_Model/tfidf_vectorizer.pkl\"\n","model_path = \"/content/drive/Shared drives/5190_NLP_Project/TF_IDF_XGBoost_Model/xgboost_tfidf_updated.json\"\n","csv_file_path = '/content/drive/Shared drives/5190_NLP_Project/test_data_random_subset.csv'\n","\n","\n","\n","data = pd.read_csv(csv_file_path, index_col=0)\n","\n","titles = data['title'].tolist()\n","labels = data['labels'].tolist()\n","\n","print(\"Loading TF-IDF vectorizer...\")\n","with open(vectorizer_path, \"rb\") as vec_file:\n"," tfidf_vectorizer = pickle.load(vec_file)\n","\n","print(\"Transforming test data...\")\n","X_test_tfidf = tfidf_vectorizer.transform(titles)\n","\n","print(\"Loading XGBoost model...\")\n","bst = xgb.Booster()\n","bst.load_model(model_path)\n","\n","dtest = xgb.DMatrix(data=X_test_tfidf)\n","\n","print(\"Making predictions...\")\n","y_pred_prob = bst.predict(dtest)\n","y_pred = (y_pred_prob >= 0.5).astype(int)\n","\n","print(\"\\nModel Performance on Test Set:\")\n","accuracy = accuracy_score(labels, y_pred)\n","print(f\"Accuracy: {accuracy:.4f}\")\n","\n","print(\"\\nClassification Report:\\n\", classification_report(labels, y_pred))\n","\n","print(\"\\nConfusion Matrix:\\n\", confusion_matrix(labels, y_pred))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ckVFHn4BXvTr","executionInfo":{"status":"ok","timestamp":1733864390897,"user_tz":300,"elapsed":1425,"user":{"displayName":"Jingyuan Zhu","userId":"12540342436467806828"}},"outputId":"5e5bf6ff-efd8-411c-9e67-36ea3632a6fe"},"execution_count":6,"outputs":[{"output_type":"stream","name":"stdout","text":["Loading TF-IDF vectorizer...\n","Transforming test data...\n","Loading XGBoost model...\n","Making predictions...\n","\n","Model Performance on Test Set:\n","Accuracy: 0.7500\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 0.69 0.90 0.78 10\n"," 1 0.86 0.60 0.71 10\n","\n"," accuracy 0.75 20\n"," macro avg 0.77 0.75 0.74 20\n","weighted avg 0.77 0.75 0.74 20\n","\n","\n","Confusion Matrix:\n"," [[9 1]\n"," [4 6]]\n"]}]}]}

XGBoost_TF_IDF/tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3499aa9857e781bd575825805e66aedb219f08bdbea78b65a0fbf3b4a8d8b8e5
+size 284408

XGBoost_TF_IDF/xgboost_tfidf_updated.json ADDED Viewed

The diff for this file is too large to render. See raw diff