Spaces:
Build error
Build error
cleaned scapped_data2; reformatted all code
Browse files- IS424_Data_Mining/code/Classification/Classification models on incident category.ipynb +122 -78
- IS424_Data_Mining/code/Classification/Classification models on maritime incident.ipynb +36 -22
- IS424_Data_Mining/code/EDA.ipynb +93 -66
- IS424_Data_Mining/code/GPT/Pipeline.ipynb +108 -87
- IS424_Data_Mining/code/LDA/basic_text_preprocessing.ipynb +98 -60
- IS424_Data_Mining/code/LDA/basic_text_preprocessing_on_scraped_data.ipynb +106 -62
- IS424_Data_Mining/code/LDA/topic_modelling_benchmark_using_headline.ipynb +28 -23
- IS424_Data_Mining/code/LDA/topic_modelling_minor.ipynb +127 -85
- IS424_Data_Mining/code/LDA/topic_modelling_moderate.ipynb +128 -86
- IS424_Data_Mining/code/LDA/topic_modelling_severe.ipynb +128 -87
- IS424_Data_Mining/code/LLM Evaluation/evaluation.ipynb +20 -12
- IS424_Data_Mining/code/NER/Named_Entity_Recognition.ipynb +14 -13
- IS424_Data_Mining/code/NewsScraper/newsScraper.ipynb +28 -20
- app.py +2 -1
- data/scrapped_data2_cleaned.csv +274 -0
- notebooks/00_EDA.ipynb +88 -59
- notebooks/05a_newsScraper_run_1.ipynb +22 -14
- notebooks/05b_newsScraper_run_2.ipynb +6 -2
- notebooks/05c_newsScraper_clearning.ipynb +0 -0
- notebooks/06_basic_text_preprocessing_on_scraped_data.ipynb +102 -60
- notebooks/07_topic_modelling_minor.ipynb +123 -81
- notebooks/08_topic_modelling_moderate.ipynb +126 -84
- notebooks/09_topic_modelling_severe.ipynb +128 -87
- notebooks/10_LLM_evaluation.ipynb +19 -11
- requirements.txt +4 -1
IS424_Data_Mining/code/Classification/Classification models on incident category.ipynb
CHANGED
@@ -195,9 +195,11 @@
|
|
195 |
"from nltk.tokenize import word_tokenize\n",
|
196 |
"from nltk.stem import WordNetLemmatizer\n",
|
197 |
"import string\n",
|
198 |
-
"
|
199 |
-
"#nltk.download('
|
200 |
-
"#nltk.download('
|
|
|
|
|
201 |
"\n",
|
202 |
"def clean_text(text):\n",
|
203 |
" # Lowercase\n",
|
@@ -207,13 +209,13 @@
|
|
207 |
" # Removing punctuation\n",
|
208 |
" tokens = [word for word in tokens if word not in string.punctuation]\n",
|
209 |
" # Removing stop words\n",
|
210 |
-
" stop_words = set(stopwords.words(
|
211 |
" tokens = [word for word in tokens if word not in stop_words]\n",
|
212 |
" # Lemmatization\n",
|
213 |
" lemmatizer = WordNetLemmatizer()\n",
|
214 |
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
|
215 |
-
"
|
216 |
-
" return
|
217 |
]
|
218 |
},
|
219 |
{
|
@@ -244,7 +246,8 @@
|
|
244 |
],
|
245 |
"source": [
|
246 |
"import nltk\n",
|
247 |
-
"
|
|
|
248 |
]
|
249 |
},
|
250 |
{
|
@@ -309,16 +312,20 @@
|
|
309 |
}
|
310 |
],
|
311 |
"source": [
|
312 |
-
"text_df = df[[\"Details\"
|
313 |
"text_df.info()\n",
|
314 |
-
"text_df[
|
315 |
-
"
|
316 |
-
"\n",
|
317 |
-
"
|
318 |
-
"
|
319 |
-
"
|
320 |
-
"
|
321 |
-
"#
|
|
|
|
|
|
|
|
|
322 |
"text_df.info()"
|
323 |
]
|
324 |
},
|
@@ -567,28 +574,30 @@
|
|
567 |
"# Create a function that will split the labels into individual\n",
|
568 |
"import re\n",
|
569 |
"\n",
|
|
|
570 |
"def split_string(text):\n",
|
571 |
" # Split the string using either \"/\" or \",\" as separator\n",
|
572 |
-
" words = re.split(r
|
573 |
" # Remove any leading or trailing whitespace from each word\n",
|
574 |
" words = [word.strip() for word in words if word.strip()]\n",
|
575 |
" return words\n",
|
576 |
"\n",
|
|
|
577 |
"# Example usage:\n",
|
578 |
"# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n",
|
579 |
"# result = split_string(input_str)\n",
|
580 |
"# print(result)\n",
|
581 |
"\n",
|
582 |
-
"#create a list to find the number of unique individual labels\n",
|
583 |
"label_list = []\n",
|
584 |
"\n",
|
585 |
"for i in processed_data[\"Category_cleaned\"]:\n",
|
586 |
" for j in split_string(i):\n",
|
587 |
" if j not in label_list:\n",
|
588 |
" label_list.append(j)\n",
|
589 |
-
"
|
590 |
"# print(label)\n",
|
591 |
-
"print(len(label_list))
|
592 |
]
|
593 |
},
|
594 |
{
|
@@ -651,7 +660,7 @@
|
|
651 |
" # Split the string using either \"/\" or \",\" as separator\n",
|
652 |
" if text == None:\n",
|
653 |
" return None\n",
|
654 |
-
" words = re.split(r
|
655 |
" # Remove any leading or trailing whitespace from each word\n",
|
656 |
" words = [word.strip() for word in words if word.strip()]\n",
|
657 |
" # Return the first word after split\n",
|
@@ -659,6 +668,8 @@
|
|
659 |
" return words[0]\n",
|
660 |
" else:\n",
|
661 |
" return None\n",
|
|
|
|
|
662 |
"def remove_none_rows(df, column_name):\n",
|
663 |
" # Iterate through the DataFrame\n",
|
664 |
" for index, value in enumerate(df[column_name]):\n",
|
@@ -667,11 +678,15 @@
|
|
667 |
" # Remove the row where the data belongs to\n",
|
668 |
" df = df.drop(index, axis=0)\n",
|
669 |
" return df\n",
|
|
|
|
|
670 |
"# Example usage:\n",
|
671 |
"# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n",
|
672 |
"# result = split_and_get_first(input_str)\n",
|
673 |
"# print(result)\n",
|
674 |
-
"text_df[
|
|
|
|
|
675 |
"result_df = remove_none_rows(text_df, \"Category_cleaned\")\n",
|
676 |
"result_df.info()"
|
677 |
]
|
@@ -1012,7 +1027,7 @@
|
|
1012 |
"outputs": [],
|
1013 |
"source": [
|
1014 |
"### first export the unique labels into excel for better visualization\n",
|
1015 |
-
"unique_labels_df = pd.DataFrame({
|
1016 |
"file_path = \"label_list.xlsx\"\n",
|
1017 |
"\n",
|
1018 |
"# Save DataFrame to Excel\n",
|
@@ -1224,7 +1239,7 @@
|
|
1224 |
}
|
1225 |
],
|
1226 |
"source": [
|
1227 |
-
"new_labels_dict = new_labels_df.to_dict(orient
|
1228 |
"\n",
|
1229 |
"\n",
|
1230 |
"for key, value in new_labels_dict.items():\n",
|
@@ -1232,9 +1247,8 @@
|
|
1232 |
"\n",
|
1233 |
"for category in new_labels_dict:\n",
|
1234 |
" print(\"\\n\")\n",
|
1235 |
-
" print(category
|
1236 |
-
" print(new_labels_dict[category])
|
1237 |
-
" "
|
1238 |
]
|
1239 |
},
|
1240 |
{
|
@@ -1460,13 +1474,13 @@
|
|
1460 |
}
|
1461 |
],
|
1462 |
"source": [
|
1463 |
-
"result_df[
|
1464 |
"\n",
|
1465 |
"for index, row in result_df.iterrows():\n",
|
1466 |
-
" value = row[
|
1467 |
" for key, values in new_labels_dict.items():\n",
|
1468 |
" if value in values:\n",
|
1469 |
-
" result_df.at[index,
|
1470 |
" break # No need to check other keys if match found\n",
|
1471 |
"result_df"
|
1472 |
]
|
@@ -1488,6 +1502,7 @@
|
|
1488 |
"source": [
|
1489 |
"from sklearn.model_selection import train_test_split\n",
|
1490 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
|
1491 |
"# from sklearn.feature_extraction.text import CountVectorizer\n",
|
1492 |
"from sklearn.naive_bayes import MultinomialNB\n",
|
1493 |
"from sklearn.metrics import accuracy_score, classification_report"
|
@@ -1511,7 +1526,9 @@
|
|
1511 |
"metadata": {},
|
1512 |
"outputs": [],
|
1513 |
"source": [
|
1514 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
1515 |
]
|
1516 |
},
|
1517 |
{
|
@@ -1525,7 +1542,7 @@
|
|
1525 |
"# X_train_vec = vectorizer.fit_transform(X_train)\n",
|
1526 |
"# X_test_vec = vectorizer.transform(X_test)\n",
|
1527 |
"\n",
|
1528 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)
|
1529 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
1530 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
1531 |
]
|
@@ -1651,16 +1668,17 @@
|
|
1651 |
],
|
1652 |
"source": [
|
1653 |
"from sklearn.model_selection import GridSearchCV\n",
|
1654 |
-
"
|
|
|
1655 |
"\n",
|
1656 |
"# Initialize the grid search\n",
|
1657 |
-
"grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring
|
1658 |
"\n",
|
1659 |
"# Perform the grid search\n",
|
1660 |
"grid_search.fit(X_train_tfidf, y_train)\n",
|
1661 |
"\n",
|
1662 |
"# Get the best hyperparameters\n",
|
1663 |
-
"best_alpha = grid_search.best_params_[
|
1664 |
"print(\"Best Alpha:\", best_alpha)\n",
|
1665 |
"\n",
|
1666 |
"# Train the model with the best alpha\n",
|
@@ -1720,14 +1738,16 @@
|
|
1720 |
"X = result_df[\"Details_cleaned\"]\n",
|
1721 |
"y = result_df[\"Summarized_label\"]\n",
|
1722 |
"\n",
|
1723 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
1724 |
"\n",
|
1725 |
"start_time = time.time()\n",
|
1726 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=4000)
|
1727 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
1728 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
1729 |
"\n",
|
1730 |
-
"naive_bayes = MultinomialNB(alpha
|
1731 |
"naive_bayes.fit(X_train_tfidf, y_train)\n",
|
1732 |
"\n",
|
1733 |
"predictions = naive_bayes.predict(X_test_tfidf)\n",
|
@@ -1770,7 +1790,9 @@
|
|
1770 |
"metadata": {},
|
1771 |
"outputs": [],
|
1772 |
"source": [
|
1773 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
1774 |
]
|
1775 |
},
|
1776 |
{
|
@@ -1780,7 +1802,7 @@
|
|
1780 |
"metadata": {},
|
1781 |
"outputs": [],
|
1782 |
"source": [
|
1783 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)
|
1784 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
1785 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
1786 |
]
|
@@ -2120,19 +2142,18 @@
|
|
2120 |
"source": [
|
2121 |
"from sklearn.pipeline import Pipeline\n",
|
2122 |
"\n",
|
2123 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2124 |
"\n",
|
2125 |
"param_grid = {\n",
|
2126 |
-
"
|
2127 |
-
"
|
2128 |
"}\n",
|
2129 |
"\n",
|
2130 |
-
"pipeline = Pipeline([\n",
|
2131 |
-
" ('tfidf', TfidfVectorizer()),\n",
|
2132 |
-
" ('model', LogisticRegression())\n",
|
2133 |
-
"])\n",
|
2134 |
"\n",
|
2135 |
-
"grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring
|
2136 |
"\n",
|
2137 |
"grid_search.fit(X_train, y_train)\n",
|
2138 |
"\n",
|
@@ -2201,10 +2222,12 @@
|
|
2201 |
}
|
2202 |
],
|
2203 |
"source": [
|
2204 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2205 |
"\n",
|
2206 |
"start_time = time.time()\n",
|
2207 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=2000)
|
2208 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2209 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2210 |
"\n",
|
@@ -2251,7 +2274,9 @@
|
|
2251 |
"metadata": {},
|
2252 |
"outputs": [],
|
2253 |
"source": [
|
2254 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2255 |
]
|
2256 |
},
|
2257 |
{
|
@@ -2287,7 +2312,7 @@
|
|
2287 |
}
|
2288 |
],
|
2289 |
"source": [
|
2290 |
-
"svm_model = SVC(kernel
|
2291 |
"svm_model.fit(X_train_tfidf, y_train)"
|
2292 |
]
|
2293 |
},
|
@@ -2354,11 +2379,12 @@
|
|
2354 |
],
|
2355 |
"source": [
|
2356 |
"from sklearn.model_selection import GridSearchCV\n",
|
2357 |
-
"
|
|
|
2358 |
"svm = SVC()\n",
|
2359 |
-
"grid_search = GridSearchCV(svm, param_grid, cv=5, scoring
|
2360 |
"grid_search.fit(X_train_tfidf, y_train)\n",
|
2361 |
-
"best_c = grid_search.best_params_[
|
2362 |
"print(\"Best C:\", best_c)"
|
2363 |
]
|
2364 |
},
|
@@ -2392,13 +2418,15 @@
|
|
2392 |
}
|
2393 |
],
|
2394 |
"source": [
|
2395 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2396 |
"\n",
|
2397 |
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
2398 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2399 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2400 |
"\n",
|
2401 |
-
"svm_model = SVC(kernel
|
2402 |
"svm_model.fit(X_train_tfidf, y_train)\n",
|
2403 |
"\n",
|
2404 |
"y_pred = svm_model.predict(X_test_tfidf)\n",
|
@@ -2446,14 +2474,16 @@
|
|
2446 |
}
|
2447 |
],
|
2448 |
"source": [
|
2449 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2450 |
"\n",
|
2451 |
"start_time = time.time()\n",
|
2452 |
"tfidf_vectorizer = TfidfVectorizer(max_features=2000)\n",
|
2453 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2454 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2455 |
"\n",
|
2456 |
-
"svm_model = SVC(kernel
|
2457 |
"svm_model.fit(X_train_tfidf, y_train)\n",
|
2458 |
"\n",
|
2459 |
"y_pred = svm_model.predict(X_test_tfidf)\n",
|
@@ -2496,7 +2526,9 @@
|
|
2496 |
"metadata": {},
|
2497 |
"outputs": [],
|
2498 |
"source": [
|
2499 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2500 |
]
|
2501 |
},
|
2502 |
{
|
@@ -2506,7 +2538,7 @@
|
|
2506 |
"metadata": {},
|
2507 |
"outputs": [],
|
2508 |
"source": [
|
2509 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)
|
2510 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2511 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
2512 |
]
|
@@ -2620,14 +2652,18 @@
|
|
2620 |
}
|
2621 |
],
|
2622 |
"source": [
|
2623 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2624 |
"\n",
|
2625 |
"start_time = time.time()\n",
|
2626 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=2000)
|
2627 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2628 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2629 |
"\n",
|
2630 |
-
"rf_model = RandomForestClassifier(
|
|
|
|
|
2631 |
"rf_model.fit(X_train_tfidf, y_train)\n",
|
2632 |
"\n",
|
2633 |
"y_pred = rf_model.predict(X_test_tfidf)\n",
|
@@ -2680,7 +2716,9 @@
|
|
2680 |
"metadata": {},
|
2681 |
"outputs": [],
|
2682 |
"source": [
|
2683 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2684 |
]
|
2685 |
},
|
2686 |
{
|
@@ -2741,9 +2779,11 @@
|
|
2741 |
"from sklearn.model_selection import train_test_split\n",
|
2742 |
"from sklearn.neighbors import KNeighborsClassifier\n",
|
2743 |
"\n",
|
2744 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2745 |
"\n",
|
2746 |
-
"k_values = range(1, 21)
|
2747 |
"\n",
|
2748 |
"train_scores = []\n",
|
2749 |
"test_scores = []\n",
|
@@ -2753,21 +2793,21 @@
|
|
2753 |
" # Train KNN classifier\n",
|
2754 |
" knn = KNeighborsClassifier(n_neighbors=k)\n",
|
2755 |
" knn.fit(X_train, y_train)\n",
|
2756 |
-
"
|
2757 |
" # Calculate training and testing accuracy\n",
|
2758 |
" train_score = knn.score(X_train, y_train)\n",
|
2759 |
" test_score = knn.score(X_test, y_test)\n",
|
2760 |
-
"
|
2761 |
" train_scores.append(train_score)\n",
|
2762 |
" test_scores.append(test_score)\n",
|
2763 |
"\n",
|
2764 |
"# Plot the performance scores\n",
|
2765 |
"plt.figure(figsize=(10, 6))\n",
|
2766 |
-
"plt.plot(k_values, train_scores, label
|
2767 |
-
"plt.plot(k_values, test_scores, label
|
2768 |
-
"plt.xlabel(
|
2769 |
-
"plt.ylabel(
|
2770 |
-
"plt.title(
|
2771 |
"plt.xticks(np.arange(1, 21, step=1))\n",
|
2772 |
"plt.legend()\n",
|
2773 |
"plt.grid(True)\n",
|
@@ -2808,9 +2848,11 @@
|
|
2808 |
"source": [
|
2809 |
"knn = KNeighborsClassifier()\n",
|
2810 |
"\n",
|
2811 |
-
"param_grid = {
|
2812 |
"\n",
|
2813 |
-
"grid_search = GridSearchCV(
|
|
|
|
|
2814 |
"\n",
|
2815 |
"grid_search.fit(X_train, y_train)\n",
|
2816 |
"\n",
|
@@ -2857,12 +2899,14 @@
|
|
2857 |
}
|
2858 |
],
|
2859 |
"source": [
|
2860 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
2861 |
"\n",
|
2862 |
"start_time = time.time()\n",
|
2863 |
"\n",
|
2864 |
"k = 5\n",
|
2865 |
-
"knn_model = KNeighborsClassifier(n_neighbors=k, weights
|
2866 |
"knn_model.fit(X_train, y_train)\n",
|
2867 |
"\n",
|
2868 |
"y_pred = knn_model.predict(X_test)\n",
|
|
|
195 |
"from nltk.tokenize import word_tokenize\n",
|
196 |
"from nltk.stem import WordNetLemmatizer\n",
|
197 |
"import string\n",
|
198 |
+
"\n",
|
199 |
+
"# nltk.download('punkt')\n",
|
200 |
+
"# nltk.download('stopwords')\n",
|
201 |
+
"# nltk.download('wordnet')\n",
|
202 |
+
"\n",
|
203 |
"\n",
|
204 |
"def clean_text(text):\n",
|
205 |
" # Lowercase\n",
|
|
|
209 |
" # Removing punctuation\n",
|
210 |
" tokens = [word for word in tokens if word not in string.punctuation]\n",
|
211 |
" # Removing stop words\n",
|
212 |
+
" stop_words = set(stopwords.words(\"english\"))\n",
|
213 |
" tokens = [word for word in tokens if word not in stop_words]\n",
|
214 |
" # Lemmatization\n",
|
215 |
" lemmatizer = WordNetLemmatizer()\n",
|
216 |
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
|
217 |
+
"\n",
|
218 |
+
" return \" \".join(tokens)"
|
219 |
]
|
220 |
},
|
221 |
{
|
|
|
246 |
],
|
247 |
"source": [
|
248 |
"import nltk\n",
|
249 |
+
"\n",
|
250 |
+
"nltk.download(\"omw-1.4\")"
|
251 |
]
|
252 |
},
|
253 |
{
|
|
|
312 |
}
|
313 |
],
|
314 |
"source": [
|
315 |
+
"text_df = df[[\"Details\", \"Category\"]]\n",
|
316 |
"text_df.info()\n",
|
317 |
+
"text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n",
|
318 |
+
" lambda x: clean_text(x) if not isinstance(x, float) else None\n",
|
319 |
+
")\n",
|
320 |
+
"text_df[\"Category_cleaned\"] = text_df[\"Category\"].apply(\n",
|
321 |
+
" lambda x: None if isinstance(x, float) else x\n",
|
322 |
+
")\n",
|
323 |
+
"\n",
|
324 |
+
"# no_nan_df[no_nan_df[\"Details\"].apply(lambda x: print(type(x)))]\n",
|
325 |
+
"# cleaned_df = text_df[text_df[\"Details\"].apply(lambda x: clean_text(x))]\n",
|
326 |
+
"# cleaned_df = df['Details'][1:2]\n",
|
327 |
+
"# type(no_nan_df[\"Details\"][0])\n",
|
328 |
+
"# print(clean_text(no_nan_df[\"Details\"][0]))\n",
|
329 |
"text_df.info()"
|
330 |
]
|
331 |
},
|
|
|
574 |
"# Create a function that will split the labels into individual\n",
|
575 |
"import re\n",
|
576 |
"\n",
|
577 |
+
"\n",
|
578 |
"def split_string(text):\n",
|
579 |
" # Split the string using either \"/\" or \",\" as separator\n",
|
580 |
+
" words = re.split(r\"[\\/,]\", text)\n",
|
581 |
" # Remove any leading or trailing whitespace from each word\n",
|
582 |
" words = [word.strip() for word in words if word.strip()]\n",
|
583 |
" return words\n",
|
584 |
"\n",
|
585 |
+
"\n",
|
586 |
"# Example usage:\n",
|
587 |
"# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n",
|
588 |
"# result = split_string(input_str)\n",
|
589 |
"# print(result)\n",
|
590 |
"\n",
|
591 |
+
"# create a list to find the number of unique individual labels\n",
|
592 |
"label_list = []\n",
|
593 |
"\n",
|
594 |
"for i in processed_data[\"Category_cleaned\"]:\n",
|
595 |
" for j in split_string(i):\n",
|
596 |
" if j not in label_list:\n",
|
597 |
" label_list.append(j)\n",
|
598 |
+
"\n",
|
599 |
"# print(label)\n",
|
600 |
+
"print(len(label_list))"
|
601 |
]
|
602 |
},
|
603 |
{
|
|
|
660 |
" # Split the string using either \"/\" or \",\" as separator\n",
|
661 |
" if text == None:\n",
|
662 |
" return None\n",
|
663 |
+
" words = re.split(r\"[\\/,]\", text)\n",
|
664 |
" # Remove any leading or trailing whitespace from each word\n",
|
665 |
" words = [word.strip() for word in words if word.strip()]\n",
|
666 |
" # Return the first word after split\n",
|
|
|
668 |
" return words[0]\n",
|
669 |
" else:\n",
|
670 |
" return None\n",
|
671 |
+
"\n",
|
672 |
+
"\n",
|
673 |
"def remove_none_rows(df, column_name):\n",
|
674 |
" # Iterate through the DataFrame\n",
|
675 |
" for index, value in enumerate(df[column_name]):\n",
|
|
|
678 |
" # Remove the row where the data belongs to\n",
|
679 |
" df = df.drop(index, axis=0)\n",
|
680 |
" return df\n",
|
681 |
+
"\n",
|
682 |
+
"\n",
|
683 |
"# Example usage:\n",
|
684 |
"# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n",
|
685 |
"# result = split_and_get_first(input_str)\n",
|
686 |
"# print(result)\n",
|
687 |
+
"text_df[\"Category_single\"] = text_df[\"Category_cleaned\"].apply(\n",
|
688 |
+
" lambda x: split_and_get_first(x)\n",
|
689 |
+
")\n",
|
690 |
"result_df = remove_none_rows(text_df, \"Category_cleaned\")\n",
|
691 |
"result_df.info()"
|
692 |
]
|
|
|
1027 |
"outputs": [],
|
1028 |
"source": [
|
1029 |
"### first export the unique labels into excel for better visualization\n",
|
1030 |
+
"unique_labels_df = pd.DataFrame({\"String\": label_list})\n",
|
1031 |
"file_path = \"label_list.xlsx\"\n",
|
1032 |
"\n",
|
1033 |
"# Save DataFrame to Excel\n",
|
|
|
1239 |
}
|
1240 |
],
|
1241 |
"source": [
|
1242 |
+
"new_labels_dict = new_labels_df.to_dict(orient=\"list\")\n",
|
1243 |
"\n",
|
1244 |
"\n",
|
1245 |
"for key, value in new_labels_dict.items():\n",
|
|
|
1247 |
"\n",
|
1248 |
"for category in new_labels_dict:\n",
|
1249 |
" print(\"\\n\")\n",
|
1250 |
+
" print(category + \"\\n\")\n",
|
1251 |
+
" print(new_labels_dict[category])"
|
|
|
1252 |
]
|
1253 |
},
|
1254 |
{
|
|
|
1474 |
}
|
1475 |
],
|
1476 |
"source": [
|
1477 |
+
"result_df[\"Summarized_label\"] = None\n",
|
1478 |
"\n",
|
1479 |
"for index, row in result_df.iterrows():\n",
|
1480 |
+
" value = row[\"Category_single\"]\n",
|
1481 |
" for key, values in new_labels_dict.items():\n",
|
1482 |
" if value in values:\n",
|
1483 |
+
" result_df.at[index, \"Summarized_label\"] = key\n",
|
1484 |
" break # No need to check other keys if match found\n",
|
1485 |
"result_df"
|
1486 |
]
|
|
|
1502 |
"source": [
|
1503 |
"from sklearn.model_selection import train_test_split\n",
|
1504 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
1505 |
+
"\n",
|
1506 |
"# from sklearn.feature_extraction.text import CountVectorizer\n",
|
1507 |
"from sklearn.naive_bayes import MultinomialNB\n",
|
1508 |
"from sklearn.metrics import accuracy_score, classification_report"
|
|
|
1526 |
"metadata": {},
|
1527 |
"outputs": [],
|
1528 |
"source": [
|
1529 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
1530 |
+
" X, y, test_size=0.2, random_state=42\n",
|
1531 |
+
")"
|
1532 |
]
|
1533 |
},
|
1534 |
{
|
|
|
1542 |
"# X_train_vec = vectorizer.fit_transform(X_train)\n",
|
1543 |
"# X_test_vec = vectorizer.transform(X_test)\n",
|
1544 |
"\n",
|
1545 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
1546 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
1547 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
1548 |
]
|
|
|
1668 |
],
|
1669 |
"source": [
|
1670 |
"from sklearn.model_selection import GridSearchCV\n",
|
1671 |
+
"\n",
|
1672 |
+
"param_grid = {\"alpha\": [0.1, 0.5, 1.0, 2.0]}\n",
|
1673 |
"\n",
|
1674 |
"# Initialize the grid search\n",
|
1675 |
+
"grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring=\"accuracy\")\n",
|
1676 |
"\n",
|
1677 |
"# Perform the grid search\n",
|
1678 |
"grid_search.fit(X_train_tfidf, y_train)\n",
|
1679 |
"\n",
|
1680 |
"# Get the best hyperparameters\n",
|
1681 |
+
"best_alpha = grid_search.best_params_[\"alpha\"]\n",
|
1682 |
"print(\"Best Alpha:\", best_alpha)\n",
|
1683 |
"\n",
|
1684 |
"# Train the model with the best alpha\n",
|
|
|
1738 |
"X = result_df[\"Details_cleaned\"]\n",
|
1739 |
"y = result_df[\"Summarized_label\"]\n",
|
1740 |
"\n",
|
1741 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
1742 |
+
" X, y, test_size=0.2, random_state=42\n",
|
1743 |
+
")\n",
|
1744 |
"\n",
|
1745 |
"start_time = time.time()\n",
|
1746 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=4000)\n",
|
1747 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
1748 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
1749 |
"\n",
|
1750 |
+
"naive_bayes = MultinomialNB(alpha=0.1)\n",
|
1751 |
"naive_bayes.fit(X_train_tfidf, y_train)\n",
|
1752 |
"\n",
|
1753 |
"predictions = naive_bayes.predict(X_test_tfidf)\n",
|
|
|
1790 |
"metadata": {},
|
1791 |
"outputs": [],
|
1792 |
"source": [
|
1793 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
1794 |
+
" X, y, test_size=0.2, random_state=42\n",
|
1795 |
+
")"
|
1796 |
]
|
1797 |
},
|
1798 |
{
|
|
|
1802 |
"metadata": {},
|
1803 |
"outputs": [],
|
1804 |
"source": [
|
1805 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
1806 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
1807 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
1808 |
]
|
|
|
2142 |
"source": [
|
2143 |
"from sklearn.pipeline import Pipeline\n",
|
2144 |
"\n",
|
2145 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2146 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2147 |
+
")\n",
|
2148 |
"\n",
|
2149 |
"param_grid = {\n",
|
2150 |
+
" \"tfidf__max_features\": [500, 1000, 2000, 3000, 4000],\n",
|
2151 |
+
" \"model__C\": [0.1, 1.0, 10.0],\n",
|
2152 |
"}\n",
|
2153 |
"\n",
|
2154 |
+
"pipeline = Pipeline([(\"tfidf\", TfidfVectorizer()), (\"model\", LogisticRegression())])\n",
|
|
|
|
|
|
|
2155 |
"\n",
|
2156 |
+
"grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=\"accuracy\")\n",
|
2157 |
"\n",
|
2158 |
"grid_search.fit(X_train, y_train)\n",
|
2159 |
"\n",
|
|
|
2222 |
}
|
2223 |
],
|
2224 |
"source": [
|
2225 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2226 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2227 |
+
")\n",
|
2228 |
"\n",
|
2229 |
"start_time = time.time()\n",
|
2230 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=2000)\n",
|
2231 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2232 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2233 |
"\n",
|
|
|
2274 |
"metadata": {},
|
2275 |
"outputs": [],
|
2276 |
"source": [
|
2277 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2278 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2279 |
+
")"
|
2280 |
]
|
2281 |
},
|
2282 |
{
|
|
|
2312 |
}
|
2313 |
],
|
2314 |
"source": [
|
2315 |
+
"svm_model = SVC(kernel=\"linear\")\n",
|
2316 |
"svm_model.fit(X_train_tfidf, y_train)"
|
2317 |
]
|
2318 |
},
|
|
|
2379 |
],
|
2380 |
"source": [
|
2381 |
"from sklearn.model_selection import GridSearchCV\n",
|
2382 |
+
"\n",
|
2383 |
+
"param_grid = {\"C\": [0.1, 1, 10]}\n",
|
2384 |
"svm = SVC()\n",
|
2385 |
+
"grid_search = GridSearchCV(svm, param_grid, cv=5, scoring=\"accuracy\")\n",
|
2386 |
"grid_search.fit(X_train_tfidf, y_train)\n",
|
2387 |
+
"best_c = grid_search.best_params_[\"C\"]\n",
|
2388 |
"print(\"Best C:\", best_c)"
|
2389 |
]
|
2390 |
},
|
|
|
2418 |
}
|
2419 |
],
|
2420 |
"source": [
|
2421 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2422 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2423 |
+
")\n",
|
2424 |
"\n",
|
2425 |
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
2426 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2427 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2428 |
"\n",
|
2429 |
+
"svm_model = SVC(kernel=\"linear\", C=10)\n",
|
2430 |
"svm_model.fit(X_train_tfidf, y_train)\n",
|
2431 |
"\n",
|
2432 |
"y_pred = svm_model.predict(X_test_tfidf)\n",
|
|
|
2474 |
}
|
2475 |
],
|
2476 |
"source": [
|
2477 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2478 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2479 |
+
")\n",
|
2480 |
"\n",
|
2481 |
"start_time = time.time()\n",
|
2482 |
"tfidf_vectorizer = TfidfVectorizer(max_features=2000)\n",
|
2483 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2484 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2485 |
"\n",
|
2486 |
+
"svm_model = SVC(kernel=\"linear\")\n",
|
2487 |
"svm_model.fit(X_train_tfidf, y_train)\n",
|
2488 |
"\n",
|
2489 |
"y_pred = svm_model.predict(X_test_tfidf)\n",
|
|
|
2526 |
"metadata": {},
|
2527 |
"outputs": [],
|
2528 |
"source": [
|
2529 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2530 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2531 |
+
")"
|
2532 |
]
|
2533 |
},
|
2534 |
{
|
|
|
2538 |
"metadata": {},
|
2539 |
"outputs": [],
|
2540 |
"source": [
|
2541 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
2542 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2543 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
2544 |
]
|
|
|
2652 |
}
|
2653 |
],
|
2654 |
"source": [
|
2655 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2656 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2657 |
+
")\n",
|
2658 |
"\n",
|
2659 |
"start_time = time.time()\n",
|
2660 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=2000)\n",
|
2661 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
2662 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
|
2663 |
"\n",
|
2664 |
+
"rf_model = RandomForestClassifier(\n",
|
2665 |
+
" n_estimators=300, min_samples_split=5, random_state=42\n",
|
2666 |
+
")\n",
|
2667 |
"rf_model.fit(X_train_tfidf, y_train)\n",
|
2668 |
"\n",
|
2669 |
"y_pred = rf_model.predict(X_test_tfidf)\n",
|
|
|
2716 |
"metadata": {},
|
2717 |
"outputs": [],
|
2718 |
"source": [
|
2719 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2720 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2721 |
+
")"
|
2722 |
]
|
2723 |
},
|
2724 |
{
|
|
|
2779 |
"from sklearn.model_selection import train_test_split\n",
|
2780 |
"from sklearn.neighbors import KNeighborsClassifier\n",
|
2781 |
"\n",
|
2782 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2783 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2784 |
+
")\n",
|
2785 |
"\n",
|
2786 |
+
"k_values = range(1, 21)\n",
|
2787 |
"\n",
|
2788 |
"train_scores = []\n",
|
2789 |
"test_scores = []\n",
|
|
|
2793 |
" # Train KNN classifier\n",
|
2794 |
" knn = KNeighborsClassifier(n_neighbors=k)\n",
|
2795 |
" knn.fit(X_train, y_train)\n",
|
2796 |
+
"\n",
|
2797 |
" # Calculate training and testing accuracy\n",
|
2798 |
" train_score = knn.score(X_train, y_train)\n",
|
2799 |
" test_score = knn.score(X_test, y_test)\n",
|
2800 |
+
"\n",
|
2801 |
" train_scores.append(train_score)\n",
|
2802 |
" test_scores.append(test_score)\n",
|
2803 |
"\n",
|
2804 |
"# Plot the performance scores\n",
|
2805 |
"plt.figure(figsize=(10, 6))\n",
|
2806 |
+
"plt.plot(k_values, train_scores, label=\"Train Accuracy\", marker=\"o\")\n",
|
2807 |
+
"plt.plot(k_values, test_scores, label=\"Test Accuracy\", marker=\"o\")\n",
|
2808 |
+
"plt.xlabel(\"Number of Neighbors (k)\")\n",
|
2809 |
+
"plt.ylabel(\"Accuracy\")\n",
|
2810 |
+
"plt.title(\"KNN Classifier Performance\")\n",
|
2811 |
"plt.xticks(np.arange(1, 21, step=1))\n",
|
2812 |
"plt.legend()\n",
|
2813 |
"plt.grid(True)\n",
|
|
|
2848 |
"source": [
|
2849 |
"knn = KNeighborsClassifier()\n",
|
2850 |
"\n",
|
2851 |
+
"param_grid = {\"weights\": [\"uniform\", \"distance\"], \"p\": [1, 2]}\n",
|
2852 |
"\n",
|
2853 |
+
"grid_search = GridSearchCV(\n",
|
2854 |
+
" estimator=knn, param_grid=param_grid, cv=5, scoring=\"accuracy\"\n",
|
2855 |
+
")\n",
|
2856 |
"\n",
|
2857 |
"grid_search.fit(X_train, y_train)\n",
|
2858 |
"\n",
|
|
|
2899 |
}
|
2900 |
],
|
2901 |
"source": [
|
2902 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
2903 |
+
" X, y, test_size=0.2, random_state=42\n",
|
2904 |
+
")\n",
|
2905 |
"\n",
|
2906 |
"start_time = time.time()\n",
|
2907 |
"\n",
|
2908 |
"k = 5\n",
|
2909 |
+
"knn_model = KNeighborsClassifier(n_neighbors=k, weights=\"distance\")\n",
|
2910 |
"knn_model.fit(X_train, y_train)\n",
|
2911 |
"\n",
|
2912 |
"y_pred = knn_model.predict(X_test)\n",
|
IS424_Data_Mining/code/Classification/Classification models on maritime incident.ipynb
CHANGED
@@ -195,9 +195,11 @@
|
|
195 |
"from nltk.tokenize import word_tokenize\n",
|
196 |
"from nltk.stem import WordNetLemmatizer\n",
|
197 |
"import string\n",
|
198 |
-
"
|
199 |
-
"#nltk.download('
|
200 |
-
"#nltk.download('
|
|
|
|
|
201 |
"\n",
|
202 |
"def clean_text(text):\n",
|
203 |
" # Lowercase\n",
|
@@ -207,13 +209,13 @@
|
|
207 |
" # Removing punctuation\n",
|
208 |
" tokens = [word for word in tokens if word not in string.punctuation]\n",
|
209 |
" # Removing stop words\n",
|
210 |
-
" stop_words = set(stopwords.words(
|
211 |
" tokens = [word for word in tokens if word not in stop_words]\n",
|
212 |
" # Lemmatization\n",
|
213 |
" lemmatizer = WordNetLemmatizer()\n",
|
214 |
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
|
215 |
-
"
|
216 |
-
" return
|
217 |
]
|
218 |
},
|
219 |
{
|
@@ -244,7 +246,8 @@
|
|
244 |
],
|
245 |
"source": [
|
246 |
"import nltk\n",
|
247 |
-
"
|
|
|
248 |
]
|
249 |
},
|
250 |
{
|
@@ -302,14 +305,16 @@
|
|
302 |
}
|
303 |
],
|
304 |
"source": [
|
305 |
-
"text_df = df[[\"Details\"
|
306 |
"text_df.info()\n",
|
307 |
-
"text_df[
|
308 |
-
"
|
309 |
-
"
|
310 |
-
"#
|
311 |
-
"#
|
312 |
-
"#
|
|
|
|
|
313 |
"text_df.info()"
|
314 |
]
|
315 |
},
|
@@ -423,6 +428,7 @@
|
|
423 |
"source": [
|
424 |
"from sklearn.model_selection import train_test_split\n",
|
425 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
|
426 |
"# from sklearn.feature_extraction.text import CountVectorizer\n",
|
427 |
"from sklearn.naive_bayes import MultinomialNB\n",
|
428 |
"from sklearn.metrics import accuracy_score, classification_report"
|
@@ -446,7 +452,9 @@
|
|
446 |
"metadata": {},
|
447 |
"outputs": [],
|
448 |
"source": [
|
449 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
450 |
]
|
451 |
},
|
452 |
{
|
@@ -460,7 +468,7 @@
|
|
460 |
"# X_train_vec = vectorizer.fit_transform(X_train)\n",
|
461 |
"# X_test_vec = vectorizer.transform(X_test)\n",
|
462 |
"\n",
|
463 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)
|
464 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
465 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
466 |
]
|
@@ -554,7 +562,9 @@
|
|
554 |
"metadata": {},
|
555 |
"outputs": [],
|
556 |
"source": [
|
557 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
558 |
]
|
559 |
},
|
560 |
{
|
@@ -564,7 +574,7 @@
|
|
564 |
"metadata": {},
|
565 |
"outputs": [],
|
566 |
"source": [
|
567 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)
|
568 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
569 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
570 |
]
|
@@ -650,7 +660,9 @@
|
|
650 |
"metadata": {},
|
651 |
"outputs": [],
|
652 |
"source": [
|
653 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
654 |
]
|
655 |
},
|
656 |
{
|
@@ -683,7 +695,7 @@
|
|
683 |
}
|
684 |
],
|
685 |
"source": [
|
686 |
-
"svm_model = SVC(kernel
|
687 |
"svm_model.fit(X_train_tfidf, y_train)"
|
688 |
]
|
689 |
},
|
@@ -754,7 +766,9 @@
|
|
754 |
"metadata": {},
|
755 |
"outputs": [],
|
756 |
"source": [
|
757 |
-
"X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
758 |
]
|
759 |
},
|
760 |
{
|
@@ -764,7 +778,7 @@
|
|
764 |
"metadata": {},
|
765 |
"outputs": [],
|
766 |
"source": [
|
767 |
-
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)
|
768 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
769 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
770 |
]
|
|
|
195 |
"from nltk.tokenize import word_tokenize\n",
|
196 |
"from nltk.stem import WordNetLemmatizer\n",
|
197 |
"import string\n",
|
198 |
+
"\n",
|
199 |
+
"# nltk.download('punkt')\n",
|
200 |
+
"# nltk.download('stopwords')\n",
|
201 |
+
"# nltk.download('wordnet')\n",
|
202 |
+
"\n",
|
203 |
"\n",
|
204 |
"def clean_text(text):\n",
|
205 |
" # Lowercase\n",
|
|
|
209 |
" # Removing punctuation\n",
|
210 |
" tokens = [word for word in tokens if word not in string.punctuation]\n",
|
211 |
" # Removing stop words\n",
|
212 |
+
" stop_words = set(stopwords.words(\"english\"))\n",
|
213 |
" tokens = [word for word in tokens if word not in stop_words]\n",
|
214 |
" # Lemmatization\n",
|
215 |
" lemmatizer = WordNetLemmatizer()\n",
|
216 |
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
|
217 |
+
"\n",
|
218 |
+
" return \" \".join(tokens)"
|
219 |
]
|
220 |
},
|
221 |
{
|
|
|
246 |
],
|
247 |
"source": [
|
248 |
"import nltk\n",
|
249 |
+
"\n",
|
250 |
+
"nltk.download(\"omw-1.4\")"
|
251 |
]
|
252 |
},
|
253 |
{
|
|
|
305 |
}
|
306 |
],
|
307 |
"source": [
|
308 |
+
"text_df = df[[\"Details\", \"maritime_label\"]]\n",
|
309 |
"text_df.info()\n",
|
310 |
+
"text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n",
|
311 |
+
" lambda x: clean_text(x) if not isinstance(x, float) else None\n",
|
312 |
+
")\n",
|
313 |
+
"# no_nan_df[no_nan_df[\"Details\"].apply(lambda x: print(type(x)))]\n",
|
314 |
+
"# cleaned_df = text_df[text_df[\"Details\"].apply(lambda x: clean_text(x))]\n",
|
315 |
+
"# cleaned_df = df['Details'][1:2]\n",
|
316 |
+
"# type(no_nan_df[\"Details\"][0])\n",
|
317 |
+
"# print(clean_text(no_nan_df[\"Details\"][0]))\n",
|
318 |
"text_df.info()"
|
319 |
]
|
320 |
},
|
|
|
428 |
"source": [
|
429 |
"from sklearn.model_selection import train_test_split\n",
|
430 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
431 |
+
"\n",
|
432 |
"# from sklearn.feature_extraction.text import CountVectorizer\n",
|
433 |
"from sklearn.naive_bayes import MultinomialNB\n",
|
434 |
"from sklearn.metrics import accuracy_score, classification_report"
|
|
|
452 |
"metadata": {},
|
453 |
"outputs": [],
|
454 |
"source": [
|
455 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
456 |
+
" X, y, test_size=0.2, random_state=42\n",
|
457 |
+
")"
|
458 |
]
|
459 |
},
|
460 |
{
|
|
|
468 |
"# X_train_vec = vectorizer.fit_transform(X_train)\n",
|
469 |
"# X_test_vec = vectorizer.transform(X_test)\n",
|
470 |
"\n",
|
471 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
472 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
473 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
474 |
]
|
|
|
562 |
"metadata": {},
|
563 |
"outputs": [],
|
564 |
"source": [
|
565 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
566 |
+
" X, y, test_size=0.2, random_state=42\n",
|
567 |
+
")"
|
568 |
]
|
569 |
},
|
570 |
{
|
|
|
574 |
"metadata": {},
|
575 |
"outputs": [],
|
576 |
"source": [
|
577 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
578 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
579 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
580 |
]
|
|
|
660 |
"metadata": {},
|
661 |
"outputs": [],
|
662 |
"source": [
|
663 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
664 |
+
" X, y, test_size=0.2, random_state=42\n",
|
665 |
+
")"
|
666 |
]
|
667 |
},
|
668 |
{
|
|
|
695 |
}
|
696 |
],
|
697 |
"source": [
|
698 |
+
"svm_model = SVC(kernel=\"linear\")\n",
|
699 |
"svm_model.fit(X_train_tfidf, y_train)"
|
700 |
]
|
701 |
},
|
|
|
766 |
"metadata": {},
|
767 |
"outputs": [],
|
768 |
"source": [
|
769 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
770 |
+
" X, y, test_size=0.2, random_state=42\n",
|
771 |
+
")"
|
772 |
]
|
773 |
},
|
774 |
{
|
|
|
778 |
"metadata": {},
|
779 |
"outputs": [],
|
780 |
"source": [
|
781 |
+
"tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
|
782 |
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
|
783 |
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
|
784 |
]
|
IS424_Data_Mining/code/EDA.ipynb
CHANGED
@@ -72,17 +72,18 @@
|
|
72 |
}
|
73 |
],
|
74 |
"source": [
|
75 |
-
"# First, load the uploaded CSV file
|
76 |
"import pandas as pd\n",
|
77 |
-
"
|
|
|
78 |
"data = pd.read_csv(data_path)\n",
|
79 |
"\n",
|
80 |
"# Display the first few rows of the dataframe and its summary statistics to get an initial understanding\n",
|
81 |
"data_head = data.head()\n",
|
82 |
"data_info = data.info()\n",
|
83 |
-
"data_description = data.describe(include
|
84 |
"\n",
|
85 |
-
"data_info
|
86 |
]
|
87 |
},
|
88 |
{
|
@@ -857,12 +858,14 @@
|
|
857 |
"missing_values_percentage = (missing_values_count / len(data)) * 100\n",
|
858 |
"\n",
|
859 |
"# Combine count and percentage into a dataframe for easier reading\n",
|
860 |
-
"missing_values_df = pd.DataFrame(
|
861 |
-
"
|
862 |
-
"
|
863 |
-
"
|
|
|
|
|
864 |
"\n",
|
865 |
-
"missing_values_df.sort_values(by
|
866 |
]
|
867 |
},
|
868 |
{
|
@@ -887,8 +890,11 @@
|
|
887 |
}
|
888 |
],
|
889 |
"source": [
|
890 |
-
"columns_to_keep = [
|
891 |
-
"columns_to_drop = missing_values_percentage[
|
|
|
|
|
|
|
892 |
"\n",
|
893 |
"# Now drop the columns except for the ones we want to keep\n",
|
894 |
"data_cleaned = data.drop(columns=columns_to_drop)\n",
|
@@ -904,17 +910,17 @@
|
|
904 |
"outputs": [],
|
905 |
"source": [
|
906 |
"# Drop the specified columns\n",
|
907 |
-
"data_cleaned = data_cleaned.drop(columns=[
|
908 |
"\n",
|
909 |
"# Create a new 'id' column starting from 1\n",
|
910 |
-
"data_cleaned[
|
911 |
"\n",
|
912 |
"# Optionally, if you want 'id' to be the first column, you can rearrange the columns like this:\n",
|
913 |
-
"cols = [
|
914 |
"data_cleaned = data_cleaned[cols]\n",
|
915 |
"\n",
|
916 |
-
"# Now we can save the modified DataFrame to a CSV as previously described :))))) yayayyyy
|
917 |
-
"data_cleaned.to_csv(
|
918 |
]
|
919 |
},
|
920 |
{
|
@@ -923,7 +929,9 @@
|
|
923 |
"metadata": {},
|
924 |
"outputs": [],
|
925 |
"source": [
|
926 |
-
"data_cleaned[
|
|
|
|
|
927 |
"\n",
|
928 |
"# Now, the DataFrame `data_cleaned` has a new column 'Headline_Details' combining the texts"
|
929 |
]
|
@@ -934,7 +942,7 @@
|
|
934 |
"metadata": {},
|
935 |
"outputs": [],
|
936 |
"source": [
|
937 |
-
"data_cleaned.to_csv(
|
938 |
]
|
939 |
},
|
940 |
{
|
@@ -989,7 +997,7 @@
|
|
989 |
}
|
990 |
],
|
991 |
"source": [
|
992 |
-
"data[
|
993 |
]
|
994 |
},
|
995 |
{
|
@@ -1033,7 +1041,7 @@
|
|
1033 |
}
|
1034 |
],
|
1035 |
"source": [
|
1036 |
-
"data[
|
1037 |
]
|
1038 |
},
|
1039 |
{
|
@@ -1850,7 +1858,7 @@
|
|
1850 |
}
|
1851 |
],
|
1852 |
"source": [
|
1853 |
-
"data[
|
1854 |
]
|
1855 |
},
|
1856 |
{
|
@@ -1877,7 +1885,7 @@
|
|
1877 |
"metadata": {},
|
1878 |
"outputs": [],
|
1879 |
"source": [
|
1880 |
-
"severity_counts = data[
|
1881 |
]
|
1882 |
},
|
1883 |
{
|
@@ -1898,10 +1906,15 @@
|
|
1898 |
],
|
1899 |
"source": [
|
1900 |
"plt.figure(figsize=(12, 6)) # Adjust size as needed\n",
|
1901 |
-
"plt.pie(
|
1902 |
-
"
|
1903 |
-
"
|
1904 |
-
"
|
|
|
|
|
|
|
|
|
|
|
1905 |
]
|
1906 |
},
|
1907 |
{
|
@@ -1917,7 +1930,7 @@
|
|
1917 |
"metadata": {},
|
1918 |
"outputs": [],
|
1919 |
"source": [
|
1920 |
-
"minor_cases = data[data[
|
1921 |
]
|
1922 |
},
|
1923 |
{
|
@@ -1926,7 +1939,7 @@
|
|
1926 |
"metadata": {},
|
1927 |
"outputs": [],
|
1928 |
"source": [
|
1929 |
-
"country_counts = minor_cases[
|
1930 |
]
|
1931 |
},
|
1932 |
{
|
@@ -1935,8 +1948,6 @@
|
|
1935 |
"metadata": {},
|
1936 |
"outputs": [],
|
1937 |
"source": [
|
1938 |
-
"\n",
|
1939 |
-
"\n",
|
1940 |
"# Keep the top 3 countries\n",
|
1941 |
"top_countries = country_counts.nlargest(3)\n",
|
1942 |
"\n",
|
@@ -1947,8 +1958,7 @@
|
|
1947 |
"top_countries_series = top_countries\n",
|
1948 |
"\n",
|
1949 |
"# Add the 'Rest' category by assigning it directly to the Series\n",
|
1950 |
-
"top_countries_series[
|
1951 |
-
"\n"
|
1952 |
]
|
1953 |
},
|
1954 |
{
|
@@ -1981,11 +1991,16 @@
|
|
1981 |
"\n",
|
1982 |
"# Create the pie chart with matplotlib, using the custom seaborn color palette\n",
|
1983 |
"plt.figure(figsize=(10, 6))\n",
|
1984 |
-
"plt.pie(
|
1985 |
-
"
|
|
|
|
|
|
|
|
|
|
|
1986 |
"\n",
|
1987 |
"plt.title(\"Distribution of 'Moderate' Cases Among Top 5 Countries and Rest\")\n",
|
1988 |
-
"plt.show()
|
1989 |
]
|
1990 |
},
|
1991 |
{
|
@@ -2006,17 +2021,21 @@
|
|
2006 |
],
|
2007 |
"source": [
|
2008 |
"# Count the occurrences of each category and select the top 10\n",
|
2009 |
-
"top_categories = data[
|
2010 |
"\n",
|
2011 |
"# Filter the DataFrame to include only the top 10 categories\n",
|
2012 |
-
"data_top_categories = data[data[
|
2013 |
"\n",
|
2014 |
"# Plot\n",
|
2015 |
"plt.figure(figsize=(12, 8)) # Adjust size as needed\n",
|
2016 |
-
"sns.countplot(
|
2017 |
-
"
|
2018 |
-
"
|
2019 |
-
"
|
|
|
|
|
|
|
|
|
2020 |
"plt.show()"
|
2021 |
]
|
2022 |
},
|
@@ -2034,15 +2053,14 @@
|
|
2034 |
"outputs": [],
|
2035 |
"source": [
|
2036 |
"# Filter data for China and United States\n",
|
2037 |
-
"china_cases = data[data[
|
2038 |
-
"us_cases = data[data[
|
2039 |
"\n",
|
2040 |
"# Get top 5 event categories for China\n",
|
2041 |
-
"china_top_5 = china_cases[
|
2042 |
"\n",
|
2043 |
"# Get top 5 event categories for United States\n",
|
2044 |
-
"us_top_5 = us_cases[
|
2045 |
-
"\n"
|
2046 |
]
|
2047 |
},
|
2048 |
{
|
@@ -2052,8 +2070,12 @@
|
|
2052 |
"outputs": [],
|
2053 |
"source": [
|
2054 |
"# Convert Series to DataFrame\n",
|
2055 |
-
"china_plot_data = china_top_5.reset_index().rename(
|
2056 |
-
"
|
|
|
|
|
|
|
|
|
2057 |
]
|
2058 |
},
|
2059 |
{
|
@@ -2191,38 +2213,43 @@
|
|
2191 |
"\n",
|
2192 |
"# Plot for China\n",
|
2193 |
"plt.figure(figsize=(10, 6))\n",
|
2194 |
-
"ax_china = sns.barplot(
|
2195 |
-
"
|
2196 |
-
"
|
2197 |
-
"plt.
|
|
|
|
|
2198 |
"\n",
|
2199 |
"# Loop through the bars and add text annotation\n",
|
2200 |
"for p in ax_china.patches:\n",
|
2201 |
" width = p.get_width()\n",
|
2202 |
-
" plt.text(
|
2203 |
-
"
|
2204 |
-
"
|
2205 |
-
"
|
|
|
|
|
2206 |
"\n",
|
2207 |
"plt.show()\n",
|
2208 |
"\n",
|
2209 |
"# Plot for United States\n",
|
2210 |
"plt.figure(figsize=(10, 6))\n",
|
2211 |
-
"ax_us = sns.barplot(x
|
2212 |
-
"plt.title(
|
2213 |
-
"plt.xlabel(
|
2214 |
-
"plt.ylabel(
|
2215 |
"\n",
|
2216 |
"# Loop through the bars and add text annotation for the US plot\n",
|
2217 |
"for p in ax_us.patches:\n",
|
2218 |
" width = p.get_width()\n",
|
2219 |
-
" plt.text(
|
2220 |
-
"
|
2221 |
-
"
|
2222 |
-
"
|
|
|
|
|
2223 |
"\n",
|
2224 |
-
"plt.show()
|
2225 |
-
"\n"
|
2226 |
]
|
2227 |
}
|
2228 |
],
|
|
|
72 |
}
|
73 |
],
|
74 |
"source": [
|
75 |
+
"# First, load the uploaded CSV file\n",
|
76 |
"import pandas as pd\n",
|
77 |
+
"\n",
|
78 |
+
"data_path = \"all_port_labelled.csv\"\n",
|
79 |
"data = pd.read_csv(data_path)\n",
|
80 |
"\n",
|
81 |
"# Display the first few rows of the dataframe and its summary statistics to get an initial understanding\n",
|
82 |
"data_head = data.head()\n",
|
83 |
"data_info = data.info()\n",
|
84 |
+
"data_description = data.describe(include=\"all\")\n",
|
85 |
"\n",
|
86 |
+
"data_info"
|
87 |
]
|
88 |
},
|
89 |
{
|
|
|
858 |
"missing_values_percentage = (missing_values_count / len(data)) * 100\n",
|
859 |
"\n",
|
860 |
"# Combine count and percentage into a dataframe for easier reading\n",
|
861 |
+
"missing_values_df = pd.DataFrame(\n",
|
862 |
+
" {\n",
|
863 |
+
" \"Missing Values\": missing_values_count,\n",
|
864 |
+
" \"Percentage (%)\": missing_values_percentage,\n",
|
865 |
+
" }\n",
|
866 |
+
")\n",
|
867 |
"\n",
|
868 |
+
"missing_values_df.sort_values(by=\"Missing Values\", ascending=False)"
|
869 |
]
|
870 |
},
|
871 |
{
|
|
|
890 |
}
|
891 |
],
|
892 |
"source": [
|
893 |
+
"columns_to_keep = [\"lat\", \"lon\"]\n",
|
894 |
+
"columns_to_drop = missing_values_percentage[\n",
|
895 |
+
" (missing_values_percentage > 30)\n",
|
896 |
+
" & (~missing_values_percentage.index.isin(columns_to_keep))\n",
|
897 |
+
"].index\n",
|
898 |
"\n",
|
899 |
"# Now drop the columns except for the ones we want to keep\n",
|
900 |
"data_cleaned = data.drop(columns=columns_to_drop)\n",
|
|
|
910 |
"outputs": [],
|
911 |
"source": [
|
912 |
"# Drop the specified columns\n",
|
913 |
+
"data_cleaned = data_cleaned.drop(columns=[\"Unnamed: 0\", \"Index\", \"Unnamed: 0.1\"])\n",
|
914 |
"\n",
|
915 |
"# Create a new 'id' column starting from 1\n",
|
916 |
+
"data_cleaned[\"id\"] = range(1, len(data_cleaned) + 1)\n",
|
917 |
"\n",
|
918 |
"# Optionally, if you want 'id' to be the first column, you can rearrange the columns like this:\n",
|
919 |
+
"cols = [\"id\"] + [col for col in data_cleaned.columns if col != \"id\"]\n",
|
920 |
"data_cleaned = data_cleaned[cols]\n",
|
921 |
"\n",
|
922 |
+
"# Now we can save the modified DataFrame to a CSV as previously described :))))) yayayyyy\n",
|
923 |
+
"data_cleaned.to_csv(\"cleaned_data.csv\", index=False)"
|
924 |
]
|
925 |
},
|
926 |
{
|
|
|
929 |
"metadata": {},
|
930 |
"outputs": [],
|
931 |
"source": [
|
932 |
+
"data_cleaned[\"Headline_Details\"] = (\n",
|
933 |
+
" data_cleaned[\"Headline\"] + \" \" + data_cleaned[\"Details\"]\n",
|
934 |
+
")\n",
|
935 |
"\n",
|
936 |
"# Now, the DataFrame `data_cleaned` has a new column 'Headline_Details' combining the texts"
|
937 |
]
|
|
|
942 |
"metadata": {},
|
943 |
"outputs": [],
|
944 |
"source": [
|
945 |
+
"data_cleaned.to_csv(\"cleaned_data.csv\", index=False)"
|
946 |
]
|
947 |
},
|
948 |
{
|
|
|
997 |
}
|
998 |
],
|
999 |
"source": [
|
1000 |
+
"data[\"Region\"].value_counts()"
|
1001 |
]
|
1002 |
},
|
1003 |
{
|
|
|
1041 |
}
|
1042 |
],
|
1043 |
"source": [
|
1044 |
+
"data[\"Region\"].unique()"
|
1045 |
]
|
1046 |
},
|
1047 |
{
|
|
|
1858 |
}
|
1859 |
],
|
1860 |
"source": [
|
1861 |
+
"data[\"Category\"].unique()"
|
1862 |
]
|
1863 |
},
|
1864 |
{
|
|
|
1885 |
"metadata": {},
|
1886 |
"outputs": [],
|
1887 |
"source": [
|
1888 |
+
"severity_counts = data[\"Severity\"].value_counts()"
|
1889 |
]
|
1890 |
},
|
1891 |
{
|
|
|
1906 |
],
|
1907 |
"source": [
|
1908 |
"plt.figure(figsize=(12, 6)) # Adjust size as needed\n",
|
1909 |
+
"plt.pie(\n",
|
1910 |
+
" severity_counts,\n",
|
1911 |
+
" labels=severity_counts.index,\n",
|
1912 |
+
" autopct=lambda p: f\"{int(p/100.*severity_counts.sum())} ({p:.1f}%)\",\n",
|
1913 |
+
" startangle=140,\n",
|
1914 |
+
" counterclock=False,\n",
|
1915 |
+
")\n",
|
1916 |
+
"plt.title(\"Event Severity Distribution\")\n",
|
1917 |
+
"plt.show()"
|
1918 |
]
|
1919 |
},
|
1920 |
{
|
|
|
1930 |
"metadata": {},
|
1931 |
"outputs": [],
|
1932 |
"source": [
|
1933 |
+
"minor_cases = data[data[\"Severity\"] == \"Moderate\"].copy()"
|
1934 |
]
|
1935 |
},
|
1936 |
{
|
|
|
1939 |
"metadata": {},
|
1940 |
"outputs": [],
|
1941 |
"source": [
|
1942 |
+
"country_counts = minor_cases[\"Region\"].value_counts()"
|
1943 |
]
|
1944 |
},
|
1945 |
{
|
|
|
1948 |
"metadata": {},
|
1949 |
"outputs": [],
|
1950 |
"source": [
|
|
|
|
|
1951 |
"# Keep the top 3 countries\n",
|
1952 |
"top_countries = country_counts.nlargest(3)\n",
|
1953 |
"\n",
|
|
|
1958 |
"top_countries_series = top_countries\n",
|
1959 |
"\n",
|
1960 |
"# Add the 'Rest' category by assigning it directly to the Series\n",
|
1961 |
+
"top_countries_series[\"Rest\"] = rest_count"
|
|
|
1962 |
]
|
1963 |
},
|
1964 |
{
|
|
|
1991 |
"\n",
|
1992 |
"# Create the pie chart with matplotlib, using the custom seaborn color palette\n",
|
1993 |
"plt.figure(figsize=(10, 6))\n",
|
1994 |
+
"plt.pie(\n",
|
1995 |
+
" top_countries_series,\n",
|
1996 |
+
" labels=top_countries_series.index,\n",
|
1997 |
+
" autopct=\"%1.1f%%\",\n",
|
1998 |
+
" startangle=90,\n",
|
1999 |
+
" colors=palette,\n",
|
2000 |
+
")\n",
|
2001 |
"\n",
|
2002 |
"plt.title(\"Distribution of 'Moderate' Cases Among Top 5 Countries and Rest\")\n",
|
2003 |
+
"plt.show()"
|
2004 |
]
|
2005 |
},
|
2006 |
{
|
|
|
2021 |
],
|
2022 |
"source": [
|
2023 |
"# Count the occurrences of each category and select the top 10\n",
|
2024 |
+
"top_categories = data[\"Category\"].value_counts().nlargest(10).index\n",
|
2025 |
"\n",
|
2026 |
"# Filter the DataFrame to include only the top 10 categories\n",
|
2027 |
+
"data_top_categories = data[data[\"Category\"].isin(top_categories)]\n",
|
2028 |
"\n",
|
2029 |
"# Plot\n",
|
2030 |
"plt.figure(figsize=(12, 8)) # Adjust size as needed\n",
|
2031 |
+
"sns.countplot(\n",
|
2032 |
+
" y=\"Category\",\n",
|
2033 |
+
" data=data_top_categories,\n",
|
2034 |
+
" order=data_top_categories[\"Category\"].value_counts().index,\n",
|
2035 |
+
")\n",
|
2036 |
+
"plt.title(\"Top 10 Event Categories Distribution\")\n",
|
2037 |
+
"plt.xlabel(\"Count\")\n",
|
2038 |
+
"plt.ylabel(\"Category\")\n",
|
2039 |
"plt.show()"
|
2040 |
]
|
2041 |
},
|
|
|
2053 |
"outputs": [],
|
2054 |
"source": [
|
2055 |
"# Filter data for China and United States\n",
|
2056 |
+
"china_cases = data[data[\"Region\"] == \"China\"]\n",
|
2057 |
+
"us_cases = data[data[\"Region\"] == \"United States\"]\n",
|
2058 |
"\n",
|
2059 |
"# Get top 5 event categories for China\n",
|
2060 |
+
"china_top_5 = china_cases[\"Category\"].value_counts().nlargest(5)\n",
|
2061 |
"\n",
|
2062 |
"# Get top 5 event categories for United States\n",
|
2063 |
+
"us_top_5 = us_cases[\"Category\"].value_counts().nlargest(5)"
|
|
|
2064 |
]
|
2065 |
},
|
2066 |
{
|
|
|
2070 |
"outputs": [],
|
2071 |
"source": [
|
2072 |
"# Convert Series to DataFrame\n",
|
2073 |
+
"china_plot_data = china_top_5.reset_index().rename(\n",
|
2074 |
+
" columns={\"index\": \"Category\", \"Category\": \"Category\"}\n",
|
2075 |
+
")\n",
|
2076 |
+
"us_plot_data = us_top_5.reset_index().rename(\n",
|
2077 |
+
" columns={\"index\": \"Category\", \"Category\": \"Category\"}\n",
|
2078 |
+
")"
|
2079 |
]
|
2080 |
},
|
2081 |
{
|
|
|
2213 |
"\n",
|
2214 |
"# Plot for China\n",
|
2215 |
"plt.figure(figsize=(10, 6))\n",
|
2216 |
+
"ax_china = sns.barplot(\n",
|
2217 |
+
" x=\"count\", y=\"Category\", data=china_plot_data, palette=\"Oranges_r\"\n",
|
2218 |
+
")\n",
|
2219 |
+
"plt.title(\"Top 5 Event Categories in China\")\n",
|
2220 |
+
"plt.xlabel(\"Number of Events\")\n",
|
2221 |
+
"plt.ylabel(\"Event Category\")\n",
|
2222 |
"\n",
|
2223 |
"# Loop through the bars and add text annotation\n",
|
2224 |
"for p in ax_china.patches:\n",
|
2225 |
" width = p.get_width()\n",
|
2226 |
+
" plt.text(\n",
|
2227 |
+
" width + 1, # x position, shifted +1 to the right for spacing\n",
|
2228 |
+
" p.get_y() + p.get_height() / 2, # y position, at the center of the bar\n",
|
2229 |
+
" f\"{int(width)}\", # text label, the count of events\n",
|
2230 |
+
" va=\"center\",\n",
|
2231 |
+
" ) # center alignment\n",
|
2232 |
"\n",
|
2233 |
"plt.show()\n",
|
2234 |
"\n",
|
2235 |
"# Plot for United States\n",
|
2236 |
"plt.figure(figsize=(10, 6))\n",
|
2237 |
+
"ax_us = sns.barplot(x=\"count\", y=\"Category\", data=us_plot_data, palette=\"Blues_r\")\n",
|
2238 |
+
"plt.title(\"Top 5 Event Categories in the United States\")\n",
|
2239 |
+
"plt.xlabel(\"Number of Events\")\n",
|
2240 |
+
"plt.ylabel(\"Event Category\")\n",
|
2241 |
"\n",
|
2242 |
"# Loop through the bars and add text annotation for the US plot\n",
|
2243 |
"for p in ax_us.patches:\n",
|
2244 |
" width = p.get_width()\n",
|
2245 |
+
" plt.text(\n",
|
2246 |
+
" width + 1, # x position, shifted +1 to the right for spacing\n",
|
2247 |
+
" p.get_y() + p.get_height() / 2, # y position, at the center of the bar\n",
|
2248 |
+
" f\"{int(width)}\", # text label, the count of events\n",
|
2249 |
+
" va=\"center\",\n",
|
2250 |
+
" ) # center alignment\n",
|
2251 |
"\n",
|
2252 |
+
"plt.show()"
|
|
|
2253 |
]
|
2254 |
}
|
2255 |
],
|
IS424_Data_Mining/code/GPT/Pipeline.ipynb
CHANGED
@@ -34,43 +34,52 @@
|
|
34 |
"from bs4 import BeautifulSoup\n",
|
35 |
"from datetime import datetime\n",
|
36 |
"\n",
|
|
|
37 |
"def get_article_details(article_url):\n",
|
38 |
" response = requests.get(article_url)\n",
|
39 |
-
" soup = BeautifulSoup(response.content,
|
40 |
-
"
|
41 |
" # Extract headline\n",
|
42 |
-
" headline_tag = soup.find(
|
43 |
-
" headline =
|
44 |
-
"
|
|
|
|
|
45 |
" # Attempt to extract publication date with error handling\n",
|
46 |
-
" date_container = soup.find(
|
47 |
" if date_container:\n",
|
48 |
" # Extract the text and handle cleaning it up\n",
|
49 |
" date_text = date_container.get_text(strip=True)\n",
|
50 |
" # Extract the first date assuming it's the publication date (before \"Updated:\")\n",
|
51 |
" publication_date_text = date_text.split(\"(Updated:\")[0].strip()\n",
|
52 |
" try:\n",
|
53 |
-
" publication_date = datetime.strptime(
|
|
|
|
|
54 |
" except ValueError:\n",
|
55 |
-
" publication_date =
|
56 |
" else:\n",
|
57 |
-
" publication_date =
|
58 |
-
"
|
59 |
" # Extract main content of the article\n",
|
60 |
-
" article_body = soup.find(
|
61 |
" if not article_body:\n",
|
62 |
" article_body = soup\n",
|
63 |
-
" article_text =
|
64 |
-
"
|
|
|
|
|
|
|
|
|
65 |
" return headline, publication_date, article_text\n",
|
66 |
"\n",
|
|
|
67 |
"# Example usage\n",
|
68 |
"article_url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
|
69 |
"headline, publication_date, article_content = get_article_details(article_url)\n",
|
70 |
"print(\"Headline:\", headline)\n",
|
71 |
"print(\"Publication Date:\", publication_date)\n",
|
72 |
-
"print(\"Content:\", article_content[:500]) # Print the first 500 characters to check
|
73 |
-
"\n"
|
74 |
]
|
75 |
},
|
76 |
{
|
@@ -97,7 +106,8 @@
|
|
97 |
"import openai\n",
|
98 |
"\n",
|
99 |
"\n",
|
100 |
-
"openai.api_key =
|
|
|
101 |
"\n",
|
102 |
"def summarize_article(article_content):\n",
|
103 |
" try:\n",
|
@@ -110,15 +120,15 @@
|
|
110 |
"\n",
|
111 |
" # Call to OpenAI's Completion API\n",
|
112 |
" response = openai.Completion.create(\n",
|
113 |
-
" engine=\"gpt-3.5-turbo-instruct\"
|
114 |
" prompt=prompt_text,\n",
|
115 |
" temperature=0.5,\n",
|
116 |
" max_tokens=60, # Adjust as needed to fit the summary length\n",
|
117 |
" top_p=1,\n",
|
118 |
" frequency_penalty=0,\n",
|
119 |
-
" presence_penalty=0
|
120 |
" )\n",
|
121 |
-
"
|
122 |
" # Extracting the text from the response\n",
|
123 |
" summary = response.choices[0].text.strip()\n",
|
124 |
" return summary\n",
|
@@ -126,8 +136,9 @@
|
|
126 |
" print(f\"An error occurred: {e}\")\n",
|
127 |
" return \"\"\n",
|
128 |
"\n",
|
|
|
129 |
"summary = summarize_article(article_content)\n",
|
130 |
-
"print(\"Summary:\", summary)
|
131 |
]
|
132 |
},
|
133 |
{
|
@@ -463,7 +474,7 @@
|
|
463 |
}
|
464 |
],
|
465 |
"source": [
|
466 |
-
"unique_categories = df[
|
467 |
"print(unique_categories)"
|
468 |
]
|
469 |
},
|
@@ -475,6 +486,7 @@
|
|
475 |
"source": [
|
476 |
"import openai\n",
|
477 |
"\n",
|
|
|
478 |
"def classify_article(article_content):\n",
|
479 |
" prompt = f\"\"\"Read the following article and classify its content into one of these categories: 'Aviation Advisory',\n",
|
480 |
"'Bombing',\n",
|
@@ -521,18 +533,17 @@
|
|
521 |
"Category:\"\"\"\n",
|
522 |
"\n",
|
523 |
" response = openai.Completion.create(\n",
|
524 |
-
"
|
525 |
-
"
|
526 |
-
"
|
527 |
-
"
|
528 |
-
"
|
529 |
-
"
|
530 |
-
"
|
531 |
-
"
|
532 |
" )\n",
|
533 |
" category = response.choices[0].text.strip()\n",
|
534 |
-
" return category
|
535 |
-
"\n"
|
536 |
]
|
537 |
},
|
538 |
{
|
@@ -544,11 +555,12 @@
|
|
544 |
"import requests\n",
|
545 |
"from bs4 import BeautifulSoup\n",
|
546 |
"\n",
|
|
|
547 |
"def fetch_article_content(url):\n",
|
548 |
" response = requests.get(url)\n",
|
549 |
-
" soup = BeautifulSoup(response.content,
|
550 |
-
" article_text =
|
551 |
-
" return article_text
|
552 |
]
|
553 |
},
|
554 |
{
|
@@ -571,7 +583,7 @@
|
|
571 |
"def classify_article(url):\n",
|
572 |
" # Fetch article content\n",
|
573 |
" article_content = fetch_article_content(url)\n",
|
574 |
-
"
|
575 |
" # Construct the classification prompt\n",
|
576 |
" prompt = f\"\"\"Read the following article and classify its content into one of these categories:\n",
|
577 |
" 'Aviation Advisory', 'Bombing',\n",
|
@@ -618,25 +630,26 @@
|
|
618 |
" {article_content}\n",
|
619 |
" \n",
|
620 |
" Category:\"\"\"\n",
|
621 |
-
"
|
622 |
" # Classify using OpenAI GPT-3\n",
|
623 |
" response = openai.Completion.create(\n",
|
624 |
-
"
|
625 |
-
"
|
626 |
-
"
|
627 |
-
"
|
628 |
-
"
|
629 |
-
"
|
630 |
-
"
|
631 |
-
"
|
632 |
" )\n",
|
633 |
" category = response.choices[0].text.strip()\n",
|
634 |
" return category\n",
|
635 |
"\n",
|
|
|
636 |
"# Example usage\n",
|
637 |
"url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
|
638 |
"category = classify_article(url)\n",
|
639 |
-
"print(\"Category:\", category)
|
640 |
]
|
641 |
},
|
642 |
{
|
@@ -687,6 +700,7 @@
|
|
687 |
"import pandas as pd\n",
|
688 |
"from datetime import datetime\n",
|
689 |
"\n",
|
|
|
690 |
"def update_database(file_path, url):\n",
|
691 |
" # Fetch details from the article\n",
|
692 |
" headline, publication_date, article_content = get_article_details(article_url)\n",
|
@@ -694,47 +708,50 @@
|
|
694 |
" category = classify_article(url)\n",
|
695 |
"\n",
|
696 |
" new_data = {\n",
|
697 |
-
"
|
698 |
-
"
|
699 |
-
"
|
700 |
-
"
|
701 |
-
"
|
702 |
" }\n",
|
703 |
-
"
|
704 |
-
" \n",
|
705 |
" # Load the existing data from the CSV file\n",
|
706 |
" try:\n",
|
707 |
" df = pd.read_csv(file_path)\n",
|
708 |
" except FileNotFoundError:\n",
|
709 |
" # If the file does not exist, create a new DataFrame\n",
|
710 |
-
" df = pd.DataFrame(
|
|
|
|
|
711 |
" new_id = 1 # Start with ID 1 if no file exists\n",
|
712 |
" else:\n",
|
713 |
" # If IDs exist, increment from the last used ID\n",
|
714 |
-
" new_id = df[
|
715 |
-
"
|
716 |
" # Prepare the new data entry\n",
|
717 |
-
" new_entry = pd.DataFrame(
|
718 |
-
"
|
719 |
-
"
|
720 |
-
"
|
721 |
-
"
|
722 |
-
"
|
723 |
-
"
|
724 |
-
"
|
725 |
-
"
|
|
|
|
|
726 |
" # Append the new data entry to the DataFrame using concat\n",
|
727 |
" df = pd.concat([df, new_entry], ignore_index=True)\n",
|
728 |
-
"
|
729 |
" # Save the updated DataFrame back to CSV\n",
|
730 |
" df.to_csv(file_path, index=False)\n",
|
731 |
" print(f\"Database updated successfully with ID {new_id}.\")\n",
|
732 |
"\n",
|
|
|
733 |
"# Example usage\n",
|
734 |
"url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
|
735 |
-
"file_path =
|
736 |
-
"update_database(file_path, url)
|
737 |
-
"\n"
|
738 |
]
|
739 |
},
|
740 |
{
|
@@ -807,6 +824,7 @@
|
|
807 |
"source": [
|
808 |
"import pandas as pd\n",
|
809 |
"\n",
|
|
|
810 |
"def rank_related_articles(file_path, category):\n",
|
811 |
" # Load the existing data from the CSV file\n",
|
812 |
" try:\n",
|
@@ -814,24 +832,25 @@
|
|
814 |
" except FileNotFoundError:\n",
|
815 |
" print(\"Database file not found.\")\n",
|
816 |
" return\n",
|
817 |
-
"
|
818 |
" # Filter articles by the specified category\n",
|
819 |
-
" filtered_df = df[df[
|
820 |
-
"
|
821 |
" # Convert 'Datetime' from string to datetime objects for accurate sorting\n",
|
822 |
-
" filtered_df[
|
823 |
-
"
|
824 |
" # Sort articles by 'Datetime' in descending order to get the most recent articles first\n",
|
825 |
-
" sorted_df = filtered_df.sort_values(by
|
826 |
-
"
|
827 |
" # Display the sorted DataFrame\n",
|
828 |
-
" print(sorted_df[[
|
829 |
" return sorted_df\n",
|
830 |
"\n",
|
|
|
831 |
"# Example usage\n",
|
832 |
-
"file_path =
|
833 |
-
"category =
|
834 |
-
"ranked_articles = rank_related_articles(file_path, category)
|
835 |
]
|
836 |
},
|
837 |
{
|
@@ -878,22 +897,24 @@
|
|
878 |
"import pandas as pd\n",
|
879 |
"from tabulate import tabulate\n",
|
880 |
"\n",
|
|
|
881 |
"def print_ranked_articles_tabulate(file_path, category):\n",
|
882 |
" try:\n",
|
883 |
" df = pd.read_csv(file_path)\n",
|
884 |
-
" df[
|
885 |
-
" filtered_df = df[df[
|
886 |
-
" sorted_df = filtered_df.sort_values(by
|
887 |
-
"
|
888 |
" # Print DataFrame using tabulate\n",
|
889 |
-
" print(tabulate(sorted_df, headers
|
890 |
" except FileNotFoundError:\n",
|
891 |
" print(\"Database file not found.\")\n",
|
892 |
"\n",
|
|
|
893 |
"# Example usage\n",
|
894 |
-
"file_path =
|
895 |
-
"category =
|
896 |
-
"print_ranked_articles_tabulate(file_path, category)
|
897 |
]
|
898 |
}
|
899 |
],
|
|
|
34 |
"from bs4 import BeautifulSoup\n",
|
35 |
"from datetime import datetime\n",
|
36 |
"\n",
|
37 |
+
"\n",
|
38 |
"def get_article_details(article_url):\n",
|
39 |
" response = requests.get(article_url)\n",
|
40 |
+
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
|
41 |
+
"\n",
|
42 |
" # Extract headline\n",
|
43 |
+
" headline_tag = soup.find(\"h1\")\n",
|
44 |
+
" headline = (\n",
|
45 |
+
" headline_tag.get_text(strip=True) if headline_tag else \"No headline found\"\n",
|
46 |
+
" )\n",
|
47 |
+
"\n",
|
48 |
" # Attempt to extract publication date with error handling\n",
|
49 |
+
" date_container = soup.find(\"div\", class_=\"article-publish\")\n",
|
50 |
" if date_container:\n",
|
51 |
" # Extract the text and handle cleaning it up\n",
|
52 |
" date_text = date_container.get_text(strip=True)\n",
|
53 |
" # Extract the first date assuming it's the publication date (before \"Updated:\")\n",
|
54 |
" publication_date_text = date_text.split(\"(Updated:\")[0].strip()\n",
|
55 |
" try:\n",
|
56 |
+
" publication_date = datetime.strptime(\n",
|
57 |
+
" publication_date_text, \"%d %b %Y %I:%M%p\"\n",
|
58 |
+
" ).strftime(\"%Y-%m-%d %H:%M:%S\")\n",
|
59 |
" except ValueError:\n",
|
60 |
+
" publication_date = \"No publication date found\"\n",
|
61 |
" else:\n",
|
62 |
+
" publication_date = \"No publication date found\"\n",
|
63 |
+
"\n",
|
64 |
" # Extract main content of the article\n",
|
65 |
+
" article_body = soup.find(\"article\")\n",
|
66 |
" if not article_body:\n",
|
67 |
" article_body = soup\n",
|
68 |
+
" article_text = (\n",
|
69 |
+
" article_body.get_text(separator=\" \", strip=True)\n",
|
70 |
+
" if article_body\n",
|
71 |
+
" else \"No article content found\"\n",
|
72 |
+
" )\n",
|
73 |
+
"\n",
|
74 |
" return headline, publication_date, article_text\n",
|
75 |
"\n",
|
76 |
+
"\n",
|
77 |
"# Example usage\n",
|
78 |
"article_url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
|
79 |
"headline, publication_date, article_content = get_article_details(article_url)\n",
|
80 |
"print(\"Headline:\", headline)\n",
|
81 |
"print(\"Publication Date:\", publication_date)\n",
|
82 |
+
"print(\"Content:\", article_content[:500]) # Print the first 500 characters to check"
|
|
|
83 |
]
|
84 |
},
|
85 |
{
|
|
|
106 |
"import openai\n",
|
107 |
"\n",
|
108 |
"\n",
|
109 |
+
"openai.api_key = \"\"\n",
|
110 |
+
"\n",
|
111 |
"\n",
|
112 |
"def summarize_article(article_content):\n",
|
113 |
" try:\n",
|
|
|
120 |
"\n",
|
121 |
" # Call to OpenAI's Completion API\n",
|
122 |
" response = openai.Completion.create(\n",
|
123 |
+
" engine=\"gpt-3.5-turbo-instruct\",\n",
|
124 |
" prompt=prompt_text,\n",
|
125 |
" temperature=0.5,\n",
|
126 |
" max_tokens=60, # Adjust as needed to fit the summary length\n",
|
127 |
" top_p=1,\n",
|
128 |
" frequency_penalty=0,\n",
|
129 |
+
" presence_penalty=0,\n",
|
130 |
" )\n",
|
131 |
+
"\n",
|
132 |
" # Extracting the text from the response\n",
|
133 |
" summary = response.choices[0].text.strip()\n",
|
134 |
" return summary\n",
|
|
|
136 |
" print(f\"An error occurred: {e}\")\n",
|
137 |
" return \"\"\n",
|
138 |
"\n",
|
139 |
+
"\n",
|
140 |
"summary = summarize_article(article_content)\n",
|
141 |
+
"print(\"Summary:\", summary)"
|
142 |
]
|
143 |
},
|
144 |
{
|
|
|
474 |
}
|
475 |
],
|
476 |
"source": [
|
477 |
+
"unique_categories = df[\"Category\"].unique()\n",
|
478 |
"print(unique_categories)"
|
479 |
]
|
480 |
},
|
|
|
486 |
"source": [
|
487 |
"import openai\n",
|
488 |
"\n",
|
489 |
+
"\n",
|
490 |
"def classify_article(article_content):\n",
|
491 |
" prompt = f\"\"\"Read the following article and classify its content into one of these categories: 'Aviation Advisory',\n",
|
492 |
"'Bombing',\n",
|
|
|
533 |
"Category:\"\"\"\n",
|
534 |
"\n",
|
535 |
" response = openai.Completion.create(\n",
|
536 |
+
" engine=\"gpt-3.5-turbo-instruct\", # Adjust according to the latest available and appropriate model\n",
|
537 |
+
" prompt=prompt,\n",
|
538 |
+
" temperature=0.7,\n",
|
539 |
+
" max_tokens=60, # Adjust based on your needs\n",
|
540 |
+
" top_p=1.0,\n",
|
541 |
+
" frequency_penalty=0,\n",
|
542 |
+
" presence_penalty=0,\n",
|
543 |
+
" stop=[\"\\n\"], # Stop generating further when a newline character is encountered\n",
|
544 |
" )\n",
|
545 |
" category = response.choices[0].text.strip()\n",
|
546 |
+
" return category"
|
|
|
547 |
]
|
548 |
},
|
549 |
{
|
|
|
555 |
"import requests\n",
|
556 |
"from bs4 import BeautifulSoup\n",
|
557 |
"\n",
|
558 |
+
"\n",
|
559 |
"def fetch_article_content(url):\n",
|
560 |
" response = requests.get(url)\n",
|
561 |
+
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
|
562 |
+
" article_text = \" \".join([p.text for p in soup.find_all(\"p\")])\n",
|
563 |
+
" return article_text"
|
564 |
]
|
565 |
},
|
566 |
{
|
|
|
583 |
"def classify_article(url):\n",
|
584 |
" # Fetch article content\n",
|
585 |
" article_content = fetch_article_content(url)\n",
|
586 |
+
"\n",
|
587 |
" # Construct the classification prompt\n",
|
588 |
" prompt = f\"\"\"Read the following article and classify its content into one of these categories:\n",
|
589 |
" 'Aviation Advisory', 'Bombing',\n",
|
|
|
630 |
" {article_content}\n",
|
631 |
" \n",
|
632 |
" Category:\"\"\"\n",
|
633 |
+
"\n",
|
634 |
" # Classify using OpenAI GPT-3\n",
|
635 |
" response = openai.Completion.create(\n",
|
636 |
+
" engine=\"gpt-3.5-turbo-instruct\", # Ensure using a correct and non-deprecated model\n",
|
637 |
+
" prompt=prompt,\n",
|
638 |
+
" temperature=0.7,\n",
|
639 |
+
" max_tokens=60,\n",
|
640 |
+
" top_p=1.0,\n",
|
641 |
+
" frequency_penalty=0,\n",
|
642 |
+
" presence_penalty=0,\n",
|
643 |
+
" stop=[\"\\n\"], # Stop generating further when a newline character is encountered\n",
|
644 |
" )\n",
|
645 |
" category = response.choices[0].text.strip()\n",
|
646 |
" return category\n",
|
647 |
"\n",
|
648 |
+
"\n",
|
649 |
"# Example usage\n",
|
650 |
"url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
|
651 |
"category = classify_article(url)\n",
|
652 |
+
"print(\"Category:\", category)"
|
653 |
]
|
654 |
},
|
655 |
{
|
|
|
700 |
"import pandas as pd\n",
|
701 |
"from datetime import datetime\n",
|
702 |
"\n",
|
703 |
+
"\n",
|
704 |
"def update_database(file_path, url):\n",
|
705 |
" # Fetch details from the article\n",
|
706 |
" headline, publication_date, article_content = get_article_details(article_url)\n",
|
|
|
708 |
" category = classify_article(url)\n",
|
709 |
"\n",
|
710 |
" new_data = {\n",
|
711 |
+
" \"Headline\": headline,\n",
|
712 |
+
" \"Summary\": summary,\n",
|
713 |
+
" \"Category\": category,\n",
|
714 |
+
" \"Datetime\": publication_date,\n",
|
715 |
+
" \"URL\": article_url,\n",
|
716 |
" }\n",
|
717 |
+
"\n",
|
|
|
718 |
" # Load the existing data from the CSV file\n",
|
719 |
" try:\n",
|
720 |
" df = pd.read_csv(file_path)\n",
|
721 |
" except FileNotFoundError:\n",
|
722 |
" # If the file does not exist, create a new DataFrame\n",
|
723 |
+
" df = pd.DataFrame(\n",
|
724 |
+
" columns=[\"id\", \"Headline\", \"Summary\", \"Category\", \"Datetime\", \"URL\"]\n",
|
725 |
+
" )\n",
|
726 |
" new_id = 1 # Start with ID 1 if no file exists\n",
|
727 |
" else:\n",
|
728 |
" # If IDs exist, increment from the last used ID\n",
|
729 |
+
" new_id = df[\"id\"].max() + 1 if not df.empty else 1\n",
|
730 |
+
"\n",
|
731 |
" # Prepare the new data entry\n",
|
732 |
+
" new_entry = pd.DataFrame(\n",
|
733 |
+
" {\n",
|
734 |
+
" \"id\": [new_id],\n",
|
735 |
+
" \"Headline\": [new_data[\"Headline\"]],\n",
|
736 |
+
" \"Summary\": [new_data[\"Summary\"]],\n",
|
737 |
+
" \"Category\": [new_data[\"Category\"]],\n",
|
738 |
+
" \"Datetime\": [new_data[\"Datetime\"]],\n",
|
739 |
+
" \"URL\": [new_data[\"URL\"]],\n",
|
740 |
+
" }\n",
|
741 |
+
" )\n",
|
742 |
+
"\n",
|
743 |
" # Append the new data entry to the DataFrame using concat\n",
|
744 |
" df = pd.concat([df, new_entry], ignore_index=True)\n",
|
745 |
+
"\n",
|
746 |
" # Save the updated DataFrame back to CSV\n",
|
747 |
" df.to_csv(file_path, index=False)\n",
|
748 |
" print(f\"Database updated successfully with ID {new_id}.\")\n",
|
749 |
"\n",
|
750 |
+
"\n",
|
751 |
"# Example usage\n",
|
752 |
"url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
|
753 |
+
"file_path = \"cleaned_data1.csv\"\n",
|
754 |
+
"update_database(file_path, url)"
|
|
|
755 |
]
|
756 |
},
|
757 |
{
|
|
|
824 |
"source": [
|
825 |
"import pandas as pd\n",
|
826 |
"\n",
|
827 |
+
"\n",
|
828 |
"def rank_related_articles(file_path, category):\n",
|
829 |
" # Load the existing data from the CSV file\n",
|
830 |
" try:\n",
|
|
|
832 |
" except FileNotFoundError:\n",
|
833 |
" print(\"Database file not found.\")\n",
|
834 |
" return\n",
|
835 |
+
"\n",
|
836 |
" # Filter articles by the specified category\n",
|
837 |
+
" filtered_df = df[df[\"Category\"] == category]\n",
|
838 |
+
"\n",
|
839 |
" # Convert 'Datetime' from string to datetime objects for accurate sorting\n",
|
840 |
+
" filtered_df[\"Datetime\"] = pd.to_datetime(filtered_df[\"Datetime\"])\n",
|
841 |
+
"\n",
|
842 |
" # Sort articles by 'Datetime' in descending order to get the most recent articles first\n",
|
843 |
+
" sorted_df = filtered_df.sort_values(by=\"Datetime\", ascending=False)\n",
|
844 |
+
"\n",
|
845 |
" # Display the sorted DataFrame\n",
|
846 |
+
" print(sorted_df[[\"id\", \"Headline\", \"Summary\", \"Category\", \"Datetime\", \"URL\"]])\n",
|
847 |
" return sorted_df\n",
|
848 |
"\n",
|
849 |
+
"\n",
|
850 |
"# Example usage\n",
|
851 |
+
"file_path = \"cleaned_data1.csv\"\n",
|
852 |
+
"category = \"Aviation Advisory\"\n",
|
853 |
+
"ranked_articles = rank_related_articles(file_path, category)"
|
854 |
]
|
855 |
},
|
856 |
{
|
|
|
897 |
"import pandas as pd\n",
|
898 |
"from tabulate import tabulate\n",
|
899 |
"\n",
|
900 |
+
"\n",
|
901 |
"def print_ranked_articles_tabulate(file_path, category):\n",
|
902 |
" try:\n",
|
903 |
" df = pd.read_csv(file_path)\n",
|
904 |
+
" df[\"Datetime\"] = pd.to_datetime(df[\"Datetime\"])\n",
|
905 |
+
" filtered_df = df[df[\"Category\"] == category]\n",
|
906 |
+
" sorted_df = filtered_df.sort_values(by=\"Datetime\", ascending=False)\n",
|
907 |
+
"\n",
|
908 |
" # Print DataFrame using tabulate\n",
|
909 |
+
" print(tabulate(sorted_df, headers=\"keys\", tablefmt=\"pretty\", showindex=False))\n",
|
910 |
" except FileNotFoundError:\n",
|
911 |
" print(\"Database file not found.\")\n",
|
912 |
"\n",
|
913 |
+
"\n",
|
914 |
"# Example usage\n",
|
915 |
+
"file_path = \"cleaned_data1.csv\"\n",
|
916 |
+
"category = \"Aviation Advisory\"\n",
|
917 |
+
"print_ranked_articles_tabulate(file_path, category)"
|
918 |
]
|
919 |
}
|
920 |
],
|
IS424_Data_Mining/code/LDA/basic_text_preprocessing.ipynb
CHANGED
@@ -64,7 +64,7 @@
|
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
67 |
-
"df = pd.read_csv(
|
68 |
]
|
69 |
},
|
70 |
{
|
@@ -132,7 +132,7 @@
|
|
132 |
"metadata": {},
|
133 |
"outputs": [],
|
134 |
"source": [
|
135 |
-
"df_copy.dropna(subset=[
|
136 |
]
|
137 |
},
|
138 |
{
|
@@ -143,8 +143,8 @@
|
|
143 |
"outputs": [],
|
144 |
"source": [
|
145 |
"print(\"Published Date Statistics:\")\n",
|
146 |
-
"print(\"Min Date:\", df_copy[
|
147 |
-
"print(\"Max Date:\", df_copy[
|
148 |
]
|
149 |
},
|
150 |
{
|
@@ -155,7 +155,7 @@
|
|
155 |
"outputs": [],
|
156 |
"source": [
|
157 |
"# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time\n",
|
158 |
-
"df_copy[[
|
159 |
]
|
160 |
},
|
161 |
{
|
@@ -166,7 +166,9 @@
|
|
166 |
"outputs": [],
|
167 |
"source": [
|
168 |
"# drop the duplicated news\n",
|
169 |
-
"duplicates = df_copy.duplicated(
|
|
|
|
|
170 |
"df_uni = df_copy[~duplicates]"
|
171 |
]
|
172 |
},
|
@@ -196,7 +198,7 @@
|
|
196 |
"metadata": {},
|
197 |
"outputs": [],
|
198 |
"source": [
|
199 |
-
"df_uni[
|
200 |
]
|
201 |
},
|
202 |
{
|
@@ -208,10 +210,14 @@
|
|
208 |
"source": [
|
209 |
"## remove contractions, lowercase, remove numbers and punctuations, remove stopwords\n",
|
210 |
"# run time roughly 2 mins\n",
|
211 |
-
"df_uni[
|
|
|
|
|
212 |
"\n",
|
213 |
"## convert back into string so that tokenization can be done\n",
|
214 |
-
"df_uni[
|
|
|
|
|
215 |
]
|
216 |
},
|
217 |
{
|
@@ -221,7 +227,7 @@
|
|
221 |
"metadata": {},
|
222 |
"outputs": [],
|
223 |
"source": [
|
224 |
-
"df_uni[
|
225 |
]
|
226 |
},
|
227 |
{
|
@@ -277,18 +283,23 @@
|
|
277 |
"\n",
|
278 |
"wnl = WordNetLemmatizer()\n",
|
279 |
"\n",
|
|
|
280 |
"def lemmatize_words(text):\n",
|
281 |
" # Tokenize the text into sentences and then words\n",
|
282 |
" sentences = sent_tokenize(text)\n",
|
283 |
" words = [word_tokenize(sentence) for sentence in sentences]\n",
|
284 |
"\n",
|
285 |
" # Remove punctuation and tokenize into lowercase words\n",
|
286 |
-
" punc = [[w.lower() for w in word if re.search(
|
287 |
"\n",
|
288 |
" # Perform lemmatization on words with valid POS tags\n",
|
289 |
-
" doc_lemmed = [
|
290 |
-
"
|
291 |
-
"
|
|
|
|
|
|
|
|
|
292 |
" return doc_lemmed"
|
293 |
]
|
294 |
},
|
@@ -309,7 +320,9 @@
|
|
309 |
"metadata": {},
|
310 |
"outputs": [],
|
311 |
"source": [
|
312 |
-
"df_uni[
|
|
|
|
|
313 |
]
|
314 |
},
|
315 |
{
|
@@ -337,8 +350,9 @@
|
|
337 |
"metadata": {},
|
338 |
"outputs": [],
|
339 |
"source": [
|
340 |
-
"stop_list = nltk.corpus.stopwords.words(
|
341 |
-
"stop_list += [
|
|
|
342 |
"\n",
|
343 |
"def corpus2docs2(corpus):\n",
|
344 |
" # corpus is a object returned by load_corpus that represents a corpus.\n",
|
@@ -349,27 +363,39 @@
|
|
349 |
" phrases = []\n",
|
350 |
" i = 0\n",
|
351 |
" while i < len(doc_pos):\n",
|
352 |
-
" if doc_pos[i][1] ==
|
353 |
-
" if
|
354 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
" i += 3\n",
|
356 |
-
" elif i+1 < len(doc_pos) and doc_pos[i+1][1] ==
|
357 |
-
" phrases.append((doc_pos[i][0], doc_pos[i+1][0]))\n",
|
358 |
" i += 2\n",
|
359 |
" else:\n",
|
360 |
" i += 1\n",
|
361 |
-
" elif doc_pos[i][1] ==
|
362 |
-
" if
|
363 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
" i += 3\n",
|
365 |
-
" elif i+1 < len(doc_pos) and doc_pos[i+1][1] ==
|
366 |
-
" phrases.append((doc_pos[i][0], doc_pos[i+1][0]))\n",
|
367 |
" i += 2\n",
|
368 |
" else:\n",
|
369 |
" i += 1\n",
|
370 |
" else:\n",
|
371 |
" i += 1\n",
|
372 |
-
" phrase_set = [
|
373 |
" docs.append(phrase_set)\n",
|
374 |
" return docs"
|
375 |
]
|
@@ -391,7 +417,7 @@
|
|
391 |
"metadata": {},
|
392 |
"outputs": [],
|
393 |
"source": [
|
394 |
-
"df_uni[
|
395 |
]
|
396 |
},
|
397 |
{
|
@@ -401,7 +427,7 @@
|
|
401 |
"metadata": {},
|
402 |
"outputs": [],
|
403 |
"source": [
|
404 |
-
"df_uni[
|
405 |
]
|
406 |
},
|
407 |
{
|
@@ -411,17 +437,17 @@
|
|
411 |
"metadata": {},
|
412 |
"outputs": [],
|
413 |
"source": [
|
414 |
-
"fdist_doc = nltk.FreqDist(df_uni[
|
415 |
"\n",
|
416 |
"x, y = zip(*fdist_doc)\n",
|
417 |
-
"plt.figure(figsize=(50,30))\n",
|
418 |
"plt.margins(0.02)\n",
|
419 |
"plt.bar(x, y)\n",
|
420 |
-
"plt.xlabel(
|
421 |
-
"plt.ylabel(
|
422 |
"plt.yticks(fontsize=40)\n",
|
423 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
424 |
-
"plt.title(
|
425 |
"plt.show()"
|
426 |
]
|
427 |
},
|
@@ -432,7 +458,7 @@
|
|
432 |
"metadata": {},
|
433 |
"outputs": [],
|
434 |
"source": [
|
435 |
-
"all_words = [word for sublist in df_uni[
|
436 |
"all_words[:2]\n",
|
437 |
"# Calculate word frequencies\n",
|
438 |
"fdist = FreqDist(all_words)"
|
@@ -447,7 +473,7 @@
|
|
447 |
"source": [
|
448 |
"# Plot the word frequency distribution as a bar graph\n",
|
449 |
"plt.figure(figsize=(12, 6))\n",
|
450 |
-
"plt.title(
|
451 |
"fdist.plot(30, cumulative=False)"
|
452 |
]
|
453 |
},
|
@@ -466,7 +492,7 @@
|
|
466 |
"metadata": {},
|
467 |
"outputs": [],
|
468 |
"source": [
|
469 |
-
"com = df_uni[
|
470 |
"com[:10]"
|
471 |
]
|
472 |
},
|
@@ -482,11 +508,11 @@
|
|
482 |
"\n",
|
483 |
"# Plotting with Seaborn for each company\n",
|
484 |
"for company in com[:10]:\n",
|
485 |
-
" haha = df_uni[
|
486 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
487 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
488 |
-
" plt.imshow(wordcloud, interpolation
|
489 |
-
" plt.title(f
|
490 |
" plt.axis(\"off\")\n",
|
491 |
" plt.margins(x=0, y=0)\n",
|
492 |
" plt.show()"
|
@@ -509,10 +535,12 @@
|
|
509 |
"metadata": {},
|
510 |
"outputs": [],
|
511 |
"source": [
|
512 |
-
"df_uni[
|
|
|
|
|
513 |
"\n",
|
514 |
"# Tokenize the text and create a dictionary\n",
|
515 |
-
"documents = df_uni[
|
516 |
"dictionary = corpora.Dictionary(documents)\n",
|
517 |
"\n",
|
518 |
"tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)\n",
|
@@ -529,7 +557,9 @@
|
|
529 |
},
|
530 |
"outputs": [],
|
531 |
"source": [
|
532 |
-
"sorted_term_frequencies = dict(
|
|
|
|
|
533 |
"sorted_term_frequencies"
|
534 |
]
|
535 |
},
|
@@ -551,11 +581,13 @@
|
|
551 |
"# customisable, lower threshold, more words retained.\n",
|
552 |
"threshold = 0.4\n",
|
553 |
"\n",
|
|
|
554 |
"def filter_and_join(tfidf_doc):\n",
|
555 |
" filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]\n",
|
556 |
" return filtered_terms\n",
|
557 |
"\n",
|
558 |
-
"
|
|
|
559 |
]
|
560 |
},
|
561 |
{
|
@@ -565,17 +597,17 @@
|
|
565 |
"metadata": {},
|
566 |
"outputs": [],
|
567 |
"source": [
|
568 |
-
"fdist_doc = nltk.FreqDist(df_uni[
|
569 |
"\n",
|
570 |
"x, y = zip(*fdist_doc)\n",
|
571 |
-
"plt.figure(figsize=(50,30))\n",
|
572 |
"plt.margins(0.02)\n",
|
573 |
"plt.bar(x, y)\n",
|
574 |
-
"plt.xlabel(
|
575 |
-
"plt.ylabel(
|
576 |
"plt.yticks(fontsize=40)\n",
|
577 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
578 |
-
"plt.title(
|
579 |
"plt.show()"
|
580 |
]
|
581 |
},
|
@@ -586,7 +618,9 @@
|
|
586 |
"metadata": {},
|
587 |
"outputs": [],
|
588 |
"source": [
|
589 |
-
"all_words_filtered = [
|
|
|
|
|
590 |
"all_words_filtered[:2]\n",
|
591 |
"# Calculate word frequencies\n",
|
592 |
"fdist_filtered = FreqDist(all_words_filtered)"
|
@@ -602,7 +636,7 @@
|
|
602 |
"# Plot the word frequency distribution as a bar graph\n",
|
603 |
"# apparently, the dataset is much cleaner now.\n",
|
604 |
"plt.figure(figsize=(12, 6))\n",
|
605 |
-
"plt.title(
|
606 |
"fdist_filtered.plot(30, cumulative=False)"
|
607 |
]
|
608 |
},
|
@@ -618,11 +652,11 @@
|
|
618 |
"\n",
|
619 |
"# Plotting with Seaborn for each company\n",
|
620 |
"for region in com[:10]:\n",
|
621 |
-
" haha = df_uni[
|
622 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
623 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
624 |
-
" plt.imshow(wordcloud, interpolation
|
625 |
-
" plt.title(f
|
626 |
" plt.axis(\"off\")\n",
|
627 |
" plt.margins(x=0, y=0)\n",
|
628 |
" plt.show()"
|
@@ -635,7 +669,7 @@
|
|
635 |
"metadata": {},
|
636 |
"outputs": [],
|
637 |
"source": [
|
638 |
-
"df_uni[
|
639 |
]
|
640 |
},
|
641 |
{
|
@@ -645,7 +679,7 @@
|
|
645 |
"metadata": {},
|
646 |
"outputs": [],
|
647 |
"source": [
|
648 |
-
"df_uni[[
|
649 |
]
|
650 |
},
|
651 |
{
|
@@ -656,7 +690,9 @@
|
|
656 |
"outputs": [],
|
657 |
"source": [
|
658 |
"# count of news by sector\n",
|
659 |
-
"df_uni[[
|
|
|
|
|
660 |
]
|
661 |
},
|
662 |
{
|
@@ -666,7 +702,9 @@
|
|
666 |
"metadata": {},
|
667 |
"outputs": [],
|
668 |
"source": [
|
669 |
-
"df_uni[[
|
|
|
|
|
670 |
]
|
671 |
},
|
672 |
{
|
@@ -705,7 +743,7 @@
|
|
705 |
"outputs": [],
|
706 |
"source": [
|
707 |
"# export as parquet data file instead of csv for easier list extraction\n",
|
708 |
-
"df_uni.to_parquet(
|
709 |
]
|
710 |
},
|
711 |
{
|
|
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
67 |
+
"df = pd.read_csv(\"cleaned_data.csv\")"
|
68 |
]
|
69 |
},
|
70 |
{
|
|
|
132 |
"metadata": {},
|
133 |
"outputs": [],
|
134 |
"source": [
|
135 |
+
"df_copy.dropna(subset=[\"Headline_Details\"], inplace=True)"
|
136 |
]
|
137 |
},
|
138 |
{
|
|
|
143 |
"outputs": [],
|
144 |
"source": [
|
145 |
"print(\"Published Date Statistics:\")\n",
|
146 |
+
"print(\"Min Date:\", df_copy[\"Datetime\"].min())\n",
|
147 |
+
"print(\"Max Date:\", df_copy[\"Datetime\"].max())"
|
148 |
]
|
149 |
},
|
150 |
{
|
|
|
155 |
"outputs": [],
|
156 |
"source": [
|
157 |
"# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time\n",
|
158 |
+
"df_copy[[\"Year\", \"Headline_Details\", \"Region\"]].duplicated().any()"
|
159 |
]
|
160 |
},
|
161 |
{
|
|
|
166 |
"outputs": [],
|
167 |
"source": [
|
168 |
"# drop the duplicated news\n",
|
169 |
+
"duplicates = df_copy.duplicated(\n",
|
170 |
+
" subset=[\"Year\", \"Headline_Details\", \"Region\"], keep=\"first\"\n",
|
171 |
+
")\n",
|
172 |
"df_uni = df_copy[~duplicates]"
|
173 |
]
|
174 |
},
|
|
|
198 |
"metadata": {},
|
199 |
"outputs": [],
|
200 |
"source": [
|
201 |
+
"df_uni[\"Headline_Details\"][5]"
|
202 |
]
|
203 |
},
|
204 |
{
|
|
|
210 |
"source": [
|
211 |
"## remove contractions, lowercase, remove numbers and punctuations, remove stopwords\n",
|
212 |
"# run time roughly 2 mins\n",
|
213 |
+
"df_uni[\"cleaned_Headline_Details\"] = df_uni[\"Headline_Details\"].apply(\n",
|
214 |
+
" lambda x: [contractions.fix(word) for word in x.split()]\n",
|
215 |
+
")\n",
|
216 |
"\n",
|
217 |
"## convert back into string so that tokenization can be done\n",
|
218 |
+
"df_uni[\"cleaned_Headline_Details\"] = [\n",
|
219 |
+
" \" \".join(map(str, l)) for l in df_uni[\"cleaned_Headline_Details\"]\n",
|
220 |
+
"]"
|
221 |
]
|
222 |
},
|
223 |
{
|
|
|
227 |
"metadata": {},
|
228 |
"outputs": [],
|
229 |
"source": [
|
230 |
+
"df_uni[\"cleaned_Headline_Details\"][5]"
|
231 |
]
|
232 |
},
|
233 |
{
|
|
|
283 |
"\n",
|
284 |
"wnl = WordNetLemmatizer()\n",
|
285 |
"\n",
|
286 |
+
"\n",
|
287 |
"def lemmatize_words(text):\n",
|
288 |
" # Tokenize the text into sentences and then words\n",
|
289 |
" sentences = sent_tokenize(text)\n",
|
290 |
" words = [word_tokenize(sentence) for sentence in sentences]\n",
|
291 |
"\n",
|
292 |
" # Remove punctuation and tokenize into lowercase words\n",
|
293 |
+
" punc = [[w.lower() for w in word if re.search(\"^[a-zA-Z]+$\", w)] for word in words]\n",
|
294 |
"\n",
|
295 |
" # Perform lemmatization on words with valid POS tags\n",
|
296 |
+
" doc_lemmed = [\n",
|
297 |
+
" wnl.lemmatize(word, pos[0].lower())\n",
|
298 |
+
" for sentence in punc\n",
|
299 |
+
" for word, pos in pos_tag(sentence, tagset=\"universal\")\n",
|
300 |
+
" if pos[0].lower() in [\"a\", \"s\", \"r\", \"n\", \"v\"]\n",
|
301 |
+
" ]\n",
|
302 |
+
"\n",
|
303 |
" return doc_lemmed"
|
304 |
]
|
305 |
},
|
|
|
320 |
"metadata": {},
|
321 |
"outputs": [],
|
322 |
"source": [
|
323 |
+
"df_uni[\"cleaned_Headline_Details\"] = df_uni[\"cleaned_Headline_Details\"].apply(\n",
|
324 |
+
" lemmatize_words\n",
|
325 |
+
")"
|
326 |
]
|
327 |
},
|
328 |
{
|
|
|
350 |
"metadata": {},
|
351 |
"outputs": [],
|
352 |
"source": [
|
353 |
+
"stop_list = nltk.corpus.stopwords.words(\"english\")\n",
|
354 |
+
"stop_list += [\"local\", \"time\", \"wednesday\", \"source\", \"certain\", \"report\", \"update\"]\n",
|
355 |
+
"\n",
|
356 |
"\n",
|
357 |
"def corpus2docs2(corpus):\n",
|
358 |
" # corpus is a object returned by load_corpus that represents a corpus.\n",
|
|
|
363 |
" phrases = []\n",
|
364 |
" i = 0\n",
|
365 |
" while i < len(doc_pos):\n",
|
366 |
+
" if doc_pos[i][1] == \"JJ\":\n",
|
367 |
+
" if (\n",
|
368 |
+
" i + 2 < len(doc_pos)\n",
|
369 |
+
" and doc_pos[i + 1][1] == \"NN\"\n",
|
370 |
+
" and doc_pos[i + 2][1] == \"NN\"\n",
|
371 |
+
" ):\n",
|
372 |
+
" phrases.append(\n",
|
373 |
+
" (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])\n",
|
374 |
+
" )\n",
|
375 |
" i += 3\n",
|
376 |
+
" elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == \"NN\":\n",
|
377 |
+
" phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))\n",
|
378 |
" i += 2\n",
|
379 |
" else:\n",
|
380 |
" i += 1\n",
|
381 |
+
" elif doc_pos[i][1] == \"NN\":\n",
|
382 |
+
" if (\n",
|
383 |
+
" i + 2 < len(doc_pos)\n",
|
384 |
+
" and doc_pos[i + 1][1] == \"NN\"\n",
|
385 |
+
" and doc_pos[i + 2][1] == \"NN\"\n",
|
386 |
+
" ):\n",
|
387 |
+
" phrases.append(\n",
|
388 |
+
" (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])\n",
|
389 |
+
" )\n",
|
390 |
" i += 3\n",
|
391 |
+
" elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == \"NN\":\n",
|
392 |
+
" phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))\n",
|
393 |
" i += 2\n",
|
394 |
" else:\n",
|
395 |
" i += 1\n",
|
396 |
" else:\n",
|
397 |
" i += 1\n",
|
398 |
+
" phrase_set = [\"_\".join(word_set) for word_set in phrases]\n",
|
399 |
" docs.append(phrase_set)\n",
|
400 |
" return docs"
|
401 |
]
|
|
|
417 |
"metadata": {},
|
418 |
"outputs": [],
|
419 |
"source": [
|
420 |
+
"df_uni[\"binary_Headline_Details\"] = corpus2docs2(df_uni[\"cleaned_Headline_Details\"])"
|
421 |
]
|
422 |
},
|
423 |
{
|
|
|
427 |
"metadata": {},
|
428 |
"outputs": [],
|
429 |
"source": [
|
430 |
+
"df_uni[\"binary_Headline_Details\"][5]"
|
431 |
]
|
432 |
},
|
433 |
{
|
|
|
437 |
"metadata": {},
|
438 |
"outputs": [],
|
439 |
"source": [
|
440 |
+
"fdist_doc = nltk.FreqDist(df_uni[\"binary_Headline_Details\"][5]).most_common(25)\n",
|
441 |
"\n",
|
442 |
"x, y = zip(*fdist_doc)\n",
|
443 |
+
"plt.figure(figsize=(50, 30))\n",
|
444 |
"plt.margins(0.02)\n",
|
445 |
"plt.bar(x, y)\n",
|
446 |
+
"plt.xlabel(\"Words\", fontsize=50)\n",
|
447 |
+
"plt.ylabel(\"Frequency of Words\", fontsize=50)\n",
|
448 |
"plt.yticks(fontsize=40)\n",
|
449 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
450 |
+
"plt.title(\"Frequency of 25 Most Common Words for One Random News\", fontsize=60)\n",
|
451 |
"plt.show()"
|
452 |
]
|
453 |
},
|
|
|
458 |
"metadata": {},
|
459 |
"outputs": [],
|
460 |
"source": [
|
461 |
+
"all_words = [word for sublist in df_uni[\"binary_Headline_Details\"] for word in sublist]\n",
|
462 |
"all_words[:2]\n",
|
463 |
"# Calculate word frequencies\n",
|
464 |
"fdist = FreqDist(all_words)"
|
|
|
473 |
"source": [
|
474 |
"# Plot the word frequency distribution as a bar graph\n",
|
475 |
"plt.figure(figsize=(12, 6))\n",
|
476 |
+
"plt.title(\"Frequency of 25 Most Common Words of the Dataset\", fontsize=12)\n",
|
477 |
"fdist.plot(30, cumulative=False)"
|
478 |
]
|
479 |
},
|
|
|
492 |
"metadata": {},
|
493 |
"outputs": [],
|
494 |
"source": [
|
495 |
+
"com = df_uni[\"Severity\"].unique()\n",
|
496 |
"com[:10]"
|
497 |
]
|
498 |
},
|
|
|
508 |
"\n",
|
509 |
"# Plotting with Seaborn for each company\n",
|
510 |
"for company in com[:10]:\n",
|
511 |
+
" haha = df_uni[\"binary_Headline_Details\"].loc[df_uni.Severity == company]\n",
|
512 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
513 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
514 |
+
" plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
515 |
+
" plt.title(f\"Wordcloud for {company}\")\n",
|
516 |
" plt.axis(\"off\")\n",
|
517 |
" plt.margins(x=0, y=0)\n",
|
518 |
" plt.show()"
|
|
|
535 |
"metadata": {},
|
536 |
"outputs": [],
|
537 |
"source": [
|
538 |
+
"df_uni[\"binary_Headline_Details\"] = df_uni[\"binary_Headline_Details\"].apply(\n",
|
539 |
+
" lambda x: \" \".join(x)\n",
|
540 |
+
")\n",
|
541 |
"\n",
|
542 |
"# Tokenize the text and create a dictionary\n",
|
543 |
+
"documents = df_uni[\"binary_Headline_Details\"].str.split()\n",
|
544 |
"dictionary = corpora.Dictionary(documents)\n",
|
545 |
"\n",
|
546 |
"tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)\n",
|
|
|
557 |
},
|
558 |
"outputs": [],
|
559 |
"source": [
|
560 |
+
"sorted_term_frequencies = dict(\n",
|
561 |
+
" sorted(term_frequencies.items(), key=lambda item: item[1], reverse=True)\n",
|
562 |
+
")\n",
|
563 |
"sorted_term_frequencies"
|
564 |
]
|
565 |
},
|
|
|
581 |
"# customisable, lower threshold, more words retained.\n",
|
582 |
"threshold = 0.4\n",
|
583 |
"\n",
|
584 |
+
"\n",
|
585 |
"def filter_and_join(tfidf_doc):\n",
|
586 |
" filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]\n",
|
587 |
" return filtered_terms\n",
|
588 |
"\n",
|
589 |
+
"\n",
|
590 |
+
"df_uni[\"binary_Headline_Details\"] = [filter_and_join(doc) for doc in tfidf_corpus]"
|
591 |
]
|
592 |
},
|
593 |
{
|
|
|
597 |
"metadata": {},
|
598 |
"outputs": [],
|
599 |
"source": [
|
600 |
+
"fdist_doc = nltk.FreqDist(df_uni[\"binary_Headline_Details\"][0]).most_common(25)\n",
|
601 |
"\n",
|
602 |
"x, y = zip(*fdist_doc)\n",
|
603 |
+
"plt.figure(figsize=(50, 30))\n",
|
604 |
"plt.margins(0.02)\n",
|
605 |
"plt.bar(x, y)\n",
|
606 |
+
"plt.xlabel(\"Words\", fontsize=50)\n",
|
607 |
+
"plt.ylabel(\"Frequency of Words\", fontsize=50)\n",
|
608 |
"plt.yticks(fontsize=40)\n",
|
609 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
610 |
+
"plt.title(\"Frequency of 25 Most Common Words for One Random News\", fontsize=60)\n",
|
611 |
"plt.show()"
|
612 |
]
|
613 |
},
|
|
|
618 |
"metadata": {},
|
619 |
"outputs": [],
|
620 |
"source": [
|
621 |
+
"all_words_filtered = [\n",
|
622 |
+
" word for sublist in df_uni[\"binary_Headline_Details\"] for word in sublist\n",
|
623 |
+
"]\n",
|
624 |
"all_words_filtered[:2]\n",
|
625 |
"# Calculate word frequencies\n",
|
626 |
"fdist_filtered = FreqDist(all_words_filtered)"
|
|
|
636 |
"# Plot the word frequency distribution as a bar graph\n",
|
637 |
"# apparently, the dataset is much cleaner now.\n",
|
638 |
"plt.figure(figsize=(12, 6))\n",
|
639 |
+
"plt.title(\"Frequency of 25 Most Common Words of the Dataset\", fontsize=12)\n",
|
640 |
"fdist_filtered.plot(30, cumulative=False)"
|
641 |
]
|
642 |
},
|
|
|
652 |
"\n",
|
653 |
"# Plotting with Seaborn for each company\n",
|
654 |
"for region in com[:10]:\n",
|
655 |
+
" haha = df_uni[\"binary_Headline_Details\"].loc[df_uni.Severity == region]\n",
|
656 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
657 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
658 |
+
" plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
659 |
+
" plt.title(f\"Wordcloud for {company}\")\n",
|
660 |
" plt.axis(\"off\")\n",
|
661 |
" plt.margins(x=0, y=0)\n",
|
662 |
" plt.show()"
|
|
|
669 |
"metadata": {},
|
670 |
"outputs": [],
|
671 |
"source": [
|
672 |
+
"df_uni[\"word_count\"] = df_uni[\"binary_Headline_Details\"].apply(len)"
|
673 |
]
|
674 |
},
|
675 |
{
|
|
|
679 |
"metadata": {},
|
680 |
"outputs": [],
|
681 |
"source": [
|
682 |
+
"df_uni[[\"word_count\"]].describe().round()"
|
683 |
]
|
684 |
},
|
685 |
{
|
|
|
690 |
"outputs": [],
|
691 |
"source": [
|
692 |
"# count of news by sector\n",
|
693 |
+
"df_uni[[\"binary_Headline_Details\", \"Region\"]].groupby(\"Region\").count().sort_values(\n",
|
694 |
+
" by=\"binary_Headline_Details\", ascending=False\n",
|
695 |
+
")"
|
696 |
]
|
697 |
},
|
698 |
{
|
|
|
702 |
"metadata": {},
|
703 |
"outputs": [],
|
704 |
"source": [
|
705 |
+
"df_uni[[\"binary_Headline_Details\", \"Severity\"]].groupby(\"Severity\").count().sort_values(\n",
|
706 |
+
" by=\"binary_Headline_Details\", ascending=False\n",
|
707 |
+
")"
|
708 |
]
|
709 |
},
|
710 |
{
|
|
|
743 |
"outputs": [],
|
744 |
"source": [
|
745 |
"# export as parquet data file instead of csv for easier list extraction\n",
|
746 |
+
"df_uni.to_parquet(\"processed_data1.parquet\", index=False)"
|
747 |
]
|
748 |
},
|
749 |
{
|
IS424_Data_Mining/code/LDA/basic_text_preprocessing_on_scraped_data.ipynb
CHANGED
@@ -64,7 +64,7 @@
|
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
67 |
-
"df = pd.read_parquet(
|
68 |
]
|
69 |
},
|
70 |
{
|
@@ -442,7 +442,7 @@
|
|
442 |
"outputs": [],
|
443 |
"source": [
|
444 |
"# drop empty lines\n",
|
445 |
-
"df_copy.dropna(subset=[
|
446 |
]
|
447 |
},
|
448 |
{
|
@@ -463,8 +463,8 @@
|
|
463 |
],
|
464 |
"source": [
|
465 |
"print(\"Published Date Statistics:\")\n",
|
466 |
-
"print(\"Min Date:\", df_copy[
|
467 |
-
"print(\"Max Date:\", df_copy[
|
468 |
]
|
469 |
},
|
470 |
{
|
@@ -486,7 +486,7 @@
|
|
486 |
],
|
487 |
"source": [
|
488 |
"# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time\n",
|
489 |
-
"df_copy[[
|
490 |
]
|
491 |
},
|
492 |
{
|
@@ -497,7 +497,7 @@
|
|
497 |
"outputs": [],
|
498 |
"source": [
|
499 |
"# drop the duplicated news\n",
|
500 |
-
"duplicates = df_copy.duplicated(subset=[
|
501 |
"df_uni = df_copy[~duplicates]"
|
502 |
]
|
503 |
},
|
@@ -508,7 +508,9 @@
|
|
508 |
"metadata": {},
|
509 |
"outputs": [],
|
510 |
"source": [
|
511 |
-
"df_uni = df_uni[
|
|
|
|
|
512 |
]
|
513 |
},
|
514 |
{
|
@@ -567,7 +569,7 @@
|
|
567 |
}
|
568 |
],
|
569 |
"source": [
|
570 |
-
"df_uni[
|
571 |
]
|
572 |
},
|
573 |
{
|
@@ -578,10 +580,12 @@
|
|
578 |
"outputs": [],
|
579 |
"source": [
|
580 |
"## remove contractions, lowercase, remove numbers and punctuations, remove stopwords\n",
|
581 |
-
"df_uni[
|
|
|
|
|
582 |
"\n",
|
583 |
"## convert back into string so that tokenization can be done\n",
|
584 |
-
"df_uni[
|
585 |
]
|
586 |
},
|
587 |
{
|
@@ -602,7 +606,7 @@
|
|
602 |
}
|
603 |
],
|
604 |
"source": [
|
605 |
-
"df_uni[
|
606 |
]
|
607 |
},
|
608 |
{
|
@@ -657,18 +661,23 @@
|
|
657 |
"\n",
|
658 |
"wnl = WordNetLemmatizer()\n",
|
659 |
"\n",
|
|
|
660 |
"def lemmatize_words(text):\n",
|
661 |
" # Tokenize the text into sentences and then words\n",
|
662 |
" sentences = sent_tokenize(text)\n",
|
663 |
" words = [word_tokenize(sentence) for sentence in sentences]\n",
|
664 |
"\n",
|
665 |
" # Remove punctuation and tokenize into lowercase words\n",
|
666 |
-
" punc = [[w.lower() for w in word if re.search(
|
667 |
"\n",
|
668 |
" # Perform lemmatization on words with valid POS tags\n",
|
669 |
-
" doc_lemmed = [
|
670 |
-
"
|
671 |
-
"
|
|
|
|
|
|
|
|
|
672 |
" return doc_lemmed"
|
673 |
]
|
674 |
},
|
@@ -697,7 +706,7 @@
|
|
697 |
"metadata": {},
|
698 |
"outputs": [],
|
699 |
"source": [
|
700 |
-
"df_uni[
|
701 |
]
|
702 |
},
|
703 |
{
|
@@ -733,8 +742,23 @@
|
|
733 |
"metadata": {},
|
734 |
"outputs": [],
|
735 |
"source": [
|
736 |
-
"stop_list = nltk.corpus.stopwords.words(
|
737 |
-
"stop_list += [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
738 |
"\n",
|
739 |
"def corpus2docs2(corpus):\n",
|
740 |
" # corpus is a object returned by load_corpus that represents a corpus.\n",
|
@@ -745,27 +769,39 @@
|
|
745 |
" phrases = []\n",
|
746 |
" i = 0\n",
|
747 |
" while i < len(doc_pos):\n",
|
748 |
-
" if doc_pos[i][1] ==
|
749 |
-
" if
|
750 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
751 |
" i += 3\n",
|
752 |
-
" elif i+1 < len(doc_pos) and doc_pos[i+1][1] ==
|
753 |
-
" phrases.append((doc_pos[i][0], doc_pos[i+1][0]))\n",
|
754 |
" i += 2\n",
|
755 |
" else:\n",
|
756 |
" i += 1\n",
|
757 |
-
" elif doc_pos[i][1] ==
|
758 |
-
" if
|
759 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
760 |
" i += 3\n",
|
761 |
-
" elif i+1 < len(doc_pos) and doc_pos[i+1][1] ==
|
762 |
-
" phrases.append((doc_pos[i][0], doc_pos[i+1][0]))\n",
|
763 |
" i += 2\n",
|
764 |
" else:\n",
|
765 |
" i += 1\n",
|
766 |
" else:\n",
|
767 |
" i += 1\n",
|
768 |
-
" phrase_set = [
|
769 |
" docs.append(phrase_set)\n",
|
770 |
" return docs"
|
771 |
]
|
@@ -795,7 +831,7 @@
|
|
795 |
"metadata": {},
|
796 |
"outputs": [],
|
797 |
"source": [
|
798 |
-
"df_uni[
|
799 |
]
|
800 |
},
|
801 |
{
|
@@ -843,7 +879,7 @@
|
|
843 |
}
|
844 |
],
|
845 |
"source": [
|
846 |
-
"df_uni[
|
847 |
]
|
848 |
},
|
849 |
{
|
@@ -866,17 +902,17 @@
|
|
866 |
}
|
867 |
],
|
868 |
"source": [
|
869 |
-
"fdist_doc = nltk.FreqDist(df_uni[
|
870 |
"\n",
|
871 |
"x, y = zip(*fdist_doc)\n",
|
872 |
-
"plt.figure(figsize=(50,30))\n",
|
873 |
"plt.margins(0.02)\n",
|
874 |
"plt.bar(x, y)\n",
|
875 |
-
"plt.xlabel(
|
876 |
-
"plt.ylabel(
|
877 |
"plt.yticks(fontsize=40)\n",
|
878 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
879 |
-
"plt.title(
|
880 |
"plt.show()"
|
881 |
]
|
882 |
},
|
@@ -887,7 +923,7 @@
|
|
887 |
"metadata": {},
|
888 |
"outputs": [],
|
889 |
"source": [
|
890 |
-
"all_words = [word for sublist in df_uni[
|
891 |
"all_words[:2]\n",
|
892 |
"# Calculate word frequencies\n",
|
893 |
"fdist = FreqDist(all_words)"
|
@@ -925,7 +961,7 @@
|
|
925 |
"source": [
|
926 |
"# Plot the word frequency distribution as a bar graph\n",
|
927 |
"plt.figure(figsize=(12, 6))\n",
|
928 |
-
"plt.title(
|
929 |
"fdist.plot(30, cumulative=False)"
|
930 |
]
|
931 |
},
|
@@ -955,7 +991,7 @@
|
|
955 |
}
|
956 |
],
|
957 |
"source": [
|
958 |
-
"com = df_uni[
|
959 |
"com[:10]"
|
960 |
]
|
961 |
},
|
@@ -1020,11 +1056,11 @@
|
|
1020 |
"\n",
|
1021 |
"# Plotting with Seaborn for each company\n",
|
1022 |
"for region in com[:10]:\n",
|
1023 |
-
" haha = df_uni[
|
1024 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
1025 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
1026 |
-
" plt.imshow(wordcloud, interpolation
|
1027 |
-
" plt.title(f
|
1028 |
" plt.axis(\"off\")\n",
|
1029 |
" plt.margins(x=0, y=0)\n",
|
1030 |
" plt.show()"
|
@@ -1047,10 +1083,10 @@
|
|
1047 |
"metadata": {},
|
1048 |
"outputs": [],
|
1049 |
"source": [
|
1050 |
-
"df_uni[
|
1051 |
"\n",
|
1052 |
"# Tokenize the text and create a dictionary\n",
|
1053 |
-
"documents = df_uni[
|
1054 |
"dictionary = corpora.Dictionary(documents)\n",
|
1055 |
"\n",
|
1056 |
"tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)\n",
|
@@ -2078,7 +2114,9 @@
|
|
2078 |
}
|
2079 |
],
|
2080 |
"source": [
|
2081 |
-
"sorted_term_frequencies = dict(
|
|
|
|
|
2082 |
"sorted_term_frequencies"
|
2083 |
]
|
2084 |
},
|
@@ -2100,11 +2138,13 @@
|
|
2100 |
"# customisable, lower threshold, more words retained.\n",
|
2101 |
"threshold = 0.03\n",
|
2102 |
"\n",
|
|
|
2103 |
"def filter_and_join(tfidf_doc):\n",
|
2104 |
" filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]\n",
|
2105 |
" return filtered_terms\n",
|
2106 |
"\n",
|
2107 |
-
"
|
|
|
2108 |
]
|
2109 |
},
|
2110 |
{
|
@@ -2136,7 +2176,7 @@
|
|
2136 |
}
|
2137 |
],
|
2138 |
"source": [
|
2139 |
-
"df_uni[
|
2140 |
]
|
2141 |
},
|
2142 |
{
|
@@ -2159,17 +2199,17 @@
|
|
2159 |
}
|
2160 |
],
|
2161 |
"source": [
|
2162 |
-
"fdist_doc = nltk.FreqDist(df_uni[
|
2163 |
"\n",
|
2164 |
"x, y = zip(*fdist_doc)\n",
|
2165 |
-
"plt.figure(figsize=(50,30))\n",
|
2166 |
"plt.margins(0.02)\n",
|
2167 |
"plt.bar(x, y)\n",
|
2168 |
-
"plt.xlabel(
|
2169 |
-
"plt.ylabel(
|
2170 |
"plt.yticks(fontsize=40)\n",
|
2171 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
2172 |
-
"plt.title(
|
2173 |
"plt.show()"
|
2174 |
]
|
2175 |
},
|
@@ -2180,7 +2220,7 @@
|
|
2180 |
"metadata": {},
|
2181 |
"outputs": [],
|
2182 |
"source": [
|
2183 |
-
"all_words_filtered = [word for sublist in df_uni[
|
2184 |
"all_words_filtered[:2]\n",
|
2185 |
"# Calculate word frequencies\n",
|
2186 |
"fdist_filtered = FreqDist(all_words_filtered)"
|
@@ -2219,7 +2259,7 @@
|
|
2219 |
"# Plot the word frequency distribution as a bar graph\n",
|
2220 |
"# apparently, the dataset is much cleaner now.\n",
|
2221 |
"plt.figure(figsize=(12, 6))\n",
|
2222 |
-
"plt.title(
|
2223 |
"fdist_filtered.plot(30, cumulative=False)"
|
2224 |
]
|
2225 |
},
|
@@ -2284,11 +2324,11 @@
|
|
2284 |
"\n",
|
2285 |
"# Plotting with Seaborn for each company\n",
|
2286 |
"for region in com[:10]:\n",
|
2287 |
-
" haha = df_uni[
|
2288 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
2289 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
2290 |
-
" plt.imshow(wordcloud, interpolation
|
2291 |
-
" plt.title(f
|
2292 |
" plt.axis(\"off\")\n",
|
2293 |
" plt.margins(x=0, y=0)\n",
|
2294 |
" plt.show()"
|
@@ -2301,7 +2341,7 @@
|
|
2301 |
"metadata": {},
|
2302 |
"outputs": [],
|
2303 |
"source": [
|
2304 |
-
"df_uni[
|
2305 |
]
|
2306 |
},
|
2307 |
{
|
@@ -2389,7 +2429,7 @@
|
|
2389 |
}
|
2390 |
],
|
2391 |
"source": [
|
2392 |
-
"df_uni[[
|
2393 |
]
|
2394 |
},
|
2395 |
{
|
@@ -2501,7 +2541,9 @@
|
|
2501 |
],
|
2502 |
"source": [
|
2503 |
"# count of news by region\n",
|
2504 |
-
"df_uni[[
|
|
|
|
|
2505 |
]
|
2506 |
},
|
2507 |
{
|
@@ -2574,7 +2616,9 @@
|
|
2574 |
}
|
2575 |
],
|
2576 |
"source": [
|
2577 |
-
"df_uni[[
|
|
|
|
|
2578 |
]
|
2579 |
},
|
2580 |
{
|
@@ -2887,7 +2931,7 @@
|
|
2887 |
"outputs": [],
|
2888 |
"source": [
|
2889 |
"# export as parquet data file instead of csv for easier list extraction\n",
|
2890 |
-
"df_uni.to_parquet(
|
2891 |
]
|
2892 |
},
|
2893 |
{
|
|
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
67 |
+
"df = pd.read_parquet(\"../NewsScraper/scraped_data1.parquet\")"
|
68 |
]
|
69 |
},
|
70 |
{
|
|
|
442 |
"outputs": [],
|
443 |
"source": [
|
444 |
"# drop empty lines\n",
|
445 |
+
"df_copy.dropna(subset=[\"Headline\"], inplace=True)"
|
446 |
]
|
447 |
},
|
448 |
{
|
|
|
463 |
],
|
464 |
"source": [
|
465 |
"print(\"Published Date Statistics:\")\n",
|
466 |
+
"print(\"Min Date:\", df_copy[\"Datetime\"].min())\n",
|
467 |
+
"print(\"Max Date:\", df_copy[\"Datetime\"].max())"
|
468 |
]
|
469 |
},
|
470 |
{
|
|
|
486 |
],
|
487 |
"source": [
|
488 |
"# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time\n",
|
489 |
+
"df_copy[[\"Year\", \"Headline\", \"Region\"]].duplicated().any()"
|
490 |
]
|
491 |
},
|
492 |
{
|
|
|
497 |
"outputs": [],
|
498 |
"source": [
|
499 |
"# drop the duplicated news\n",
|
500 |
+
"duplicates = df_copy.duplicated(subset=[\"Year\", \"Headline\", \"Region\"], keep=\"first\")\n",
|
501 |
"df_uni = df_copy[~duplicates]"
|
502 |
]
|
503 |
},
|
|
|
508 |
"metadata": {},
|
509 |
"outputs": [],
|
510 |
"source": [
|
511 |
+
"df_uni = df_uni[\n",
|
512 |
+
" ~df_uni[\"content\"].str.contains(\"no content found|cannot scrape content\")\n",
|
513 |
+
"]"
|
514 |
]
|
515 |
},
|
516 |
{
|
|
|
569 |
}
|
570 |
],
|
571 |
"source": [
|
572 |
+
"df_uni[\"content\"][5]"
|
573 |
]
|
574 |
},
|
575 |
{
|
|
|
580 |
"outputs": [],
|
581 |
"source": [
|
582 |
"## remove contractions, lowercase, remove numbers and punctuations, remove stopwords\n",
|
583 |
+
"df_uni[\"cleaned_content\"] = df_uni[\"content\"].apply(\n",
|
584 |
+
" lambda x: [contractions.fix(word) for word in x.split()]\n",
|
585 |
+
")\n",
|
586 |
"\n",
|
587 |
"## convert back into string so that tokenization can be done\n",
|
588 |
+
"df_uni[\"cleaned_content\"] = [\" \".join(map(str, l)) for l in df_uni[\"cleaned_content\"]]"
|
589 |
]
|
590 |
},
|
591 |
{
|
|
|
606 |
}
|
607 |
],
|
608 |
"source": [
|
609 |
+
"df_uni[\"cleaned_content\"][5]"
|
610 |
]
|
611 |
},
|
612 |
{
|
|
|
661 |
"\n",
|
662 |
"wnl = WordNetLemmatizer()\n",
|
663 |
"\n",
|
664 |
+
"\n",
|
665 |
"def lemmatize_words(text):\n",
|
666 |
" # Tokenize the text into sentences and then words\n",
|
667 |
" sentences = sent_tokenize(text)\n",
|
668 |
" words = [word_tokenize(sentence) for sentence in sentences]\n",
|
669 |
"\n",
|
670 |
" # Remove punctuation and tokenize into lowercase words\n",
|
671 |
+
" punc = [[w.lower() for w in word if re.search(\"^[a-zA-Z]+$\", w)] for word in words]\n",
|
672 |
"\n",
|
673 |
" # Perform lemmatization on words with valid POS tags\n",
|
674 |
+
" doc_lemmed = [\n",
|
675 |
+
" wnl.lemmatize(word, pos[0].lower())\n",
|
676 |
+
" for sentence in punc\n",
|
677 |
+
" for word, pos in pos_tag(sentence, tagset=\"universal\")\n",
|
678 |
+
" if pos[0].lower() in [\"a\", \"s\", \"r\", \"n\", \"v\"]\n",
|
679 |
+
" ]\n",
|
680 |
+
"\n",
|
681 |
" return doc_lemmed"
|
682 |
]
|
683 |
},
|
|
|
706 |
"metadata": {},
|
707 |
"outputs": [],
|
708 |
"source": [
|
709 |
+
"df_uni[\"cleaned_content\"] = df_uni[\"cleaned_content\"].apply(lemmatize_words)"
|
710 |
]
|
711 |
},
|
712 |
{
|
|
|
742 |
"metadata": {},
|
743 |
"outputs": [],
|
744 |
"source": [
|
745 |
+
"stop_list = nltk.corpus.stopwords.words(\"english\")\n",
|
746 |
+
"stop_list += [\n",
|
747 |
+
" \"local\",\n",
|
748 |
+
" \"time\",\n",
|
749 |
+
" \"wednesday\",\n",
|
750 |
+
" \"source\",\n",
|
751 |
+
" \"certain\",\n",
|
752 |
+
" \"report\",\n",
|
753 |
+
" \"update\",\n",
|
754 |
+
" \"last\",\n",
|
755 |
+
" \"year\",\n",
|
756 |
+
" \"week\",\n",
|
757 |
+
" \"month\",\n",
|
758 |
+
" \"scrape\",\n",
|
759 |
+
" \"content\",\n",
|
760 |
+
"]\n",
|
761 |
+
"\n",
|
762 |
"\n",
|
763 |
"def corpus2docs2(corpus):\n",
|
764 |
" # corpus is a object returned by load_corpus that represents a corpus.\n",
|
|
|
769 |
" phrases = []\n",
|
770 |
" i = 0\n",
|
771 |
" while i < len(doc_pos):\n",
|
772 |
+
" if doc_pos[i][1] == \"JJ\":\n",
|
773 |
+
" if (\n",
|
774 |
+
" i + 2 < len(doc_pos)\n",
|
775 |
+
" and doc_pos[i + 1][1] == \"NN\"\n",
|
776 |
+
" and doc_pos[i + 2][1] == \"NN\"\n",
|
777 |
+
" ):\n",
|
778 |
+
" phrases.append(\n",
|
779 |
+
" (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])\n",
|
780 |
+
" )\n",
|
781 |
" i += 3\n",
|
782 |
+
" elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == \"NN\":\n",
|
783 |
+
" phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))\n",
|
784 |
" i += 2\n",
|
785 |
" else:\n",
|
786 |
" i += 1\n",
|
787 |
+
" elif doc_pos[i][1] == \"NN\":\n",
|
788 |
+
" if (\n",
|
789 |
+
" i + 2 < len(doc_pos)\n",
|
790 |
+
" and doc_pos[i + 1][1] == \"NN\"\n",
|
791 |
+
" and doc_pos[i + 2][1] == \"NN\"\n",
|
792 |
+
" ):\n",
|
793 |
+
" phrases.append(\n",
|
794 |
+
" (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])\n",
|
795 |
+
" )\n",
|
796 |
" i += 3\n",
|
797 |
+
" elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == \"NN\":\n",
|
798 |
+
" phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))\n",
|
799 |
" i += 2\n",
|
800 |
" else:\n",
|
801 |
" i += 1\n",
|
802 |
" else:\n",
|
803 |
" i += 1\n",
|
804 |
+
" phrase_set = [\"_\".join(word_set) for word_set in phrases]\n",
|
805 |
" docs.append(phrase_set)\n",
|
806 |
" return docs"
|
807 |
]
|
|
|
831 |
"metadata": {},
|
832 |
"outputs": [],
|
833 |
"source": [
|
834 |
+
"df_uni[\"binary_content\"] = corpus2docs2(df_uni[\"cleaned_content\"])"
|
835 |
]
|
836 |
},
|
837 |
{
|
|
|
879 |
}
|
880 |
],
|
881 |
"source": [
|
882 |
+
"df_uni[\"binary_content\"][5]"
|
883 |
]
|
884 |
},
|
885 |
{
|
|
|
902 |
}
|
903 |
],
|
904 |
"source": [
|
905 |
+
"fdist_doc = nltk.FreqDist(df_uni[\"binary_content\"][5]).most_common(25)\n",
|
906 |
"\n",
|
907 |
"x, y = zip(*fdist_doc)\n",
|
908 |
+
"plt.figure(figsize=(50, 30))\n",
|
909 |
"plt.margins(0.02)\n",
|
910 |
"plt.bar(x, y)\n",
|
911 |
+
"plt.xlabel(\"Words\", fontsize=50)\n",
|
912 |
+
"plt.ylabel(\"Frequency of Words\", fontsize=50)\n",
|
913 |
"plt.yticks(fontsize=40)\n",
|
914 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
915 |
+
"plt.title(\"Frequency of 25 Most Common Words for One Random News\", fontsize=60)\n",
|
916 |
"plt.show()"
|
917 |
]
|
918 |
},
|
|
|
923 |
"metadata": {},
|
924 |
"outputs": [],
|
925 |
"source": [
|
926 |
+
"all_words = [word for sublist in df_uni[\"binary_content\"] for word in sublist]\n",
|
927 |
"all_words[:2]\n",
|
928 |
"# Calculate word frequencies\n",
|
929 |
"fdist = FreqDist(all_words)"
|
|
|
961 |
"source": [
|
962 |
"# Plot the word frequency distribution as a bar graph\n",
|
963 |
"plt.figure(figsize=(12, 6))\n",
|
964 |
+
"plt.title(\"Frequency of 25 Most Common Words of the Dataset\", fontsize=12)\n",
|
965 |
"fdist.plot(30, cumulative=False)"
|
966 |
]
|
967 |
},
|
|
|
991 |
}
|
992 |
],
|
993 |
"source": [
|
994 |
+
"com = df_uni[\"Severity\"].unique()\n",
|
995 |
"com[:10]"
|
996 |
]
|
997 |
},
|
|
|
1056 |
"\n",
|
1057 |
"# Plotting with Seaborn for each company\n",
|
1058 |
"for region in com[:10]:\n",
|
1059 |
+
" haha = df_uni[\"binary_content\"].loc[df_uni.Severity == region]\n",
|
1060 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
1061 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
1062 |
+
" plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
1063 |
+
" plt.title(f\"Wordcloud for {region}\")\n",
|
1064 |
" plt.axis(\"off\")\n",
|
1065 |
" plt.margins(x=0, y=0)\n",
|
1066 |
" plt.show()"
|
|
|
1083 |
"metadata": {},
|
1084 |
"outputs": [],
|
1085 |
"source": [
|
1086 |
+
"df_uni[\"binary_content\"] = df_uni[\"binary_content\"].apply(lambda x: \" \".join(x))\n",
|
1087 |
"\n",
|
1088 |
"# Tokenize the text and create a dictionary\n",
|
1089 |
+
"documents = df_uni[\"binary_content\"].str.split()\n",
|
1090 |
"dictionary = corpora.Dictionary(documents)\n",
|
1091 |
"\n",
|
1092 |
"tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)\n",
|
|
|
2114 |
}
|
2115 |
],
|
2116 |
"source": [
|
2117 |
+
"sorted_term_frequencies = dict(\n",
|
2118 |
+
" sorted(term_frequencies.items(), key=lambda item: item[1], reverse=True)\n",
|
2119 |
+
")\n",
|
2120 |
"sorted_term_frequencies"
|
2121 |
]
|
2122 |
},
|
|
|
2138 |
"# customisable, lower threshold, more words retained.\n",
|
2139 |
"threshold = 0.03\n",
|
2140 |
"\n",
|
2141 |
+
"\n",
|
2142 |
"def filter_and_join(tfidf_doc):\n",
|
2143 |
" filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]\n",
|
2144 |
" return filtered_terms\n",
|
2145 |
"\n",
|
2146 |
+
"\n",
|
2147 |
+
"df_uni[\"binary_content\"] = [filter_and_join(doc) for doc in tfidf_corpus]"
|
2148 |
]
|
2149 |
},
|
2150 |
{
|
|
|
2176 |
}
|
2177 |
],
|
2178 |
"source": [
|
2179 |
+
"df_uni[\"binary_content\"]"
|
2180 |
]
|
2181 |
},
|
2182 |
{
|
|
|
2199 |
}
|
2200 |
],
|
2201 |
"source": [
|
2202 |
+
"fdist_doc = nltk.FreqDist(df_uni[\"binary_content\"][5]).most_common(25)\n",
|
2203 |
"\n",
|
2204 |
"x, y = zip(*fdist_doc)\n",
|
2205 |
+
"plt.figure(figsize=(50, 30))\n",
|
2206 |
"plt.margins(0.02)\n",
|
2207 |
"plt.bar(x, y)\n",
|
2208 |
+
"plt.xlabel(\"Words\", fontsize=50)\n",
|
2209 |
+
"plt.ylabel(\"Frequency of Words\", fontsize=50)\n",
|
2210 |
"plt.yticks(fontsize=40)\n",
|
2211 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
2212 |
+
"plt.title(\"Frequency of 25 Most Common Words for One Random News\", fontsize=60)\n",
|
2213 |
"plt.show()"
|
2214 |
]
|
2215 |
},
|
|
|
2220 |
"metadata": {},
|
2221 |
"outputs": [],
|
2222 |
"source": [
|
2223 |
+
"all_words_filtered = [word for sublist in df_uni[\"binary_content\"] for word in sublist]\n",
|
2224 |
"all_words_filtered[:2]\n",
|
2225 |
"# Calculate word frequencies\n",
|
2226 |
"fdist_filtered = FreqDist(all_words_filtered)"
|
|
|
2259 |
"# Plot the word frequency distribution as a bar graph\n",
|
2260 |
"# apparently, the dataset is much cleaner now.\n",
|
2261 |
"plt.figure(figsize=(12, 6))\n",
|
2262 |
+
"plt.title(\"Frequency of 25 Most Common Words of the Dataset\", fontsize=12)\n",
|
2263 |
"fdist_filtered.plot(30, cumulative=False)"
|
2264 |
]
|
2265 |
},
|
|
|
2324 |
"\n",
|
2325 |
"# Plotting with Seaborn for each company\n",
|
2326 |
"for region in com[:10]:\n",
|
2327 |
+
" haha = df_uni[\"binary_content\"].loc[df_uni.Severity == region]\n",
|
2328 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
2329 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
2330 |
+
" plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
2331 |
+
" plt.title(f\"Wordcloud for {region}\")\n",
|
2332 |
" plt.axis(\"off\")\n",
|
2333 |
" plt.margins(x=0, y=0)\n",
|
2334 |
" plt.show()"
|
|
|
2341 |
"metadata": {},
|
2342 |
"outputs": [],
|
2343 |
"source": [
|
2344 |
+
"df_uni[\"word_count\"] = df_uni[\"binary_content\"].apply(len)"
|
2345 |
]
|
2346 |
},
|
2347 |
{
|
|
|
2429 |
}
|
2430 |
],
|
2431 |
"source": [
|
2432 |
+
"df_uni[[\"word_count\"]].describe().round()"
|
2433 |
]
|
2434 |
},
|
2435 |
{
|
|
|
2541 |
],
|
2542 |
"source": [
|
2543 |
"# count of news by region\n",
|
2544 |
+
"df_uni[[\"binary_content\", \"Region\"]].groupby(\"Region\").count().sort_values(\n",
|
2545 |
+
" by=\"binary_content\", ascending=False\n",
|
2546 |
+
")"
|
2547 |
]
|
2548 |
},
|
2549 |
{
|
|
|
2616 |
}
|
2617 |
],
|
2618 |
"source": [
|
2619 |
+
"df_uni[[\"binary_content\", \"Severity\"]].groupby(\"Severity\").count().sort_values(\n",
|
2620 |
+
" by=\"binary_content\", ascending=False\n",
|
2621 |
+
")"
|
2622 |
]
|
2623 |
},
|
2624 |
{
|
|
|
2931 |
"outputs": [],
|
2932 |
"source": [
|
2933 |
"# export as parquet data file instead of csv for easier list extraction\n",
|
2934 |
+
"df_uni.to_parquet(\"processed_data1.parquet\", index=False)"
|
2935 |
]
|
2936 |
},
|
2937 |
{
|
IS424_Data_Mining/code/LDA/topic_modelling_benchmark_using_headline.ipynb
CHANGED
@@ -38,7 +38,8 @@
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
-
"
|
|
|
42 |
"\n",
|
43 |
"from pprint import pprint\n",
|
44 |
"import pyLDAvis\n",
|
@@ -95,7 +96,7 @@
|
|
95 |
}
|
96 |
],
|
97 |
"source": [
|
98 |
-
"df = pd.read_parquet(
|
99 |
]
|
100 |
},
|
101 |
{
|
@@ -512,8 +513,8 @@
|
|
512 |
}
|
513 |
],
|
514 |
"source": [
|
515 |
-
"print(
|
516 |
-
"print(
|
517 |
]
|
518 |
},
|
519 |
{
|
@@ -683,13 +684,15 @@
|
|
683 |
],
|
684 |
"source": [
|
685 |
"# Build LDA benchmark model\n",
|
686 |
-
"lda_model = gensim.models.LdaMulticore(
|
687 |
-
"
|
688 |
-
"
|
689 |
-
"
|
690 |
-
"
|
691 |
-
"
|
692 |
-
"
|
|
|
|
|
693 |
]
|
694 |
},
|
695 |
{
|
@@ -771,9 +774,11 @@
|
|
771 |
],
|
772 |
"source": [
|
773 |
"# Compute Benchmark Coherence Score\n",
|
774 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
775 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
776 |
-
"print(
|
777 |
]
|
778 |
},
|
779 |
{
|
@@ -803,10 +808,10 @@
|
|
803 |
],
|
804 |
"source": [
|
805 |
"# Compute Benchmark Perplexity\n",
|
806 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
807 |
-
"
|
808 |
"\n",
|
809 |
-
"print(
|
810 |
]
|
811 |
},
|
812 |
{
|
@@ -831,9 +836,9 @@
|
|
831 |
"\n",
|
832 |
"# feed the LDA model into the pyLDAvis instance\n",
|
833 |
"pyLDAvis.enable_notebook()\n",
|
834 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
835 |
"\n",
|
836 |
-
"#Save the output to the html file\n",
|
837 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark.html\")"
|
838 |
]
|
839 |
},
|
@@ -921,16 +926,16 @@
|
|
921 |
}
|
922 |
],
|
923 |
"source": [
|
924 |
-
"pd.set_option(
|
925 |
"# Get the topics and their top keywords into a dataframe\n",
|
926 |
-
"topics = lda_model.show_topics(num_words=6)
|
927 |
"\n",
|
928 |
"topic_keywords = pd.DataFrame()\n",
|
929 |
"for topic_id, topic in topics:\n",
|
930 |
-
" topic_keywords.at[topic_id,
|
931 |
"\n",
|
932 |
-
"topic_keywords[
|
933 |
-
"# topic_keywords['Topic Name'] = topic_mapping
|
934 |
"topic_keywords"
|
935 |
]
|
936 |
},
|
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
+
"\n",
|
42 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
43 |
"\n",
|
44 |
"from pprint import pprint\n",
|
45 |
"import pyLDAvis\n",
|
|
|
96 |
}
|
97 |
],
|
98 |
"source": [
|
99 |
+
"df = pd.read_parquet(\"processed_data.parquet\")"
|
100 |
]
|
101 |
},
|
102 |
{
|
|
|
513 |
}
|
514 |
],
|
515 |
"source": [
|
516 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
517 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
518 |
]
|
519 |
},
|
520 |
{
|
|
|
684 |
],
|
685 |
"source": [
|
686 |
"# Build LDA benchmark model\n",
|
687 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
688 |
+
" corpus=docs_vecs,\n",
|
689 |
+
" id2word=doc_dict,\n",
|
690 |
+
" num_topics=4,\n",
|
691 |
+
" random_state=42,\n",
|
692 |
+
" chunksize=100,\n",
|
693 |
+
" passes=10,\n",
|
694 |
+
" per_word_topics=True,\n",
|
695 |
+
")"
|
696 |
]
|
697 |
},
|
698 |
{
|
|
|
774 |
],
|
775 |
"source": [
|
776 |
"# Compute Benchmark Coherence Score\n",
|
777 |
+
"coherence_model_lda = CoherenceModel(\n",
|
778 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
779 |
+
")\n",
|
780 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
781 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
782 |
]
|
783 |
},
|
784 |
{
|
|
|
808 |
],
|
809 |
"source": [
|
810 |
"# Compute Benchmark Perplexity\n",
|
811 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
812 |
+
"# a measure of how good the model is. lower the better.\n",
|
813 |
"\n",
|
814 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
815 |
]
|
816 |
},
|
817 |
{
|
|
|
836 |
"\n",
|
837 |
"# feed the LDA model into the pyLDAvis instance\n",
|
838 |
"pyLDAvis.enable_notebook()\n",
|
839 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
840 |
"\n",
|
841 |
+
"# Save the output to the html file\n",
|
842 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark.html\")"
|
843 |
]
|
844 |
},
|
|
|
926 |
}
|
927 |
],
|
928 |
"source": [
|
929 |
+
"pd.set_option(\"max_colwidth\", 200)\n",
|
930 |
"# Get the topics and their top keywords into a dataframe\n",
|
931 |
+
"topics = lda_model.show_topics(num_words=6)\n",
|
932 |
"\n",
|
933 |
"topic_keywords = pd.DataFrame()\n",
|
934 |
"for topic_id, topic in topics:\n",
|
935 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
936 |
"\n",
|
937 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
938 |
+
"# topic_keywords['Topic Name'] = topic_mapping\n",
|
939 |
"topic_keywords"
|
940 |
]
|
941 |
},
|
IS424_Data_Mining/code/LDA/topic_modelling_minor.ipynb
CHANGED
@@ -38,7 +38,8 @@
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
-
"
|
|
|
42 |
"\n",
|
43 |
"from pprint import pprint\n",
|
44 |
"import pyLDAvis\n",
|
@@ -95,7 +96,7 @@
|
|
95 |
}
|
96 |
],
|
97 |
"source": [
|
98 |
-
"df = pd.read_parquet(
|
99 |
]
|
100 |
},
|
101 |
{
|
@@ -448,7 +449,7 @@
|
|
448 |
],
|
449 |
"source": [
|
450 |
"# choose only the extreme and severe cases for modelling\n",
|
451 |
-
"cleaned = df_copy[df_copy[
|
452 |
"cleaned.reset_index(drop=True, inplace=True)"
|
453 |
]
|
454 |
},
|
@@ -597,8 +598,8 @@
|
|
597 |
}
|
598 |
],
|
599 |
"source": [
|
600 |
-
"print(
|
601 |
-
"print(
|
602 |
]
|
603 |
},
|
604 |
{
|
@@ -768,13 +769,15 @@
|
|
768 |
],
|
769 |
"source": [
|
770 |
"# Build LDA benchmark model\n",
|
771 |
-
"lda_model = gensim.models.LdaMulticore(
|
772 |
-
"
|
773 |
-
"
|
774 |
-
"
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"
|
|
|
|
|
778 |
]
|
779 |
},
|
780 |
{
|
@@ -803,9 +806,11 @@
|
|
803 |
"outputs": [],
|
804 |
"source": [
|
805 |
"# Compute Benchmark Coherence Score\n",
|
806 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
807 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
808 |
-
"print(
|
809 |
]
|
810 |
},
|
811 |
{
|
@@ -818,10 +823,10 @@
|
|
818 |
"outputs": [],
|
819 |
"source": [
|
820 |
"# Compute Benchmark Perplexity\n",
|
821 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
822 |
-
"
|
823 |
"\n",
|
824 |
-
"print(
|
825 |
]
|
826 |
},
|
827 |
{
|
@@ -837,9 +842,9 @@
|
|
837 |
"\n",
|
838 |
"# feed the LDA model into the pyLDAvis instance\n",
|
839 |
"pyLDAvis.enable_notebook()\n",
|
840 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
841 |
"\n",
|
842 |
-
"#Save the output to the html file\n",
|
843 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark_minor.html\")"
|
844 |
]
|
845 |
},
|
@@ -850,7 +855,7 @@
|
|
850 |
"metadata": {},
|
851 |
"outputs": [],
|
852 |
"source": [
|
853 |
-
"# break
|
854 |
]
|
855 |
},
|
856 |
{
|
@@ -878,20 +883,24 @@
|
|
878 |
"source": [
|
879 |
"# hyper-perameter tuning (alpha and beta)\n",
|
880 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
881 |
-
"
|
882 |
-
" lda_model = gensim.models.LdaMulticore(
|
883 |
-
"
|
884 |
-
"
|
885 |
-
"
|
886 |
-
"
|
887 |
-
"
|
888 |
-
"
|
889 |
-
"
|
890 |
-
"
|
891 |
-
"
|
|
|
|
|
|
|
|
|
892 |
" coherence = coherence_model_lda.get_coherence()\n",
|
893 |
-
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)
|
894 |
-
"
|
895 |
" return coherence, perplex"
|
896 |
]
|
897 |
},
|
@@ -919,12 +928,12 @@
|
|
919 |
"\n",
|
920 |
"# Alpha parameter\n",
|
921 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
922 |
-
"alpha.append(
|
923 |
-
"alpha.append(
|
924 |
"\n",
|
925 |
"# Beta parameter\n",
|
926 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
927 |
-
"beta.append(
|
928 |
]
|
929 |
},
|
930 |
{
|
@@ -944,8 +953,8 @@
|
|
944 |
},
|
945 |
"outputs": [],
|
946 |
"source": [
|
947 |
-
"print(\"Topic range: \",num_topics)\n",
|
948 |
-
"print(\"Alpha: \",alpha)\n",
|
949 |
"print(\"Beta: \", beta)"
|
950 |
]
|
951 |
},
|
@@ -965,15 +974,28 @@
|
|
965 |
"for a in alpha:\n",
|
966 |
" for b in beta:\n",
|
967 |
" for num in num_topics:\n",
|
968 |
-
" cv, pv = compute_coherence_values(
|
|
|
|
|
969 |
"\n",
|
970 |
-
" model_topics.append(num)
|
971 |
-
" coherence_values.append(cv)
|
972 |
" perplexity_values.append(pv)\n",
|
973 |
" alpha_result.append(a)\n",
|
974 |
" beta_result.append(b)\n",
|
975 |
-
" print(\
|
976 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
977 |
"print(datetime.datetime.now())"
|
978 |
]
|
979 |
},
|
@@ -994,13 +1016,17 @@
|
|
994 |
"source": [
|
995 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
996 |
"result = pd.DataFrame(\n",
|
997 |
-
" {
|
998 |
-
"
|
999 |
-
"
|
1000 |
-
"
|
1001 |
-
"
|
1002 |
-
"
|
1003 |
-
"
|
|
|
|
|
|
|
|
|
1004 |
]
|
1005 |
},
|
1006 |
{
|
@@ -1010,7 +1036,7 @@
|
|
1010 |
"metadata": {},
|
1011 |
"outputs": [],
|
1012 |
"source": [
|
1013 |
-
"result.to_csv(
|
1014 |
]
|
1015 |
},
|
1016 |
{
|
@@ -1023,7 +1049,7 @@
|
|
1023 |
"outputs": [],
|
1024 |
"source": [
|
1025 |
"# Show graph Topics vs Coherence Score\n",
|
1026 |
-
"result.groupby(
|
1027 |
]
|
1028 |
},
|
1029 |
{
|
@@ -1038,7 +1064,7 @@
|
|
1038 |
"plt.plot(model_topics, coherence_values)\n",
|
1039 |
"plt.xlabel(\"Num Topics\")\n",
|
1040 |
"plt.ylabel(\"Coherence Score\")\n",
|
1041 |
-
"plt.legend((\"Coherence Score\"), loc
|
1042 |
"plt.show()"
|
1043 |
]
|
1044 |
},
|
@@ -1054,7 +1080,7 @@
|
|
1054 |
"plt.plot(model_topics, perplexity_values)\n",
|
1055 |
"plt.xlabel(\"Num Topics\")\n",
|
1056 |
"plt.ylabel(\"Perplexity score\")\n",
|
1057 |
-
"plt.legend((\"perplexity_values\"), loc
|
1058 |
"plt.show()"
|
1059 |
]
|
1060 |
},
|
@@ -1086,17 +1112,19 @@
|
|
1086 |
"# a = 'asymmetric'\n",
|
1087 |
"a = 0.31\n",
|
1088 |
"# b = 0.31\n",
|
1089 |
-
"b =
|
1090 |
"\n",
|
1091 |
"\n",
|
1092 |
-
"final_model = gensim.models.LdaMulticore(
|
1093 |
-
"
|
1094 |
-
"
|
1095 |
-
"
|
1096 |
-
"
|
1097 |
-
"
|
1098 |
-
"
|
1099 |
-
"
|
|
|
|
|
1100 |
]
|
1101 |
},
|
1102 |
{
|
@@ -1108,7 +1136,7 @@
|
|
1108 |
},
|
1109 |
"outputs": [],
|
1110 |
"source": [
|
1111 |
-
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b)
|
1112 |
]
|
1113 |
},
|
1114 |
{
|
@@ -1120,12 +1148,12 @@
|
|
1120 |
},
|
1121 |
"outputs": [],
|
1122 |
"source": [
|
1123 |
-
"#Set up the environment to display the graphical outputs\n",
|
1124 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1125 |
"pyLDAvis.enable_notebook()\n",
|
1126 |
-
"visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1127 |
"\n",
|
1128 |
-
"#Save the output to the html file\n",
|
1129 |
"pyLDAvis.save_html(visual, \"topic_viz2_minor_training.html\")"
|
1130 |
]
|
1131 |
},
|
@@ -1170,14 +1198,14 @@
|
|
1170 |
"outputs": [],
|
1171 |
"source": [
|
1172 |
"# Get the topics and their top keywords into a dataframe\n",
|
1173 |
-
"topics = final_model.show_topics(num_words=30)
|
1174 |
"\n",
|
1175 |
"topic_keywords = pd.DataFrame()\n",
|
1176 |
"for topic_id, topic in topics:\n",
|
1177 |
-
" topic_keywords.at[topic_id,
|
1178 |
"\n",
|
1179 |
-
"topic_keywords[
|
1180 |
-
"topic_keywords[
|
1181 |
"topic_keywords"
|
1182 |
]
|
1183 |
},
|
@@ -1196,7 +1224,7 @@
|
|
1196 |
"metadata": {},
|
1197 |
"outputs": [],
|
1198 |
"source": [
|
1199 |
-
"#Save a model to disk, or reload a pre-trained model\n",
|
1200 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1201 |
"final_model.save(\"final_model_5_asym_91\")"
|
1202 |
]
|
@@ -1233,13 +1261,19 @@
|
|
1233 |
"outputs": [],
|
1234 |
"source": [
|
1235 |
"import warnings\n",
|
1236 |
-
"
|
|
|
|
|
1237 |
"\n",
|
1238 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1239 |
" # Preallocate memory for the DataFrame\n",
|
1240 |
" num_docs = len(corpus)\n",
|
1241 |
-
" sent_topics = {
|
1242 |
-
"
|
|
|
|
|
|
|
|
|
1243 |
" # Get main topic in each document\n",
|
1244 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1245 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
@@ -1247,13 +1281,13 @@
|
|
1247 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1248 |
" dominant_topic, perc_contribution = row[0]\n",
|
1249 |
" topic_distribution = row\n",
|
1250 |
-
" sent_topics[
|
1251 |
-
" sent_topics[
|
1252 |
-
" sent_topics[
|
1253 |
"\n",
|
1254 |
" # Create the DataFrame\n",
|
1255 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1256 |
-
" sent_topics_df[
|
1257 |
"\n",
|
1258 |
" return sent_topics_df"
|
1259 |
]
|
@@ -1265,7 +1299,9 @@
|
|
1265 |
"metadata": {},
|
1266 |
"outputs": [],
|
1267 |
"source": [
|
1268 |
-
"df_topic_sents_keywords = format_topics_sentences(
|
|
|
|
|
1269 |
]
|
1270 |
},
|
1271 |
{
|
@@ -1277,7 +1313,13 @@
|
|
1277 |
"source": [
|
1278 |
"# Format\n",
|
1279 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1280 |
-
"df_dominant_topic.columns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
1281 |
"\n",
|
1282 |
"# Show\n",
|
1283 |
"df_dominant_topic.head(10)"
|
@@ -1329,7 +1371,7 @@
|
|
1329 |
"# Show the plot\n",
|
1330 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1331 |
"plt.tight_layout()\n",
|
1332 |
-
"plt.show()
|
1333 |
]
|
1334 |
},
|
1335 |
{
|
@@ -1339,7 +1381,7 @@
|
|
1339 |
"metadata": {},
|
1340 |
"outputs": [],
|
1341 |
"source": [
|
1342 |
-
"df_dominant_topic.sort_values(by
|
1343 |
]
|
1344 |
},
|
1345 |
{
|
@@ -1350,9 +1392,9 @@
|
|
1350 |
"outputs": [],
|
1351 |
"source": [
|
1352 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1353 |
-
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)
|
1354 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1355 |
-
"sampled_df.to_csv(
|
1356 |
]
|
1357 |
}
|
1358 |
],
|
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
+
"\n",
|
42 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
43 |
"\n",
|
44 |
"from pprint import pprint\n",
|
45 |
"import pyLDAvis\n",
|
|
|
96 |
}
|
97 |
],
|
98 |
"source": [
|
99 |
+
"df = pd.read_parquet(\"processed_data1.parquet\")"
|
100 |
]
|
101 |
},
|
102 |
{
|
|
|
449 |
],
|
450 |
"source": [
|
451 |
"# choose only the extreme and severe cases for modelling\n",
|
452 |
+
"cleaned = df_copy[df_copy[\"Severity\"].isin([\"Minor\"])]\n",
|
453 |
"cleaned.reset_index(drop=True, inplace=True)"
|
454 |
]
|
455 |
},
|
|
|
598 |
}
|
599 |
],
|
600 |
"source": [
|
601 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
602 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
603 |
]
|
604 |
},
|
605 |
{
|
|
|
769 |
],
|
770 |
"source": [
|
771 |
"# Build LDA benchmark model\n",
|
772 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
773 |
+
" corpus=docs_vecs,\n",
|
774 |
+
" id2word=doc_dict,\n",
|
775 |
+
" num_topics=4,\n",
|
776 |
+
" random_state=42,\n",
|
777 |
+
" chunksize=100,\n",
|
778 |
+
" passes=10,\n",
|
779 |
+
" per_word_topics=True,\n",
|
780 |
+
")"
|
781 |
]
|
782 |
},
|
783 |
{
|
|
|
806 |
"outputs": [],
|
807 |
"source": [
|
808 |
"# Compute Benchmark Coherence Score\n",
|
809 |
+
"coherence_model_lda = CoherenceModel(\n",
|
810 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
811 |
+
")\n",
|
812 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
813 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
814 |
]
|
815 |
},
|
816 |
{
|
|
|
823 |
"outputs": [],
|
824 |
"source": [
|
825 |
"# Compute Benchmark Perplexity\n",
|
826 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
827 |
+
"# a measure of how good the model is. lower the better.\n",
|
828 |
"\n",
|
829 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
830 |
]
|
831 |
},
|
832 |
{
|
|
|
842 |
"\n",
|
843 |
"# feed the LDA model into the pyLDAvis instance\n",
|
844 |
"pyLDAvis.enable_notebook()\n",
|
845 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
846 |
"\n",
|
847 |
+
"# Save the output to the html file\n",
|
848 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark_minor.html\")"
|
849 |
]
|
850 |
},
|
|
|
855 |
"metadata": {},
|
856 |
"outputs": [],
|
857 |
"source": [
|
858 |
+
"# break"
|
859 |
]
|
860 |
},
|
861 |
{
|
|
|
883 |
"source": [
|
884 |
"# hyper-perameter tuning (alpha and beta)\n",
|
885 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
886 |
+
"\n",
|
887 |
+
" lda_model = gensim.models.LdaMulticore(\n",
|
888 |
+
" corpus=corpus,\n",
|
889 |
+
" id2word=dictionary,\n",
|
890 |
+
" num_topics=k,\n",
|
891 |
+
" random_state=42,\n",
|
892 |
+
" chunksize=100,\n",
|
893 |
+
" passes=10,\n",
|
894 |
+
" alpha=a,\n",
|
895 |
+
" eta=b,\n",
|
896 |
+
" )\n",
|
897 |
+
"\n",
|
898 |
+
" coherence_model_lda = CoherenceModel(\n",
|
899 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
900 |
+
" )\n",
|
901 |
" coherence = coherence_model_lda.get_coherence()\n",
|
902 |
+
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)\n",
|
903 |
+
"\n",
|
904 |
" return coherence, perplex"
|
905 |
]
|
906 |
},
|
|
|
928 |
"\n",
|
929 |
"# Alpha parameter\n",
|
930 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
931 |
+
"alpha.append(\"symmetric\")\n",
|
932 |
+
"alpha.append(\"asymmetric\")\n",
|
933 |
"\n",
|
934 |
"# Beta parameter\n",
|
935 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
936 |
+
"beta.append(\"symmetric\")"
|
937 |
]
|
938 |
},
|
939 |
{
|
|
|
953 |
},
|
954 |
"outputs": [],
|
955 |
"source": [
|
956 |
+
"print(\"Topic range: \", num_topics)\n",
|
957 |
+
"print(\"Alpha: \", alpha)\n",
|
958 |
"print(\"Beta: \", beta)"
|
959 |
]
|
960 |
},
|
|
|
974 |
"for a in alpha:\n",
|
975 |
" for b in beta:\n",
|
976 |
" for num in num_topics:\n",
|
977 |
+
" cv, pv = compute_coherence_values(\n",
|
978 |
+
" corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b\n",
|
979 |
+
" )\n",
|
980 |
"\n",
|
981 |
+
" model_topics.append(num)\n",
|
982 |
+
" coherence_values.append(cv)\n",
|
983 |
" perplexity_values.append(pv)\n",
|
984 |
" alpha_result.append(a)\n",
|
985 |
" beta_result.append(b)\n",
|
986 |
+
" print(\n",
|
987 |
+
" \"#Topics: \"\n",
|
988 |
+
" + str(num)\n",
|
989 |
+
" + \", CV Score: \"\n",
|
990 |
+
" + str(coherence_values[-1])\n",
|
991 |
+
" + \", PV Score: \"\n",
|
992 |
+
" + str(perplexity_values[-1])\n",
|
993 |
+
" + \", Alpha: \"\n",
|
994 |
+
" + str(alpha_result[-1])\n",
|
995 |
+
" + \", Beta: \"\n",
|
996 |
+
" + str(beta_result[-1])\n",
|
997 |
+
" )\n",
|
998 |
+
"\n",
|
999 |
"print(datetime.datetime.now())"
|
1000 |
]
|
1001 |
},
|
|
|
1016 |
"source": [
|
1017 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1018 |
"result = pd.DataFrame(\n",
|
1019 |
+
" {\n",
|
1020 |
+
" \"Topics\": model_topics,\n",
|
1021 |
+
" \"Coherence Score\": coherence_values,\n",
|
1022 |
+
" \"Perplexity Score\": perplexity_values,\n",
|
1023 |
+
" \"Alpha\": alpha_result,\n",
|
1024 |
+
" \"Beta\": beta_result,\n",
|
1025 |
+
" }\n",
|
1026 |
+
")\n",
|
1027 |
+
"result.sort_values(\n",
|
1028 |
+
" by=[\"Coherence Score\", \"Perplexity Score\"], ascending=[False, True]\n",
|
1029 |
+
").head(20)"
|
1030 |
]
|
1031 |
},
|
1032 |
{
|
|
|
1036 |
"metadata": {},
|
1037 |
"outputs": [],
|
1038 |
"source": [
|
1039 |
+
"result.to_csv(\"lda_fine_tuning_result.csv\")"
|
1040 |
]
|
1041 |
},
|
1042 |
{
|
|
|
1049 |
"outputs": [],
|
1050 |
"source": [
|
1051 |
"# Show graph Topics vs Coherence Score\n",
|
1052 |
+
"result.groupby(\"Alpha\").plot(x=\"Topics\", y=\"Coherence Score\", legend=True)"
|
1053 |
]
|
1054 |
},
|
1055 |
{
|
|
|
1064 |
"plt.plot(model_topics, coherence_values)\n",
|
1065 |
"plt.xlabel(\"Num Topics\")\n",
|
1066 |
"plt.ylabel(\"Coherence Score\")\n",
|
1067 |
+
"plt.legend((\"Coherence Score\"), loc=\"best\")\n",
|
1068 |
"plt.show()"
|
1069 |
]
|
1070 |
},
|
|
|
1080 |
"plt.plot(model_topics, perplexity_values)\n",
|
1081 |
"plt.xlabel(\"Num Topics\")\n",
|
1082 |
"plt.ylabel(\"Perplexity score\")\n",
|
1083 |
+
"plt.legend((\"perplexity_values\"), loc=\"best\")\n",
|
1084 |
"plt.show()"
|
1085 |
]
|
1086 |
},
|
|
|
1112 |
"# a = 'asymmetric'\n",
|
1113 |
"a = 0.31\n",
|
1114 |
"# b = 0.31\n",
|
1115 |
+
"b = \"symmetric\"\n",
|
1116 |
"\n",
|
1117 |
"\n",
|
1118 |
+
"final_model = gensim.models.LdaMulticore(\n",
|
1119 |
+
" corpus=docs_vecs,\n",
|
1120 |
+
" id2word=doc_dict,\n",
|
1121 |
+
" num_topics=k,\n",
|
1122 |
+
" random_state=42,\n",
|
1123 |
+
" chunksize=100,\n",
|
1124 |
+
" passes=10,\n",
|
1125 |
+
" alpha=a,\n",
|
1126 |
+
" eta=b,\n",
|
1127 |
+
")"
|
1128 |
]
|
1129 |
},
|
1130 |
{
|
|
|
1136 |
},
|
1137 |
"outputs": [],
|
1138 |
"source": [
|
1139 |
+
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)"
|
1140 |
]
|
1141 |
},
|
1142 |
{
|
|
|
1148 |
},
|
1149 |
"outputs": [],
|
1150 |
"source": [
|
1151 |
+
"# Set up the environment to display the graphical outputs\n",
|
1152 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1153 |
"pyLDAvis.enable_notebook()\n",
|
1154 |
+
"visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1155 |
"\n",
|
1156 |
+
"# Save the output to the html file\n",
|
1157 |
"pyLDAvis.save_html(visual, \"topic_viz2_minor_training.html\")"
|
1158 |
]
|
1159 |
},
|
|
|
1198 |
"outputs": [],
|
1199 |
"source": [
|
1200 |
"# Get the topics and their top keywords into a dataframe\n",
|
1201 |
+
"topics = final_model.show_topics(num_words=30)\n",
|
1202 |
"\n",
|
1203 |
"topic_keywords = pd.DataFrame()\n",
|
1204 |
"for topic_id, topic in topics:\n",
|
1205 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
1206 |
"\n",
|
1207 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
1208 |
+
"topic_keywords[\"Topic Name\"] = topic_mapping\n",
|
1209 |
"topic_keywords"
|
1210 |
]
|
1211 |
},
|
|
|
1224 |
"metadata": {},
|
1225 |
"outputs": [],
|
1226 |
"source": [
|
1227 |
+
"# Save a model to disk, or reload a pre-trained model\n",
|
1228 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1229 |
"final_model.save(\"final_model_5_asym_91\")"
|
1230 |
]
|
|
|
1261 |
"outputs": [],
|
1262 |
"source": [
|
1263 |
"import warnings\n",
|
1264 |
+
"\n",
|
1265 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
1266 |
+
"\n",
|
1267 |
"\n",
|
1268 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1269 |
" # Preallocate memory for the DataFrame\n",
|
1270 |
" num_docs = len(corpus)\n",
|
1271 |
+
" sent_topics = {\n",
|
1272 |
+
" \"Dominant_Topic\": [0] * num_docs,\n",
|
1273 |
+
" \"Perc_Contribution\": [0.0] * num_docs,\n",
|
1274 |
+
" \"Topic_Distribution\": [()] * num_docs,\n",
|
1275 |
+
" }\n",
|
1276 |
+
"\n",
|
1277 |
" # Get main topic in each document\n",
|
1278 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1279 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
|
|
1281 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1282 |
" dominant_topic, perc_contribution = row[0]\n",
|
1283 |
" topic_distribution = row\n",
|
1284 |
+
" sent_topics[\"Dominant_Topic\"][i] = int(dominant_topic)\n",
|
1285 |
+
" sent_topics[\"Perc_Contribution\"][i] = round(perc_contribution, 4)\n",
|
1286 |
+
" sent_topics[\"Topic_Distribution\"][i] = topic_distribution\n",
|
1287 |
"\n",
|
1288 |
" # Create the DataFrame\n",
|
1289 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1290 |
+
" sent_topics_df[\"Text\"] = data\n",
|
1291 |
"\n",
|
1292 |
" return sent_topics_df"
|
1293 |
]
|
|
|
1299 |
"metadata": {},
|
1300 |
"outputs": [],
|
1301 |
"source": [
|
1302 |
+
"df_topic_sents_keywords = format_topics_sentences(\n",
|
1303 |
+
" ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details\n",
|
1304 |
+
")"
|
1305 |
]
|
1306 |
},
|
1307 |
{
|
|
|
1313 |
"source": [
|
1314 |
"# Format\n",
|
1315 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1316 |
+
"df_dominant_topic.columns = [\n",
|
1317 |
+
" \"Document_No\",\n",
|
1318 |
+
" \"Dominant_Topic\",\n",
|
1319 |
+
" \"Topic_Perc_Contrib\",\n",
|
1320 |
+
" \"Topic_Distribution\",\n",
|
1321 |
+
" \"Text\",\n",
|
1322 |
+
"]\n",
|
1323 |
"\n",
|
1324 |
"# Show\n",
|
1325 |
"df_dominant_topic.head(10)"
|
|
|
1371 |
"# Show the plot\n",
|
1372 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1373 |
"plt.tight_layout()\n",
|
1374 |
+
"plt.show()"
|
1375 |
]
|
1376 |
},
|
1377 |
{
|
|
|
1381 |
"metadata": {},
|
1382 |
"outputs": [],
|
1383 |
"source": [
|
1384 |
+
"df_dominant_topic.sort_values(by=\"Topic_Perc_Contrib\", ascending=True).head(20)"
|
1385 |
]
|
1386 |
},
|
1387 |
{
|
|
|
1392 |
"outputs": [],
|
1393 |
"source": [
|
1394 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1395 |
+
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)\n",
|
1396 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1397 |
+
"sampled_df.to_csv(\"sample_minor.csv\")"
|
1398 |
]
|
1399 |
}
|
1400 |
],
|
IS424_Data_Mining/code/LDA/topic_modelling_moderate.ipynb
CHANGED
@@ -38,7 +38,8 @@
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
-
"
|
|
|
42 |
"\n",
|
43 |
"from pprint import pprint\n",
|
44 |
"import pyLDAvis\n",
|
@@ -95,7 +96,7 @@
|
|
95 |
}
|
96 |
],
|
97 |
"source": [
|
98 |
-
"df = pd.read_parquet(
|
99 |
]
|
100 |
},
|
101 |
{
|
@@ -448,7 +449,7 @@
|
|
448 |
],
|
449 |
"source": [
|
450 |
"# choose only the extreme and severe cases for modelling\n",
|
451 |
-
"cleaned = df_copy[df_copy[
|
452 |
"cleaned.reset_index(drop=True, inplace=True)"
|
453 |
]
|
454 |
},
|
@@ -578,8 +579,8 @@
|
|
578 |
}
|
579 |
],
|
580 |
"source": [
|
581 |
-
"print(
|
582 |
-
"print(
|
583 |
]
|
584 |
},
|
585 |
{
|
@@ -749,13 +750,15 @@
|
|
749 |
],
|
750 |
"source": [
|
751 |
"# Build LDA benchmark model\n",
|
752 |
-
"lda_model = gensim.models.LdaMulticore(
|
753 |
-
"
|
754 |
-
"
|
755 |
-
"
|
756 |
-
"
|
757 |
-
"
|
758 |
-
"
|
|
|
|
|
759 |
]
|
760 |
},
|
761 |
{
|
@@ -838,9 +841,11 @@
|
|
838 |
],
|
839 |
"source": [
|
840 |
"# Compute Benchmark Coherence Score\n",
|
841 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
842 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
843 |
-
"print(
|
844 |
]
|
845 |
},
|
846 |
{
|
@@ -870,10 +875,10 @@
|
|
870 |
],
|
871 |
"source": [
|
872 |
"# Compute Benchmark Perplexity\n",
|
873 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
874 |
-
"
|
875 |
"\n",
|
876 |
-
"print(
|
877 |
]
|
878 |
},
|
879 |
{
|
@@ -898,9 +903,9 @@
|
|
898 |
"\n",
|
899 |
"# feed the LDA model into the pyLDAvis instance\n",
|
900 |
"pyLDAvis.enable_notebook()\n",
|
901 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
902 |
"\n",
|
903 |
-
"#Save the output to the html file\n",
|
904 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark_moderate.html\")"
|
905 |
]
|
906 |
},
|
@@ -988,16 +993,16 @@
|
|
988 |
}
|
989 |
],
|
990 |
"source": [
|
991 |
-
"pd.set_option(
|
992 |
"# Get the topics and their top keywords into a dataframe\n",
|
993 |
-
"topics = lda_model.show_topics(num_words=6)
|
994 |
"\n",
|
995 |
"topic_keywords = pd.DataFrame()\n",
|
996 |
"for topic_id, topic in topics:\n",
|
997 |
-
" topic_keywords.at[topic_id,
|
998 |
"\n",
|
999 |
-
"topic_keywords[
|
1000 |
-
"# topic_keywords['Topic Name'] = topic_mapping
|
1001 |
"topic_keywords"
|
1002 |
]
|
1003 |
},
|
@@ -1017,7 +1022,7 @@
|
|
1017 |
}
|
1018 |
],
|
1019 |
"source": [
|
1020 |
-
"# break
|
1021 |
]
|
1022 |
},
|
1023 |
{
|
@@ -1054,20 +1059,24 @@
|
|
1054 |
"source": [
|
1055 |
"# hyper-perameter tuning (alpha and beta)\n",
|
1056 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
1057 |
-
"
|
1058 |
-
" lda_model = gensim.models.LdaMulticore(
|
1059 |
-
"
|
1060 |
-
"
|
1061 |
-
"
|
1062 |
-
"
|
1063 |
-
"
|
1064 |
-
"
|
1065 |
-
"
|
1066 |
-
"
|
1067 |
-
"
|
|
|
|
|
|
|
|
|
1068 |
" coherence = coherence_model_lda.get_coherence()\n",
|
1069 |
-
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)
|
1070 |
-
"
|
1071 |
" return coherence, perplex"
|
1072 |
]
|
1073 |
},
|
@@ -1104,12 +1113,12 @@
|
|
1104 |
"\n",
|
1105 |
"# Alpha parameter\n",
|
1106 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
1107 |
-
"alpha.append(
|
1108 |
-
"alpha.append(
|
1109 |
"\n",
|
1110 |
"# Beta parameter\n",
|
1111 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
1112 |
-
"beta.append(
|
1113 |
]
|
1114 |
},
|
1115 |
{
|
@@ -1147,8 +1156,8 @@
|
|
1147 |
}
|
1148 |
],
|
1149 |
"source": [
|
1150 |
-
"print(\"Topic range: \",num_topics)\n",
|
1151 |
-
"print(\"Alpha: \",alpha)\n",
|
1152 |
"print(\"Beta: \", beta)"
|
1153 |
]
|
1154 |
},
|
@@ -1371,15 +1380,28 @@
|
|
1371 |
"for a in alpha:\n",
|
1372 |
" for b in beta:\n",
|
1373 |
" for num in num_topics:\n",
|
1374 |
-
" cv, pv = compute_coherence_values(
|
|
|
|
|
1375 |
"\n",
|
1376 |
-
" model_topics.append(num)
|
1377 |
-
" coherence_values.append(cv)
|
1378 |
" perplexity_values.append(pv)\n",
|
1379 |
" alpha_result.append(a)\n",
|
1380 |
" beta_result.append(b)\n",
|
1381 |
-
" print(\
|
1382 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1383 |
"print(datetime.datetime.now())"
|
1384 |
]
|
1385 |
},
|
@@ -1630,13 +1652,17 @@
|
|
1630 |
"source": [
|
1631 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1632 |
"result = pd.DataFrame(\n",
|
1633 |
-
" {
|
1634 |
-
"
|
1635 |
-
"
|
1636 |
-
"
|
1637 |
-
"
|
1638 |
-
"
|
1639 |
-
"
|
|
|
|
|
|
|
|
|
1640 |
]
|
1641 |
},
|
1642 |
{
|
@@ -1655,7 +1681,7 @@
|
|
1655 |
}
|
1656 |
],
|
1657 |
"source": [
|
1658 |
-
"result.to_csv(
|
1659 |
]
|
1660 |
},
|
1661 |
{
|
@@ -1753,7 +1779,7 @@
|
|
1753 |
],
|
1754 |
"source": [
|
1755 |
"# Show graph Topics vs Coherence Score\n",
|
1756 |
-
"result.groupby(
|
1757 |
]
|
1758 |
},
|
1759 |
{
|
@@ -1789,7 +1815,7 @@
|
|
1789 |
"plt.plot(model_topics, coherence_values)\n",
|
1790 |
"plt.xlabel(\"Num Topics\")\n",
|
1791 |
"plt.ylabel(\"Coherence Score\")\n",
|
1792 |
-
"plt.legend((\"Coherence Score\"), loc
|
1793 |
"plt.show()"
|
1794 |
]
|
1795 |
},
|
@@ -1826,7 +1852,7 @@
|
|
1826 |
"plt.plot(model_topics, perplexity_values)\n",
|
1827 |
"plt.xlabel(\"Num Topics\")\n",
|
1828 |
"plt.ylabel(\"Perplexity score\")\n",
|
1829 |
-
"plt.legend((\"perplexity_values\"), loc
|
1830 |
"plt.show()"
|
1831 |
]
|
1832 |
},
|
@@ -1848,20 +1874,22 @@
|
|
1848 |
"# realised that there may be some overlaps for more than 5 topics, but below 5 topics results in low differentiation and high ambiguity among the topics.\n",
|
1849 |
"# LDA is not suitable for this dataset\n",
|
1850 |
"k = 9\n",
|
1851 |
-
"a =
|
1852 |
"# a = 0.31\n",
|
1853 |
"# b = 0.31\n",
|
1854 |
-
"b =
|
1855 |
"\n",
|
1856 |
"\n",
|
1857 |
-
"final_model = gensim.models.LdaMulticore(
|
1858 |
-
"
|
1859 |
-
"
|
1860 |
-
"
|
1861 |
-
"
|
1862 |
-
"
|
1863 |
-
"
|
1864 |
-
"
|
|
|
|
|
1865 |
]
|
1866 |
},
|
1867 |
{
|
@@ -1884,7 +1912,7 @@
|
|
1884 |
}
|
1885 |
],
|
1886 |
"source": [
|
1887 |
-
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b)
|
1888 |
]
|
1889 |
},
|
1890 |
{
|
@@ -1896,12 +1924,12 @@
|
|
1896 |
},
|
1897 |
"outputs": [],
|
1898 |
"source": [
|
1899 |
-
"#Set up the environment to display the graphical outputs\n",
|
1900 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1901 |
"pyLDAvis.enable_notebook()\n",
|
1902 |
-
"visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1903 |
"\n",
|
1904 |
-
"#Save the output to the html file\n",
|
1905 |
"pyLDAvis.save_html(visual, \"topic_viz12_mod_training.html\")"
|
1906 |
]
|
1907 |
},
|
@@ -1960,7 +1988,7 @@
|
|
1960 |
"metadata": {},
|
1961 |
"outputs": [],
|
1962 |
"source": [
|
1963 |
-
"#Save a model to disk, or reload a pre-trained model\n",
|
1964 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1965 |
"final_model.save(\"final_model_9_sym_sym\")"
|
1966 |
]
|
@@ -1997,13 +2025,19 @@
|
|
1997 |
"outputs": [],
|
1998 |
"source": [
|
1999 |
"import warnings\n",
|
2000 |
-
"
|
|
|
|
|
2001 |
"\n",
|
2002 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
2003 |
" # Preallocate memory for the DataFrame\n",
|
2004 |
" num_docs = len(corpus)\n",
|
2005 |
-
" sent_topics = {
|
2006 |
-
"
|
|
|
|
|
|
|
|
|
2007 |
" # Get main topic in each document\n",
|
2008 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
2009 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
@@ -2011,13 +2045,13 @@
|
|
2011 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
2012 |
" dominant_topic, perc_contribution = row[0]\n",
|
2013 |
" topic_distribution = row\n",
|
2014 |
-
" sent_topics[
|
2015 |
-
" sent_topics[
|
2016 |
-
" sent_topics[
|
2017 |
"\n",
|
2018 |
" # Create the DataFrame\n",
|
2019 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
2020 |
-
" sent_topics_df[
|
2021 |
"\n",
|
2022 |
" return sent_topics_df"
|
2023 |
]
|
@@ -2029,7 +2063,9 @@
|
|
2029 |
"metadata": {},
|
2030 |
"outputs": [],
|
2031 |
"source": [
|
2032 |
-
"df_topic_sents_keywords = format_topics_sentences(
|
|
|
|
|
2033 |
]
|
2034 |
},
|
2035 |
{
|
@@ -2185,7 +2221,13 @@
|
|
2185 |
"source": [
|
2186 |
"# Format\n",
|
2187 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
2188 |
-
"df_dominant_topic.columns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
2189 |
"\n",
|
2190 |
"# Show\n",
|
2191 |
"df_dominant_topic.head(10)"
|
@@ -2270,7 +2312,7 @@
|
|
2270 |
"# Show the plot\n",
|
2271 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
2272 |
"plt.tight_layout()\n",
|
2273 |
-
"plt.show()
|
2274 |
]
|
2275 |
},
|
2276 |
{
|
@@ -2302,9 +2344,9 @@
|
|
2302 |
"outputs": [],
|
2303 |
"source": [
|
2304 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
2305 |
-
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)
|
2306 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
2307 |
-
"sampled_df.to_csv(
|
2308 |
]
|
2309 |
}
|
2310 |
],
|
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
+
"\n",
|
42 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
43 |
"\n",
|
44 |
"from pprint import pprint\n",
|
45 |
"import pyLDAvis\n",
|
|
|
96 |
}
|
97 |
],
|
98 |
"source": [
|
99 |
+
"df = pd.read_parquet(\"processed_data1.parquet\")"
|
100 |
]
|
101 |
},
|
102 |
{
|
|
|
449 |
],
|
450 |
"source": [
|
451 |
"# choose only the extreme and severe cases for modelling\n",
|
452 |
+
"cleaned = df_copy[df_copy[\"Severity\"].isin([\"Moderate\"])]\n",
|
453 |
"cleaned.reset_index(drop=True, inplace=True)"
|
454 |
]
|
455 |
},
|
|
|
579 |
}
|
580 |
],
|
581 |
"source": [
|
582 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
583 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
584 |
]
|
585 |
},
|
586 |
{
|
|
|
750 |
],
|
751 |
"source": [
|
752 |
"# Build LDA benchmark model\n",
|
753 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
754 |
+
" corpus=docs_vecs,\n",
|
755 |
+
" id2word=doc_dict,\n",
|
756 |
+
" num_topics=4,\n",
|
757 |
+
" random_state=42,\n",
|
758 |
+
" chunksize=100,\n",
|
759 |
+
" passes=10,\n",
|
760 |
+
" per_word_topics=True,\n",
|
761 |
+
")"
|
762 |
]
|
763 |
},
|
764 |
{
|
|
|
841 |
],
|
842 |
"source": [
|
843 |
"# Compute Benchmark Coherence Score\n",
|
844 |
+
"coherence_model_lda = CoherenceModel(\n",
|
845 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
846 |
+
")\n",
|
847 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
848 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
849 |
]
|
850 |
},
|
851 |
{
|
|
|
875 |
],
|
876 |
"source": [
|
877 |
"# Compute Benchmark Perplexity\n",
|
878 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
879 |
+
"# a measure of how good the model is. lower the better.\n",
|
880 |
"\n",
|
881 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
882 |
]
|
883 |
},
|
884 |
{
|
|
|
903 |
"\n",
|
904 |
"# feed the LDA model into the pyLDAvis instance\n",
|
905 |
"pyLDAvis.enable_notebook()\n",
|
906 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
907 |
"\n",
|
908 |
+
"# Save the output to the html file\n",
|
909 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark_moderate.html\")"
|
910 |
]
|
911 |
},
|
|
|
993 |
}
|
994 |
],
|
995 |
"source": [
|
996 |
+
"pd.set_option(\"max_colwidth\", 200)\n",
|
997 |
"# Get the topics and their top keywords into a dataframe\n",
|
998 |
+
"topics = lda_model.show_topics(num_words=6)\n",
|
999 |
"\n",
|
1000 |
"topic_keywords = pd.DataFrame()\n",
|
1001 |
"for topic_id, topic in topics:\n",
|
1002 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
1003 |
"\n",
|
1004 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
1005 |
+
"# topic_keywords['Topic Name'] = topic_mapping\n",
|
1006 |
"topic_keywords"
|
1007 |
]
|
1008 |
},
|
|
|
1022 |
}
|
1023 |
],
|
1024 |
"source": [
|
1025 |
+
"# break"
|
1026 |
]
|
1027 |
},
|
1028 |
{
|
|
|
1059 |
"source": [
|
1060 |
"# hyper-perameter tuning (alpha and beta)\n",
|
1061 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
1062 |
+
"\n",
|
1063 |
+
" lda_model = gensim.models.LdaMulticore(\n",
|
1064 |
+
" corpus=corpus,\n",
|
1065 |
+
" id2word=dictionary,\n",
|
1066 |
+
" num_topics=k,\n",
|
1067 |
+
" random_state=42,\n",
|
1068 |
+
" chunksize=100,\n",
|
1069 |
+
" passes=10,\n",
|
1070 |
+
" alpha=a,\n",
|
1071 |
+
" eta=b,\n",
|
1072 |
+
" )\n",
|
1073 |
+
"\n",
|
1074 |
+
" coherence_model_lda = CoherenceModel(\n",
|
1075 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
1076 |
+
" )\n",
|
1077 |
" coherence = coherence_model_lda.get_coherence()\n",
|
1078 |
+
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)\n",
|
1079 |
+
"\n",
|
1080 |
" return coherence, perplex"
|
1081 |
]
|
1082 |
},
|
|
|
1113 |
"\n",
|
1114 |
"# Alpha parameter\n",
|
1115 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
1116 |
+
"alpha.append(\"symmetric\")\n",
|
1117 |
+
"alpha.append(\"asymmetric\")\n",
|
1118 |
"\n",
|
1119 |
"# Beta parameter\n",
|
1120 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
1121 |
+
"beta.append(\"symmetric\")"
|
1122 |
]
|
1123 |
},
|
1124 |
{
|
|
|
1156 |
}
|
1157 |
],
|
1158 |
"source": [
|
1159 |
+
"print(\"Topic range: \", num_topics)\n",
|
1160 |
+
"print(\"Alpha: \", alpha)\n",
|
1161 |
"print(\"Beta: \", beta)"
|
1162 |
]
|
1163 |
},
|
|
|
1380 |
"for a in alpha:\n",
|
1381 |
" for b in beta:\n",
|
1382 |
" for num in num_topics:\n",
|
1383 |
+
" cv, pv = compute_coherence_values(\n",
|
1384 |
+
" corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b\n",
|
1385 |
+
" )\n",
|
1386 |
"\n",
|
1387 |
+
" model_topics.append(num)\n",
|
1388 |
+
" coherence_values.append(cv)\n",
|
1389 |
" perplexity_values.append(pv)\n",
|
1390 |
" alpha_result.append(a)\n",
|
1391 |
" beta_result.append(b)\n",
|
1392 |
+
" print(\n",
|
1393 |
+
" \"#Topics: \"\n",
|
1394 |
+
" + str(num)\n",
|
1395 |
+
" + \", CV Score: \"\n",
|
1396 |
+
" + str(coherence_values[-1])\n",
|
1397 |
+
" + \", PV Score: \"\n",
|
1398 |
+
" + str(perplexity_values[-1])\n",
|
1399 |
+
" + \", Alpha: \"\n",
|
1400 |
+
" + str(alpha_result[-1])\n",
|
1401 |
+
" + \", Beta: \"\n",
|
1402 |
+
" + str(beta_result[-1])\n",
|
1403 |
+
" )\n",
|
1404 |
+
"\n",
|
1405 |
"print(datetime.datetime.now())"
|
1406 |
]
|
1407 |
},
|
|
|
1652 |
"source": [
|
1653 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1654 |
"result = pd.DataFrame(\n",
|
1655 |
+
" {\n",
|
1656 |
+
" \"Topics\": model_topics,\n",
|
1657 |
+
" \"Coherence Score\": coherence_values,\n",
|
1658 |
+
" \"Perplexity Score\": perplexity_values,\n",
|
1659 |
+
" \"Alpha\": alpha_result,\n",
|
1660 |
+
" \"Beta\": beta_result,\n",
|
1661 |
+
" }\n",
|
1662 |
+
")\n",
|
1663 |
+
"result.sort_values(\n",
|
1664 |
+
" by=[\"Coherence Score\", \"Perplexity Score\"], ascending=[False, True]\n",
|
1665 |
+
").head(20)"
|
1666 |
]
|
1667 |
},
|
1668 |
{
|
|
|
1681 |
}
|
1682 |
],
|
1683 |
"source": [
|
1684 |
+
"result.to_csv(\"lda_fine_tuning_result.csv\")"
|
1685 |
]
|
1686 |
},
|
1687 |
{
|
|
|
1779 |
],
|
1780 |
"source": [
|
1781 |
"# Show graph Topics vs Coherence Score\n",
|
1782 |
+
"result.groupby(\"Alpha\").plot(x=\"Topics\", y=\"Coherence Score\", legend=True)"
|
1783 |
]
|
1784 |
},
|
1785 |
{
|
|
|
1815 |
"plt.plot(model_topics, coherence_values)\n",
|
1816 |
"plt.xlabel(\"Num Topics\")\n",
|
1817 |
"plt.ylabel(\"Coherence Score\")\n",
|
1818 |
+
"plt.legend((\"Coherence Score\"), loc=\"best\")\n",
|
1819 |
"plt.show()"
|
1820 |
]
|
1821 |
},
|
|
|
1852 |
"plt.plot(model_topics, perplexity_values)\n",
|
1853 |
"plt.xlabel(\"Num Topics\")\n",
|
1854 |
"plt.ylabel(\"Perplexity score\")\n",
|
1855 |
+
"plt.legend((\"perplexity_values\"), loc=\"best\")\n",
|
1856 |
"plt.show()"
|
1857 |
]
|
1858 |
},
|
|
|
1874 |
"# realised that there may be some overlaps for more than 5 topics, but below 5 topics results in low differentiation and high ambiguity among the topics.\n",
|
1875 |
"# LDA is not suitable for this dataset\n",
|
1876 |
"k = 9\n",
|
1877 |
+
"a = \"symmetric\"\n",
|
1878 |
"# a = 0.31\n",
|
1879 |
"# b = 0.31\n",
|
1880 |
+
"b = \"symmetric\"\n",
|
1881 |
"\n",
|
1882 |
"\n",
|
1883 |
+
"final_model = gensim.models.LdaMulticore(\n",
|
1884 |
+
" corpus=docs_vecs,\n",
|
1885 |
+
" id2word=doc_dict,\n",
|
1886 |
+
" num_topics=k,\n",
|
1887 |
+
" random_state=42,\n",
|
1888 |
+
" chunksize=100,\n",
|
1889 |
+
" passes=10,\n",
|
1890 |
+
" alpha=a,\n",
|
1891 |
+
" eta=b,\n",
|
1892 |
+
")"
|
1893 |
]
|
1894 |
},
|
1895 |
{
|
|
|
1912 |
}
|
1913 |
],
|
1914 |
"source": [
|
1915 |
+
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)"
|
1916 |
]
|
1917 |
},
|
1918 |
{
|
|
|
1924 |
},
|
1925 |
"outputs": [],
|
1926 |
"source": [
|
1927 |
+
"# Set up the environment to display the graphical outputs\n",
|
1928 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1929 |
"pyLDAvis.enable_notebook()\n",
|
1930 |
+
"visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1931 |
"\n",
|
1932 |
+
"# Save the output to the html file\n",
|
1933 |
"pyLDAvis.save_html(visual, \"topic_viz12_mod_training.html\")"
|
1934 |
]
|
1935 |
},
|
|
|
1988 |
"metadata": {},
|
1989 |
"outputs": [],
|
1990 |
"source": [
|
1991 |
+
"# Save a model to disk, or reload a pre-trained model\n",
|
1992 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1993 |
"final_model.save(\"final_model_9_sym_sym\")"
|
1994 |
]
|
|
|
2025 |
"outputs": [],
|
2026 |
"source": [
|
2027 |
"import warnings\n",
|
2028 |
+
"\n",
|
2029 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
2030 |
+
"\n",
|
2031 |
"\n",
|
2032 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
2033 |
" # Preallocate memory for the DataFrame\n",
|
2034 |
" num_docs = len(corpus)\n",
|
2035 |
+
" sent_topics = {\n",
|
2036 |
+
" \"Dominant_Topic\": [0] * num_docs,\n",
|
2037 |
+
" \"Perc_Contribution\": [0.0] * num_docs,\n",
|
2038 |
+
" \"Topic_Distribution\": [()] * num_docs,\n",
|
2039 |
+
" }\n",
|
2040 |
+
"\n",
|
2041 |
" # Get main topic in each document\n",
|
2042 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
2043 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
|
|
2045 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
2046 |
" dominant_topic, perc_contribution = row[0]\n",
|
2047 |
" topic_distribution = row\n",
|
2048 |
+
" sent_topics[\"Dominant_Topic\"][i] = int(dominant_topic)\n",
|
2049 |
+
" sent_topics[\"Perc_Contribution\"][i] = round(perc_contribution, 4)\n",
|
2050 |
+
" sent_topics[\"Topic_Distribution\"][i] = topic_distribution\n",
|
2051 |
"\n",
|
2052 |
" # Create the DataFrame\n",
|
2053 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
2054 |
+
" sent_topics_df[\"Text\"] = data\n",
|
2055 |
"\n",
|
2056 |
" return sent_topics_df"
|
2057 |
]
|
|
|
2063 |
"metadata": {},
|
2064 |
"outputs": [],
|
2065 |
"source": [
|
2066 |
+
"df_topic_sents_keywords = format_topics_sentences(\n",
|
2067 |
+
" ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details\n",
|
2068 |
+
")"
|
2069 |
]
|
2070 |
},
|
2071 |
{
|
|
|
2221 |
"source": [
|
2222 |
"# Format\n",
|
2223 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
2224 |
+
"df_dominant_topic.columns = [\n",
|
2225 |
+
" \"Document_No\",\n",
|
2226 |
+
" \"Dominant_Topic\",\n",
|
2227 |
+
" \"Topic_Perc_Contrib\",\n",
|
2228 |
+
" \"Topic_Distribution\",\n",
|
2229 |
+
" \"Text\",\n",
|
2230 |
+
"]\n",
|
2231 |
"\n",
|
2232 |
"# Show\n",
|
2233 |
"df_dominant_topic.head(10)"
|
|
|
2312 |
"# Show the plot\n",
|
2313 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
2314 |
"plt.tight_layout()\n",
|
2315 |
+
"plt.show()"
|
2316 |
]
|
2317 |
},
|
2318 |
{
|
|
|
2344 |
"outputs": [],
|
2345 |
"source": [
|
2346 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
2347 |
+
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)\n",
|
2348 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
2349 |
+
"sampled_df.to_csv(\"sample_moderate.csv\")"
|
2350 |
]
|
2351 |
}
|
2352 |
],
|
IS424_Data_Mining/code/LDA/topic_modelling_severe.ipynb
CHANGED
@@ -38,7 +38,8 @@
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
-
"
|
|
|
42 |
"\n",
|
43 |
"from pprint import pprint\n",
|
44 |
"import pyLDAvis\n",
|
@@ -86,7 +87,7 @@
|
|
86 |
"metadata": {},
|
87 |
"outputs": [],
|
88 |
"source": [
|
89 |
-
"df = pd.read_parquet(
|
90 |
]
|
91 |
},
|
92 |
{
|
@@ -439,7 +440,7 @@
|
|
439 |
],
|
440 |
"source": [
|
441 |
"# choose only the extreme and severe cases for modelling\n",
|
442 |
-
"cleaned = df_copy[df_copy[
|
443 |
"cleaned.reset_index(drop=True, inplace=True)"
|
444 |
]
|
445 |
},
|
@@ -558,8 +559,8 @@
|
|
558 |
}
|
559 |
],
|
560 |
"source": [
|
561 |
-
"print(
|
562 |
-
"print(
|
563 |
]
|
564 |
},
|
565 |
{
|
@@ -737,13 +738,15 @@
|
|
737 |
],
|
738 |
"source": [
|
739 |
"# Build LDA benchmark model\n",
|
740 |
-
"lda_model = gensim.models.LdaMulticore(
|
741 |
-
"
|
742 |
-
"
|
743 |
-
"
|
744 |
-
"
|
745 |
-
"
|
746 |
-
"
|
|
|
|
|
747 |
]
|
748 |
},
|
749 |
{
|
@@ -826,9 +829,11 @@
|
|
826 |
],
|
827 |
"source": [
|
828 |
"# Compute Benchmark Coherence Score\n",
|
829 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
830 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
831 |
-
"print(
|
832 |
]
|
833 |
},
|
834 |
{
|
@@ -858,10 +863,10 @@
|
|
858 |
],
|
859 |
"source": [
|
860 |
"# Compute Benchmark Perplexity\n",
|
861 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
862 |
-
"
|
863 |
"\n",
|
864 |
-
"print(
|
865 |
]
|
866 |
},
|
867 |
{
|
@@ -886,9 +891,9 @@
|
|
886 |
"\n",
|
887 |
"# feed the LDA model into the pyLDAvis instance\n",
|
888 |
"pyLDAvis.enable_notebook()\n",
|
889 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
890 |
"\n",
|
891 |
-
"#Save the output to the html file\n",
|
892 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark_severe.html\")"
|
893 |
]
|
894 |
},
|
@@ -976,15 +981,15 @@
|
|
976 |
}
|
977 |
],
|
978 |
"source": [
|
979 |
-
"pd.set_option(
|
980 |
"# Get the topics and their top keywords into a dataframe\n",
|
981 |
-
"topics = lda_model.show_topics(num_words=6)
|
982 |
"\n",
|
983 |
"topic_keywords = pd.DataFrame()\n",
|
984 |
"for topic_id, topic in topics:\n",
|
985 |
-
" topic_keywords.at[topic_id,
|
986 |
"\n",
|
987 |
-
"topic_keywords[
|
988 |
"topic_keywords"
|
989 |
]
|
990 |
},
|
@@ -1004,7 +1009,7 @@
|
|
1004 |
}
|
1005 |
],
|
1006 |
"source": [
|
1007 |
-
"# break
|
1008 |
]
|
1009 |
},
|
1010 |
{
|
@@ -1041,20 +1046,24 @@
|
|
1041 |
"source": [
|
1042 |
"# hyper-perameter tuning (alpha and beta)\n",
|
1043 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
1044 |
-
"
|
1045 |
-
" lda_model = gensim.models.LdaMulticore(
|
1046 |
-
"
|
1047 |
-
"
|
1048 |
-
"
|
1049 |
-
"
|
1050 |
-
"
|
1051 |
-
"
|
1052 |
-
"
|
1053 |
-
"
|
1054 |
-
"
|
|
|
|
|
|
|
|
|
1055 |
" coherence = coherence_model_lda.get_coherence()\n",
|
1056 |
-
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)
|
1057 |
-
"
|
1058 |
" return coherence, perplex"
|
1059 |
]
|
1060 |
},
|
@@ -1091,12 +1100,12 @@
|
|
1091 |
"\n",
|
1092 |
"# Alpha parameter\n",
|
1093 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
1094 |
-
"alpha.append(
|
1095 |
-
"alpha.append(
|
1096 |
"\n",
|
1097 |
"# Beta parameter\n",
|
1098 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
1099 |
-
"beta.append(
|
1100 |
]
|
1101 |
},
|
1102 |
{
|
@@ -1134,8 +1143,8 @@
|
|
1134 |
}
|
1135 |
],
|
1136 |
"source": [
|
1137 |
-
"print(\"Topic range: \",num_topics)\n",
|
1138 |
-
"print(\"Alpha: \",alpha)\n",
|
1139 |
"print(\"Beta: \", beta)"
|
1140 |
]
|
1141 |
},
|
@@ -1358,15 +1367,28 @@
|
|
1358 |
"for a in alpha:\n",
|
1359 |
" for b in beta:\n",
|
1360 |
" for num in num_topics:\n",
|
1361 |
-
" cv, pv = compute_coherence_values(
|
|
|
|
|
1362 |
"\n",
|
1363 |
-
" model_topics.append(num)
|
1364 |
-
" coherence_values.append(cv)
|
1365 |
" perplexity_values.append(pv)\n",
|
1366 |
" alpha_result.append(a)\n",
|
1367 |
" beta_result.append(b)\n",
|
1368 |
-
" print(\
|
1369 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1370 |
"print(datetime.datetime.now())"
|
1371 |
]
|
1372 |
},
|
@@ -1617,13 +1639,17 @@
|
|
1617 |
"source": [
|
1618 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1619 |
"result = pd.DataFrame(\n",
|
1620 |
-
" {
|
1621 |
-
"
|
1622 |
-
"
|
1623 |
-
"
|
1624 |
-
"
|
1625 |
-
"
|
1626 |
-
"
|
|
|
|
|
|
|
|
|
1627 |
]
|
1628 |
},
|
1629 |
{
|
@@ -1642,7 +1668,7 @@
|
|
1642 |
}
|
1643 |
],
|
1644 |
"source": [
|
1645 |
-
"result.to_csv(
|
1646 |
]
|
1647 |
},
|
1648 |
{
|
@@ -1740,7 +1766,7 @@
|
|
1740 |
],
|
1741 |
"source": [
|
1742 |
"# Show graph Topics vs Coherence Score\n",
|
1743 |
-
"result.groupby(
|
1744 |
]
|
1745 |
},
|
1746 |
{
|
@@ -1776,7 +1802,7 @@
|
|
1776 |
"plt.plot(model_topics, coherence_values)\n",
|
1777 |
"plt.xlabel(\"Num Topics\")\n",
|
1778 |
"plt.ylabel(\"Coherence Score\")\n",
|
1779 |
-
"plt.legend((\"Coherence Score\"), loc
|
1780 |
"plt.show()"
|
1781 |
]
|
1782 |
},
|
@@ -1813,7 +1839,7 @@
|
|
1813 |
"plt.plot(model_topics, perplexity_values)\n",
|
1814 |
"plt.xlabel(\"Num Topics\")\n",
|
1815 |
"plt.ylabel(\"Perplexity score\")\n",
|
1816 |
-
"plt.legend((\"perplexity_values\"), loc
|
1817 |
"plt.show()"
|
1818 |
]
|
1819 |
},
|
@@ -1861,21 +1887,22 @@
|
|
1861 |
"source": [
|
1862 |
"# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
|
1863 |
"k = 8\n",
|
1864 |
-
"a =
|
1865 |
"# a = 0.91\n",
|
1866 |
"# b = 0.61\n",
|
1867 |
-
"b =
|
1868 |
"\n",
|
1869 |
"\n",
|
1870 |
-
"\n",
|
1871 |
-
"
|
1872 |
-
"
|
1873 |
-
"
|
1874 |
-
"
|
1875 |
-
"
|
1876 |
-
"
|
1877 |
-
"
|
1878 |
-
"
|
|
|
1879 |
]
|
1880 |
},
|
1881 |
{
|
@@ -1906,7 +1933,7 @@
|
|
1906 |
}
|
1907 |
],
|
1908 |
"source": [
|
1909 |
-
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b)
|
1910 |
]
|
1911 |
},
|
1912 |
{
|
@@ -1927,12 +1954,12 @@
|
|
1927 |
}
|
1928 |
],
|
1929 |
"source": [
|
1930 |
-
"#Set up the environment to display the graphical outputs\n",
|
1931 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1932 |
"pyLDAvis.enable_notebook()\n",
|
1933 |
-
"visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1934 |
"\n",
|
1935 |
-
"#Save the output to the html file\n",
|
1936 |
"pyLDAvis.save_html(visual, \"topic_viz8_severe_training.html\")"
|
1937 |
]
|
1938 |
},
|
@@ -1997,7 +2024,7 @@
|
|
1997 |
"metadata": {},
|
1998 |
"outputs": [],
|
1999 |
"source": [
|
2000 |
-
"#Save a model to disk, or reload a pre-trained model\n",
|
2001 |
"# naming convention: final_model_topic_alpha_eta\n",
|
2002 |
"final_model.save(\"final_model_8_asym_sym\")"
|
2003 |
]
|
@@ -2028,13 +2055,19 @@
|
|
2028 |
],
|
2029 |
"source": [
|
2030 |
"import warnings\n",
|
2031 |
-
"
|
|
|
|
|
2032 |
"\n",
|
2033 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
2034 |
" # Preallocate memory for the DataFrame\n",
|
2035 |
" num_docs = len(corpus)\n",
|
2036 |
-
" sent_topics = {
|
2037 |
-
"
|
|
|
|
|
|
|
|
|
2038 |
" # Get main topic in each document\n",
|
2039 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
2040 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
@@ -2042,13 +2075,13 @@
|
|
2042 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
2043 |
" dominant_topic, perc_contribution = row[0]\n",
|
2044 |
" topic_distribution = row\n",
|
2045 |
-
" sent_topics[
|
2046 |
-
" sent_topics[
|
2047 |
-
" sent_topics[
|
2048 |
"\n",
|
2049 |
" # Create the DataFrame\n",
|
2050 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
2051 |
-
" sent_topics_df[
|
2052 |
"\n",
|
2053 |
" return sent_topics_df"
|
2054 |
]
|
@@ -2060,7 +2093,9 @@
|
|
2060 |
"metadata": {},
|
2061 |
"outputs": [],
|
2062 |
"source": [
|
2063 |
-
"df_topic_sents_keywords = format_topics_sentences(
|
|
|
|
|
2064 |
]
|
2065 |
},
|
2066 |
{
|
@@ -2228,7 +2263,13 @@
|
|
2228 |
"source": [
|
2229 |
"# Format\n",
|
2230 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
2231 |
-
"df_dominant_topic.columns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
2232 |
"\n",
|
2233 |
"# Show\n",
|
2234 |
"df_dominant_topic.head(10)"
|
@@ -2312,7 +2353,7 @@
|
|
2312 |
"# Show the plot\n",
|
2313 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
2314 |
"plt.tight_layout()\n",
|
2315 |
-
"plt.show()
|
2316 |
]
|
2317 |
},
|
2318 |
{
|
@@ -2566,7 +2607,7 @@
|
|
2566 |
}
|
2567 |
],
|
2568 |
"source": [
|
2569 |
-
"df_dominant_topic.sort_values(by
|
2570 |
]
|
2571 |
},
|
2572 |
{
|
@@ -2577,9 +2618,9 @@
|
|
2577 |
"outputs": [],
|
2578 |
"source": [
|
2579 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
2580 |
-
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)
|
2581 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
2582 |
-
"sampled_df.to_csv(
|
2583 |
]
|
2584 |
},
|
2585 |
{
|
|
|
38 |
"import datetime\n",
|
39 |
"\n",
|
40 |
"import warnings\n",
|
41 |
+
"\n",
|
42 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
43 |
"\n",
|
44 |
"from pprint import pprint\n",
|
45 |
"import pyLDAvis\n",
|
|
|
87 |
"metadata": {},
|
88 |
"outputs": [],
|
89 |
"source": [
|
90 |
+
"df = pd.read_parquet(\"processed_data1.parquet\")"
|
91 |
]
|
92 |
},
|
93 |
{
|
|
|
440 |
],
|
441 |
"source": [
|
442 |
"# choose only the extreme and severe cases for modelling\n",
|
443 |
+
"cleaned = df_copy[df_copy[\"Severity\"].isin([\"Extreme\", \"Severe\"])]\n",
|
444 |
"cleaned.reset_index(drop=True, inplace=True)"
|
445 |
]
|
446 |
},
|
|
|
559 |
}
|
560 |
],
|
561 |
"source": [
|
562 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
563 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
564 |
]
|
565 |
},
|
566 |
{
|
|
|
738 |
],
|
739 |
"source": [
|
740 |
"# Build LDA benchmark model\n",
|
741 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
742 |
+
" corpus=docs_vecs,\n",
|
743 |
+
" id2word=doc_dict,\n",
|
744 |
+
" num_topics=4,\n",
|
745 |
+
" random_state=42,\n",
|
746 |
+
" chunksize=100,\n",
|
747 |
+
" passes=10,\n",
|
748 |
+
" per_word_topics=True,\n",
|
749 |
+
")"
|
750 |
]
|
751 |
},
|
752 |
{
|
|
|
829 |
],
|
830 |
"source": [
|
831 |
"# Compute Benchmark Coherence Score\n",
|
832 |
+
"coherence_model_lda = CoherenceModel(\n",
|
833 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
834 |
+
")\n",
|
835 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
836 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
837 |
]
|
838 |
},
|
839 |
{
|
|
|
863 |
],
|
864 |
"source": [
|
865 |
"# Compute Benchmark Perplexity\n",
|
866 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
867 |
+
"# a measure of how good the model is. lower the better.\n",
|
868 |
"\n",
|
869 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
870 |
]
|
871 |
},
|
872 |
{
|
|
|
891 |
"\n",
|
892 |
"# feed the LDA model into the pyLDAvis instance\n",
|
893 |
"pyLDAvis.enable_notebook()\n",
|
894 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
895 |
"\n",
|
896 |
+
"# Save the output to the html file\n",
|
897 |
"pyLDAvis.save_html(visual, \"topic_viz_benchmark_severe.html\")"
|
898 |
]
|
899 |
},
|
|
|
981 |
}
|
982 |
],
|
983 |
"source": [
|
984 |
+
"pd.set_option(\"max_colwidth\", 200)\n",
|
985 |
"# Get the topics and their top keywords into a dataframe\n",
|
986 |
+
"topics = lda_model.show_topics(num_words=6)\n",
|
987 |
"\n",
|
988 |
"topic_keywords = pd.DataFrame()\n",
|
989 |
"for topic_id, topic in topics:\n",
|
990 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
991 |
"\n",
|
992 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
993 |
"topic_keywords"
|
994 |
]
|
995 |
},
|
|
|
1009 |
}
|
1010 |
],
|
1011 |
"source": [
|
1012 |
+
"# break"
|
1013 |
]
|
1014 |
},
|
1015 |
{
|
|
|
1046 |
"source": [
|
1047 |
"# hyper-perameter tuning (alpha and beta)\n",
|
1048 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
1049 |
+
"\n",
|
1050 |
+
" lda_model = gensim.models.LdaMulticore(\n",
|
1051 |
+
" corpus=corpus,\n",
|
1052 |
+
" id2word=dictionary,\n",
|
1053 |
+
" num_topics=k,\n",
|
1054 |
+
" random_state=42,\n",
|
1055 |
+
" chunksize=100,\n",
|
1056 |
+
" passes=10,\n",
|
1057 |
+
" alpha=a,\n",
|
1058 |
+
" eta=b,\n",
|
1059 |
+
" )\n",
|
1060 |
+
"\n",
|
1061 |
+
" coherence_model_lda = CoherenceModel(\n",
|
1062 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
1063 |
+
" )\n",
|
1064 |
" coherence = coherence_model_lda.get_coherence()\n",
|
1065 |
+
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)\n",
|
1066 |
+
"\n",
|
1067 |
" return coherence, perplex"
|
1068 |
]
|
1069 |
},
|
|
|
1100 |
"\n",
|
1101 |
"# Alpha parameter\n",
|
1102 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
1103 |
+
"alpha.append(\"symmetric\")\n",
|
1104 |
+
"alpha.append(\"asymmetric\")\n",
|
1105 |
"\n",
|
1106 |
"# Beta parameter\n",
|
1107 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
1108 |
+
"beta.append(\"symmetric\")"
|
1109 |
]
|
1110 |
},
|
1111 |
{
|
|
|
1143 |
}
|
1144 |
],
|
1145 |
"source": [
|
1146 |
+
"print(\"Topic range: \", num_topics)\n",
|
1147 |
+
"print(\"Alpha: \", alpha)\n",
|
1148 |
"print(\"Beta: \", beta)"
|
1149 |
]
|
1150 |
},
|
|
|
1367 |
"for a in alpha:\n",
|
1368 |
" for b in beta:\n",
|
1369 |
" for num in num_topics:\n",
|
1370 |
+
" cv, pv = compute_coherence_values(\n",
|
1371 |
+
" corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b\n",
|
1372 |
+
" )\n",
|
1373 |
"\n",
|
1374 |
+
" model_topics.append(num)\n",
|
1375 |
+
" coherence_values.append(cv)\n",
|
1376 |
" perplexity_values.append(pv)\n",
|
1377 |
" alpha_result.append(a)\n",
|
1378 |
" beta_result.append(b)\n",
|
1379 |
+
" print(\n",
|
1380 |
+
" \"#Topics: \"\n",
|
1381 |
+
" + str(num)\n",
|
1382 |
+
" + \", CV Score: \"\n",
|
1383 |
+
" + str(coherence_values[-1])\n",
|
1384 |
+
" + \", PV Score: \"\n",
|
1385 |
+
" + str(perplexity_values[-1])\n",
|
1386 |
+
" + \", Alpha: \"\n",
|
1387 |
+
" + str(alpha_result[-1])\n",
|
1388 |
+
" + \", Beta: \"\n",
|
1389 |
+
" + str(beta_result[-1])\n",
|
1390 |
+
" )\n",
|
1391 |
+
"\n",
|
1392 |
"print(datetime.datetime.now())"
|
1393 |
]
|
1394 |
},
|
|
|
1639 |
"source": [
|
1640 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1641 |
"result = pd.DataFrame(\n",
|
1642 |
+
" {\n",
|
1643 |
+
" \"Topics\": model_topics,\n",
|
1644 |
+
" \"Coherence Score\": coherence_values,\n",
|
1645 |
+
" \"Perplexity Score\": perplexity_values,\n",
|
1646 |
+
" \"Alpha\": alpha_result,\n",
|
1647 |
+
" \"Beta\": beta_result,\n",
|
1648 |
+
" }\n",
|
1649 |
+
")\n",
|
1650 |
+
"result.sort_values(\n",
|
1651 |
+
" by=[\"Coherence Score\", \"Perplexity Score\"], ascending=[False, True]\n",
|
1652 |
+
").head(20)"
|
1653 |
]
|
1654 |
},
|
1655 |
{
|
|
|
1668 |
}
|
1669 |
],
|
1670 |
"source": [
|
1671 |
+
"result.to_csv(\"lda_fine_tuning_result.csv\")"
|
1672 |
]
|
1673 |
},
|
1674 |
{
|
|
|
1766 |
],
|
1767 |
"source": [
|
1768 |
"# Show graph Topics vs Coherence Score\n",
|
1769 |
+
"result.groupby(\"Alpha\").plot(x=\"Topics\", y=\"Coherence Score\", legend=True)"
|
1770 |
]
|
1771 |
},
|
1772 |
{
|
|
|
1802 |
"plt.plot(model_topics, coherence_values)\n",
|
1803 |
"plt.xlabel(\"Num Topics\")\n",
|
1804 |
"plt.ylabel(\"Coherence Score\")\n",
|
1805 |
+
"plt.legend((\"Coherence Score\"), loc=\"best\")\n",
|
1806 |
"plt.show()"
|
1807 |
]
|
1808 |
},
|
|
|
1839 |
"plt.plot(model_topics, perplexity_values)\n",
|
1840 |
"plt.xlabel(\"Num Topics\")\n",
|
1841 |
"plt.ylabel(\"Perplexity score\")\n",
|
1842 |
+
"plt.legend((\"perplexity_values\"), loc=\"best\")\n",
|
1843 |
"plt.show()"
|
1844 |
]
|
1845 |
},
|
|
|
1887 |
"source": [
|
1888 |
"# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
|
1889 |
"k = 8\n",
|
1890 |
+
"a = \"asymmetric\"\n",
|
1891 |
"# a = 0.91\n",
|
1892 |
"# b = 0.61\n",
|
1893 |
+
"b = \"symmetric\"\n",
|
1894 |
"\n",
|
1895 |
"\n",
|
1896 |
+
"final_model = gensim.models.LdaMulticore(\n",
|
1897 |
+
" corpus=docs_vecs,\n",
|
1898 |
+
" id2word=doc_dict,\n",
|
1899 |
+
" num_topics=k,\n",
|
1900 |
+
" random_state=42,\n",
|
1901 |
+
" chunksize=100,\n",
|
1902 |
+
" passes=10,\n",
|
1903 |
+
" alpha=a,\n",
|
1904 |
+
" eta=b,\n",
|
1905 |
+
")"
|
1906 |
]
|
1907 |
},
|
1908 |
{
|
|
|
1933 |
}
|
1934 |
],
|
1935 |
"source": [
|
1936 |
+
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)"
|
1937 |
]
|
1938 |
},
|
1939 |
{
|
|
|
1954 |
}
|
1955 |
],
|
1956 |
"source": [
|
1957 |
+
"# Set up the environment to display the graphical outputs\n",
|
1958 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1959 |
"pyLDAvis.enable_notebook()\n",
|
1960 |
+
"visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1961 |
"\n",
|
1962 |
+
"# Save the output to the html file\n",
|
1963 |
"pyLDAvis.save_html(visual, \"topic_viz8_severe_training.html\")"
|
1964 |
]
|
1965 |
},
|
|
|
2024 |
"metadata": {},
|
2025 |
"outputs": [],
|
2026 |
"source": [
|
2027 |
+
"# Save a model to disk, or reload a pre-trained model\n",
|
2028 |
"# naming convention: final_model_topic_alpha_eta\n",
|
2029 |
"final_model.save(\"final_model_8_asym_sym\")"
|
2030 |
]
|
|
|
2055 |
],
|
2056 |
"source": [
|
2057 |
"import warnings\n",
|
2058 |
+
"\n",
|
2059 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
2060 |
+
"\n",
|
2061 |
"\n",
|
2062 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
2063 |
" # Preallocate memory for the DataFrame\n",
|
2064 |
" num_docs = len(corpus)\n",
|
2065 |
+
" sent_topics = {\n",
|
2066 |
+
" \"Dominant_Topic\": [0] * num_docs,\n",
|
2067 |
+
" \"Perc_Contribution\": [0.0] * num_docs,\n",
|
2068 |
+
" \"Topic_Distribution\": [()] * num_docs,\n",
|
2069 |
+
" }\n",
|
2070 |
+
"\n",
|
2071 |
" # Get main topic in each document\n",
|
2072 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
2073 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
|
|
2075 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
2076 |
" dominant_topic, perc_contribution = row[0]\n",
|
2077 |
" topic_distribution = row\n",
|
2078 |
+
" sent_topics[\"Dominant_Topic\"][i] = int(dominant_topic)\n",
|
2079 |
+
" sent_topics[\"Perc_Contribution\"][i] = round(perc_contribution, 4)\n",
|
2080 |
+
" sent_topics[\"Topic_Distribution\"][i] = topic_distribution\n",
|
2081 |
"\n",
|
2082 |
" # Create the DataFrame\n",
|
2083 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
2084 |
+
" sent_topics_df[\"Text\"] = data\n",
|
2085 |
"\n",
|
2086 |
" return sent_topics_df"
|
2087 |
]
|
|
|
2093 |
"metadata": {},
|
2094 |
"outputs": [],
|
2095 |
"source": [
|
2096 |
+
"df_topic_sents_keywords = format_topics_sentences(\n",
|
2097 |
+
" ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details\n",
|
2098 |
+
")"
|
2099 |
]
|
2100 |
},
|
2101 |
{
|
|
|
2263 |
"source": [
|
2264 |
"# Format\n",
|
2265 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
2266 |
+
"df_dominant_topic.columns = [\n",
|
2267 |
+
" \"Document_No\",\n",
|
2268 |
+
" \"Dominant_Topic\",\n",
|
2269 |
+
" \"Topic_Perc_Contrib\",\n",
|
2270 |
+
" \"Topic_Distribution\",\n",
|
2271 |
+
" \"Text\",\n",
|
2272 |
+
"]\n",
|
2273 |
"\n",
|
2274 |
"# Show\n",
|
2275 |
"df_dominant_topic.head(10)"
|
|
|
2353 |
"# Show the plot\n",
|
2354 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
2355 |
"plt.tight_layout()\n",
|
2356 |
+
"plt.show()"
|
2357 |
]
|
2358 |
},
|
2359 |
{
|
|
|
2607 |
}
|
2608 |
],
|
2609 |
"source": [
|
2610 |
+
"df_dominant_topic.sort_values(by=\"Topic_Perc_Contrib\", ascending=False).head(20)"
|
2611 |
]
|
2612 |
},
|
2613 |
{
|
|
|
2618 |
"outputs": [],
|
2619 |
"source": [
|
2620 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
2621 |
+
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)\n",
|
2622 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
2623 |
+
"sampled_df.to_csv(\"sample_severe.csv\")"
|
2624 |
]
|
2625 |
},
|
2626 |
{
|
IS424_Data_Mining/code/LLM Evaluation/evaluation.ipynb
CHANGED
@@ -821,7 +821,7 @@
|
|
821 |
}
|
822 |
],
|
823 |
"source": [
|
824 |
-
"df_sorted = df.groupby(\"Category\").count().sort_values(by
|
825 |
"df_sorted"
|
826 |
]
|
827 |
},
|
@@ -834,8 +834,8 @@
|
|
834 |
"source": [
|
835 |
"# Function to determine the value for the new column\n",
|
836 |
"def categorize(value):\n",
|
837 |
-
" if
|
838 |
-
" return
|
839 |
" else:\n",
|
840 |
" return value"
|
841 |
]
|
@@ -847,7 +847,7 @@
|
|
847 |
"metadata": {},
|
848 |
"outputs": [],
|
849 |
"source": [
|
850 |
-
"df[
|
851 |
]
|
852 |
},
|
853 |
{
|
@@ -1131,7 +1131,9 @@
|
|
1131 |
}
|
1132 |
],
|
1133 |
"source": [
|
1134 |
-
"df_sorted1 =
|
|
|
|
|
1135 |
"df_sorted1"
|
1136 |
]
|
1137 |
},
|
@@ -1142,7 +1144,7 @@
|
|
1142 |
"metadata": {},
|
1143 |
"outputs": [],
|
1144 |
"source": [
|
1145 |
-
"df.to_csv(
|
1146 |
]
|
1147 |
},
|
1148 |
{
|
@@ -1488,7 +1490,7 @@
|
|
1488 |
}
|
1489 |
],
|
1490 |
"source": [
|
1491 |
-
"eva = pd.read_csv(
|
1492 |
"eva"
|
1493 |
]
|
1494 |
},
|
@@ -1524,8 +1526,10 @@
|
|
1524 |
}
|
1525 |
],
|
1526 |
"source": [
|
1527 |
-
"eva[
|
1528 |
-
"
|
|
|
|
|
1529 |
"\n",
|
1530 |
"result_gpt = result.sort_values(ascending=False)\n",
|
1531 |
"result_gpt"
|
@@ -1735,7 +1739,9 @@
|
|
1735 |
}
|
1736 |
],
|
1737 |
"source": [
|
1738 |
-
"test =
|
|
|
|
|
1739 |
"test"
|
1740 |
]
|
1741 |
},
|
@@ -1763,8 +1769,10 @@
|
|
1763 |
}
|
1764 |
],
|
1765 |
"source": [
|
1766 |
-
"eva[
|
1767 |
-
"
|
|
|
|
|
1768 |
"\n",
|
1769 |
"# If you want to sort the result by the count in descending order:\n",
|
1770 |
"result_golden = result.sort_values(ascending=False)\n",
|
|
|
821 |
}
|
822 |
],
|
823 |
"source": [
|
824 |
+
"df_sorted = df.groupby(\"Category\").count().sort_values(by=\"id\", ascending=False)\n",
|
825 |
"df_sorted"
|
826 |
]
|
827 |
},
|
|
|
834 |
"source": [
|
835 |
"# Function to determine the value for the new column\n",
|
836 |
"def categorize(value):\n",
|
837 |
+
" if \"/\" in str(value) or \",\" in str(value):\n",
|
838 |
+
" return \"Miscellaneous Events\"\n",
|
839 |
" else:\n",
|
840 |
" return value"
|
841 |
]
|
|
|
847 |
"metadata": {},
|
848 |
"outputs": [],
|
849 |
"source": [
|
850 |
+
"df[\"GPT Generated Result\"] = df[\"Category\"].apply(categorize)"
|
851 |
]
|
852 |
},
|
853 |
{
|
|
|
1131 |
}
|
1132 |
],
|
1133 |
"source": [
|
1134 |
+
"df_sorted1 = (\n",
|
1135 |
+
" df.groupby(\"GPT Generated Result\").count().sort_values(by=\"id\", ascending=False)\n",
|
1136 |
+
")\n",
|
1137 |
"df_sorted1"
|
1138 |
]
|
1139 |
},
|
|
|
1144 |
"metadata": {},
|
1145 |
"outputs": [],
|
1146 |
"source": [
|
1147 |
+
"df.to_csv(\"result.csv\")"
|
1148 |
]
|
1149 |
},
|
1150 |
{
|
|
|
1490 |
}
|
1491 |
],
|
1492 |
"source": [
|
1493 |
+
"eva = pd.read_csv(\"evaluation_result.csv\")\n",
|
1494 |
"eva"
|
1495 |
]
|
1496 |
},
|
|
|
1526 |
}
|
1527 |
],
|
1528 |
"source": [
|
1529 |
+
"eva[\"Result_GPT_True_Count\"] = eva[\"Result_GPT\"].astype(\n",
|
1530 |
+
" int\n",
|
1531 |
+
") # Convert boolean values to integers\n",
|
1532 |
+
"result = eva.groupby(\"GPT Generated Result\")[\"Result_GPT_True_Count\"].sum()\n",
|
1533 |
"\n",
|
1534 |
"result_gpt = result.sort_values(ascending=False)\n",
|
1535 |
"result_gpt"
|
|
|
1739 |
}
|
1740 |
],
|
1741 |
"source": [
|
1742 |
+
"test = (\n",
|
1743 |
+
" eva.groupby(\"Category_GoldenResult\").count().sort_values(by=\"id\", ascending=False)\n",
|
1744 |
+
")\n",
|
1745 |
"test"
|
1746 |
]
|
1747 |
},
|
|
|
1769 |
}
|
1770 |
],
|
1771 |
"source": [
|
1772 |
+
"eva[\"Result_Golden_True_Count\"] = eva[\"Result_Golden\"].astype(\n",
|
1773 |
+
" int\n",
|
1774 |
+
") # Convert boolean values to integers\n",
|
1775 |
+
"result = eva.groupby(\"Category_GoldenResult\")[\"Result_Golden_True_Count\"].sum()\n",
|
1776 |
"\n",
|
1777 |
"# If you want to sort the result by the count in descending order:\n",
|
1778 |
"result_golden = result.sort_values(ascending=False)\n",
|
IS424_Data_Mining/code/NER/Named_Entity_Recognition.ipynb
CHANGED
@@ -270,10 +270,11 @@
|
|
270 |
"import pandas as pd\n",
|
271 |
"import numpy as np\n",
|
272 |
"import re\n",
|
|
|
273 |
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
274 |
"\n",
|
275 |
-
"df = pd.read_csv(
|
276 |
-
"df = df[df[
|
277 |
"df.head()"
|
278 |
]
|
279 |
},
|
@@ -350,7 +351,7 @@
|
|
350 |
}
|
351 |
],
|
352 |
"source": [
|
353 |
-
"df[
|
354 |
]
|
355 |
},
|
356 |
{
|
@@ -1001,7 +1002,7 @@
|
|
1001 |
}
|
1002 |
],
|
1003 |
"source": [
|
1004 |
-
"df[
|
1005 |
"\n",
|
1006 |
"# Print the updated DataFramedf)\n",
|
1007 |
"df.head(20)"
|
@@ -1037,7 +1038,7 @@
|
|
1037 |
}
|
1038 |
],
|
1039 |
"source": [
|
1040 |
-
"entities= df[\"Entities\"]\n",
|
1041 |
"entities[0]"
|
1042 |
]
|
1043 |
},
|
@@ -1064,14 +1065,13 @@
|
|
1064 |
}
|
1065 |
],
|
1066 |
"source": [
|
1067 |
-
"#put all the entity into a list of str\n",
|
1068 |
"documents = []\n",
|
1069 |
"for tup in entities:\n",
|
1070 |
" for en in tup:\n",
|
1071 |
" documents.append(en[0])\n",
|
1072 |
-
"
|
1073 |
-
"print(documents)
|
1074 |
-
" "
|
1075 |
]
|
1076 |
},
|
1077 |
{
|
@@ -1094,16 +1094,17 @@
|
|
1094 |
"source": [
|
1095 |
"from wordcloud import WordCloud\n",
|
1096 |
"import matplotlib.pyplot as plt\n",
|
|
|
1097 |
"# Combine all the documents into a single string\n",
|
1098 |
-
"text =
|
1099 |
"\n",
|
1100 |
"# Generate a word cloud\n",
|
1101 |
-
"wordcloud = WordCloud(width=800, height=400, background_color
|
1102 |
"\n",
|
1103 |
"# Display the generated word cloud using matplotlib\n",
|
1104 |
"plt.figure(figsize=(10, 5))\n",
|
1105 |
-
"plt.imshow(wordcloud, interpolation
|
1106 |
-
"plt.axis(
|
1107 |
"plt.show()"
|
1108 |
]
|
1109 |
},
|
|
|
270 |
"import pandas as pd\n",
|
271 |
"import numpy as np\n",
|
272 |
"import re\n",
|
273 |
+
"\n",
|
274 |
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
275 |
"\n",
|
276 |
+
"df = pd.read_csv(\"cleaned_data.csv\")\n",
|
277 |
+
"df = df[df[\"Headline\"].apply(lambda x: not isinstance(x, float))]\n",
|
278 |
"df.head()"
|
279 |
]
|
280 |
},
|
|
|
351 |
}
|
352 |
],
|
353 |
"source": [
|
354 |
+
"df[\"Headline\"][5078:5092]"
|
355 |
]
|
356 |
},
|
357 |
{
|
|
|
1002 |
}
|
1003 |
],
|
1004 |
"source": [
|
1005 |
+
"df[\"Entities\"] = df[\"Details\"].apply(extract_entities)\n",
|
1006 |
"\n",
|
1007 |
"# Print the updated DataFramedf)\n",
|
1008 |
"df.head(20)"
|
|
|
1038 |
}
|
1039 |
],
|
1040 |
"source": [
|
1041 |
+
"entities = df[\"Entities\"]\n",
|
1042 |
"entities[0]"
|
1043 |
]
|
1044 |
},
|
|
|
1065 |
}
|
1066 |
],
|
1067 |
"source": [
|
1068 |
+
"# put all the entity into a list of str\n",
|
1069 |
"documents = []\n",
|
1070 |
"for tup in entities:\n",
|
1071 |
" for en in tup:\n",
|
1072 |
" documents.append(en[0])\n",
|
1073 |
+
"\n",
|
1074 |
+
"print(documents)"
|
|
|
1075 |
]
|
1076 |
},
|
1077 |
{
|
|
|
1094 |
"source": [
|
1095 |
"from wordcloud import WordCloud\n",
|
1096 |
"import matplotlib.pyplot as plt\n",
|
1097 |
+
"\n",
|
1098 |
"# Combine all the documents into a single string\n",
|
1099 |
+
"text = \" \".join(documents)\n",
|
1100 |
"\n",
|
1101 |
"# Generate a word cloud\n",
|
1102 |
+
"wordcloud = WordCloud(width=800, height=400, background_color=\"white\").generate(text)\n",
|
1103 |
"\n",
|
1104 |
"# Display the generated word cloud using matplotlib\n",
|
1105 |
"plt.figure(figsize=(10, 5))\n",
|
1106 |
+
"plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
1107 |
+
"plt.axis(\"off\")\n",
|
1108 |
"plt.show()"
|
1109 |
]
|
1110 |
},
|
IS424_Data_Mining/code/NewsScraper/newsScraper.ipynb
CHANGED
@@ -71,7 +71,7 @@
|
|
71 |
" if articles:\n",
|
72 |
" for link in range(len(articles)):\n",
|
73 |
" try:\n",
|
74 |
-
" article = gnews.get_full_article(articles[link][
|
75 |
" if article.text:\n",
|
76 |
" results.append([article.url, article.title, article.text])\n",
|
77 |
" break\n",
|
@@ -82,13 +82,19 @@
|
|
82 |
"\n",
|
83 |
" if not results:\n",
|
84 |
" # if blocked by the website\n",
|
85 |
-
" results.append(
|
86 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
" # No articles found for the given title.\n",
|
88 |
" else:\n",
|
89 |
-
" results.append([
|
90 |
"\n",
|
91 |
-
" return results
|
92 |
]
|
93 |
},
|
94 |
{
|
@@ -101,7 +107,7 @@
|
|
101 |
"outputs": [],
|
102 |
"source": [
|
103 |
"# Read the CSV file with news titles\n",
|
104 |
-
"csv_file_path =
|
105 |
"df = pd.read_csv(csv_file_path)"
|
106 |
]
|
107 |
},
|
@@ -134,7 +140,7 @@
|
|
134 |
"outputs": [],
|
135 |
"source": [
|
136 |
"# drop empty news\n",
|
137 |
-
"df.dropna(subset=[
|
138 |
]
|
139 |
},
|
140 |
{
|
@@ -155,7 +161,7 @@
|
|
155 |
}
|
156 |
],
|
157 |
"source": [
|
158 |
-
"df[[
|
159 |
]
|
160 |
},
|
161 |
{
|
@@ -166,7 +172,7 @@
|
|
166 |
"outputs": [],
|
167 |
"source": [
|
168 |
"# drop the duplicated news\n",
|
169 |
-
"duplicates = df.duplicated(subset=[
|
170 |
"df_uni = df[~duplicates]"
|
171 |
]
|
172 |
},
|
@@ -3224,14 +3230,14 @@
|
|
3224 |
"# Iterate through each row and get the full news article\n",
|
3225 |
"tes = df_uni\n",
|
3226 |
"for index, row in tes.iterrows():\n",
|
3227 |
-
" headline = row[
|
3228 |
-
"
|
3229 |
" results = get_news_article(headline)\n",
|
3230 |
-
"
|
3231 |
" # Update the DataFrame with the fetched data\n",
|
3232 |
-
" tes.at[index,
|
3233 |
-
" tes.at[index,
|
3234 |
-
" tes.at[index,
|
3235 |
]
|
3236 |
},
|
3237 |
{
|
@@ -3700,7 +3706,9 @@
|
|
3700 |
}
|
3701 |
],
|
3702 |
"source": [
|
3703 |
-
"count_rows = tes[
|
|
|
|
|
3704 |
"count_rows"
|
3705 |
]
|
3706 |
},
|
@@ -3722,7 +3730,7 @@
|
|
3722 |
}
|
3723 |
],
|
3724 |
"source": [
|
3725 |
-
"count_rows = tes[tes[
|
3726 |
"count_rows"
|
3727 |
]
|
3728 |
},
|
@@ -3744,7 +3752,7 @@
|
|
3744 |
}
|
3745 |
],
|
3746 |
"source": [
|
3747 |
-
"count_rows = tes[tes[
|
3748 |
"count_rows"
|
3749 |
]
|
3750 |
},
|
@@ -3755,8 +3763,8 @@
|
|
3755 |
"metadata": {},
|
3756 |
"outputs": [],
|
3757 |
"source": [
|
3758 |
-
"tes.to_parquet(
|
3759 |
-
"tes.to_csv(
|
3760 |
]
|
3761 |
},
|
3762 |
{
|
|
|
71 |
" if articles:\n",
|
72 |
" for link in range(len(articles)):\n",
|
73 |
" try:\n",
|
74 |
+
" article = gnews.get_full_article(articles[link][\"url\"])\n",
|
75 |
" if article.text:\n",
|
76 |
" results.append([article.url, article.title, article.text])\n",
|
77 |
" break\n",
|
|
|
82 |
"\n",
|
83 |
" if not results:\n",
|
84 |
" # if blocked by the website\n",
|
85 |
+
" results.append(\n",
|
86 |
+
" [\n",
|
87 |
+
" \"cannot scrape the url\",\n",
|
88 |
+
" \"cannot scrape the title\",\n",
|
89 |
+
" \"cannot scrape the content\",\n",
|
90 |
+
" ]\n",
|
91 |
+
" )\n",
|
92 |
+
"\n",
|
93 |
" # No articles found for the given title.\n",
|
94 |
" else:\n",
|
95 |
+
" results.append([\"no url found\", \"no title found\", \"no content found\"])\n",
|
96 |
"\n",
|
97 |
+
" return results"
|
98 |
]
|
99 |
},
|
100 |
{
|
|
|
107 |
"outputs": [],
|
108 |
"source": [
|
109 |
"# Read the CSV file with news titles\n",
|
110 |
+
"csv_file_path = \"LDA/cleaned_data.csv\"\n",
|
111 |
"df = pd.read_csv(csv_file_path)"
|
112 |
]
|
113 |
},
|
|
|
140 |
"outputs": [],
|
141 |
"source": [
|
142 |
"# drop empty news\n",
|
143 |
+
"df.dropna(subset=[\"Headline\"], inplace=True)"
|
144 |
]
|
145 |
},
|
146 |
{
|
|
|
161 |
}
|
162 |
],
|
163 |
"source": [
|
164 |
+
"df[[\"Year\", \"Headline\", \"Region\"]].duplicated().any()"
|
165 |
]
|
166 |
},
|
167 |
{
|
|
|
172 |
"outputs": [],
|
173 |
"source": [
|
174 |
"# drop the duplicated news\n",
|
175 |
+
"duplicates = df.duplicated(subset=[\"Year\", \"Headline\", \"Region\"], keep=\"first\")\n",
|
176 |
"df_uni = df[~duplicates]"
|
177 |
]
|
178 |
},
|
|
|
3230 |
"# Iterate through each row and get the full news article\n",
|
3231 |
"tes = df_uni\n",
|
3232 |
"for index, row in tes.iterrows():\n",
|
3233 |
+
" headline = row[\"Headline\"]\n",
|
3234 |
+
"\n",
|
3235 |
" results = get_news_article(headline)\n",
|
3236 |
+
"\n",
|
3237 |
" # Update the DataFrame with the fetched data\n",
|
3238 |
+
" tes.at[index, \"url\"] = results[0][0]\n",
|
3239 |
+
" tes.at[index, \"title\"] = results[0][1]\n",
|
3240 |
+
" tes.at[index, \"content\"] = results[0][2]"
|
3241 |
]
|
3242 |
},
|
3243 |
{
|
|
|
3706 |
}
|
3707 |
],
|
3708 |
"source": [
|
3709 |
+
"count_rows = tes[\n",
|
3710 |
+
" ~tes[\"content\"].isin([\"cannot scrape the content\", \"no content found\"])\n",
|
3711 |
+
"].shape[0]\n",
|
3712 |
"count_rows"
|
3713 |
]
|
3714 |
},
|
|
|
3730 |
}
|
3731 |
],
|
3732 |
"source": [
|
3733 |
+
"count_rows = tes[tes[\"content\"].isin([\"no content found\"])].shape[0]\n",
|
3734 |
"count_rows"
|
3735 |
]
|
3736 |
},
|
|
|
3752 |
}
|
3753 |
],
|
3754 |
"source": [
|
3755 |
+
"count_rows = tes[tes[\"content\"].isin([\"cannot scrape the content\"])].shape[0]\n",
|
3756 |
"count_rows"
|
3757 |
]
|
3758 |
},
|
|
|
3763 |
"metadata": {},
|
3764 |
"outputs": [],
|
3765 |
"source": [
|
3766 |
+
"tes.to_parquet(\"scraped_data1.parquet\", index=False)\n",
|
3767 |
+
"tes.to_csv(\"scraped_data1.csv\", index=False)"
|
3768 |
]
|
3769 |
},
|
3770 |
{
|
app.py
CHANGED
@@ -39,6 +39,7 @@ def respond(
|
|
39 |
response += token
|
40 |
yield response
|
41 |
|
|
|
42 |
"""
|
43 |
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
44 |
"""
|
@@ -60,4 +61,4 @@ demo = gr.ChatInterface(
|
|
60 |
|
61 |
|
62 |
if __name__ == "__main__":
|
63 |
-
demo.launch()
|
|
|
39 |
response += token
|
40 |
yield response
|
41 |
|
42 |
+
|
43 |
"""
|
44 |
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
45 |
"""
|
|
|
61 |
|
62 |
|
63 |
if __name__ == "__main__":
|
64 |
+
demo.launch()
|
data/scrapped_data2_cleaned.csv
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id,Headline,Details,Severity,Category,Region,Datetime,Year,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Month,Week,Headline_Details,url,title,content,bert_score
|
2 |
+
19,270 kilograms of heroin discovered in container at Port of Genoa,"Local media sources indicated on November 8 that 270 kilograms of heroin arrived from Iran were discovered in containers at the Port of Genoa by Italian Customs. Two suspects were arrested, as the illicit cargo was believed to be bound for Switzerland, France, Belgium, and the Netherlands. Distributors should be mindful of illicit activity risks to their maritime shipments and should plan accordingly.",Moderate,Public Safety / Security,Italy,14/11/18 22:24,2018.0,44.4042,8.89,True,['genoa'],1.0,True,11.0,46.0,"270 kilograms of heroin discovered in container at Port of Genoa Local media sources indicated on November 8 that 270 kilograms of heroin arrived from Iran were discovered in containers at the Port of Genoa by Italian Customs. Two suspects were arrested, as the illicit cargo was believed to be bound for Switzerland, France, Belgium, and the Netherlands. Distributors should be mindful of illicit activity risks to their maritime shipments and should plan accordingly.",https://safety4sea.com/270-kg-of-heroin-found-in-container-at-port-of-genoa/,270 kg of heroin found in container at Port of Genoa,"The greatest heroin seizure of the last twenty years was carried out at the Port of Genoa. 270 kilos of heroin was found in a container coming from Iran after a long investigation by the Italian police, assisted by the staff of the Customs Agency. The container had ultimate destination the Netherlands.
|
3 |
+
|
4 |
+
According to local reports, once the drugs was seized, the Italian police organised and executed the first controlled delivery of heroin outside Italy. Under the coordination of law enforcement agencies and judicial authorities of Switzerland, France, Belgium and the Netherlands, an operation took place that ended with the arrest of two people.
|
5 |
+
|
6 |
+
On 17 October, drug containers were disembarked from the cargo ship, which had departed from Iranian Gulf port of Bandar Abbas, and stopped in Hamburg, Germany and Valencia, Spain before reaching Genoa, where 31 containers. From the examination of their bills of lading, 3 containers loaded with bentonite, a clayey mineral powder, appeared to be of particular interest.
|
7 |
+
|
8 |
+
In agreement with the District Anti-Mafia and Antiterrorism Department of Genoa, the search took place and then the 270 kilos of heroin, worth about 10 million euros, were found in one of the containers.
|
9 |
+
|
10 |
+
Then, a small amount was left in place and the investigators were able to follow the load on its journey to Europe through Switzerland, France, Belgium and the Netherlands.
|
11 |
+
|
12 |
+
After that it was loaded onto a tug that was discretely followed by the police, and after a three-day trip, the operation concluded on 2 November. The cargo arrived in a warehouse in the Netherlands, with a different address from the one the documents were indicating.
|
13 |
+
|
14 |
+
On 2 November, the Dutch police, together with the Italian forces, raided inside the warehouse, arresting two people from Turkey and the whole operation was completed.",0.6463567316532135
|
15 |
+
1049,5 freight cars derail near Mirpur Mathelo,"Local media sources indicate on October 14 that 5 cars of a Sahiwal to Karachi freight train derailed near Mirpur Mathelo on October 14, resulting in a suspension of Karachi-bound traffic. Due to the damage to cars and track, the line won't be restored until the evening of October 14. Those shipping by rail to Karachi should anticipate delays and should plan accordingly.",Minor,Train Accident / Derailment,Pakistan,14/10/19 22:28,2019.0,28.01947,69.55382,False,['karachi'],1.0,True,10.0,42.0,"5 freight cars derail near Mirpur Mathelo Local media sources indicate on October 14 that 5 cars of a Sahiwal to Karachi freight train derailed near Mirpur Mathelo on October 14, resulting in a suspension of Karachi-bound traffic. Due to the damage to cars and track, the line won't be restored until the evening of October 14. Those shipping by rail to Karachi should anticipate delays and should plan accordingly.",https://dailytimes.com.pk/483777/five-bogies-of-freight-train-derail-near-mirpur-mathelo/,Five bogies of freight train derail near Mirpur Mathelo,"Five bogies of a freight train derailed on Monday morning near Mirpur Mathelo railway station in Sindh. The Karachi-bound train’s derailment caused suspension of railway traffic on the down track, officials said. The train was running to Karachi from Sahiwal to carry back coal cargo from the port city. Various trains running to Karachi were stopped at Daharki and Sadiqabad railway stations, sources said. The derailment incident has caused damage to the bogies and railway track, railway department sources said. The work of restoration of the track and the railway traffic will be started after the heavy machinery from Rohri will reach at the place of the incident to remove five bogies derailed. The work to remove derailed bogies from the place of the accident and restoration of the track will be completed by Tuesday evening, railway authorities said. The trains stopped due to the mishap being sent from another track to their destination, officials said.",0.5540391951799393
|
16 |
+
1481,"Dutch customs seize 90,000 bottles of vodka to be exported to North Korea","Local media sources reported on February 25, 2019 that Dutch customs officials at the port of Rotterdam have seized 90,000 bottles of vodka believed to be destined for the North Korean leader, Kim Jong-un, and his army chiefs. The discovery was made after Dutch authorities flagged up the suspicious route and records of a Chinese-owned container ship, Nebula. The Russian vodka, contained in 3,000 boxes, had been recorded as being due for unloading in China, via the ports of Hamburg and Rotterdam. When officers sought to retrieve the container from the ship’s hull, it was found to be concealed and hemmed in by the fuselage of an aircraft also due to be exported to China. Despite concerns about damaging the aircraft, the Dutch ministry of affairs ordered the container’s removal. Initial investigations heightened suspicions that the haul was to be taken to Pyongyang,",Minor,Public Safety / Security,Netherlands,26/2/19 19:27,2019.0,51.95118,4.14481,True,"['hamburg', 'rotterdam']",1.0,False,2.0,9.0,"Dutch customs seize 90,000 bottles of vodka to be exported to North Korea Local media sources reported on February 25, 2019 that Dutch customs officials at the port of Rotterdam have seized 90,000 bottles of vodka believed to be destined for the North Korean leader, Kim Jong-un, and his army chiefs. The discovery was made after Dutch authorities flagged up the suspicious route and records of a Chinese-owned container ship, Nebula. The Russian vodka, contained in 3,000 boxes, had been recorded as being due for unloading in China, via the ports of Hamburg and Rotterdam. When officers sought to retrieve the container from the ship’s hull, it was found to be concealed and hemmed in by the fuselage of an aircraft also due to be exported to China. Despite concerns about damaging the aircraft, the Dutch ministry of affairs ordered the container’s removal. Initial investigations heightened suspicions that the haul was to be taken to Pyongyang,",https://www.theguardian.com/world/2019/feb/26/dutch-customs-seize-90000-bottles-of-vodka-believed-to-be-for-kim-jong-un,"Dutch customs seize 90,000 bottles of vodka believed to be for Kim Jong-un","Dutch customs officials at the port of Rotterdam have seized 90,000 bottles of vodka believed to be destined for the North Korean leader, Kim Jong-un, and his army chiefs.
|
17 |
+
|
18 |
+
The discovery, on the eve of Kim’s two-day summit with Donald Trump in Hanoi, was made after Dutch authorities flagged up the suspicious route and records of a Chinese-owned container ship, Nebula.
|
19 |
+
|
20 |
+
The Russian vodka, contained in 3,000 boxes, had been recorded as being due for unloading in China, via the ports of Hamburg and Rotterdam.
|
21 |
+
|
22 |
+
When officers sought to retrieve the container from the ship’s hull, it was found to be concealed and hemmed in by the fuselage of an aircraft also due to be exported to China.
|
23 |
+
|
24 |
+
Despite concerns about damaging the aircraft, the Dutch ministry of affairs ordered the container’s removal.
|
25 |
+
|
26 |
+
Initial investigations heightened suspicions that the haul was to be taken to Pyongyang, Dutch authorities said.
|
27 |
+
|
28 |
+
View image in fullscreen A customs official displays a container with 90,000 bottles of vodka. Photograph: Robin Utrecht/AFP/Getty Images
|
29 |
+
|
30 |
+
Arno Kooij, the customs officer in charge of the seizure, declined to comment on what had alerted his systems to the container ship.
|
31 |
+
|
32 |
+
“We do not want to make anyone wiser than necessary,” he told the Dutch newspaper, Algemeen Dagblad. “What I can say, based on the information we had, we suspected that this container would fall under the sanctions regime for North Korea. We suspected that this vodka would not go to China, but to North Korea.”
|
33 |
+
|
34 |
+
Sigrid Kaag, the Dutch minister for trade, who ordered the seizure, said: “The security council of the United Nations has imposed clear sanctions on North Korea, so it is important to enforce those sanctions. The sanctions also govern the import of luxury goods and so customs was completely justified in unloading that container.”
|
35 |
+
|
36 |
+
Kim, who was educated in Switzerland, is known to have a taste for the high life. Last year, it was claimed he had has spent more than $4bn (£3bn) on importing luxury products from China since taking power in 2011.
|
37 |
+
|
38 |
+
His family is said to have imported a seaplane, musical instruments, watches and furs, with luxury items accounting for 17.8% of North Korea’s total imports from China in 2017, according to a South Korean analysis.
|
39 |
+
|
40 |
+
Kim arrived in Hanoi in his armoured train on Tuesday, ahead of his second meeting with Trump. He had travelled for two and a half days, covering 2,800 miles, although his precise route has not been disclosed.
|
41 |
+
|
42 |
+
The meeting comes eight months after a summit in Singapore, the first between a sitting US president and a North Korean leader, which failed to produce concrete results on a path to denuclearisation.",0.7640108466148376
|
43 |
+
1708,Iranian oil tanker damaged by explosions near Port of Jeddah,"On October 11, media sources indicate that an oil tanker belonging to the National Iranian Oil Company (NIOC) was damaged by explosions while passing Port of Jeddah, Saudi Arabia in the Red Sea, on Friday, October 11. Explosions, likely caused by missile strikes, resulted in the crude oil to leak into the sea. According to IRNA, the tanker was about 60 miles (96 km) from the Saudi port of Jeddah when the incident occurred. All members of the tanker's crew are reportedly safe. Oil prices increased by 2% after reports of the explosions.",Severe,"Explosion, Chemical Spill",Saudi Arabia,11/10/19 8:03,2019.0,21.47637,39.15467,False,['jeddah'],1.0,False,11.0,45.0,"Iranian oil tanker damaged by explosions near Port of Jeddah On October 11, media sources indicate that an oil tanker belonging to the National Iranian Oil Company (NIOC) was damaged by explosions while passing Port of Jeddah, Saudi Arabia in the Red Sea, on Friday, October 11. Explosions, likely caused by missile strikes, resulted in the crude oil to leak into the sea. According to IRNA, the tanker was about 60 miles (96 km) from the Saudi port of Jeddah when the incident occurred. All members of the tanker's crew are reportedly safe. Oil prices increased by 2% after reports of the explosions.",https://www.theguardian.com/world/2019/oct/11/iranian-oil-tanker-on-fire-after-blast-near-saudi-port-city-report,Iranian oil tanker damaged by explosions near Saudi port city,"Two missiles hit an Iranian state-owned oil tanker as it headed to Syria on the Red Sea, the Iranian government has claimed.
|
44 |
+
|
45 |
+
Tehran did not attribute responsibility immediately but said two explosions 20 minutes apart on the Sabiti tanker, which caused oil to spill from two tanks, were not the result of an accident.
|
46 |
+
|
47 |
+
The boat was about 60 miles from the Saudi Arabian port city of Jeddah when it was hit. The scale of the damage did not appear extensive in photos published by Tehran news agencies.
|
48 |
+
|
49 |
+
The blasts prompted speculation that they were a reprisal by Gulf states for attacks attributed to Iran on Saudi oil assets in September.
|
50 |
+
|
51 |
+
Meanwhile, the Pentagon announced it was sending at least 1,000 additional troops to Saudi Arabia. It said the deployment meant that in the past month 3,000 troops had been sent to the kingdom or had their deployment extended.
|
52 |
+
|
53 |
+
US Patriot missile defences failed to deter a cruise missile and drone attack that severely damaged two Aramco facilities and shocked the Saudi government last month. US officials have since discussed with their Middle Eastern ally how to strengthen Saudi defences, and the Pentagon said Friday’s deployment would “assure and enhance the defence of Saudi Arabia”.
|
54 |
+
|
55 |
+
This week Donald Trump announced he was pulling troops from the Syrian border with Turkey and said he wanted to end America’s endless wars in the Middle East.
|
56 |
+
|
57 |
+
It appears some of the newly announced troops are being despatched to replace other American forces expected to depart the region in the coming weeks.
|
58 |
+
|
59 |
+
The US and the EU attributed the September attack to Iran, rejecting claims by Houthi rebels in Yemen that they were responsible. A full UN report on the incident has not yet been completed
|
60 |
+
|
61 |
+
An Iranian foreign ministry spokesman said: “Those behind the attack are responsible for the consequences of this dangerous adventure, including the dangerous environmental pollution caused. The details and factors behind this act will be investigated and will be announced after the results are reached.”
|
62 |
+
|
63 |
+
Suggestions that the oil company that owned the tanker was blaming Saudi Arabia at this stage were denied.
|
64 |
+
|
65 |
+
Friday’s incident, if confirmed as an attack, would be the first such incident targeting Iranian-owned shipping in the Gulf, though a state-owned tanker, Grace 1, was seized by British authorities off Gibraltar on the basis that it was breaching an EU oil embargo.
|
66 |
+
|
67 |
+
Iranian news agencies stressed that the Sabiti was stable, no crew had been injured and the leak was being brought under control.
|
68 |
+
|
69 |
+
Iranian ships routinely turn off their transponders to prevent tracking, but the Sabiti turned on its tracking devices late on Friday morning in the Red Sea, according to data from MarineTraffic.com.
|
70 |
+
|
71 |
+
The vessel last turned on its tracking devices in August, showing it near the Iranian port of Bandar Abbas.
|
72 |
+
|
73 |
+
Dryad Global, a firm specialising in oil shipping intelligence, said the vessel’s proximity to the port of Jeddah made it plausible that Saudi Arabia could have been involved, or at the very least that the incident was intended to create the perception of Saudi involvement.
|
74 |
+
|
75 |
+
But it added: “In terms of Saudi interests within the region, it remains unclear why Saudi would seek to target Iran in this manner. An attack of relatively low sophistication with limited and almost negligible strategic gain would be highly irregular and not serve any Saudi strategic narrative. Further still, it is highly unlikely that the Saudis would risk an ecological disaster in an area of strategic significance such as the Red Sea.”
|
76 |
+
|
77 |
+
Tension in the strait of Hormuz has been heightened for months as the US and Iran spar over Washington’s decision in 2018 to withdraw from the Iranian nuclear deal and impose worldwide sanctions on Iran including its oil exports.
|
78 |
+
|
79 |
+
Q&A Why is the Gulf of Oman so important for shipping oil? Show The strait of Hormuz, which provides passage from the Gulf of Oman to the open sea, is the most important gateway for oil exports in the world. With Iran on its northern shore, and the UAE and Oman on its southern shore, the US Energy Information Administration (EIA) calls it the world’s worst 'chokepoint' In 2016, 18.5m barrels of crude oil were transported each day through the strait of Hormuz, compared with 16m through the strait of Malacca, which runs between the Indonesian island of Sumatra, Malaysia and Thailand, connecting the Indian Ocean with the South China Sea. 5m barrels of crude oil are transported annually through the next largest chokepoint, the Suez canal. Phillip Inman Was this helpful? Thank you for your feedback.
|
80 |
+
|
81 |
+
An attempt by the French president, Emmanuel Macron, at the UN general assembly in New York to engineer a meeting between the Iranian president, Hassan Rouhani, and Trump failed as the two sides could not reach agreement on the sequencing of the compromises the two sides would have to take. Since then, the Pakistani prime minister, Imran Khan, has stepped forward as a possible mediator between Iran and Saudi Arabia.
|
82 |
+
|
83 |
+
The perception was that neither Saudi Arabia nor the UAE, Iran’s two main Gulf rivals, were looking to escalate the crisis by undertaking a military response to the Aramco incident. It would be surprising if either Gulf state resorted to the kind of “plausible deniability” tactics allegedly deployed by Iran.
|
84 |
+
|
85 |
+
Israel is also deeply hostile to Iran but has confined most of its attacks to Iranian military sites in Syria.
|
86 |
+
|
87 |
+
The current round of attacks on oil shipping started on 12 May when four ships, including two Saudi oil tankers, were attacked in the Gulf just outside the strait of Hormuz, which is a major oil shipping route.
|
88 |
+
|
89 |
+
US and British officials blamed Iran, a charge Tehran denies. A further two tankers were hit on 13 June, and a week later Iran said it had shot down a US surveillance drone, an attack that nearly led to a major reprisal by the Trump administration.
|
90 |
+
|
91 |
+
Oil prices jumped 2% after reports of the tanker blasts on Friday, with crude futures rising by more than $1 (79p) a barrel.",0.5100782811641693
|
92 |
+
1816,Mexican Navy seizes 80 kg of cocaine on CMA CGM container ship in the Port of Lazaro Cardenas,"On September 24, sources report that Mexican Navy and Coast Guard personnel seized over 80 kilograms of cocaine from a container aboard the CMA CGM Mississippi at the Pacific Ocean Port of Lazaro Cardenas. A total of 70 packages containing approximately 82 kilograms of narcotics were reportedly seized in an operation on September 18. An investigation into the matter has been launched. It was not immediately reported whether the vessel was detained for a longer period of time in the port. The containership is operated by French shipping company CMA CGM and was on its way from Guayaquil, Ecuador to Yokohama, Japan.",Moderate,"Organized Crime,Cargo Disruption",Mexico,24/9/19 7:52,2019.0,17.95506,-102.17547,True,"['guayaquil', 'yokohama']",1.0,True,9.0,39.0,"Mexican Navy seizes 80 kg of cocaine on CMA CGM container ship in the Port of Lazaro Cardenas On September 24, sources report that Mexican Navy and Coast Guard personnel seized over 80 kilograms of cocaine from a container aboard the CMA CGM Mississippi at the Pacific Ocean Port of Lazaro Cardenas. A total of 70 packages containing approximately 82 kilograms of narcotics were reportedly seized in an operation on September 18. An investigation into the matter has been launched. It was not immediately reported whether the vessel was detained for a longer period of time in the port. The containership is operated by French shipping company CMA CGM and was on its way from Guayaquil, Ecuador to Yokohama, Japan.",https://shipsandports.com.ng/mexico-intercepts-80kg-of-cocaine-on-cma-cgm-ship-2/,Mexico intercepts 80kg of cocaine on CMA CGM ship,"Mexican Navy and Coast Guard personnel have seized over 80 kilograms of cocaine from a container aboard the CMA CGM Mississippi at the Pacific Ocean Port of Lázaro Cárdenas.
|
93 |
+
|
94 |
+
The navy said that a total of 70 packages containing approximately 82 kilograms of narcotics were seized in an operation on September 18.
|
95 |
+
|
96 |
+
According to the cargo manifest, the container on the Post-Panamax ship was supposed to be filled with electrical cable rolls. However, after opening the container, the authorities found bags with packages of cocaine hydrochloride.
|
97 |
+
|
98 |
+
An investigation into the matter has been launched.
|
99 |
+
|
100 |
+
Flying the flag of Liberia, the containership is operated by French shipping company CMA CGM and owned by the US-based JP Morgan Global Maritime.
|
101 |
+
|
102 |
+
The 2015-built CMA CGM Mississippi arrived in Lázaro Cárdenas from Guayaquil, Ecuador, and is currently en route to Yokohama, Japan, according to the ship’s AIS data.",0.5610589683055878
|
103 |
+
2225,Singapore arrests eleven men for illegal marine gas oil transaction,Media sources indicated on November 4 that the Maritime and Port Authority of Singapore has arrested eleven men for their suspected involvement in an illegal transaction of marine gas oil at sea off Northern Tuas. The eleven men include six crew members of a craft of a marine service provider and another five of a foreign-registered tugboat. Preliminary reports cited that the crewmembers were thought to have misappropriated the marine gas oil and sold it to crew members of the tugboats.,Moderate,Maritime Advisory,Singapore,6/11/19 1:37,2019.0,,,False,['singapore'],1.0,True,6.0,24.0,Singapore arrests eleven men for illegal marine gas oil transaction Media sources indicated on November 4 that the Maritime and Port Authority of Singapore has arrested eleven men for their suspected involvement in an illegal transaction of marine gas oil at sea off Northern Tuas. The eleven men include six crew members of a craft of a marine service provider and another five of a foreign-registered tugboat. Preliminary reports cited that the crewmembers were thought to have misappropriated the marine gas oil and sold it to crew members of the tugboats.,https://safety4sea.com/15-people-arrested-for-illegal-mgo-transaction-in-singapore/,15 people arrested for illegal MGO transaction in Singapore,"The Police have arrested 15 men, aged between 26 and 56 years old, for their suspected involvement in an illegal transaction of Marine Gas Oil (MGO).
|
104 |
+
|
105 |
+
According to data provided by Singapore’s Police, on 1st March 2024, officers from the Police Coast Guard arrested eight crew members of a Singapore-registered tugboat and another seven crew members of a foreign-registered tugboat for their suspected involvement in illegal transaction of MGO at the sea off Tuas, Singapore.
|
106 |
+
|
107 |
+
Preliminary investigations revealed that the crew members of the Singapore- registered tugboat were believed to have misappropriated the MGO without their company’s knowledge by selling it to the crew members of a foreign-registered tugboat. The tugboat and cash amounting to $8,200 were seized as case exhibits.
|
108 |
+
|
109 |
+
The eight crew members of the Singapore-registered tugboat will be charged in court with criminal breach of trust by employees and the seven crew members of the foreign-registered tugboat will be charged in court with dishonestly receiving stolen property on 2nd March 2024. If convicted for criminal breach of trust by employees under Section 408 of the Penal Code 1871, they shall be punished with an imprisonment term, which may extend to 15 years and shall be liable to a fine. If convicted for dishonestly receiving stolen property under Section 411 (1) of the Penal Code 1871, they shall be punished with an imprisonment term that may extend to five years, or with fine, or with both.
|
110 |
+
|
111 |
+
The Police takes a serious view of illegal transaction of MGO in Singapore waters. The authorities will continue to conduct enforcement and security checks to prevent, deter and detect such illicit activities in Singapore waters.",0.5223027020692825
|
112 |
+
2395,"Tanker truck carrying 2,000 gallons of unknown liquid stolen in LA","According to local media sources, at around 6:30 a.m. on March 29, 2019 an unknown male suspect stole a blue tanker truck from a location near the intersection of W. 6th Street and S. Curson Avenue in the Mid-Wilshire area of Los Angeles. Shortly after, police began to pursue the allegedly stolen vehicle. Police said that the stolen vehicle was carrying more than 2,000 gallons of an unknown liquid. Officials said that the liquid was not believed to be flammable.",Moderate,Cargo/Warehouse Theft,United States,12/4/19 20:27,2019.0,34.06484,-118.3547,False,['los angeles'],1.0,True,12.0,49.0,"Tanker truck carrying 2,000 gallons of unknown liquid stolen in LA According to local media sources, at around 6:30 a.m. on March 29, 2019 an unknown male suspect stole a blue tanker truck from a location near the intersection of W. 6th Street and S. Curson Avenue in the Mid-Wilshire area of Los Angeles. Shortly after, police began to pursue the allegedly stolen vehicle. Police said that the stolen vehicle was carrying more than 2,000 gallons of an unknown liquid. Officials said that the liquid was not believed to be flammable.",https://www.fox5atlanta.com/news/police-chase-stolen-tanker-truck-carrying-2000-gallons-of-unknown-liquid-in-mid-wilshire-area-of-la,"Police chase stolen tanker truck carrying 2,000 gallons of unknown liquid in Mid-Wilshire area of LA","Image 1 of 2 ▼
|
113 |
+
|
114 |
+
Los Angeles police officers were in pursuit of a reported stolen tanker truck on Thursday morning.
|
115 |
+
|
116 |
+
The Los Angeles Police Department said that around 6:30 a.m. an unknown male suspect stole a blue tanker truck from a location near the intersection of W. 6th Street and S. Curson Avenue in the Mid-Wilshire area of Los Angeles. Shortly after, police began to pursue the allegedly stolen vehicle.
|
117 |
+
|
118 |
+
LAPD said that the stolen vehicle was carrying more than 2,000 gallons of an unknown liquid. Officials said that the liquid was not believed to be flammable.
|
119 |
+
|
120 |
+
At 7:15 a.m. the stolen vehicle was driving eastbound on Wilshire Boulevard. The vehicle failed to stop when officers attempted to perform a traffic stop. Footage from SkyFOX showed the driver make several erratic turns and cut across multiple intersections during the pursuit.
|
121 |
+
|
122 |
+
When the suspect driver turned down S. Curson Avenue at 7:20 a.m., city workers had to move construction cones out of the way for the truck to pass by. At that time, the pursuing police cruisers were substantially down the street.
|
123 |
+
|
124 |
+
The alleged suspect was taken into custody on W. 6th Street just off of S. Curson Avenue shortly before 7:30 a.m.
|
125 |
+
|
126 |
+
No additional information has been released at this time.",0.6466525495052338
|
127 |
+
2615,UPDATE - Spain: Uber and Cabify announce pulling their services out of Barcelona,"Ride-hailing services Uber and Cabify have announced that they will be pulling their services out of Barcelona. The companies are doing so in response to the approval by the Catalan regional government of regulations that will force customers to request their services 15 minutes ahead of their planned journey time. The new law from the Generalitat, as the regional government is known, was passed in response to demands from taxi drivers, who have been on strike in Barcelona and Madrid in recent days over what they consider to be unfair competition from Uber and Cabify drivers, who operate in Spain using what are known as VTC licenses. The direct consequence of the two companies pulling out of Barcelona will fall on the more than 3,500 drivers with VTC licenses in the city.",Minor,Ground Transportation Advisory,Spain,2/2/19 3:55,2019.0,,,False,['barcelona'],1.0,False,2.0,5.0,"UPDATE - Spain: Uber and Cabify announce pulling their services out of Barcelona Ride-hailing services Uber and Cabify have announced that they will be pulling their services out of Barcelona. The companies are doing so in response to the approval by the Catalan regional government of regulations that will force customers to request their services 15 minutes ahead of their planned journey time. The new law from the Generalitat, as the regional government is known, was passed in response to demands from taxi drivers, who have been on strike in Barcelona and Madrid in recent days over what they consider to be unfair competition from Uber and Cabify drivers, who operate in Spain using what are known as VTC licenses. The direct consequence of the two companies pulling out of Barcelona will fall on the more than 3,500 drivers with VTC licenses in the city.",https://english.elpais.com/elpais/2019/01/31/inenglish/1548940738_151302.html,"Uber, Cabify announce they are pulling their services out of Barcelona","VTC vehicles block a street in Barcelona on January 19. Joan Sánchez
|
128 |
+
|
129 |
+
Ride-hailing services Uber and Cabify announced on Thursday that they would be pulling their services out of Barcelona from tomorrow onward. The companies are doing so in response to the approval by the Catalan regional government of regulations that will force customers to request their services 15 minutes ahead of their planned journey time.
|
130 |
+
|
131 |
+
The new law from the Generalitat, as the regional government is known, was passed in response to demands from taxi drivers, who have been on strike in Barcelona and Madrid in recent days over what they consider to be unfair competition from Uber and Cabify drivers, who operate in Spain using what are known as VTC (private hire) licenses.
|
132 |
+
|
133 |
+
Uber has sent a message to its users, entitled: “So long Barcelona”
|
134 |
+
|
135 |
+
Stoppages in Barcelona were suspended last week but continue in Madrid, with drivers having blocked a number of areas of the city over the last seven days, as well as staging noisy demonstrations outside the headquarters of the conservative Popular Party (PP), which is in power in the Madrid regional government.
|
136 |
+
|
137 |
+
Uber neither owns cars nor licenses in Spain, and it limits its activities to simply offering the technology platform that drivers can use to find customers. But it will be affected by the move that it and its competitor Cabify have made. The American firm has sent a message to its users, entitled: “So long Barcelona,” in which it blames the regional government for the closure of the UberX service it was offering in the Catalan capital.
|
138 |
+
|
139 |
+
For its part, Cabify – which does own VTC licenses at the same time as working with other companies, to whom it offers its technology platform – will contact its users on Friday to convey what it considers to be its “expulsion” from Catalonia.
|
140 |
+
|
141 |
+
An Uber user requesting a vehicle. Albert Garcia
|
142 |
+
|
143 |
+
“Nearly a year ago we returned to Barcelona with one commitment: to do things properly,” reads the statement released by Uber. “Since then, more than half a million people have chosen us to get them around the city. And thousands of drivers have found a way to make a living with Uber.”
|
144 |
+
|
145 |
+
Several years ago now, the San Francisco-based firm tried to launch in Spain the model that brought it so much success in its home country: i.e. a platform that allowed anyone with a car to essentially be a taxi driver. But it was quickly forced to close its service due to protests from the taxi sector and legal challenges, and in 2018 it returned with a service that observed Spanish law and regulations, using VTC licenses. This activity will now be curtailed by the regional government decree, which is aimed at differentiating the activity of taxis and that of VTC vehicles.
|
146 |
+
|
147 |
+
VTC drivers have also been out on the streets to demonstrate, in opposition to the government decree. Uber and Cabify have announced that the measure will cost more than 3,500 people their jobs. “The obligation to wait 15 minutes to travel in a VTC vehicle does not exist in any other place in Europe, and it is completely incompatible with the immediacy of on-demand services such as UberX,” the company statement continues.
|
148 |
+
|
149 |
+
Stoppages in Barcelona were suspended last week but continue in Madrid
|
150 |
+
|
151 |
+
For its part, Cabify has released a statement in which it condemns the Generalitat for “having given in to the pressure and demands of the taxi sector, seriously damaging the interest of citizens.” It adds that it believes the 15-minute wait time will make its business inviable, given that 98.5% of its journeys are offered below that time frame.
|
152 |
+
|
153 |
+
Meanwhile, the spokesperson for Élite Taxi in Barcelona, Alberto Tito Álvarez, released a statement in which he celebrated the decision by Uber and Cabify to end their services in Barcelona. “This is a victory for working people against the enslaving multinationals, and not just of the taxis,” he said. He criticized the fact that these companies had created what he called “a fraudulent business model,” and assured that the new regulation would put an end to the model.
|
154 |
+
|
155 |
+
The direct consequence of the two companies pulling out of Barcelona will fall on the more than 3,500 drivers with VTC licenses in the city. A company called Vector Ronda, for example, which has a workforce of nearly a thousand people and operates mostly with Cabify, has already announced to its workers that it will be executing a mass layoff plan, known in Spanish as an ERE. However, it is yet to officially notify the regional labor department of this plan. In a statement, Spanish union CCOO rejected the ERE, saying that “these announcements are designed to exert pressure on a political decision, making use of workers.”
|
156 |
+
|
157 |
+
English version by Simon Hunter.",0.7118738293647766
|
158 |
+
2953,US border authorities seize 1 million pounds of pork from China,"According to media sources, U.S. federal border agents at the Port of Newark in New Jersey seized on March 16 around 1 million pounds of pork (454 metric tons) from China, due to fears that the meat could contain traces of African swine fever which has hit Chinese pork output.
|
159 |
+
|
160 |
+
The US Customs and Border Protection (CBP) revealed that the pork arrived in more than 50 shipping containers over the past few weeks and was hidden in containers of ramen noodles and laundry detergent packaging. The CBP believes that these pork products were smuggled with the original Chinese food ingredients covered by English packaging labels that deliberately did not mention that the products contained pork",Moderate,Customs Regulation,United States,18/3/19 10:24,2019.0,41.40441,-101.17074,True,['new jersey'],1.0,False,3.0,12.0,"US border authorities seize 1 million pounds of pork from China According to media sources, U.S. federal border agents at the Port of Newark in New Jersey seized on March 16 around 1 million pounds of pork (454 metric tons) from China, due to fears that the meat could contain traces of African swine fever which has hit Chinese pork output.
|
161 |
+
|
162 |
+
The US Customs and Border Protection (CBP) revealed that the pork arrived in more than 50 shipping containers over the past few weeks and was hidden in containers of ramen noodles and laundry detergent packaging. The CBP believes that these pork products were smuggled with the original Chinese food ingredients covered by English packaging labels that deliberately did not mention that the products contained pork",https://www.northjersey.com/story/news/new-jersey/2019/03/16/one-million-pounds-smuggled-pork-china-seized-nj-port-african-swine-fever/3185268002/,1 million pounds of smuggled pork from China seized at NJ port,"Federal officials at the Newark port of entry seized 1 million pounds of pork products, allegedly smuggled from China, in the biggest agricultural bust in American history.
|
163 |
+
|
164 |
+
U.S. Customs & Border Protection announced the pork seizure at a news conference on Friday at a warehouse in Elizabeth, telling NJ.com and other media outlets, the raid of more than 50 shipping containers was an effort to stop the spread of African swine fever, a virus that has decimated China's pigs.
|
165 |
+
|
166 |
+
Anthony Bucci, an agency spokesman, said the meat was ""primarily cured,"" and the cargo containers were not refrigerated.
|
167 |
+
|
168 |
+
The deadly disease, which does not affect humans, has never been detected among livestock in this country, NJ.com reported. But, if a domestic outbreak were to occur, a customs official said, it could cause $10 billion worth of damage to the U.S. pork industry in one year.
|
169 |
+
|
170 |
+
Shooting:Parsippany man charged with obstructing investigation in connection with wife's death
|
171 |
+
|
172 |
+
Crime:Alleged Gambino crime boss's slaying is throwback to bygone era, experts say
|
173 |
+
|
174 |
+
Child porn:Bergen County juvenile charged with having child pornography files
|
175 |
+
|
176 |
+
Authorities, including representatives of the U.S. Department of Agriculture, still were investigating the smuggling, and as of Friday, no charges were filed.
|
177 |
+
|
178 |
+
The Port Newark-Elizabeth Marine Terminal, run by The Port Authority of New York & New Jersey, is among the largest ports of entry in the U.S., and the busiest on the East Coast.
|
179 |
+
|
180 |
+
Email: devencentis@northjersey.com",0.5224124491214752
|
181 |
+
3265,Boxship Seaspan Lahore detained following fuel spill at Port of Algeciras,"Sources indicated on May 25 that the boxship Seaspan Lahore has been detained by the captain of the port at the Port of Algeciras following a fuel spill on the afternoon of May 24 at the Isla Verde Exterior dock. According to sources, the spill occurred due to a crack on the vessel’s hull, which was located on the side of the vessel facing the dock, thus easing the containment and spill cleanup efforts. The vessel is detained and awaiting repairs, with the captain of the port requiring a USD 92,000 bond in connection with the spill.",Moderate,Maritime Accident,Spain,26/5/20 8:12,2020.0,36.12999,-5.42581,False,['algeciras'],1.0,False,5.0,22.0,"Boxship Seaspan Lahore detained following fuel spill at Port of Algeciras Sources indicated on May 25 that the boxship Seaspan Lahore has been detained by the captain of the port at the Port of Algeciras following a fuel spill on the afternoon of May 24 at the Isla Verde Exterior dock. According to sources, the spill occurred due to a crack on the vessel’s hull, which was located on the side of the vessel facing the dock, thus easing the containment and spill cleanup efforts. The vessel is detained and awaiting repairs, with the captain of the port requiring a USD 92,000 bond in connection with the spill.",https://cyprusshippingnews.com/2020/05/27/container-vessel-seaspan-lahore-spills-fuel-at-port-of-algeciras/,Container vessel Seaspan Lahore spills fuel at port of Algeciras,"The captain of the port at the Port of Algeciras has detained the boxship Seaspan Lahore after a fuel spill Sunday afternoon at the Isla Verde Exterior dock.
|
182 |
+
|
183 |
+
According to local outlet EuropaSur, the spill occurred due to a small crack in the Seaspan Lahore’s hull. It was located on the side of the vessel facing the dock, enabling containment and spill cleanup measures. The abatement effort was declared successful and is now complete.
|
184 |
+
|
185 |
+
The vessel has been detained awaiting repairs, according EuropaSur, and the captain of the port has required a bond of $92,000 in connection with the spill.
|
186 |
+
|
187 |
+
In response to the incident, the local environmental advocacy group Verdemar Ecologists in Action issued a protest to the captain of the port, asserting that fuel spills are occurring at Algeciras with increasing frequency. “Verdemar will request . . . a report of what happened, the extent of pollution to Algeciras Bay and data from the physical parameters of the spill,” the group wrote in a social media post.",0.6996552348136902
|
188 |
+
3302,Cargo trucks discouraged from using Roxas Boulevard from May 20 due to closure to traffic,Media sources indicated on May 20 that motorists have been advised to take alternate routes as the southbound lane of Roxas Boulevard from Katibak Drive to Quirino Avenue has been closed to traffic from May 20 due to the rehabilitation of the box culvert on Remedios Street. Manila municipal authorities on May 19 discouraged heavy cargo trucks from using Roxas Boulevard upon discovery of several road trucks along the roadway.,Moderate,"Roadway Closure / Disruption, Ground Transportation Advisory",Philippines,21/5/20 3:49,2020.0,14.57775,120.97843,False,['manila'],1.0,False,5.0,21.0,Cargo trucks discouraged from using Roxas Boulevard from May 20 due to closure to traffic Media sources indicated on May 20 that motorists have been advised to take alternate routes as the southbound lane of Roxas Boulevard from Katibak Drive to Quirino Avenue has been closed to traffic from May 20 due to the rehabilitation of the box culvert on Remedios Street. Manila municipal authorities on May 19 discouraged heavy cargo trucks from using Roxas Boulevard upon discovery of several road trucks along the roadway.,https://www.portcalls.com/parts-of-roxas-boulevard-closed-to-traffic/,Parts of Roxas Boulevard closed to traffic,"Motorists should use alternate routes as the southbound lane of Roxas Boulevard from Katigbak Drive to Quirino Avenue is closed to traffic starting May 20 due to rehabilitation of the box culvert on Remedios Street.
|
189 |
+
|
190 |
+
The Manila city government on May 19 discouraged heavy trucks from using Roxas Boulevard upon discovery of several road cracks along the major thoroughfare.
|
191 |
+
|
192 |
+
READ: Cargo trucks discouraged from using Roxas Boulevard
|
193 |
+
|
194 |
+
In a traffic advisory by the Manila Police District-Manila District Traffic Enforcement Unit, all vehicles coming from these three bridges—Jones, McArthur, Quezon—intending to use the southbound lane of Roxas Boulevard should go straight to Taft Avenue to point of destination as an alternate route.
|
195 |
+
|
196 |
+
Other alternate routes include:
|
197 |
+
|
198 |
+
All vehicles coming from Bonifacio Drive intending to use the southbound lane of Roxas Boulevard should turn left to P. Burgos Avenue to point of destination.
|
199 |
+
|
200 |
+
All vehicles travelling the westbound lane of P. Burgos Avenue should turn right to Bonifacio Drive or make a u-turn to the eastbound lane of P. Burgos Avenue to point of destination.
|
201 |
+
|
202 |
+
All Vehiles traveling westbound to TM Kalaw Street going to Roxas Boulevard should turn left to MH Del Pilar Street to point of destination.
|
203 |
+
|
204 |
+
All vehicles travelling the westbound lane of U.N. Avenue going to Roxas Boulevard should turn left to MH Del Pilar Street or utilize Roxas Boulevard service road going to point of destination.
|
205 |
+
|
206 |
+
Heavy vehicles, meanwhile, should utilize the truck route: eastbound of P. Burgos Avenue right to Finance Road straight to Ayala Boulevard, right to San Marcelino Street to point of destination.
|
207 |
+
|
208 |
+
Manila City mayor Francisco Domagoso on May 19 assured movement of trucks will remain unhampered in the nation’s capital during the period of modified enhanced community quarantine (MECQ).
|
209 |
+
|
210 |
+
READ: No trucks will be flagged down, impounded in Manila during MECQ
|
211 |
+
|
212 |
+
Domagoso assured trucking organizations in a virtual meeting that no truck will be flagged down and impounded during the MECQ period until May 31. Trucks will also be allowed to traverse Roxas Boulevard and other truck routes even those without the Terminal Appointment Booking System (TABS), he added.
|
213 |
+
|
214 |
+
This is in accordance to the directive of the Inter-Agency Task Force for the Management of Emerging Infectious Diseases that movement of all types cargoes by land, air, or sea should remain unhampered during the quarantine period.
|
215 |
+
|
216 |
+
While elbow room was given for truckers operating in the city, Domagoso maintained that laws against road violations such as reckless driving will be strictly enforced.
|
217 |
+
|
218 |
+
Trucking organizations that attended the virtual conference included the Alliance of Concerned Truck Owners and Organizations, Confederation of Truckers Association of the Philippines, Haulers and Truckers Associations in the Watersouth, Inland Haulers & Truckers Association, and other transport groups.",0.5179985910654068
|
219 |
+
3332,CN Railway adjusts operations amid strike at Port of Montreal,"Media sources indicate on August 12 that Canadian National Railway (CN) is relying on port terminals in Halifax and elsewhere on the East Coast as longshore workers continue to strike at the Port of Montreal. The railway company is by using port terminals in Halifax and elsewhere on the East Coast to continue importing and exporting goods. According to the port website, the Port of Montreal operates its own railway network, with 60 miles of track and direct access to various berths, and it has a rail interchange zone. The port’s on-dock rail system connects to both CN and Canadian Pacific.",Moderate,Train Delays / Disruption,Canada,12/8/20 22:35,2020.0,45.49934,-73.56675,True,['montreal'],1.0,False,12.0,50.0,"CN Railway adjusts operations amid strike at Port of Montreal Media sources indicate on August 12 that Canadian National Railway (CN) is relying on port terminals in Halifax and elsewhere on the East Coast as longshore workers continue to strike at the Port of Montreal. The railway company is by using port terminals in Halifax and elsewhere on the East Coast to continue importing and exporting goods. According to the port website, the Port of Montreal operates its own railway network, with 60 miles of track and direct access to various berths, and it has a rail interchange zone. The port’s on-dock rail system connects to both CN and Canadian Pacific.",https://www.freightwaves.com/news/cn-adjusts-operations-amid-strike-at-port-of-montreal,CN adjusts operations amid strike at Port of Montreal,"Canadian railway CN (NYSE: CNI) is relying on port terminals in Halifax and elsewhere on the East Coast as longshore workers continue to strike at the Port of Montreal.
|
220 |
+
|
221 |
+
“We are monitoring the situation at the Port of Montreal closely and we are in regular contact with the responsible authorities. While we wish the parties [would] reach an agreement quickly, we are adjusting our operations in order to continue to provide service to our customers,” CN told FreightWaves.
|
222 |
+
|
223 |
+
Those adjustments entail “leveraging [the] network by using port terminals in Halifax and elsewhere on the East Coast to continue importing and exporting goods that are vital for the economy,” CN said.
|
224 |
+
|
225 |
+
FreightWaves reported earlier this week that vessels bound for Montreal are being diverted and containers already there are sitting at the port. Although the port has said the strike would not affect liquid bulk handling, the Oceanex service at Bickerdike Terminal and the Viterra grain terminal, some vessels have been diverted to other ports, such as the Ports of St. John and Halifax.",0.5962026864290237
|
226 |
+
3334,Cocaine worth EUR 151 million found in banana shipment in Port of Rotterdam,"Media sources on June 9 that customs officers at the Port of Rotterdam found 2,020 kilograms of cocaine hidden inside a sea container full of bananas. The shipment had arrived from Guayaquil, Ecuador, and was to continue to a company in Hungary. It was reported that the street value of this batch of cocaine is more than 151 million EUR. The discovery was made on June 6 during an inspection at the port. Authorities would not identify the firm that was set to receive the shipment and stated that the recipient probably has nothing to do with the smuggling.",Moderate,Cargo Disruption,Netherlands,9/6/20 9:56,2020.0,,,True,"['guayaquil', 'rotterdam']",1.0,False,9.0,36.0,"Cocaine worth EUR 151 million found in banana shipment in Port of Rotterdam Media sources on June 9 that customs officers at the Port of Rotterdam found 2,020 kilograms of cocaine hidden inside a sea container full of bananas. The shipment had arrived from Guayaquil, Ecuador, and was to continue to a company in Hungary. It was reported that the street value of this batch of cocaine is more than 151 million EUR. The discovery was made on June 6 during an inspection at the port. Authorities would not identify the firm that was set to receive the shipment and stated that the recipient probably has nothing to do with the smuggling.",https://dailynewshungary.com/cocaine-worth-e151-million-found-in-banana-shipment-destined-for-hungary/,Cocaine worth €151 million found in banana shipment destined for Hungary,"Customs officers at the Port of Rotterdam found a massive 2,020 kilograms of cocaine hidden inside a sea container full of bananas. The shipment had arrived from Guayaquil, Ecuador, and was to continue to a company in Hungary, the Public Prosecution Service (OM) said.
|
227 |
+
|
228 |
+
Read also Police catch dealers during cocaine party – PHOTOS, VIDEOS
|
229 |
+
|
230 |
+
NLTimes reported that the street value of this batch of cocaine is more than 151 million EUR. The discovery was made on Saturday during an inspection at the port. Authorities would not identify the firm that was set to receive the shipment and stated that the recipient probably has nothing to do with the smuggling.
|
231 |
+
|
232 |
+
The Public Prosecution Service of Rotterdam reported that last weekend another 374 kilograms of cocaine was hidden in a different banana shipment.
|
233 |
+
|
234 |
+
Two Rotterdam men who had raised suspicions by their presence at a port terminal were later arrested when they were caught coming out of the container. That shipment was valued at 28 million EUR. The OM on Friday also acknowledged for the first time a separate discovery of cocaine at the Port of Rotterdam dating back to May 20.
|
235 |
+
|
236 |
+
In that case, a smaller batch of 37 kilograms of cocaine was found in a shipment of pomegranates.
|
237 |
+
|
238 |
+
The fruit had arrived from Lima, Peru and was destined for a company in Germany. Again, the OM said that the company was not believed to be involved with drug trafficking. Bricks of cocaine were packaged and kept inside of duffel bags hidden in the sea container. The Hit-and-Run Cargo Team ran all three drug busts at the Port of Rotterdam.
|
239 |
+
|
240 |
+
Featured image: Illustration
|
241 |
+
|
242 |
+
Brazilian woman tried to smuggle 3 kgs of cocaine in condoms into Hungary
|
243 |
+
|
244 |
+
The 30-year-old woman arrived in Hungary with more than three kilogrammes of cocaine. The airport police are now investigating the case. She arrived at Budapest Liszt Ferenc International Airport on 10 February in the late hours of the night from Brazil.
|
245 |
+
|
246 |
+
READ MORE HERE
|
247 |
+
|
248 |
+
Source: www.nltimes.nl",0.6601043045520782
|
249 |
+
3570,Guardia Civil seizes 250 kg of cocaine at the Port of Valencia,"Spanish media sources on May 6 report that a Guardia Civil operation has resulted in the seizure of 250kg of cocaine that arrived at the Port of Valencia hidden among the pieces of a helicopter. The aircraft was dismantled in a South American country after being used in a campaign to prevent forest fires, and it arrived at the port of Valencia in containers. Port workers took the helicopter to the APM Terminals Valencia facilities, specifically to a restricted access area controlled by security guards. There were no arrests in the operation. The cocaine stash is valued EUR 8.5 million.",Moderate,Cargo Disruption,Spain,7/5/20 12:00,2020.0,39.43057,-0.32286,True,['valencia'],1.0,True,7.0,27.0,"Guardia Civil seizes 250 kg of cocaine at the Port of Valencia Spanish media sources on May 6 report that a Guardia Civil operation has resulted in the seizure of 250kg of cocaine that arrived at the Port of Valencia hidden among the pieces of a helicopter. The aircraft was dismantled in a South American country after being used in a campaign to prevent forest fires, and it arrived at the port of Valencia in containers. Port workers took the helicopter to the APM Terminals Valencia facilities, specifically to a restricted access area controlled by security guards. There were no arrests in the operation. The cocaine stash is valued EUR 8.5 million.",https://www.theolivepress.es/spain-news/2020/05/06/250kg-of-cocaine-found-hidden-in-south-american-helicopter-parts-in-spains-valencia/,250kg of cocaine found hidden in South American helicopter parts in Spain’s Valencia,"A GUARDIA Civil operation has resulted in the seizure of 250kg of cocaine that arrived at the Port of Valencia hidden among the pieces of a helicopter.
|
250 |
+
|
251 |
+
The aircraft was dismantled in a South American country after being used in a campaign to prevent forest fires, and on Saturday it arrived at the port of Valencia in containers.
|
252 |
+
|
253 |
+
Port workers took the helicopter to the APM Terminals Valencia facilities, specifically to a restricted access area controlled by security guards.
|
254 |
+
|
255 |
+
According to investigations, a member of the gang must have got spooked by police surveillance or for another unknown reason and not come for the delivery.
|
256 |
+
|
257 |
+
The Risk Analysis Unit (UAR), a group made up of civil guards from the Valencia Office of Fiscal Analysis and Investigation and agents from the Tax Agency, later identified the illicit drug and seized the 250kg.
|
258 |
+
|
259 |
+
There were no arrests in the operation.
|
260 |
+
|
261 |
+
This criminal tactic, known as a blind hook or ‘rip-off’, consists of hiding the cocaine in a container with legal merchandise in the country of origin, without the knowledge of the exporter or importer, and then withdrawing the drug at the port destination before the cargo reaches the end of the route.
|
262 |
+
|
263 |
+
In this way, the drug traffickers often bypass customs controls in Valencia.
|
264 |
+
|
265 |
+
The cocaine stash is valued at 8.5 million euros, according to a first estimate by the Central Narcotics Office.",0.6175416558980942
|
266 |
+
3682,Japanese mega boxship slams into cranes at Busan New Port,"Media sources indicated on April 7 that Japanese mega boxship Milano Bridge smashed into the quayside at Busan New Port on April 6, causing a crane onto the back end of the boxship in the process. Milano Bridge is part of the Ocean Network Express fleet.",Moderate,Maritime Accident,Republic of Korea,8/4/20 2:27,2020.0,35.08092,128.83488,False,['busan'],1.0,False,8.0,32.0,"Japanese mega boxship slams into cranes at Busan New Port Media sources indicated on April 7 that Japanese mega boxship Milano Bridge smashed into the quayside at Busan New Port on April 6, causing a crane onto the back end of the boxship in the process. Milano Bridge is part of the Ocean Network Express fleet.",https://splash247.com/japanese-ultra-large-containership-slams-into-busan-gantry-cranes/,Japanese mega boxship slams into cranes at Busan New Port,"Yet another gantry crane has been pulverised by a poorly berthed mega boxship. The Japanese controlled 13,900 teu Milano Bridge , part of the Ocean Network Express (ONE) fleet, smashed into the quayside at Busan New Port on Monday, bringing down a crane onto the back end of the boxship in the process.
|
267 |
+
|
268 |
+
Dramatic footage sent to Splash shows the incident, which also sees the vessel come into contact with 10,000 teu boxship Seaspan Ganges.
|
269 |
+
|
270 |
+
As well as bringing down crane 85, at least two other cranes sustained damage however no injuries were reported.
|
271 |
+
|
272 |
+
Over the past few years, terminals at Jebel Ali, Karachi, Santos, Semarang, Antwerp and Haiphong have suffered from vessels pranging into cranes.
|
273 |
+
|
274 |
+
According to VesselsValue, Milano Bridge is owned by Doun Kisen and is on charter to ONE.",0.6692844182252884
|
notebooks/00_EDA.ipynb
CHANGED
@@ -96,17 +96,18 @@
|
|
96 |
}
|
97 |
],
|
98 |
"source": [
|
99 |
-
"# First, load the uploaded CSV file
|
100 |
"import pandas as pd\n",
|
101 |
-
"
|
|
|
102 |
"data = pd.read_csv(data_path)\n",
|
103 |
"\n",
|
104 |
"# Display the first few rows of the dataframe and its summary statistics to get an initial understanding\n",
|
105 |
"data_head = data.head()\n",
|
106 |
"data_info = data.info()\n",
|
107 |
-
"data_description = data.describe(include
|
108 |
"\n",
|
109 |
-
"data_info
|
110 |
]
|
111 |
},
|
112 |
{
|
@@ -845,12 +846,14 @@
|
|
845 |
"missing_values_percentage = (missing_values_count / len(data)) * 100\n",
|
846 |
"\n",
|
847 |
"# Combine count and percentage into a dataframe for easier reading\n",
|
848 |
-
"missing_values_df = pd.DataFrame(
|
849 |
-
"
|
850 |
-
"
|
851 |
-
"
|
|
|
|
|
852 |
"\n",
|
853 |
-
"missing_values_df.sort_values(by
|
854 |
]
|
855 |
},
|
856 |
{
|
@@ -873,8 +876,11 @@
|
|
873 |
}
|
874 |
],
|
875 |
"source": [
|
876 |
-
"columns_to_keep = [
|
877 |
-
"columns_to_drop = missing_values_percentage[
|
|
|
|
|
|
|
878 |
"\n",
|
879 |
"# Now drop the columns except for the ones we want to keep\n",
|
880 |
"data_cleaned = data.drop(columns=columns_to_drop)\n",
|
@@ -907,10 +913,10 @@
|
|
907 |
")\n",
|
908 |
"\n",
|
909 |
"# Create a new 'id' column starting from 1\n",
|
910 |
-
"data_cleaned[
|
911 |
"\n",
|
912 |
"# Optionally, if you want 'id' to be the first column, you can rearrange the columns like this:\n",
|
913 |
-
"cols = [
|
914 |
"data_cleaned = data_cleaned[cols]"
|
915 |
]
|
916 |
},
|
@@ -931,7 +937,9 @@
|
|
931 |
"metadata": {},
|
932 |
"outputs": [],
|
933 |
"source": [
|
934 |
-
"data_cleaned[
|
|
|
|
|
935 |
"\n",
|
936 |
"# Now, the DataFrame `data_cleaned` has a new column 'Headline_Details' combining the texts"
|
937 |
]
|
@@ -1314,7 +1322,7 @@
|
|
1314 |
}
|
1315 |
],
|
1316 |
"source": [
|
1317 |
-
"data[
|
1318 |
]
|
1319 |
},
|
1320 |
{
|
@@ -1358,7 +1366,7 @@
|
|
1358 |
}
|
1359 |
],
|
1360 |
"source": [
|
1361 |
-
"data[
|
1362 |
]
|
1363 |
},
|
1364 |
{
|
@@ -2175,7 +2183,7 @@
|
|
2175 |
}
|
2176 |
],
|
2177 |
"source": [
|
2178 |
-
"data[
|
2179 |
]
|
2180 |
},
|
2181 |
{
|
@@ -2202,7 +2210,7 @@
|
|
2202 |
"metadata": {},
|
2203 |
"outputs": [],
|
2204 |
"source": [
|
2205 |
-
"severity_counts = data[
|
2206 |
]
|
2207 |
},
|
2208 |
{
|
@@ -2223,10 +2231,15 @@
|
|
2223 |
],
|
2224 |
"source": [
|
2225 |
"plt.figure(figsize=(12, 6)) # Adjust size as needed\n",
|
2226 |
-
"plt.pie(
|
2227 |
-
"
|
2228 |
-
"
|
2229 |
-
"
|
|
|
|
|
|
|
|
|
|
|
2230 |
]
|
2231 |
},
|
2232 |
{
|
@@ -2262,7 +2275,7 @@
|
|
2262 |
"metadata": {},
|
2263 |
"outputs": [],
|
2264 |
"source": [
|
2265 |
-
"minor_cases = data[data[
|
2266 |
]
|
2267 |
},
|
2268 |
{
|
@@ -2271,7 +2284,7 @@
|
|
2271 |
"metadata": {},
|
2272 |
"outputs": [],
|
2273 |
"source": [
|
2274 |
-
"country_counts = minor_cases[
|
2275 |
]
|
2276 |
},
|
2277 |
{
|
@@ -2280,8 +2293,6 @@
|
|
2280 |
"metadata": {},
|
2281 |
"outputs": [],
|
2282 |
"source": [
|
2283 |
-
"\n",
|
2284 |
-
"\n",
|
2285 |
"# Keep the top 3 countries\n",
|
2286 |
"top_countries = country_counts.nlargest(3)\n",
|
2287 |
"\n",
|
@@ -2292,8 +2303,7 @@
|
|
2292 |
"top_countries_series = top_countries\n",
|
2293 |
"\n",
|
2294 |
"# Add the 'Rest' category by assigning it directly to the Series\n",
|
2295 |
-
"top_countries_series[
|
2296 |
-
"\n"
|
2297 |
]
|
2298 |
},
|
2299 |
{
|
@@ -2326,11 +2336,16 @@
|
|
2326 |
"\n",
|
2327 |
"# Create the pie chart with matplotlib, using the custom seaborn color palette\n",
|
2328 |
"plt.figure(figsize=(10, 6))\n",
|
2329 |
-
"plt.pie(
|
2330 |
-
"
|
|
|
|
|
|
|
|
|
|
|
2331 |
"\n",
|
2332 |
"plt.title(\"Distribution of 'Moderate' Cases Among Top 5 Countries and Rest\")\n",
|
2333 |
-
"plt.show()
|
2334 |
]
|
2335 |
},
|
2336 |
{
|
@@ -2351,7 +2366,7 @@
|
|
2351 |
],
|
2352 |
"source": [
|
2353 |
"# top 10 regions with the most number of cases\n",
|
2354 |
-
"top_regions = data[
|
2355 |
"\n",
|
2356 |
"# Filter the DataFrame to include only the top 10 categories\n",
|
2357 |
"data_top_regions = data[data[\"Region\"].isin(top_regions)]\n",
|
@@ -2412,17 +2427,21 @@
|
|
2412 |
],
|
2413 |
"source": [
|
2414 |
"# Count the occurrences of each category and select the top 10\n",
|
2415 |
-
"top_categories = data[
|
2416 |
"\n",
|
2417 |
"# Filter the DataFrame to include only the top 10 categories\n",
|
2418 |
-
"data_top_categories = data[data[
|
2419 |
"\n",
|
2420 |
"# Plot\n",
|
2421 |
"plt.figure(figsize=(12, 8)) # Adjust size as needed\n",
|
2422 |
-
"sns.countplot(
|
2423 |
-
"
|
|
|
|
|
|
|
|
|
2424 |
"plt.xlabel(\"Number of Incidents\")\n",
|
2425 |
-
"plt.ylabel(
|
2426 |
"plt.show()"
|
2427 |
]
|
2428 |
},
|
@@ -2440,14 +2459,14 @@
|
|
2440 |
"outputs": [],
|
2441 |
"source": [
|
2442 |
"# Filter data for China and United States\n",
|
2443 |
-
"china_cases = data[data[
|
2444 |
-
"us_cases = data[data[
|
2445 |
"\n",
|
2446 |
"# Get top 5 event categories for China\n",
|
2447 |
-
"china_top_5 = china_cases[
|
2448 |
"\n",
|
2449 |
"# Get top 5 event categories for United States\n",
|
2450 |
-
"us_top_5 = us_cases[
|
2451 |
]
|
2452 |
},
|
2453 |
{
|
@@ -2457,8 +2476,12 @@
|
|
2457 |
"outputs": [],
|
2458 |
"source": [
|
2459 |
"# Convert Series to DataFrame\n",
|
2460 |
-
"china_plot_data = china_top_5.reset_index().rename(
|
2461 |
-
"
|
|
|
|
|
|
|
|
|
2462 |
]
|
2463 |
},
|
2464 |
{
|
@@ -2596,35 +2619,41 @@
|
|
2596 |
"\n",
|
2597 |
"# Plot for China\n",
|
2598 |
"plt.figure(figsize=(10, 6))\n",
|
2599 |
-
"ax_china = sns.barplot(
|
2600 |
-
"
|
2601 |
-
"
|
2602 |
-
"plt.
|
|
|
|
|
2603 |
"\n",
|
2604 |
"# Loop through the bars and add text annotation\n",
|
2605 |
"for p in ax_china.patches:\n",
|
2606 |
" width = p.get_width()\n",
|
2607 |
-
" plt.text(
|
2608 |
-
"
|
2609 |
-
"
|
2610 |
-
"
|
|
|
|
|
2611 |
"\n",
|
2612 |
"plt.show()\n",
|
2613 |
"\n",
|
2614 |
"# Plot for United States\n",
|
2615 |
"plt.figure(figsize=(10, 6))\n",
|
2616 |
-
"ax_us = sns.barplot(x
|
2617 |
-
"plt.title(
|
2618 |
-
"plt.xlabel(
|
2619 |
-
"plt.ylabel(
|
2620 |
"\n",
|
2621 |
"# Loop through the bars and add text annotation for the US plot\n",
|
2622 |
"for p in ax_us.patches:\n",
|
2623 |
" width = p.get_width()\n",
|
2624 |
-
" plt.text(
|
2625 |
-
"
|
2626 |
-
"
|
2627 |
-
"
|
|
|
|
|
2628 |
"\n",
|
2629 |
"plt.show()"
|
2630 |
]
|
|
|
96 |
}
|
97 |
],
|
98 |
"source": [
|
99 |
+
"# First, load the uploaded CSV file\n",
|
100 |
"import pandas as pd\n",
|
101 |
+
"\n",
|
102 |
+
"data_path = \"data/all_port_labelled.csv\"\n",
|
103 |
"data = pd.read_csv(data_path)\n",
|
104 |
"\n",
|
105 |
"# Display the first few rows of the dataframe and its summary statistics to get an initial understanding\n",
|
106 |
"data_head = data.head()\n",
|
107 |
"data_info = data.info()\n",
|
108 |
+
"data_description = data.describe(include=\"all\")\n",
|
109 |
"\n",
|
110 |
+
"data_info"
|
111 |
]
|
112 |
},
|
113 |
{
|
|
|
846 |
"missing_values_percentage = (missing_values_count / len(data)) * 100\n",
|
847 |
"\n",
|
848 |
"# Combine count and percentage into a dataframe for easier reading\n",
|
849 |
+
"missing_values_df = pd.DataFrame(\n",
|
850 |
+
" {\n",
|
851 |
+
" \"Missing Values\": missing_values_count,\n",
|
852 |
+
" \"Percentage (%)\": missing_values_percentage,\n",
|
853 |
+
" }\n",
|
854 |
+
")\n",
|
855 |
"\n",
|
856 |
+
"missing_values_df.sort_values(by=\"Missing Values\", ascending=False)"
|
857 |
]
|
858 |
},
|
859 |
{
|
|
|
876 |
}
|
877 |
],
|
878 |
"source": [
|
879 |
+
"columns_to_keep = [\"lat\", \"lon\"]\n",
|
880 |
+
"columns_to_drop = missing_values_percentage[\n",
|
881 |
+
" (missing_values_percentage > 30)\n",
|
882 |
+
" & (~missing_values_percentage.index.isin(columns_to_keep))\n",
|
883 |
+
"].index\n",
|
884 |
"\n",
|
885 |
"# Now drop the columns except for the ones we want to keep\n",
|
886 |
"data_cleaned = data.drop(columns=columns_to_drop)\n",
|
|
|
913 |
")\n",
|
914 |
"\n",
|
915 |
"# Create a new 'id' column starting from 1\n",
|
916 |
+
"data_cleaned[\"id\"] = range(1, len(data_cleaned) + 1)\n",
|
917 |
"\n",
|
918 |
"# Optionally, if you want 'id' to be the first column, you can rearrange the columns like this:\n",
|
919 |
+
"cols = [\"id\"] + [col for col in data_cleaned.columns if col != \"id\"]\n",
|
920 |
"data_cleaned = data_cleaned[cols]"
|
921 |
]
|
922 |
},
|
|
|
937 |
"metadata": {},
|
938 |
"outputs": [],
|
939 |
"source": [
|
940 |
+
"data_cleaned[\"Headline_Details\"] = (\n",
|
941 |
+
" data_cleaned[\"Headline\"] + \" \" + data_cleaned[\"Details\"]\n",
|
942 |
+
")\n",
|
943 |
"\n",
|
944 |
"# Now, the DataFrame `data_cleaned` has a new column 'Headline_Details' combining the texts"
|
945 |
]
|
|
|
1322 |
}
|
1323 |
],
|
1324 |
"source": [
|
1325 |
+
"data[\"Region\"].value_counts()"
|
1326 |
]
|
1327 |
},
|
1328 |
{
|
|
|
1366 |
}
|
1367 |
],
|
1368 |
"source": [
|
1369 |
+
"data[\"Region\"].unique()"
|
1370 |
]
|
1371 |
},
|
1372 |
{
|
|
|
2183 |
}
|
2184 |
],
|
2185 |
"source": [
|
2186 |
+
"data[\"Category\"].unique()"
|
2187 |
]
|
2188 |
},
|
2189 |
{
|
|
|
2210 |
"metadata": {},
|
2211 |
"outputs": [],
|
2212 |
"source": [
|
2213 |
+
"severity_counts = data[\"Severity\"].value_counts()"
|
2214 |
]
|
2215 |
},
|
2216 |
{
|
|
|
2231 |
],
|
2232 |
"source": [
|
2233 |
"plt.figure(figsize=(12, 6)) # Adjust size as needed\n",
|
2234 |
+
"plt.pie(\n",
|
2235 |
+
" severity_counts,\n",
|
2236 |
+
" labels=severity_counts.index,\n",
|
2237 |
+
" autopct=lambda p: f\"{int(p/100.*severity_counts.sum())} ({p:.1f}%)\",\n",
|
2238 |
+
" startangle=140,\n",
|
2239 |
+
" counterclock=False,\n",
|
2240 |
+
")\n",
|
2241 |
+
"plt.title(\"Event Severity Distribution\")\n",
|
2242 |
+
"plt.show()"
|
2243 |
]
|
2244 |
},
|
2245 |
{
|
|
|
2275 |
"metadata": {},
|
2276 |
"outputs": [],
|
2277 |
"source": [
|
2278 |
+
"minor_cases = data[data[\"Severity\"] == \"Moderate\"].copy()"
|
2279 |
]
|
2280 |
},
|
2281 |
{
|
|
|
2284 |
"metadata": {},
|
2285 |
"outputs": [],
|
2286 |
"source": [
|
2287 |
+
"country_counts = minor_cases[\"Region\"].value_counts()"
|
2288 |
]
|
2289 |
},
|
2290 |
{
|
|
|
2293 |
"metadata": {},
|
2294 |
"outputs": [],
|
2295 |
"source": [
|
|
|
|
|
2296 |
"# Keep the top 3 countries\n",
|
2297 |
"top_countries = country_counts.nlargest(3)\n",
|
2298 |
"\n",
|
|
|
2303 |
"top_countries_series = top_countries\n",
|
2304 |
"\n",
|
2305 |
"# Add the 'Rest' category by assigning it directly to the Series\n",
|
2306 |
+
"top_countries_series[\"Rest\"] = rest_count"
|
|
|
2307 |
]
|
2308 |
},
|
2309 |
{
|
|
|
2336 |
"\n",
|
2337 |
"# Create the pie chart with matplotlib, using the custom seaborn color palette\n",
|
2338 |
"plt.figure(figsize=(10, 6))\n",
|
2339 |
+
"plt.pie(\n",
|
2340 |
+
" top_countries_series,\n",
|
2341 |
+
" labels=top_countries_series.index,\n",
|
2342 |
+
" autopct=\"%1.1f%%\",\n",
|
2343 |
+
" startangle=90,\n",
|
2344 |
+
" colors=palette,\n",
|
2345 |
+
")\n",
|
2346 |
"\n",
|
2347 |
"plt.title(\"Distribution of 'Moderate' Cases Among Top 5 Countries and Rest\")\n",
|
2348 |
+
"plt.show()"
|
2349 |
]
|
2350 |
},
|
2351 |
{
|
|
|
2366 |
],
|
2367 |
"source": [
|
2368 |
"# top 10 regions with the most number of cases\n",
|
2369 |
+
"top_regions = data[\"Region\"].value_counts().nlargest(10).index\n",
|
2370 |
"\n",
|
2371 |
"# Filter the DataFrame to include only the top 10 categories\n",
|
2372 |
"data_top_regions = data[data[\"Region\"].isin(top_regions)]\n",
|
|
|
2427 |
],
|
2428 |
"source": [
|
2429 |
"# Count the occurrences of each category and select the top 10\n",
|
2430 |
+
"top_categories = data[\"Category\"].value_counts().nlargest(10).index\n",
|
2431 |
"\n",
|
2432 |
"# Filter the DataFrame to include only the top 10 categories\n",
|
2433 |
+
"data_top_categories = data[data[\"Category\"].isin(top_categories)]\n",
|
2434 |
"\n",
|
2435 |
"# Plot\n",
|
2436 |
"plt.figure(figsize=(12, 8)) # Adjust size as needed\n",
|
2437 |
+
"sns.countplot(\n",
|
2438 |
+
" y=\"Category\",\n",
|
2439 |
+
" data=data_top_categories,\n",
|
2440 |
+
" order=data_top_categories[\"Category\"].value_counts().index,\n",
|
2441 |
+
")\n",
|
2442 |
+
"plt.title(\"Top 10 Event Categories Distribution\")\n",
|
2443 |
"plt.xlabel(\"Number of Incidents\")\n",
|
2444 |
+
"plt.ylabel(\"Category\")\n",
|
2445 |
"plt.show()"
|
2446 |
]
|
2447 |
},
|
|
|
2459 |
"outputs": [],
|
2460 |
"source": [
|
2461 |
"# Filter data for China and United States\n",
|
2462 |
+
"china_cases = data[data[\"Region\"] == \"China\"]\n",
|
2463 |
+
"us_cases = data[data[\"Region\"] == \"United States\"]\n",
|
2464 |
"\n",
|
2465 |
"# Get top 5 event categories for China\n",
|
2466 |
+
"china_top_5 = china_cases[\"Category\"].value_counts().nlargest(5)\n",
|
2467 |
"\n",
|
2468 |
"# Get top 5 event categories for United States\n",
|
2469 |
+
"us_top_5 = us_cases[\"Category\"].value_counts().nlargest(5)"
|
2470 |
]
|
2471 |
},
|
2472 |
{
|
|
|
2476 |
"outputs": [],
|
2477 |
"source": [
|
2478 |
"# Convert Series to DataFrame\n",
|
2479 |
+
"china_plot_data = china_top_5.reset_index().rename(\n",
|
2480 |
+
" columns={\"index\": \"Category\", \"Category\": \"Category\"}\n",
|
2481 |
+
")\n",
|
2482 |
+
"us_plot_data = us_top_5.reset_index().rename(\n",
|
2483 |
+
" columns={\"index\": \"Category\", \"Category\": \"Category\"}\n",
|
2484 |
+
")"
|
2485 |
]
|
2486 |
},
|
2487 |
{
|
|
|
2619 |
"\n",
|
2620 |
"# Plot for China\n",
|
2621 |
"plt.figure(figsize=(10, 6))\n",
|
2622 |
+
"ax_china = sns.barplot(\n",
|
2623 |
+
" x=\"count\", y=\"Category\", data=china_plot_data, palette=\"Oranges_r\"\n",
|
2624 |
+
")\n",
|
2625 |
+
"plt.title(\"Top 5 Event Categories in China\")\n",
|
2626 |
+
"plt.xlabel(\"Number of Events\")\n",
|
2627 |
+
"plt.ylabel(\"Event Category\")\n",
|
2628 |
"\n",
|
2629 |
"# Loop through the bars and add text annotation\n",
|
2630 |
"for p in ax_china.patches:\n",
|
2631 |
" width = p.get_width()\n",
|
2632 |
+
" plt.text(\n",
|
2633 |
+
" width + 1, # x position, shifted +1 to the right for spacing\n",
|
2634 |
+
" p.get_y() + p.get_height() / 2, # y position, at the center of the bar\n",
|
2635 |
+
" f\"{int(width)}\", # text label, the count of events\n",
|
2636 |
+
" va=\"center\",\n",
|
2637 |
+
" ) # center alignment\n",
|
2638 |
"\n",
|
2639 |
"plt.show()\n",
|
2640 |
"\n",
|
2641 |
"# Plot for United States\n",
|
2642 |
"plt.figure(figsize=(10, 6))\n",
|
2643 |
+
"ax_us = sns.barplot(x=\"count\", y=\"Category\", data=us_plot_data, palette=\"Blues_r\")\n",
|
2644 |
+
"plt.title(\"Top 5 Event Categories in the United States\")\n",
|
2645 |
+
"plt.xlabel(\"Number of Events\")\n",
|
2646 |
+
"plt.ylabel(\"Event Category\")\n",
|
2647 |
"\n",
|
2648 |
"# Loop through the bars and add text annotation for the US plot\n",
|
2649 |
"for p in ax_us.patches:\n",
|
2650 |
" width = p.get_width()\n",
|
2651 |
+
" plt.text(\n",
|
2652 |
+
" width + 1, # x position, shifted +1 to the right for spacing\n",
|
2653 |
+
" p.get_y() + p.get_height() / 2, # y position, at the center of the bar\n",
|
2654 |
+
" f\"{int(width)}\", # text label, the count of events\n",
|
2655 |
+
" va=\"center\",\n",
|
2656 |
+
" ) # center alignment\n",
|
2657 |
"\n",
|
2658 |
"plt.show()"
|
2659 |
]
|
notebooks/05a_newsScraper_run_1.ipynb
CHANGED
@@ -120,11 +120,17 @@
|
|
120 |
"\n",
|
121 |
" if not results:\n",
|
122 |
" # if blocked by the website\n",
|
123 |
-
" results.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
"\n",
|
125 |
" # No articles found for the given title.\n",
|
126 |
" else:\n",
|
127 |
-
" results.append([
|
128 |
"\n",
|
129 |
" return results"
|
130 |
]
|
@@ -172,7 +178,7 @@
|
|
172 |
"outputs": [],
|
173 |
"source": [
|
174 |
"# drop empty news\n",
|
175 |
-
"df.dropna(subset=[
|
176 |
]
|
177 |
},
|
178 |
{
|
@@ -193,7 +199,7 @@
|
|
193 |
}
|
194 |
],
|
195 |
"source": [
|
196 |
-
"df[[
|
197 |
]
|
198 |
},
|
199 |
{
|
@@ -204,7 +210,7 @@
|
|
204 |
"outputs": [],
|
205 |
"source": [
|
206 |
"# drop the duplicated news\n",
|
207 |
-
"duplicates = df.duplicated(subset=[
|
208 |
"df_uni = df[~duplicates]"
|
209 |
]
|
210 |
},
|
@@ -3262,14 +3268,14 @@
|
|
3262 |
"# Iterate through each row and get the full news article\n",
|
3263 |
"tes = df_uni\n",
|
3264 |
"for index, row in tes.iterrows():\n",
|
3265 |
-
" headline = row[
|
3266 |
-
"
|
3267 |
" results = get_news_article(headline)\n",
|
3268 |
-
"
|
3269 |
" # Update the DataFrame with the fetched data\n",
|
3270 |
-
" tes.at[index,
|
3271 |
-
" tes.at[index,
|
3272 |
-
" tes.at[index,
|
3273 |
]
|
3274 |
},
|
3275 |
{
|
@@ -3738,7 +3744,9 @@
|
|
3738 |
}
|
3739 |
],
|
3740 |
"source": [
|
3741 |
-
"count_rows = tes[
|
|
|
|
|
3742 |
"count_rows"
|
3743 |
]
|
3744 |
},
|
@@ -3760,7 +3768,7 @@
|
|
3760 |
}
|
3761 |
],
|
3762 |
"source": [
|
3763 |
-
"count_rows = tes[tes[
|
3764 |
"count_rows"
|
3765 |
]
|
3766 |
},
|
@@ -3782,7 +3790,7 @@
|
|
3782 |
}
|
3783 |
],
|
3784 |
"source": [
|
3785 |
-
"count_rows = tes[tes[
|
3786 |
"count_rows"
|
3787 |
]
|
3788 |
},
|
|
|
120 |
"\n",
|
121 |
" if not results:\n",
|
122 |
" # if blocked by the website\n",
|
123 |
+
" results.append(\n",
|
124 |
+
" [\n",
|
125 |
+
" \"cannot scrape the url\",\n",
|
126 |
+
" \"cannot scrape the title\",\n",
|
127 |
+
" \"cannot scrape the content\",\n",
|
128 |
+
" ]\n",
|
129 |
+
" )\n",
|
130 |
"\n",
|
131 |
" # No articles found for the given title.\n",
|
132 |
" else:\n",
|
133 |
+
" results.append([\"no url found\", \"no title found\", \"no content found\"])\n",
|
134 |
"\n",
|
135 |
" return results"
|
136 |
]
|
|
|
178 |
"outputs": [],
|
179 |
"source": [
|
180 |
"# drop empty news\n",
|
181 |
+
"df.dropna(subset=[\"Headline\"], inplace=True)"
|
182 |
]
|
183 |
},
|
184 |
{
|
|
|
199 |
}
|
200 |
],
|
201 |
"source": [
|
202 |
+
"df[[\"Year\", \"Headline\", \"Region\"]].duplicated().any()"
|
203 |
]
|
204 |
},
|
205 |
{
|
|
|
210 |
"outputs": [],
|
211 |
"source": [
|
212 |
"# drop the duplicated news\n",
|
213 |
+
"duplicates = df.duplicated(subset=[\"Year\", \"Headline\", \"Region\"], keep=\"first\")\n",
|
214 |
"df_uni = df[~duplicates]"
|
215 |
]
|
216 |
},
|
|
|
3268 |
"# Iterate through each row and get the full news article\n",
|
3269 |
"tes = df_uni\n",
|
3270 |
"for index, row in tes.iterrows():\n",
|
3271 |
+
" headline = row[\"Headline\"]\n",
|
3272 |
+
"\n",
|
3273 |
" results = get_news_article(headline)\n",
|
3274 |
+
"\n",
|
3275 |
" # Update the DataFrame with the fetched data\n",
|
3276 |
+
" tes.at[index, \"url\"] = results[0][0]\n",
|
3277 |
+
" tes.at[index, \"title\"] = results[0][1]\n",
|
3278 |
+
" tes.at[index, \"content\"] = results[0][2]"
|
3279 |
]
|
3280 |
},
|
3281 |
{
|
|
|
3744 |
}
|
3745 |
],
|
3746 |
"source": [
|
3747 |
+
"count_rows = tes[\n",
|
3748 |
+
" ~tes[\"content\"].isin([\"cannot scrape the content\", \"no content found\"])\n",
|
3749 |
+
"].shape[0]\n",
|
3750 |
"count_rows"
|
3751 |
]
|
3752 |
},
|
|
|
3768 |
}
|
3769 |
],
|
3770 |
"source": [
|
3771 |
+
"count_rows = tes[tes[\"content\"].isin([\"no content found\"])].shape[0]\n",
|
3772 |
"count_rows"
|
3773 |
]
|
3774 |
},
|
|
|
3790 |
}
|
3791 |
],
|
3792 |
"source": [
|
3793 |
+
"count_rows = tes[tes[\"content\"].isin([\"cannot scrape the content\"])].shape[0]\n",
|
3794 |
"count_rows"
|
3795 |
]
|
3796 |
},
|
notebooks/05b_newsScraper_run_2.ipynb
CHANGED
@@ -913,8 +913,12 @@
|
|
913 |
"for index in tqdm(range(len(tes))):\n",
|
914 |
" row1 = scrapped_df1.iloc[index]\n",
|
915 |
" row2 = tes.iloc[index]\n",
|
916 |
-
"
|
917 |
-
" if
|
|
|
|
|
|
|
|
|
918 |
" row = row2\n",
|
919 |
" else:\n",
|
920 |
" row = row1\n",
|
|
|
913 |
"for index in tqdm(range(len(tes))):\n",
|
914 |
" row1 = scrapped_df1.iloc[index]\n",
|
915 |
" row2 = tes.iloc[index]\n",
|
916 |
+
"\n",
|
917 |
+
" if (\n",
|
918 |
+
" row1[\"content\"] in not_found\n",
|
919 |
+
" and not row2[\"content\"] in not_found\n",
|
920 |
+
" and row2[\"title\"] is not None\n",
|
921 |
+
" ):\n",
|
922 |
" row = row2\n",
|
923 |
" else:\n",
|
924 |
" row = row1\n",
|
notebooks/05c_newsScraper_clearning.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/06_basic_text_preprocessing_on_scraped_data.ipynb
CHANGED
@@ -479,7 +479,7 @@
|
|
479 |
"outputs": [],
|
480 |
"source": [
|
481 |
"# drop empty lines\n",
|
482 |
-
"df_copy.dropna(subset=[
|
483 |
]
|
484 |
},
|
485 |
{
|
@@ -500,8 +500,8 @@
|
|
500 |
],
|
501 |
"source": [
|
502 |
"print(\"Published Date Statistics:\")\n",
|
503 |
-
"print(\"Min Date:\", df_copy[
|
504 |
-
"print(\"Max Date:\", df_copy[
|
505 |
]
|
506 |
},
|
507 |
{
|
@@ -523,7 +523,7 @@
|
|
523 |
],
|
524 |
"source": [
|
525 |
"# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time\n",
|
526 |
-
"df_copy[[
|
527 |
]
|
528 |
},
|
529 |
{
|
@@ -534,7 +534,7 @@
|
|
534 |
"outputs": [],
|
535 |
"source": [
|
536 |
"# drop the duplicated news\n",
|
537 |
-
"duplicates = df_copy.duplicated(subset=[
|
538 |
"df_uni = df_copy[~duplicates]"
|
539 |
]
|
540 |
},
|
@@ -1295,7 +1295,7 @@
|
|
1295 |
}
|
1296 |
],
|
1297 |
"source": [
|
1298 |
-
"df_uni[
|
1299 |
]
|
1300 |
},
|
1301 |
{
|
@@ -1306,10 +1306,12 @@
|
|
1306 |
"outputs": [],
|
1307 |
"source": [
|
1308 |
"## remove contractions, lowercase, remove numbers and punctuations, remove stopwords\n",
|
1309 |
-
"df_uni[
|
|
|
|
|
1310 |
"\n",
|
1311 |
"## convert back into string so that tokenization can be done\n",
|
1312 |
-
"df_uni[
|
1313 |
]
|
1314 |
},
|
1315 |
{
|
@@ -1330,7 +1332,7 @@
|
|
1330 |
}
|
1331 |
],
|
1332 |
"source": [
|
1333 |
-
"df_uni[
|
1334 |
]
|
1335 |
},
|
1336 |
{
|
@@ -1385,18 +1387,23 @@
|
|
1385 |
"\n",
|
1386 |
"wnl = WordNetLemmatizer()\n",
|
1387 |
"\n",
|
|
|
1388 |
"def lemmatize_words(text):\n",
|
1389 |
" # Tokenize the text into sentences and then words\n",
|
1390 |
" sentences = sent_tokenize(text)\n",
|
1391 |
" words = [word_tokenize(sentence) for sentence in sentences]\n",
|
1392 |
"\n",
|
1393 |
" # Remove punctuation and tokenize into lowercase words\n",
|
1394 |
-
" punc = [[w.lower() for w in word if re.search(
|
1395 |
"\n",
|
1396 |
" # Perform lemmatization on words with valid POS tags\n",
|
1397 |
-
" doc_lemmed = [
|
1398 |
-
"
|
1399 |
-
"
|
|
|
|
|
|
|
|
|
1400 |
" return doc_lemmed"
|
1401 |
]
|
1402 |
},
|
@@ -1418,7 +1425,7 @@
|
|
1418 |
"source": [
|
1419 |
"%%time\n",
|
1420 |
"\n",
|
1421 |
-
"df_uni[
|
1422 |
]
|
1423 |
},
|
1424 |
{
|
@@ -1436,8 +1443,23 @@
|
|
1436 |
"metadata": {},
|
1437 |
"outputs": [],
|
1438 |
"source": [
|
1439 |
-
"stop_list = nltk.corpus.stopwords.words(
|
1440 |
-
"stop_list += [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1441 |
"\n",
|
1442 |
"def corpus2docs2(corpus):\n",
|
1443 |
" # corpus is a object returned by load_corpus that represents a corpus.\n",
|
@@ -1448,27 +1470,39 @@
|
|
1448 |
" phrases = []\n",
|
1449 |
" i = 0\n",
|
1450 |
" while i < len(doc_pos):\n",
|
1451 |
-
" if doc_pos[i][1] ==
|
1452 |
-
" if
|
1453 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
1454 |
" i += 3\n",
|
1455 |
-
" elif i+1 < len(doc_pos) and doc_pos[i+1][1] ==
|
1456 |
-
" phrases.append((doc_pos[i][0], doc_pos[i+1][0]))\n",
|
1457 |
" i += 2\n",
|
1458 |
" else:\n",
|
1459 |
" i += 1\n",
|
1460 |
-
" elif doc_pos[i][1] ==
|
1461 |
-
" if
|
1462 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
1463 |
" i += 3\n",
|
1464 |
-
" elif i+1 < len(doc_pos) and doc_pos[i+1][1] ==
|
1465 |
-
" phrases.append((doc_pos[i][0], doc_pos[i+1][0]))\n",
|
1466 |
" i += 2\n",
|
1467 |
" else:\n",
|
1468 |
" i += 1\n",
|
1469 |
" else:\n",
|
1470 |
" i += 1\n",
|
1471 |
-
" phrase_set = [
|
1472 |
" docs.append(phrase_set)\n",
|
1473 |
" return docs"
|
1474 |
]
|
@@ -1498,7 +1532,7 @@
|
|
1498 |
"metadata": {},
|
1499 |
"outputs": [],
|
1500 |
"source": [
|
1501 |
-
"df_uni[
|
1502 |
]
|
1503 |
},
|
1504 |
{
|
@@ -1546,7 +1580,7 @@
|
|
1546 |
}
|
1547 |
],
|
1548 |
"source": [
|
1549 |
-
"df_uni[
|
1550 |
]
|
1551 |
},
|
1552 |
{
|
@@ -1567,17 +1601,17 @@
|
|
1567 |
}
|
1568 |
],
|
1569 |
"source": [
|
1570 |
-
"fdist_doc = nltk.FreqDist(df_uni[
|
1571 |
"\n",
|
1572 |
"x, y = zip(*fdist_doc)\n",
|
1573 |
-
"plt.figure(figsize=(50,30))\n",
|
1574 |
"plt.margins(0.02)\n",
|
1575 |
"plt.bar(x, y)\n",
|
1576 |
-
"plt.xlabel(
|
1577 |
-
"plt.ylabel(
|
1578 |
"plt.yticks(fontsize=40)\n",
|
1579 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
1580 |
-
"plt.title(
|
1581 |
"plt.show()"
|
1582 |
]
|
1583 |
},
|
@@ -1588,7 +1622,7 @@
|
|
1588 |
"metadata": {},
|
1589 |
"outputs": [],
|
1590 |
"source": [
|
1591 |
-
"all_words = [word for sublist in df_uni[
|
1592 |
"all_words[:2]\n",
|
1593 |
"# Calculate word frequencies\n",
|
1594 |
"fdist = FreqDist(all_words)"
|
@@ -1624,7 +1658,7 @@
|
|
1624 |
"source": [
|
1625 |
"# Plot the word frequency distribution as a bar graph\n",
|
1626 |
"plt.figure(figsize=(12, 6))\n",
|
1627 |
-
"plt.title(
|
1628 |
"fdist.plot(30, cumulative=False)"
|
1629 |
]
|
1630 |
},
|
@@ -1654,7 +1688,7 @@
|
|
1654 |
}
|
1655 |
],
|
1656 |
"source": [
|
1657 |
-
"com = df_uni[
|
1658 |
"com[:10]"
|
1659 |
]
|
1660 |
},
|
@@ -1711,11 +1745,11 @@
|
|
1711 |
"\n",
|
1712 |
"# Plotting with Seaborn for each company\n",
|
1713 |
"for region in com[:10]:\n",
|
1714 |
-
" haha = df_uni[
|
1715 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
1716 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
1717 |
-
" plt.imshow(wordcloud, interpolation
|
1718 |
-
" plt.title(f
|
1719 |
" plt.axis(\"off\")\n",
|
1720 |
" plt.margins(x=0, y=0)\n",
|
1721 |
" plt.show()"
|
@@ -1738,10 +1772,10 @@
|
|
1738 |
"metadata": {},
|
1739 |
"outputs": [],
|
1740 |
"source": [
|
1741 |
-
"df_uni[
|
1742 |
"\n",
|
1743 |
"# Tokenize the text and create a dictionary\n",
|
1744 |
-
"documents = df_uni[
|
1745 |
"dictionary = corpora.Dictionary(documents)\n",
|
1746 |
"\n",
|
1747 |
"tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)\n",
|
@@ -2769,7 +2803,9 @@
|
|
2769 |
}
|
2770 |
],
|
2771 |
"source": [
|
2772 |
-
"sorted_term_frequencies = dict(
|
|
|
|
|
2773 |
"sorted_term_frequencies"
|
2774 |
]
|
2775 |
},
|
@@ -2791,11 +2827,13 @@
|
|
2791 |
"# customisable, lower threshold, more words retained.\n",
|
2792 |
"threshold = 0.03\n",
|
2793 |
"\n",
|
|
|
2794 |
"def filter_and_join(tfidf_doc):\n",
|
2795 |
" filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]\n",
|
2796 |
" return filtered_terms\n",
|
2797 |
"\n",
|
2798 |
-
"
|
|
|
2799 |
]
|
2800 |
},
|
2801 |
{
|
@@ -2827,7 +2865,7 @@
|
|
2827 |
}
|
2828 |
],
|
2829 |
"source": [
|
2830 |
-
"df_uni[
|
2831 |
]
|
2832 |
},
|
2833 |
{
|
@@ -2848,17 +2886,17 @@
|
|
2848 |
}
|
2849 |
],
|
2850 |
"source": [
|
2851 |
-
"fdist_doc = nltk.FreqDist(df_uni[
|
2852 |
"\n",
|
2853 |
"x, y = zip(*fdist_doc)\n",
|
2854 |
-
"plt.figure(figsize=(50,30))\n",
|
2855 |
"plt.margins(0.02)\n",
|
2856 |
"plt.bar(x, y)\n",
|
2857 |
-
"plt.xlabel(
|
2858 |
-
"plt.ylabel(
|
2859 |
"plt.yticks(fontsize=40)\n",
|
2860 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
2861 |
-
"plt.title(
|
2862 |
"plt.show()"
|
2863 |
]
|
2864 |
},
|
@@ -2869,7 +2907,7 @@
|
|
2869 |
"metadata": {},
|
2870 |
"outputs": [],
|
2871 |
"source": [
|
2872 |
-
"all_words_filtered = [word for sublist in df_uni[
|
2873 |
"all_words_filtered[:2]\n",
|
2874 |
"# Calculate word frequencies\n",
|
2875 |
"fdist_filtered = FreqDist(all_words_filtered)"
|
@@ -2906,7 +2944,7 @@
|
|
2906 |
"# Plot the word frequency distribution as a bar graph\n",
|
2907 |
"# apparently, the dataset is much cleaner now.\n",
|
2908 |
"plt.figure(figsize=(12, 6))\n",
|
2909 |
-
"plt.title(
|
2910 |
"fdist_filtered.plot(30, cumulative=False)"
|
2911 |
]
|
2912 |
},
|
@@ -2963,11 +3001,11 @@
|
|
2963 |
"\n",
|
2964 |
"# Plotting with Seaborn for each company\n",
|
2965 |
"for region in com[:10]:\n",
|
2966 |
-
" haha = df_uni[
|
2967 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
2968 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
2969 |
-
" plt.imshow(wordcloud, interpolation
|
2970 |
-
" plt.title(f
|
2971 |
" plt.axis(\"off\")\n",
|
2972 |
" plt.margins(x=0, y=0)\n",
|
2973 |
" plt.show()"
|
@@ -2980,7 +3018,7 @@
|
|
2980 |
"metadata": {},
|
2981 |
"outputs": [],
|
2982 |
"source": [
|
2983 |
-
"df_uni[
|
2984 |
]
|
2985 |
},
|
2986 |
{
|
@@ -3068,7 +3106,7 @@
|
|
3068 |
}
|
3069 |
],
|
3070 |
"source": [
|
3071 |
-
"df_uni[[
|
3072 |
]
|
3073 |
},
|
3074 |
{
|
@@ -3180,7 +3218,9 @@
|
|
3180 |
],
|
3181 |
"source": [
|
3182 |
"# count of news by region\n",
|
3183 |
-
"df_uni[[
|
|
|
|
|
3184 |
]
|
3185 |
},
|
3186 |
{
|
@@ -3253,7 +3293,9 @@
|
|
3253 |
}
|
3254 |
],
|
3255 |
"source": [
|
3256 |
-
"df_uni[[
|
|
|
|
|
3257 |
]
|
3258 |
},
|
3259 |
{
|
@@ -3566,7 +3608,7 @@
|
|
3566 |
"outputs": [],
|
3567 |
"source": [
|
3568 |
"# export as parquet data file instead of csv for easier list extraction\n",
|
3569 |
-
"df_uni.to_parquet(
|
3570 |
]
|
3571 |
}
|
3572 |
],
|
|
|
479 |
"outputs": [],
|
480 |
"source": [
|
481 |
"# drop empty lines\n",
|
482 |
+
"df_copy.dropna(subset=[\"Headline\"], inplace=True)"
|
483 |
]
|
484 |
},
|
485 |
{
|
|
|
500 |
],
|
501 |
"source": [
|
502 |
"print(\"Published Date Statistics:\")\n",
|
503 |
+
"print(\"Min Date:\", df_copy[\"Datetime\"].min())\n",
|
504 |
+
"print(\"Max Date:\", df_copy[\"Datetime\"].max())"
|
505 |
]
|
506 |
},
|
507 |
{
|
|
|
523 |
],
|
524 |
"source": [
|
525 |
"# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time\n",
|
526 |
+
"df_copy[[\"Year\", \"Headline\", \"Region\"]].duplicated().any()"
|
527 |
]
|
528 |
},
|
529 |
{
|
|
|
534 |
"outputs": [],
|
535 |
"source": [
|
536 |
"# drop the duplicated news\n",
|
537 |
+
"duplicates = df_copy.duplicated(subset=[\"Year\", \"Headline\", \"Region\"], keep=\"first\")\n",
|
538 |
"df_uni = df_copy[~duplicates]"
|
539 |
]
|
540 |
},
|
|
|
1295 |
}
|
1296 |
],
|
1297 |
"source": [
|
1298 |
+
"df_uni[\"content\"][5]"
|
1299 |
]
|
1300 |
},
|
1301 |
{
|
|
|
1306 |
"outputs": [],
|
1307 |
"source": [
|
1308 |
"## remove contractions, lowercase, remove numbers and punctuations, remove stopwords\n",
|
1309 |
+
"df_uni[\"cleaned_content\"] = df_uni[\"content\"].apply(\n",
|
1310 |
+
" lambda x: [contractions.fix(word) for word in x.split()]\n",
|
1311 |
+
")\n",
|
1312 |
"\n",
|
1313 |
"## convert back into string so that tokenization can be done\n",
|
1314 |
+
"df_uni[\"cleaned_content\"] = [\" \".join(map(str, l)) for l in df_uni[\"cleaned_content\"]]"
|
1315 |
]
|
1316 |
},
|
1317 |
{
|
|
|
1332 |
}
|
1333 |
],
|
1334 |
"source": [
|
1335 |
+
"df_uni[\"cleaned_content\"][5]"
|
1336 |
]
|
1337 |
},
|
1338 |
{
|
|
|
1387 |
"\n",
|
1388 |
"wnl = WordNetLemmatizer()\n",
|
1389 |
"\n",
|
1390 |
+
"\n",
|
1391 |
"def lemmatize_words(text):\n",
|
1392 |
" # Tokenize the text into sentences and then words\n",
|
1393 |
" sentences = sent_tokenize(text)\n",
|
1394 |
" words = [word_tokenize(sentence) for sentence in sentences]\n",
|
1395 |
"\n",
|
1396 |
" # Remove punctuation and tokenize into lowercase words\n",
|
1397 |
+
" punc = [[w.lower() for w in word if re.search(\"^[a-zA-Z]+$\", w)] for word in words]\n",
|
1398 |
"\n",
|
1399 |
" # Perform lemmatization on words with valid POS tags\n",
|
1400 |
+
" doc_lemmed = [\n",
|
1401 |
+
" wnl.lemmatize(word, pos[0].lower())\n",
|
1402 |
+
" for sentence in punc\n",
|
1403 |
+
" for word, pos in pos_tag(sentence, tagset=\"universal\")\n",
|
1404 |
+
" if pos[0].lower() in [\"a\", \"s\", \"r\", \"n\", \"v\"]\n",
|
1405 |
+
" ]\n",
|
1406 |
+
"\n",
|
1407 |
" return doc_lemmed"
|
1408 |
]
|
1409 |
},
|
|
|
1425 |
"source": [
|
1426 |
"%%time\n",
|
1427 |
"\n",
|
1428 |
+
"df_uni[\"cleaned_content\"] = df_uni[\"cleaned_content\"].apply(lemmatize_words)"
|
1429 |
]
|
1430 |
},
|
1431 |
{
|
|
|
1443 |
"metadata": {},
|
1444 |
"outputs": [],
|
1445 |
"source": [
|
1446 |
+
"stop_list = nltk.corpus.stopwords.words(\"english\")\n",
|
1447 |
+
"stop_list += [\n",
|
1448 |
+
" \"local\",\n",
|
1449 |
+
" \"time\",\n",
|
1450 |
+
" \"wednesday\",\n",
|
1451 |
+
" \"source\",\n",
|
1452 |
+
" \"certain\",\n",
|
1453 |
+
" \"report\",\n",
|
1454 |
+
" \"update\",\n",
|
1455 |
+
" \"last\",\n",
|
1456 |
+
" \"year\",\n",
|
1457 |
+
" \"week\",\n",
|
1458 |
+
" \"month\",\n",
|
1459 |
+
" \"scrape\",\n",
|
1460 |
+
" \"content\",\n",
|
1461 |
+
"]\n",
|
1462 |
+
"\n",
|
1463 |
"\n",
|
1464 |
"def corpus2docs2(corpus):\n",
|
1465 |
" # corpus is a object returned by load_corpus that represents a corpus.\n",
|
|
|
1470 |
" phrases = []\n",
|
1471 |
" i = 0\n",
|
1472 |
" while i < len(doc_pos):\n",
|
1473 |
+
" if doc_pos[i][1] == \"JJ\":\n",
|
1474 |
+
" if (\n",
|
1475 |
+
" i + 2 < len(doc_pos)\n",
|
1476 |
+
" and doc_pos[i + 1][1] == \"NN\"\n",
|
1477 |
+
" and doc_pos[i + 2][1] == \"NN\"\n",
|
1478 |
+
" ):\n",
|
1479 |
+
" phrases.append(\n",
|
1480 |
+
" (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])\n",
|
1481 |
+
" )\n",
|
1482 |
" i += 3\n",
|
1483 |
+
" elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == \"NN\":\n",
|
1484 |
+
" phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))\n",
|
1485 |
" i += 2\n",
|
1486 |
" else:\n",
|
1487 |
" i += 1\n",
|
1488 |
+
" elif doc_pos[i][1] == \"NN\":\n",
|
1489 |
+
" if (\n",
|
1490 |
+
" i + 2 < len(doc_pos)\n",
|
1491 |
+
" and doc_pos[i + 1][1] == \"NN\"\n",
|
1492 |
+
" and doc_pos[i + 2][1] == \"NN\"\n",
|
1493 |
+
" ):\n",
|
1494 |
+
" phrases.append(\n",
|
1495 |
+
" (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])\n",
|
1496 |
+
" )\n",
|
1497 |
" i += 3\n",
|
1498 |
+
" elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == \"NN\":\n",
|
1499 |
+
" phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))\n",
|
1500 |
" i += 2\n",
|
1501 |
" else:\n",
|
1502 |
" i += 1\n",
|
1503 |
" else:\n",
|
1504 |
" i += 1\n",
|
1505 |
+
" phrase_set = [\"_\".join(word_set) for word_set in phrases]\n",
|
1506 |
" docs.append(phrase_set)\n",
|
1507 |
" return docs"
|
1508 |
]
|
|
|
1532 |
"metadata": {},
|
1533 |
"outputs": [],
|
1534 |
"source": [
|
1535 |
+
"df_uni[\"binary_content\"] = corpus2docs2(df_uni[\"cleaned_content\"])"
|
1536 |
]
|
1537 |
},
|
1538 |
{
|
|
|
1580 |
}
|
1581 |
],
|
1582 |
"source": [
|
1583 |
+
"df_uni[\"binary_content\"][5]"
|
1584 |
]
|
1585 |
},
|
1586 |
{
|
|
|
1601 |
}
|
1602 |
],
|
1603 |
"source": [
|
1604 |
+
"fdist_doc = nltk.FreqDist(df_uni[\"binary_content\"][5]).most_common(25)\n",
|
1605 |
"\n",
|
1606 |
"x, y = zip(*fdist_doc)\n",
|
1607 |
+
"plt.figure(figsize=(50, 30))\n",
|
1608 |
"plt.margins(0.02)\n",
|
1609 |
"plt.bar(x, y)\n",
|
1610 |
+
"plt.xlabel(\"Words\", fontsize=50)\n",
|
1611 |
+
"plt.ylabel(\"Frequency of Words\", fontsize=50)\n",
|
1612 |
"plt.yticks(fontsize=40)\n",
|
1613 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
1614 |
+
"plt.title(\"Frequency of 25 Most Common Words for One Random News\", fontsize=60)\n",
|
1615 |
"plt.show()"
|
1616 |
]
|
1617 |
},
|
|
|
1622 |
"metadata": {},
|
1623 |
"outputs": [],
|
1624 |
"source": [
|
1625 |
+
"all_words = [word for sublist in df_uni[\"binary_content\"] for word in sublist]\n",
|
1626 |
"all_words[:2]\n",
|
1627 |
"# Calculate word frequencies\n",
|
1628 |
"fdist = FreqDist(all_words)"
|
|
|
1658 |
"source": [
|
1659 |
"# Plot the word frequency distribution as a bar graph\n",
|
1660 |
"plt.figure(figsize=(12, 6))\n",
|
1661 |
+
"plt.title(\"Frequency of 25 Most Common Words of the Dataset\", fontsize=12)\n",
|
1662 |
"fdist.plot(30, cumulative=False)"
|
1663 |
]
|
1664 |
},
|
|
|
1688 |
}
|
1689 |
],
|
1690 |
"source": [
|
1691 |
+
"com = df_uni[\"Severity\"].unique()\n",
|
1692 |
"com[:10]"
|
1693 |
]
|
1694 |
},
|
|
|
1745 |
"\n",
|
1746 |
"# Plotting with Seaborn for each company\n",
|
1747 |
"for region in com[:10]:\n",
|
1748 |
+
" haha = df_uni[\"binary_content\"].loc[df_uni.Severity == region]\n",
|
1749 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
1750 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
1751 |
+
" plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
1752 |
+
" plt.title(f\"Wordcloud for {region}\")\n",
|
1753 |
" plt.axis(\"off\")\n",
|
1754 |
" plt.margins(x=0, y=0)\n",
|
1755 |
" plt.show()"
|
|
|
1772 |
"metadata": {},
|
1773 |
"outputs": [],
|
1774 |
"source": [
|
1775 |
+
"df_uni[\"binary_content\"] = df_uni[\"binary_content\"].apply(lambda x: \" \".join(x))\n",
|
1776 |
"\n",
|
1777 |
"# Tokenize the text and create a dictionary\n",
|
1778 |
+
"documents = df_uni[\"binary_content\"].str.split()\n",
|
1779 |
"dictionary = corpora.Dictionary(documents)\n",
|
1780 |
"\n",
|
1781 |
"tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)\n",
|
|
|
2803 |
}
|
2804 |
],
|
2805 |
"source": [
|
2806 |
+
"sorted_term_frequencies = dict(\n",
|
2807 |
+
" sorted(term_frequencies.items(), key=lambda item: item[1], reverse=True)\n",
|
2808 |
+
")\n",
|
2809 |
"sorted_term_frequencies"
|
2810 |
]
|
2811 |
},
|
|
|
2827 |
"# customisable, lower threshold, more words retained.\n",
|
2828 |
"threshold = 0.03\n",
|
2829 |
"\n",
|
2830 |
+
"\n",
|
2831 |
"def filter_and_join(tfidf_doc):\n",
|
2832 |
" filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]\n",
|
2833 |
" return filtered_terms\n",
|
2834 |
"\n",
|
2835 |
+
"\n",
|
2836 |
+
"df_uni[\"binary_content\"] = [filter_and_join(doc) for doc in tfidf_corpus]"
|
2837 |
]
|
2838 |
},
|
2839 |
{
|
|
|
2865 |
}
|
2866 |
],
|
2867 |
"source": [
|
2868 |
+
"df_uni[\"binary_content\"]"
|
2869 |
]
|
2870 |
},
|
2871 |
{
|
|
|
2886 |
}
|
2887 |
],
|
2888 |
"source": [
|
2889 |
+
"fdist_doc = nltk.FreqDist(df_uni[\"binary_content\"][5]).most_common(25)\n",
|
2890 |
"\n",
|
2891 |
"x, y = zip(*fdist_doc)\n",
|
2892 |
+
"plt.figure(figsize=(50, 30))\n",
|
2893 |
"plt.margins(0.02)\n",
|
2894 |
"plt.bar(x, y)\n",
|
2895 |
+
"plt.xlabel(\"Words\", fontsize=50)\n",
|
2896 |
+
"plt.ylabel(\"Frequency of Words\", fontsize=50)\n",
|
2897 |
"plt.yticks(fontsize=40)\n",
|
2898 |
"plt.xticks(rotation=60, fontsize=40)\n",
|
2899 |
+
"plt.title(\"Frequency of 25 Most Common Words for One Random News\", fontsize=60)\n",
|
2900 |
"plt.show()"
|
2901 |
]
|
2902 |
},
|
|
|
2907 |
"metadata": {},
|
2908 |
"outputs": [],
|
2909 |
"source": [
|
2910 |
+
"all_words_filtered = [word for sublist in df_uni[\"binary_content\"] for word in sublist]\n",
|
2911 |
"all_words_filtered[:2]\n",
|
2912 |
"# Calculate word frequencies\n",
|
2913 |
"fdist_filtered = FreqDist(all_words_filtered)"
|
|
|
2944 |
"# Plot the word frequency distribution as a bar graph\n",
|
2945 |
"# apparently, the dataset is much cleaner now.\n",
|
2946 |
"plt.figure(figsize=(12, 6))\n",
|
2947 |
+
"plt.title(\"Frequency of 25 Most Common Words of the Dataset\", fontsize=12)\n",
|
2948 |
"fdist_filtered.plot(30, cumulative=False)"
|
2949 |
]
|
2950 |
},
|
|
|
3001 |
"\n",
|
3002 |
"# Plotting with Seaborn for each company\n",
|
3003 |
"for region in com[:10]:\n",
|
3004 |
+
" haha = df_uni[\"binary_content\"].loc[df_uni.Severity == region]\n",
|
3005 |
" text = \" \".join(\" \".join(item) for item in haha)\n",
|
3006 |
" wordcloud = WordCloud(background_color=\"white\").generate(text)\n",
|
3007 |
+
" plt.imshow(wordcloud, interpolation=\"bilinear\")\n",
|
3008 |
+
" plt.title(f\"Wordcloud for {region}\")\n",
|
3009 |
" plt.axis(\"off\")\n",
|
3010 |
" plt.margins(x=0, y=0)\n",
|
3011 |
" plt.show()"
|
|
|
3018 |
"metadata": {},
|
3019 |
"outputs": [],
|
3020 |
"source": [
|
3021 |
+
"df_uni[\"word_count\"] = df_uni[\"binary_content\"].apply(len)"
|
3022 |
]
|
3023 |
},
|
3024 |
{
|
|
|
3106 |
}
|
3107 |
],
|
3108 |
"source": [
|
3109 |
+
"df_uni[[\"word_count\"]].describe().round()"
|
3110 |
]
|
3111 |
},
|
3112 |
{
|
|
|
3218 |
],
|
3219 |
"source": [
|
3220 |
"# count of news by region\n",
|
3221 |
+
"df_uni[[\"binary_content\", \"Region\"]].groupby(\"Region\").count().sort_values(\n",
|
3222 |
+
" by=\"binary_content\", ascending=False\n",
|
3223 |
+
")"
|
3224 |
]
|
3225 |
},
|
3226 |
{
|
|
|
3293 |
}
|
3294 |
],
|
3295 |
"source": [
|
3296 |
+
"df_uni[[\"binary_content\", \"Severity\"]].groupby(\"Severity\").count().sort_values(\n",
|
3297 |
+
" by=\"binary_content\", ascending=False\n",
|
3298 |
+
")"
|
3299 |
]
|
3300 |
},
|
3301 |
{
|
|
|
3608 |
"outputs": [],
|
3609 |
"source": [
|
3610 |
"# export as parquet data file instead of csv for easier list extraction\n",
|
3611 |
+
"df_uni.to_parquet(\"data/processed_data2.parquet\", index=False)"
|
3612 |
]
|
3613 |
}
|
3614 |
],
|
notebooks/07_topic_modelling_minor.ipynb
CHANGED
@@ -76,7 +76,8 @@
|
|
76 |
"import datetime\n",
|
77 |
"\n",
|
78 |
"import warnings\n",
|
79 |
-
"
|
|
|
80 |
"\n",
|
81 |
"from pprint import pprint\n",
|
82 |
"import pyLDAvis\n",
|
@@ -98,7 +99,7 @@
|
|
98 |
"metadata": {},
|
99 |
"outputs": [],
|
100 |
"source": [
|
101 |
-
"df = pd.read_parquet(
|
102 |
]
|
103 |
},
|
104 |
{
|
@@ -417,7 +418,7 @@
|
|
417 |
"outputs": [],
|
418 |
"source": [
|
419 |
"# choose only the extreme and severe cases for modelling\n",
|
420 |
-
"cleaned = df_copy[df_copy[
|
421 |
"cleaned.reset_index(drop=True, inplace=True)"
|
422 |
]
|
423 |
},
|
@@ -524,8 +525,8 @@
|
|
524 |
}
|
525 |
],
|
526 |
"source": [
|
527 |
-
"print(
|
528 |
-
"print(
|
529 |
]
|
530 |
},
|
531 |
{
|
@@ -678,13 +679,15 @@
|
|
678 |
"outputs": [],
|
679 |
"source": [
|
680 |
"# Build LDA benchmark model\n",
|
681 |
-
"lda_model = gensim.models.LdaMulticore(
|
682 |
-
"
|
683 |
-
"
|
684 |
-
"
|
685 |
-
"
|
686 |
-
"
|
687 |
-
"
|
|
|
|
|
688 |
]
|
689 |
},
|
690 |
{
|
@@ -741,9 +744,11 @@
|
|
741 |
],
|
742 |
"source": [
|
743 |
"# Compute Benchmark Coherence Score\n",
|
744 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
745 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
746 |
-
"print(
|
747 |
]
|
748 |
},
|
749 |
{
|
@@ -765,10 +770,10 @@
|
|
765 |
],
|
766 |
"source": [
|
767 |
"# Compute Benchmark Perplexity\n",
|
768 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
769 |
-
"
|
770 |
"\n",
|
771 |
-
"print(
|
772 |
]
|
773 |
},
|
774 |
{
|
@@ -821,7 +826,7 @@
|
|
821 |
"\n",
|
822 |
"# feed the LDA model into the pyLDAvis instance\n",
|
823 |
"pyLDAvis.enable_notebook()\n",
|
824 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
825 |
"\n",
|
826 |
"# Save the output to the html file\n",
|
827 |
"pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_minor.html\")"
|
@@ -852,20 +857,24 @@
|
|
852 |
"source": [
|
853 |
"# hyper-perameter tuning (alpha and beta)\n",
|
854 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
855 |
-
"
|
856 |
-
" lda_model = gensim.models.LdaMulticore(
|
857 |
-
"
|
858 |
-
"
|
859 |
-
"
|
860 |
-
"
|
861 |
-
"
|
862 |
-
"
|
863 |
-
"
|
864 |
-
"
|
865 |
-
"
|
|
|
|
|
|
|
|
|
866 |
" coherence = coherence_model_lda.get_coherence()\n",
|
867 |
-
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)
|
868 |
-
"
|
869 |
" return coherence, perplex"
|
870 |
]
|
871 |
},
|
@@ -893,12 +902,12 @@
|
|
893 |
"\n",
|
894 |
"# Alpha parameter\n",
|
895 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
896 |
-
"alpha.append(
|
897 |
-
"alpha.append(
|
898 |
"\n",
|
899 |
"# Beta parameter\n",
|
900 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
901 |
-
"beta.append(
|
902 |
]
|
903 |
},
|
904 |
{
|
@@ -928,8 +937,8 @@
|
|
928 |
}
|
929 |
],
|
930 |
"source": [
|
931 |
-
"print(\"Topic range: \",num_topics)\n",
|
932 |
-
"print(\"Alpha: \",alpha)\n",
|
933 |
"print(\"Beta: \", beta)"
|
934 |
]
|
935 |
},
|
@@ -1149,15 +1158,28 @@
|
|
1149 |
"for a in alpha:\n",
|
1150 |
" for b in beta:\n",
|
1151 |
" for num in num_topics:\n",
|
1152 |
-
" cv, pv = compute_coherence_values(
|
|
|
|
|
1153 |
"\n",
|
1154 |
-
" model_topics.append(num)
|
1155 |
-
" coherence_values.append(cv)
|
1156 |
" perplexity_values.append(pv)\n",
|
1157 |
" alpha_result.append(a)\n",
|
1158 |
" beta_result.append(b)\n",
|
1159 |
-
" print(\
|
1160 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1161 |
"print(datetime.datetime.now())"
|
1162 |
]
|
1163 |
},
|
@@ -1178,13 +1200,17 @@
|
|
1178 |
"source": [
|
1179 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1180 |
"result = pd.DataFrame(\n",
|
1181 |
-
" {
|
1182 |
-
"
|
1183 |
-
"
|
1184 |
-
"
|
1185 |
-
"
|
1186 |
-
"
|
1187 |
-
"
|
|
|
|
|
|
|
|
|
1188 |
]
|
1189 |
},
|
1190 |
{
|
@@ -1194,7 +1220,7 @@
|
|
1194 |
"metadata": {},
|
1195 |
"outputs": [],
|
1196 |
"source": [
|
1197 |
-
"result.to_csv(
|
1198 |
]
|
1199 |
},
|
1200 |
{
|
@@ -1207,7 +1233,7 @@
|
|
1207 |
"outputs": [],
|
1208 |
"source": [
|
1209 |
"# Show graph Topics vs Coherence Score\n",
|
1210 |
-
"result.groupby(
|
1211 |
]
|
1212 |
},
|
1213 |
{
|
@@ -1222,7 +1248,7 @@
|
|
1222 |
"plt.plot(model_topics, coherence_values)\n",
|
1223 |
"plt.xlabel(\"Num Topics\")\n",
|
1224 |
"plt.ylabel(\"Coherence Score\")\n",
|
1225 |
-
"plt.legend((\"Coherence Score\"), loc
|
1226 |
"plt.show()"
|
1227 |
]
|
1228 |
},
|
@@ -1238,7 +1264,7 @@
|
|
1238 |
"plt.plot(model_topics, perplexity_values)\n",
|
1239 |
"plt.xlabel(\"Num Topics\")\n",
|
1240 |
"plt.ylabel(\"Perplexity score\")\n",
|
1241 |
-
"plt.legend((\"perplexity_values\"), loc
|
1242 |
"plt.show()"
|
1243 |
]
|
1244 |
},
|
@@ -1270,17 +1296,19 @@
|
|
1270 |
"# a = 'asymmetric'\n",
|
1271 |
"a = 0.31\n",
|
1272 |
"# b = 0.31\n",
|
1273 |
-
"b =
|
1274 |
"\n",
|
1275 |
"\n",
|
1276 |
-
"final_model = gensim.models.LdaMulticore(
|
1277 |
-
"
|
1278 |
-
"
|
1279 |
-
"
|
1280 |
-
"
|
1281 |
-
"
|
1282 |
-
"
|
1283 |
-
"
|
|
|
|
|
1284 |
]
|
1285 |
},
|
1286 |
{
|
@@ -1292,7 +1320,7 @@
|
|
1292 |
},
|
1293 |
"outputs": [],
|
1294 |
"source": [
|
1295 |
-
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b)
|
1296 |
]
|
1297 |
},
|
1298 |
{
|
@@ -1307,7 +1335,7 @@
|
|
1307 |
"# Set up the environment to display the graphical outputs\n",
|
1308 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1309 |
"pyLDAvis.enable_notebook()\n",
|
1310 |
-
"visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1311 |
"\n",
|
1312 |
"# Save the output to the html file\n",
|
1313 |
"pyLDAvis.save_html(visual, \"data/topic_viz2_minor_training.html\")"
|
@@ -1354,14 +1382,14 @@
|
|
1354 |
"outputs": [],
|
1355 |
"source": [
|
1356 |
"# Get the topics and their top keywords into a dataframe\n",
|
1357 |
-
"topics = final_model.show_topics(num_words=30)
|
1358 |
"\n",
|
1359 |
"topic_keywords = pd.DataFrame()\n",
|
1360 |
"for topic_id, topic in topics:\n",
|
1361 |
-
" topic_keywords.at[topic_id,
|
1362 |
"\n",
|
1363 |
-
"topic_keywords[
|
1364 |
-
"topic_keywords[
|
1365 |
"topic_keywords"
|
1366 |
]
|
1367 |
},
|
@@ -1380,7 +1408,7 @@
|
|
1380 |
"metadata": {},
|
1381 |
"outputs": [],
|
1382 |
"source": [
|
1383 |
-
"#Save a model to disk, or reload a pre-trained model\n",
|
1384 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1385 |
"final_model.save(\"models/final_model_5_asym_91\")"
|
1386 |
]
|
@@ -1417,13 +1445,19 @@
|
|
1417 |
"outputs": [],
|
1418 |
"source": [
|
1419 |
"import warnings\n",
|
1420 |
-
"
|
|
|
|
|
1421 |
"\n",
|
1422 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1423 |
" # Preallocate memory for the DataFrame\n",
|
1424 |
" num_docs = len(corpus)\n",
|
1425 |
-
" sent_topics = {
|
1426 |
-
"
|
|
|
|
|
|
|
|
|
1427 |
" # Get main topic in each document\n",
|
1428 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1429 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
@@ -1431,13 +1465,13 @@
|
|
1431 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1432 |
" dominant_topic, perc_contribution = row[0]\n",
|
1433 |
" topic_distribution = row\n",
|
1434 |
-
" sent_topics[
|
1435 |
-
" sent_topics[
|
1436 |
-
" sent_topics[
|
1437 |
"\n",
|
1438 |
" # Create the DataFrame\n",
|
1439 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1440 |
-
" sent_topics_df[
|
1441 |
"\n",
|
1442 |
" return sent_topics_df"
|
1443 |
]
|
@@ -1449,7 +1483,9 @@
|
|
1449 |
"metadata": {},
|
1450 |
"outputs": [],
|
1451 |
"source": [
|
1452 |
-
"df_topic_sents_keywords = format_topics_sentences(
|
|
|
|
|
1453 |
]
|
1454 |
},
|
1455 |
{
|
@@ -1461,7 +1497,13 @@
|
|
1461 |
"source": [
|
1462 |
"# Format\n",
|
1463 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1464 |
-
"df_dominant_topic.columns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
1465 |
"\n",
|
1466 |
"# Show\n",
|
1467 |
"df_dominant_topic.head(10)"
|
@@ -1513,7 +1555,7 @@
|
|
1513 |
"# Show the plot\n",
|
1514 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1515 |
"plt.tight_layout()\n",
|
1516 |
-
"plt.show()
|
1517 |
]
|
1518 |
},
|
1519 |
{
|
@@ -1523,7 +1565,7 @@
|
|
1523 |
"metadata": {},
|
1524 |
"outputs": [],
|
1525 |
"source": [
|
1526 |
-
"df_dominant_topic.sort_values(by
|
1527 |
]
|
1528 |
},
|
1529 |
{
|
@@ -1534,9 +1576,9 @@
|
|
1534 |
"outputs": [],
|
1535 |
"source": [
|
1536 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1537 |
-
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)
|
1538 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1539 |
-
"sampled_df.to_csv(
|
1540 |
]
|
1541 |
}
|
1542 |
],
|
|
|
76 |
"import datetime\n",
|
77 |
"\n",
|
78 |
"import warnings\n",
|
79 |
+
"\n",
|
80 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
81 |
"\n",
|
82 |
"from pprint import pprint\n",
|
83 |
"import pyLDAvis\n",
|
|
|
99 |
"metadata": {},
|
100 |
"outputs": [],
|
101 |
"source": [
|
102 |
+
"df = pd.read_parquet(\"data/processed_data2.parquet\")"
|
103 |
]
|
104 |
},
|
105 |
{
|
|
|
418 |
"outputs": [],
|
419 |
"source": [
|
420 |
"# choose only the extreme and severe cases for modelling\n",
|
421 |
+
"cleaned = df_copy[df_copy[\"Severity\"].isin([\"Minor\"])]\n",
|
422 |
"cleaned.reset_index(drop=True, inplace=True)"
|
423 |
]
|
424 |
},
|
|
|
525 |
}
|
526 |
],
|
527 |
"source": [
|
528 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
529 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
530 |
]
|
531 |
},
|
532 |
{
|
|
|
679 |
"outputs": [],
|
680 |
"source": [
|
681 |
"# Build LDA benchmark model\n",
|
682 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
683 |
+
" corpus=docs_vecs,\n",
|
684 |
+
" id2word=doc_dict,\n",
|
685 |
+
" num_topics=4,\n",
|
686 |
+
" random_state=42,\n",
|
687 |
+
" chunksize=100,\n",
|
688 |
+
" passes=10,\n",
|
689 |
+
" per_word_topics=True,\n",
|
690 |
+
")"
|
691 |
]
|
692 |
},
|
693 |
{
|
|
|
744 |
],
|
745 |
"source": [
|
746 |
"# Compute Benchmark Coherence Score\n",
|
747 |
+
"coherence_model_lda = CoherenceModel(\n",
|
748 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
749 |
+
")\n",
|
750 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
751 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
752 |
]
|
753 |
},
|
754 |
{
|
|
|
770 |
],
|
771 |
"source": [
|
772 |
"# Compute Benchmark Perplexity\n",
|
773 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
774 |
+
"# a measure of how good the model is. lower the better.\n",
|
775 |
"\n",
|
776 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
777 |
]
|
778 |
},
|
779 |
{
|
|
|
826 |
"\n",
|
827 |
"# feed the LDA model into the pyLDAvis instance\n",
|
828 |
"pyLDAvis.enable_notebook()\n",
|
829 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
830 |
"\n",
|
831 |
"# Save the output to the html file\n",
|
832 |
"pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_minor.html\")"
|
|
|
857 |
"source": [
|
858 |
"# hyper-perameter tuning (alpha and beta)\n",
|
859 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
860 |
+
"\n",
|
861 |
+
" lda_model = gensim.models.LdaMulticore(\n",
|
862 |
+
" corpus=corpus,\n",
|
863 |
+
" id2word=dictionary,\n",
|
864 |
+
" num_topics=k,\n",
|
865 |
+
" random_state=42,\n",
|
866 |
+
" chunksize=100,\n",
|
867 |
+
" passes=10,\n",
|
868 |
+
" alpha=a,\n",
|
869 |
+
" eta=b,\n",
|
870 |
+
" )\n",
|
871 |
+
"\n",
|
872 |
+
" coherence_model_lda = CoherenceModel(\n",
|
873 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
874 |
+
" )\n",
|
875 |
" coherence = coherence_model_lda.get_coherence()\n",
|
876 |
+
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)\n",
|
877 |
+
"\n",
|
878 |
" return coherence, perplex"
|
879 |
]
|
880 |
},
|
|
|
902 |
"\n",
|
903 |
"# Alpha parameter\n",
|
904 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
905 |
+
"alpha.append(\"symmetric\")\n",
|
906 |
+
"alpha.append(\"asymmetric\")\n",
|
907 |
"\n",
|
908 |
"# Beta parameter\n",
|
909 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
910 |
+
"beta.append(\"symmetric\")"
|
911 |
]
|
912 |
},
|
913 |
{
|
|
|
937 |
}
|
938 |
],
|
939 |
"source": [
|
940 |
+
"print(\"Topic range: \", num_topics)\n",
|
941 |
+
"print(\"Alpha: \", alpha)\n",
|
942 |
"print(\"Beta: \", beta)"
|
943 |
]
|
944 |
},
|
|
|
1158 |
"for a in alpha:\n",
|
1159 |
" for b in beta:\n",
|
1160 |
" for num in num_topics:\n",
|
1161 |
+
" cv, pv = compute_coherence_values(\n",
|
1162 |
+
" corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b\n",
|
1163 |
+
" )\n",
|
1164 |
"\n",
|
1165 |
+
" model_topics.append(num)\n",
|
1166 |
+
" coherence_values.append(cv)\n",
|
1167 |
" perplexity_values.append(pv)\n",
|
1168 |
" alpha_result.append(a)\n",
|
1169 |
" beta_result.append(b)\n",
|
1170 |
+
" print(\n",
|
1171 |
+
" \"#Topics: \"\n",
|
1172 |
+
" + str(num)\n",
|
1173 |
+
" + \", CV Score: \"\n",
|
1174 |
+
" + str(coherence_values[-1])\n",
|
1175 |
+
" + \", PV Score: \"\n",
|
1176 |
+
" + str(perplexity_values[-1])\n",
|
1177 |
+
" + \", Alpha: \"\n",
|
1178 |
+
" + str(alpha_result[-1])\n",
|
1179 |
+
" + \", Beta: \"\n",
|
1180 |
+
" + str(beta_result[-1])\n",
|
1181 |
+
" )\n",
|
1182 |
+
"\n",
|
1183 |
"print(datetime.datetime.now())"
|
1184 |
]
|
1185 |
},
|
|
|
1200 |
"source": [
|
1201 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1202 |
"result = pd.DataFrame(\n",
|
1203 |
+
" {\n",
|
1204 |
+
" \"Topics\": model_topics,\n",
|
1205 |
+
" \"Coherence Score\": coherence_values,\n",
|
1206 |
+
" \"Perplexity Score\": perplexity_values,\n",
|
1207 |
+
" \"Alpha\": alpha_result,\n",
|
1208 |
+
" \"Beta\": beta_result,\n",
|
1209 |
+
" }\n",
|
1210 |
+
")\n",
|
1211 |
+
"result.sort_values(\n",
|
1212 |
+
" by=[\"Coherence Score\", \"Perplexity Score\"], ascending=[False, True]\n",
|
1213 |
+
").head(20)"
|
1214 |
]
|
1215 |
},
|
1216 |
{
|
|
|
1220 |
"metadata": {},
|
1221 |
"outputs": [],
|
1222 |
"source": [
|
1223 |
+
"result.to_csv(\"data/lda_fine_tuning_result_minor.csv\")"
|
1224 |
]
|
1225 |
},
|
1226 |
{
|
|
|
1233 |
"outputs": [],
|
1234 |
"source": [
|
1235 |
"# Show graph Topics vs Coherence Score\n",
|
1236 |
+
"result.groupby(\"Alpha\").plot(x=\"Topics\", y=\"Coherence Score\", legend=True)"
|
1237 |
]
|
1238 |
},
|
1239 |
{
|
|
|
1248 |
"plt.plot(model_topics, coherence_values)\n",
|
1249 |
"plt.xlabel(\"Num Topics\")\n",
|
1250 |
"plt.ylabel(\"Coherence Score\")\n",
|
1251 |
+
"plt.legend((\"Coherence Score\"), loc=\"best\")\n",
|
1252 |
"plt.show()"
|
1253 |
]
|
1254 |
},
|
|
|
1264 |
"plt.plot(model_topics, perplexity_values)\n",
|
1265 |
"plt.xlabel(\"Num Topics\")\n",
|
1266 |
"plt.ylabel(\"Perplexity score\")\n",
|
1267 |
+
"plt.legend((\"perplexity_values\"), loc=\"best\")\n",
|
1268 |
"plt.show()"
|
1269 |
]
|
1270 |
},
|
|
|
1296 |
"# a = 'asymmetric'\n",
|
1297 |
"a = 0.31\n",
|
1298 |
"# b = 0.31\n",
|
1299 |
+
"b = \"symmetric\"\n",
|
1300 |
"\n",
|
1301 |
"\n",
|
1302 |
+
"final_model = gensim.models.LdaMulticore(\n",
|
1303 |
+
" corpus=docs_vecs,\n",
|
1304 |
+
" id2word=doc_dict,\n",
|
1305 |
+
" num_topics=k,\n",
|
1306 |
+
" random_state=42,\n",
|
1307 |
+
" chunksize=100,\n",
|
1308 |
+
" passes=10,\n",
|
1309 |
+
" alpha=a,\n",
|
1310 |
+
" eta=b,\n",
|
1311 |
+
")"
|
1312 |
]
|
1313 |
},
|
1314 |
{
|
|
|
1320 |
},
|
1321 |
"outputs": [],
|
1322 |
"source": [
|
1323 |
+
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)"
|
1324 |
]
|
1325 |
},
|
1326 |
{
|
|
|
1335 |
"# Set up the environment to display the graphical outputs\n",
|
1336 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1337 |
"pyLDAvis.enable_notebook()\n",
|
1338 |
+
"visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1339 |
"\n",
|
1340 |
"# Save the output to the html file\n",
|
1341 |
"pyLDAvis.save_html(visual, \"data/topic_viz2_minor_training.html\")"
|
|
|
1382 |
"outputs": [],
|
1383 |
"source": [
|
1384 |
"# Get the topics and their top keywords into a dataframe\n",
|
1385 |
+
"topics = final_model.show_topics(num_words=30)\n",
|
1386 |
"\n",
|
1387 |
"topic_keywords = pd.DataFrame()\n",
|
1388 |
"for topic_id, topic in topics:\n",
|
1389 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
1390 |
"\n",
|
1391 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
1392 |
+
"topic_keywords[\"Topic Name\"] = topic_mapping\n",
|
1393 |
"topic_keywords"
|
1394 |
]
|
1395 |
},
|
|
|
1408 |
"metadata": {},
|
1409 |
"outputs": [],
|
1410 |
"source": [
|
1411 |
+
"# Save a model to disk, or reload a pre-trained model\n",
|
1412 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1413 |
"final_model.save(\"models/final_model_5_asym_91\")"
|
1414 |
]
|
|
|
1445 |
"outputs": [],
|
1446 |
"source": [
|
1447 |
"import warnings\n",
|
1448 |
+
"\n",
|
1449 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
1450 |
+
"\n",
|
1451 |
"\n",
|
1452 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1453 |
" # Preallocate memory for the DataFrame\n",
|
1454 |
" num_docs = len(corpus)\n",
|
1455 |
+
" sent_topics = {\n",
|
1456 |
+
" \"Dominant_Topic\": [0] * num_docs,\n",
|
1457 |
+
" \"Perc_Contribution\": [0.0] * num_docs,\n",
|
1458 |
+
" \"Topic_Distribution\": [()] * num_docs,\n",
|
1459 |
+
" }\n",
|
1460 |
+
"\n",
|
1461 |
" # Get main topic in each document\n",
|
1462 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1463 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
|
|
1465 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1466 |
" dominant_topic, perc_contribution = row[0]\n",
|
1467 |
" topic_distribution = row\n",
|
1468 |
+
" sent_topics[\"Dominant_Topic\"][i] = int(dominant_topic)\n",
|
1469 |
+
" sent_topics[\"Perc_Contribution\"][i] = round(perc_contribution, 4)\n",
|
1470 |
+
" sent_topics[\"Topic_Distribution\"][i] = topic_distribution\n",
|
1471 |
"\n",
|
1472 |
" # Create the DataFrame\n",
|
1473 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1474 |
+
" sent_topics_df[\"Text\"] = data\n",
|
1475 |
"\n",
|
1476 |
" return sent_topics_df"
|
1477 |
]
|
|
|
1483 |
"metadata": {},
|
1484 |
"outputs": [],
|
1485 |
"source": [
|
1486 |
+
"df_topic_sents_keywords = format_topics_sentences(\n",
|
1487 |
+
" ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details\n",
|
1488 |
+
")"
|
1489 |
]
|
1490 |
},
|
1491 |
{
|
|
|
1497 |
"source": [
|
1498 |
"# Format\n",
|
1499 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1500 |
+
"df_dominant_topic.columns = [\n",
|
1501 |
+
" \"Document_No\",\n",
|
1502 |
+
" \"Dominant_Topic\",\n",
|
1503 |
+
" \"Topic_Perc_Contrib\",\n",
|
1504 |
+
" \"Topic_Distribution\",\n",
|
1505 |
+
" \"Text\",\n",
|
1506 |
+
"]\n",
|
1507 |
"\n",
|
1508 |
"# Show\n",
|
1509 |
"df_dominant_topic.head(10)"
|
|
|
1555 |
"# Show the plot\n",
|
1556 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1557 |
"plt.tight_layout()\n",
|
1558 |
+
"plt.show()"
|
1559 |
]
|
1560 |
},
|
1561 |
{
|
|
|
1565 |
"metadata": {},
|
1566 |
"outputs": [],
|
1567 |
"source": [
|
1568 |
+
"df_dominant_topic.sort_values(by=\"Topic_Perc_Contrib\", ascending=True).head(20)"
|
1569 |
]
|
1570 |
},
|
1571 |
{
|
|
|
1576 |
"outputs": [],
|
1577 |
"source": [
|
1578 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1579 |
+
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)\n",
|
1580 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1581 |
+
"sampled_df.to_csv(\"data/sample_minor.csv\")"
|
1582 |
]
|
1583 |
}
|
1584 |
],
|
notebooks/08_topic_modelling_moderate.ipynb
CHANGED
@@ -76,7 +76,8 @@
|
|
76 |
"import datetime\n",
|
77 |
"\n",
|
78 |
"import warnings\n",
|
79 |
-
"
|
|
|
80 |
"\n",
|
81 |
"from pprint import pprint\n",
|
82 |
"import pyLDAvis\n",
|
@@ -98,7 +99,7 @@
|
|
98 |
"metadata": {},
|
99 |
"outputs": [],
|
100 |
"source": [
|
101 |
-
"df = pd.read_parquet(
|
102 |
]
|
103 |
},
|
104 |
{
|
@@ -417,7 +418,7 @@
|
|
417 |
"outputs": [],
|
418 |
"source": [
|
419 |
"# choose only the extreme and severe cases for modelling\n",
|
420 |
-
"cleaned = df_copy[df_copy[
|
421 |
"cleaned.reset_index(drop=True, inplace=True)"
|
422 |
]
|
423 |
},
|
@@ -510,8 +511,8 @@
|
|
510 |
}
|
511 |
],
|
512 |
"source": [
|
513 |
-
"print(
|
514 |
-
"print(
|
515 |
]
|
516 |
},
|
517 |
{
|
@@ -675,13 +676,15 @@
|
|
675 |
"%%time\n",
|
676 |
"\n",
|
677 |
"# Build LDA benchmark model\n",
|
678 |
-
"lda_model = gensim.models.LdaMulticore(
|
679 |
-
"
|
680 |
-
"
|
681 |
-
"
|
682 |
-
"
|
683 |
-
"
|
684 |
-
"
|
|
|
|
|
685 |
]
|
686 |
},
|
687 |
{
|
@@ -750,9 +753,11 @@
|
|
750 |
],
|
751 |
"source": [
|
752 |
"# Compute Benchmark Coherence Score\n",
|
753 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
754 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
755 |
-
"print(
|
756 |
]
|
757 |
},
|
758 |
{
|
@@ -774,10 +779,10 @@
|
|
774 |
],
|
775 |
"source": [
|
776 |
"# Compute Benchmark Perplexity\n",
|
777 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
778 |
-
"
|
779 |
"\n",
|
780 |
-
"print(
|
781 |
]
|
782 |
},
|
783 |
{
|
@@ -793,7 +798,7 @@
|
|
793 |
"\n",
|
794 |
"# feed the LDA model into the pyLDAvis instance\n",
|
795 |
"pyLDAvis.enable_notebook()\n",
|
796 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
797 |
"\n",
|
798 |
"# Save the output to the html file\n",
|
799 |
"pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_moderate.html\")"
|
@@ -875,16 +880,16 @@
|
|
875 |
}
|
876 |
],
|
877 |
"source": [
|
878 |
-
"pd.set_option(
|
879 |
"# Get the topics and their top keywords into a dataframe\n",
|
880 |
-
"topics = lda_model.show_topics(num_words=6)
|
881 |
"\n",
|
882 |
"topic_keywords = pd.DataFrame()\n",
|
883 |
"for topic_id, topic in topics:\n",
|
884 |
-
" topic_keywords.at[topic_id,
|
885 |
"\n",
|
886 |
-
"topic_keywords[
|
887 |
-
"# topic_keywords['Topic Name'] = topic_mapping
|
888 |
"topic_keywords"
|
889 |
]
|
890 |
},
|
@@ -895,7 +900,7 @@
|
|
895 |
"metadata": {},
|
896 |
"outputs": [],
|
897 |
"source": [
|
898 |
-
"# break
|
899 |
]
|
900 |
},
|
901 |
{
|
@@ -923,20 +928,24 @@
|
|
923 |
"source": [
|
924 |
"# hyper-perameter tuning (alpha and beta)\n",
|
925 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
926 |
-
"
|
927 |
-
" lda_model = gensim.models.LdaMulticore(
|
928 |
-
"
|
929 |
-
"
|
930 |
-
"
|
931 |
-
"
|
932 |
-
"
|
933 |
-
"
|
934 |
-
"
|
935 |
-
"
|
936 |
-
"
|
|
|
|
|
|
|
|
|
937 |
" coherence = coherence_model_lda.get_coherence()\n",
|
938 |
-
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)
|
939 |
-
"
|
940 |
" return coherence, perplex"
|
941 |
]
|
942 |
},
|
@@ -964,12 +973,12 @@
|
|
964 |
"\n",
|
965 |
"# Alpha parameter\n",
|
966 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
967 |
-
"alpha.append(
|
968 |
-
"alpha.append(
|
969 |
"\n",
|
970 |
"# Beta parameter\n",
|
971 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
972 |
-
"beta.append(
|
973 |
]
|
974 |
},
|
975 |
{
|
@@ -999,8 +1008,8 @@
|
|
999 |
}
|
1000 |
],
|
1001 |
"source": [
|
1002 |
-
"print(\"Topic range: \",num_topics)\n",
|
1003 |
-
"print(\"Alpha: \",alpha)\n",
|
1004 |
"print(\"Beta: \", beta)"
|
1005 |
]
|
1006 |
},
|
@@ -1205,15 +1214,28 @@
|
|
1205 |
"for a in alpha:\n",
|
1206 |
" for b in beta:\n",
|
1207 |
" for num in num_topics:\n",
|
1208 |
-
" cv, pv = compute_coherence_values(
|
|
|
|
|
1209 |
"\n",
|
1210 |
-
" model_topics.append(num)
|
1211 |
-
" coherence_values.append(cv)
|
1212 |
" perplexity_values.append(pv)\n",
|
1213 |
" alpha_result.append(a)\n",
|
1214 |
" beta_result.append(b)\n",
|
1215 |
-
" print(\
|
1216 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1217 |
"print(datetime.datetime.now())"
|
1218 |
]
|
1219 |
},
|
@@ -1245,13 +1267,17 @@
|
|
1245 |
"source": [
|
1246 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1247 |
"result = pd.DataFrame(\n",
|
1248 |
-
" {
|
1249 |
-
"
|
1250 |
-
"
|
1251 |
-
"
|
1252 |
-
"
|
1253 |
-
"
|
1254 |
-
"
|
|
|
|
|
|
|
|
|
1255 |
]
|
1256 |
},
|
1257 |
{
|
@@ -1272,7 +1298,7 @@
|
|
1272 |
}
|
1273 |
],
|
1274 |
"source": [
|
1275 |
-
"result.to_csv(
|
1276 |
]
|
1277 |
},
|
1278 |
{
|
@@ -1296,7 +1322,7 @@
|
|
1296 |
],
|
1297 |
"source": [
|
1298 |
"# Show graph Topics vs Coherence Score\n",
|
1299 |
-
"result.groupby(
|
1300 |
]
|
1301 |
},
|
1302 |
{
|
@@ -1322,7 +1348,7 @@
|
|
1322 |
"plt.plot(model_topics, coherence_values)\n",
|
1323 |
"plt.xlabel(\"Num Topics\")\n",
|
1324 |
"plt.ylabel(\"Coherence Score\")\n",
|
1325 |
-
"plt.legend((\"Coherence Score\"), loc
|
1326 |
"plt.show()"
|
1327 |
]
|
1328 |
},
|
@@ -1349,7 +1375,7 @@
|
|
1349 |
"plt.plot(model_topics, perplexity_values)\n",
|
1350 |
"plt.xlabel(\"Num Topics\")\n",
|
1351 |
"plt.ylabel(\"Perplexity score\")\n",
|
1352 |
-
"plt.legend((\"perplexity_values\"), loc
|
1353 |
"plt.show()"
|
1354 |
]
|
1355 |
},
|
@@ -1382,20 +1408,22 @@
|
|
1382 |
"# realised that there may be some overlaps for more than 5 topics, but below 5 topics results in low differentiation and high ambiguity among the topics.\n",
|
1383 |
"# LDA is not suitable for this dataset\n",
|
1384 |
"k = 9\n",
|
1385 |
-
"a =
|
1386 |
"# a = 0.31\n",
|
1387 |
"# b = 0.31\n",
|
1388 |
-
"b =
|
1389 |
"\n",
|
1390 |
"\n",
|
1391 |
-
"final_model = gensim.models.LdaMulticore(
|
1392 |
-
"
|
1393 |
-
"
|
1394 |
-
"
|
1395 |
-
"
|
1396 |
-
"
|
1397 |
-
"
|
1398 |
-
"
|
|
|
|
|
1399 |
]
|
1400 |
},
|
1401 |
{
|
@@ -1418,7 +1446,7 @@
|
|
1418 |
}
|
1419 |
],
|
1420 |
"source": [
|
1421 |
-
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b)
|
1422 |
]
|
1423 |
},
|
1424 |
{
|
@@ -1441,12 +1469,12 @@
|
|
1441 |
}
|
1442 |
],
|
1443 |
"source": [
|
1444 |
-
"#Set up the environment to display the graphical outputs\n",
|
1445 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1446 |
"pyLDAvis.enable_notebook()\n",
|
1447 |
-
"visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1448 |
"\n",
|
1449 |
-
"#Save the output to the html file\n",
|
1450 |
"pyLDAvis.save_html(visual, \"data/topic_viz12_mod_training.html\")"
|
1451 |
]
|
1452 |
},
|
@@ -1547,13 +1575,19 @@
|
|
1547 |
],
|
1548 |
"source": [
|
1549 |
"import warnings\n",
|
1550 |
-
"
|
|
|
|
|
1551 |
"\n",
|
1552 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1553 |
" # Preallocate memory for the DataFrame\n",
|
1554 |
" num_docs = len(corpus)\n",
|
1555 |
-
" sent_topics = {
|
1556 |
-
"
|
|
|
|
|
|
|
|
|
1557 |
" # Get main topic in each document\n",
|
1558 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1559 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
@@ -1561,13 +1595,13 @@
|
|
1561 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1562 |
" dominant_topic, perc_contribution = row[0]\n",
|
1563 |
" topic_distribution = row\n",
|
1564 |
-
" sent_topics[
|
1565 |
-
" sent_topics[
|
1566 |
-
" sent_topics[
|
1567 |
"\n",
|
1568 |
" # Create the DataFrame\n",
|
1569 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1570 |
-
" sent_topics_df[
|
1571 |
"\n",
|
1572 |
" return sent_topics_df"
|
1573 |
]
|
@@ -1590,7 +1624,9 @@
|
|
1590 |
}
|
1591 |
],
|
1592 |
"source": [
|
1593 |
-
"df_topic_sents_keywords = format_topics_sentences(
|
|
|
|
|
1594 |
]
|
1595 |
},
|
1596 |
{
|
@@ -1613,7 +1649,13 @@
|
|
1613 |
"source": [
|
1614 |
"# Format\n",
|
1615 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1616 |
-
"df_dominant_topic.columns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
1617 |
"\n",
|
1618 |
"# Show\n",
|
1619 |
"df_dominant_topic.head(10)"
|
@@ -1687,7 +1729,7 @@
|
|
1687 |
"# Show the plot\n",
|
1688 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1689 |
"plt.tight_layout()\n",
|
1690 |
-
"plt.show()
|
1691 |
]
|
1692 |
},
|
1693 |
{
|
@@ -1730,9 +1772,9 @@
|
|
1730 |
],
|
1731 |
"source": [
|
1732 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1733 |
-
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)
|
1734 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1735 |
-
"sampled_df.to_csv(
|
1736 |
]
|
1737 |
}
|
1738 |
],
|
|
|
76 |
"import datetime\n",
|
77 |
"\n",
|
78 |
"import warnings\n",
|
79 |
+
"\n",
|
80 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
81 |
"\n",
|
82 |
"from pprint import pprint\n",
|
83 |
"import pyLDAvis\n",
|
|
|
99 |
"metadata": {},
|
100 |
"outputs": [],
|
101 |
"source": [
|
102 |
+
"df = pd.read_parquet(\"data/processed_data2.parquet\")"
|
103 |
]
|
104 |
},
|
105 |
{
|
|
|
418 |
"outputs": [],
|
419 |
"source": [
|
420 |
"# choose only the extreme and severe cases for modelling\n",
|
421 |
+
"cleaned = df_copy[df_copy[\"Severity\"].isin([\"Moderate\"])]\n",
|
422 |
"cleaned.reset_index(drop=True, inplace=True)"
|
423 |
]
|
424 |
},
|
|
|
511 |
}
|
512 |
],
|
513 |
"source": [
|
514 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
515 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
516 |
]
|
517 |
},
|
518 |
{
|
|
|
676 |
"%%time\n",
|
677 |
"\n",
|
678 |
"# Build LDA benchmark model\n",
|
679 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
680 |
+
" corpus=docs_vecs,\n",
|
681 |
+
" id2word=doc_dict,\n",
|
682 |
+
" num_topics=4,\n",
|
683 |
+
" random_state=42,\n",
|
684 |
+
" chunksize=100,\n",
|
685 |
+
" passes=10,\n",
|
686 |
+
" per_word_topics=True,\n",
|
687 |
+
")"
|
688 |
]
|
689 |
},
|
690 |
{
|
|
|
753 |
],
|
754 |
"source": [
|
755 |
"# Compute Benchmark Coherence Score\n",
|
756 |
+
"coherence_model_lda = CoherenceModel(\n",
|
757 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
758 |
+
")\n",
|
759 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
760 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
761 |
]
|
762 |
},
|
763 |
{
|
|
|
779 |
],
|
780 |
"source": [
|
781 |
"# Compute Benchmark Perplexity\n",
|
782 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
783 |
+
"# a measure of how good the model is. lower the better.\n",
|
784 |
"\n",
|
785 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
786 |
]
|
787 |
},
|
788 |
{
|
|
|
798 |
"\n",
|
799 |
"# feed the LDA model into the pyLDAvis instance\n",
|
800 |
"pyLDAvis.enable_notebook()\n",
|
801 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
802 |
"\n",
|
803 |
"# Save the output to the html file\n",
|
804 |
"pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_moderate.html\")"
|
|
|
880 |
}
|
881 |
],
|
882 |
"source": [
|
883 |
+
"pd.set_option(\"max_colwidth\", 200)\n",
|
884 |
"# Get the topics and their top keywords into a dataframe\n",
|
885 |
+
"topics = lda_model.show_topics(num_words=6)\n",
|
886 |
"\n",
|
887 |
"topic_keywords = pd.DataFrame()\n",
|
888 |
"for topic_id, topic in topics:\n",
|
889 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
890 |
"\n",
|
891 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
892 |
+
"# topic_keywords['Topic Name'] = topic_mapping\n",
|
893 |
"topic_keywords"
|
894 |
]
|
895 |
},
|
|
|
900 |
"metadata": {},
|
901 |
"outputs": [],
|
902 |
"source": [
|
903 |
+
"# break"
|
904 |
]
|
905 |
},
|
906 |
{
|
|
|
928 |
"source": [
|
929 |
"# hyper-perameter tuning (alpha and beta)\n",
|
930 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
931 |
+
"\n",
|
932 |
+
" lda_model = gensim.models.LdaMulticore(\n",
|
933 |
+
" corpus=corpus,\n",
|
934 |
+
" id2word=dictionary,\n",
|
935 |
+
" num_topics=k,\n",
|
936 |
+
" random_state=42,\n",
|
937 |
+
" chunksize=100,\n",
|
938 |
+
" passes=10,\n",
|
939 |
+
" alpha=a,\n",
|
940 |
+
" eta=b,\n",
|
941 |
+
" )\n",
|
942 |
+
"\n",
|
943 |
+
" coherence_model_lda = CoherenceModel(\n",
|
944 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
945 |
+
" )\n",
|
946 |
" coherence = coherence_model_lda.get_coherence()\n",
|
947 |
+
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)\n",
|
948 |
+
"\n",
|
949 |
" return coherence, perplex"
|
950 |
]
|
951 |
},
|
|
|
973 |
"\n",
|
974 |
"# Alpha parameter\n",
|
975 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
976 |
+
"alpha.append(\"symmetric\")\n",
|
977 |
+
"alpha.append(\"asymmetric\")\n",
|
978 |
"\n",
|
979 |
"# Beta parameter\n",
|
980 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
981 |
+
"beta.append(\"symmetric\")"
|
982 |
]
|
983 |
},
|
984 |
{
|
|
|
1008 |
}
|
1009 |
],
|
1010 |
"source": [
|
1011 |
+
"print(\"Topic range: \", num_topics)\n",
|
1012 |
+
"print(\"Alpha: \", alpha)\n",
|
1013 |
"print(\"Beta: \", beta)"
|
1014 |
]
|
1015 |
},
|
|
|
1214 |
"for a in alpha:\n",
|
1215 |
" for b in beta:\n",
|
1216 |
" for num in num_topics:\n",
|
1217 |
+
" cv, pv = compute_coherence_values(\n",
|
1218 |
+
" corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b\n",
|
1219 |
+
" )\n",
|
1220 |
"\n",
|
1221 |
+
" model_topics.append(num)\n",
|
1222 |
+
" coherence_values.append(cv)\n",
|
1223 |
" perplexity_values.append(pv)\n",
|
1224 |
" alpha_result.append(a)\n",
|
1225 |
" beta_result.append(b)\n",
|
1226 |
+
" print(\n",
|
1227 |
+
" \"#Topics: \"\n",
|
1228 |
+
" + str(num)\n",
|
1229 |
+
" + \", CV Score: \"\n",
|
1230 |
+
" + str(coherence_values[-1])\n",
|
1231 |
+
" + \", PV Score: \"\n",
|
1232 |
+
" + str(perplexity_values[-1])\n",
|
1233 |
+
" + \", Alpha: \"\n",
|
1234 |
+
" + str(alpha_result[-1])\n",
|
1235 |
+
" + \", Beta: \"\n",
|
1236 |
+
" + str(beta_result[-1])\n",
|
1237 |
+
" )\n",
|
1238 |
+
"\n",
|
1239 |
"print(datetime.datetime.now())"
|
1240 |
]
|
1241 |
},
|
|
|
1267 |
"source": [
|
1268 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1269 |
"result = pd.DataFrame(\n",
|
1270 |
+
" {\n",
|
1271 |
+
" \"Topics\": model_topics,\n",
|
1272 |
+
" \"Coherence Score\": coherence_values,\n",
|
1273 |
+
" \"Perplexity Score\": perplexity_values,\n",
|
1274 |
+
" \"Alpha\": alpha_result,\n",
|
1275 |
+
" \"Beta\": beta_result,\n",
|
1276 |
+
" }\n",
|
1277 |
+
")\n",
|
1278 |
+
"result.sort_values(\n",
|
1279 |
+
" by=[\"Coherence Score\", \"Perplexity Score\"], ascending=[False, True]\n",
|
1280 |
+
").head(20)"
|
1281 |
]
|
1282 |
},
|
1283 |
{
|
|
|
1298 |
}
|
1299 |
],
|
1300 |
"source": [
|
1301 |
+
"result.to_csv(\"data/lda_fine_tuning_result_moderate.csv\")"
|
1302 |
]
|
1303 |
},
|
1304 |
{
|
|
|
1322 |
],
|
1323 |
"source": [
|
1324 |
"# Show graph Topics vs Coherence Score\n",
|
1325 |
+
"result.groupby(\"Alpha\").plot(x=\"Topics\", y=\"Coherence Score\", legend=True)"
|
1326 |
]
|
1327 |
},
|
1328 |
{
|
|
|
1348 |
"plt.plot(model_topics, coherence_values)\n",
|
1349 |
"plt.xlabel(\"Num Topics\")\n",
|
1350 |
"plt.ylabel(\"Coherence Score\")\n",
|
1351 |
+
"plt.legend((\"Coherence Score\"), loc=\"best\")\n",
|
1352 |
"plt.show()"
|
1353 |
]
|
1354 |
},
|
|
|
1375 |
"plt.plot(model_topics, perplexity_values)\n",
|
1376 |
"plt.xlabel(\"Num Topics\")\n",
|
1377 |
"plt.ylabel(\"Perplexity score\")\n",
|
1378 |
+
"plt.legend((\"perplexity_values\"), loc=\"best\")\n",
|
1379 |
"plt.show()"
|
1380 |
]
|
1381 |
},
|
|
|
1408 |
"# realised that there may be some overlaps for more than 5 topics, but below 5 topics results in low differentiation and high ambiguity among the topics.\n",
|
1409 |
"# LDA is not suitable for this dataset\n",
|
1410 |
"k = 9\n",
|
1411 |
+
"a = \"symmetric\"\n",
|
1412 |
"# a = 0.31\n",
|
1413 |
"# b = 0.31\n",
|
1414 |
+
"b = \"symmetric\"\n",
|
1415 |
"\n",
|
1416 |
"\n",
|
1417 |
+
"final_model = gensim.models.LdaMulticore(\n",
|
1418 |
+
" corpus=docs_vecs,\n",
|
1419 |
+
" id2word=doc_dict,\n",
|
1420 |
+
" num_topics=k,\n",
|
1421 |
+
" random_state=42,\n",
|
1422 |
+
" chunksize=100,\n",
|
1423 |
+
" passes=10,\n",
|
1424 |
+
" alpha=a,\n",
|
1425 |
+
" eta=b,\n",
|
1426 |
+
")"
|
1427 |
]
|
1428 |
},
|
1429 |
{
|
|
|
1446 |
}
|
1447 |
],
|
1448 |
"source": [
|
1449 |
+
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)"
|
1450 |
]
|
1451 |
},
|
1452 |
{
|
|
|
1469 |
}
|
1470 |
],
|
1471 |
"source": [
|
1472 |
+
"# Set up the environment to display the graphical outputs\n",
|
1473 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1474 |
"pyLDAvis.enable_notebook()\n",
|
1475 |
+
"visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1476 |
"\n",
|
1477 |
+
"# Save the output to the html file\n",
|
1478 |
"pyLDAvis.save_html(visual, \"data/topic_viz12_mod_training.html\")"
|
1479 |
]
|
1480 |
},
|
|
|
1575 |
],
|
1576 |
"source": [
|
1577 |
"import warnings\n",
|
1578 |
+
"\n",
|
1579 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
1580 |
+
"\n",
|
1581 |
"\n",
|
1582 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1583 |
" # Preallocate memory for the DataFrame\n",
|
1584 |
" num_docs = len(corpus)\n",
|
1585 |
+
" sent_topics = {\n",
|
1586 |
+
" \"Dominant_Topic\": [0] * num_docs,\n",
|
1587 |
+
" \"Perc_Contribution\": [0.0] * num_docs,\n",
|
1588 |
+
" \"Topic_Distribution\": [()] * num_docs,\n",
|
1589 |
+
" }\n",
|
1590 |
+
"\n",
|
1591 |
" # Get main topic in each document\n",
|
1592 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1593 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
|
|
1595 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1596 |
" dominant_topic, perc_contribution = row[0]\n",
|
1597 |
" topic_distribution = row\n",
|
1598 |
+
" sent_topics[\"Dominant_Topic\"][i] = int(dominant_topic)\n",
|
1599 |
+
" sent_topics[\"Perc_Contribution\"][i] = round(perc_contribution, 4)\n",
|
1600 |
+
" sent_topics[\"Topic_Distribution\"][i] = topic_distribution\n",
|
1601 |
"\n",
|
1602 |
" # Create the DataFrame\n",
|
1603 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1604 |
+
" sent_topics_df[\"Text\"] = data\n",
|
1605 |
"\n",
|
1606 |
" return sent_topics_df"
|
1607 |
]
|
|
|
1624 |
}
|
1625 |
],
|
1626 |
"source": [
|
1627 |
+
"df_topic_sents_keywords = format_topics_sentences(\n",
|
1628 |
+
" ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details\n",
|
1629 |
+
")"
|
1630 |
]
|
1631 |
},
|
1632 |
{
|
|
|
1649 |
"source": [
|
1650 |
"# Format\n",
|
1651 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1652 |
+
"df_dominant_topic.columns = [\n",
|
1653 |
+
" \"Document_No\",\n",
|
1654 |
+
" \"Dominant_Topic\",\n",
|
1655 |
+
" \"Topic_Perc_Contrib\",\n",
|
1656 |
+
" \"Topic_Distribution\",\n",
|
1657 |
+
" \"Text\",\n",
|
1658 |
+
"]\n",
|
1659 |
"\n",
|
1660 |
"# Show\n",
|
1661 |
"df_dominant_topic.head(10)"
|
|
|
1729 |
"# Show the plot\n",
|
1730 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1731 |
"plt.tight_layout()\n",
|
1732 |
+
"plt.show()"
|
1733 |
]
|
1734 |
},
|
1735 |
{
|
|
|
1772 |
],
|
1773 |
"source": [
|
1774 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1775 |
+
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)\n",
|
1776 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1777 |
+
"sampled_df.to_csv(\"data/sample_moderate.csv\")"
|
1778 |
]
|
1779 |
}
|
1780 |
],
|
notebooks/09_topic_modelling_severe.ipynb
CHANGED
@@ -76,7 +76,8 @@
|
|
76 |
"import datetime\n",
|
77 |
"\n",
|
78 |
"import warnings\n",
|
79 |
-
"
|
|
|
80 |
"\n",
|
81 |
"from pprint import pprint\n",
|
82 |
"import pyLDAvis\n",
|
@@ -116,7 +117,7 @@
|
|
116 |
"metadata": {},
|
117 |
"outputs": [],
|
118 |
"source": [
|
119 |
-
"df = pd.read_parquet(
|
120 |
]
|
121 |
},
|
122 |
{
|
@@ -435,7 +436,7 @@
|
|
435 |
"outputs": [],
|
436 |
"source": [
|
437 |
"# choose only the extreme and severe cases for modelling\n",
|
438 |
-
"cleaned = df_copy[df_copy[
|
439 |
"cleaned.reset_index(drop=True, inplace=True)"
|
440 |
]
|
441 |
},
|
@@ -512,8 +513,8 @@
|
|
512 |
}
|
513 |
],
|
514 |
"source": [
|
515 |
-
"print(
|
516 |
-
"print(
|
517 |
]
|
518 |
},
|
519 |
{
|
@@ -674,13 +675,15 @@
|
|
674 |
"outputs": [],
|
675 |
"source": [
|
676 |
"# Build LDA benchmark model\n",
|
677 |
-
"lda_model = gensim.models.LdaMulticore(
|
678 |
-
"
|
679 |
-
"
|
680 |
-
"
|
681 |
-
"
|
682 |
-
"
|
683 |
-
"
|
|
|
|
|
684 |
]
|
685 |
},
|
686 |
{
|
@@ -751,9 +754,11 @@
|
|
751 |
"%%time\n",
|
752 |
"\n",
|
753 |
"# Compute Benchmark Coherence Score\n",
|
754 |
-
"coherence_model_lda = CoherenceModel(
|
|
|
|
|
755 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
756 |
-
"print(
|
757 |
]
|
758 |
},
|
759 |
{
|
@@ -775,10 +780,10 @@
|
|
775 |
],
|
776 |
"source": [
|
777 |
"# Compute Benchmark Perplexity\n",
|
778 |
-
"perplex= lda_model.log_perplexity(docs_vecs, total_docs=None)
|
779 |
-
"
|
780 |
"\n",
|
781 |
-
"print(
|
782 |
]
|
783 |
},
|
784 |
{
|
@@ -831,9 +836,9 @@
|
|
831 |
"\n",
|
832 |
"# feed the LDA model into the pyLDAvis instance\n",
|
833 |
"pyLDAvis.enable_notebook()\n",
|
834 |
-
"visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
835 |
"\n",
|
836 |
-
"#Save the output to the html file\n",
|
837 |
"pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_severe.html\")"
|
838 |
]
|
839 |
},
|
@@ -913,15 +918,15 @@
|
|
913 |
}
|
914 |
],
|
915 |
"source": [
|
916 |
-
"pd.set_option(
|
917 |
"# Get the topics and their top keywords into a dataframe\n",
|
918 |
-
"topics = lda_model.show_topics(num_words=6)
|
919 |
"\n",
|
920 |
"topic_keywords = pd.DataFrame()\n",
|
921 |
"for topic_id, topic in topics:\n",
|
922 |
-
" topic_keywords.at[topic_id,
|
923 |
"\n",
|
924 |
-
"topic_keywords[
|
925 |
"topic_keywords"
|
926 |
]
|
927 |
},
|
@@ -932,7 +937,7 @@
|
|
932 |
"metadata": {},
|
933 |
"outputs": [],
|
934 |
"source": [
|
935 |
-
"# break
|
936 |
]
|
937 |
},
|
938 |
{
|
@@ -960,20 +965,24 @@
|
|
960 |
"source": [
|
961 |
"# hyper-perameter tuning (alpha and beta)\n",
|
962 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
963 |
-
"
|
964 |
-
" lda_model = gensim.models.LdaMulticore(
|
965 |
-
"
|
966 |
-
"
|
967 |
-
"
|
968 |
-
"
|
969 |
-
"
|
970 |
-
"
|
971 |
-
"
|
972 |
-
"
|
973 |
-
"
|
|
|
|
|
|
|
|
|
974 |
" coherence = coherence_model_lda.get_coherence()\n",
|
975 |
-
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)
|
976 |
-
"
|
977 |
" return coherence, perplex"
|
978 |
]
|
979 |
},
|
@@ -1001,12 +1010,12 @@
|
|
1001 |
"\n",
|
1002 |
"# Alpha parameter\n",
|
1003 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
1004 |
-
"alpha.append(
|
1005 |
-
"alpha.append(
|
1006 |
"\n",
|
1007 |
"# Beta parameter\n",
|
1008 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
1009 |
-
"beta.append(
|
1010 |
]
|
1011 |
},
|
1012 |
{
|
@@ -1036,8 +1045,8 @@
|
|
1036 |
}
|
1037 |
],
|
1038 |
"source": [
|
1039 |
-
"print(\"Topic range: \",num_topics)\n",
|
1040 |
-
"print(\"Alpha: \",alpha)\n",
|
1041 |
"print(\"Beta: \", beta)"
|
1042 |
]
|
1043 |
},
|
@@ -1282,15 +1291,28 @@
|
|
1282 |
"for a in alpha:\n",
|
1283 |
" for b in beta:\n",
|
1284 |
" for num in num_topics:\n",
|
1285 |
-
" cv, pv = compute_coherence_values(
|
|
|
|
|
1286 |
"\n",
|
1287 |
-
" model_topics.append(num)
|
1288 |
-
" coherence_values.append(cv)
|
1289 |
" perplexity_values.append(pv)\n",
|
1290 |
" alpha_result.append(a)\n",
|
1291 |
" beta_result.append(b)\n",
|
1292 |
-
" print(\
|
1293 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1294 |
"print(datetime.datetime.now())"
|
1295 |
]
|
1296 |
},
|
@@ -1322,13 +1344,17 @@
|
|
1322 |
"source": [
|
1323 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1324 |
"result = pd.DataFrame(\n",
|
1325 |
-
" {
|
1326 |
-
"
|
1327 |
-
"
|
1328 |
-
"
|
1329 |
-
"
|
1330 |
-
"
|
1331 |
-
"
|
|
|
|
|
|
|
|
|
1332 |
]
|
1333 |
},
|
1334 |
{
|
@@ -1349,7 +1375,7 @@
|
|
1349 |
}
|
1350 |
],
|
1351 |
"source": [
|
1352 |
-
"result.to_csv(
|
1353 |
]
|
1354 |
},
|
1355 |
{
|
@@ -1373,7 +1399,7 @@
|
|
1373 |
],
|
1374 |
"source": [
|
1375 |
"# Show graph Topics vs Coherence Score\n",
|
1376 |
-
"result.groupby(
|
1377 |
]
|
1378 |
},
|
1379 |
{
|
@@ -1399,7 +1425,7 @@
|
|
1399 |
"plt.plot(model_topics, coherence_values)\n",
|
1400 |
"plt.xlabel(\"Num Topics\")\n",
|
1401 |
"plt.ylabel(\"Coherence Score\")\n",
|
1402 |
-
"plt.legend((\"Coherence Score\"), loc
|
1403 |
"plt.show()"
|
1404 |
]
|
1405 |
},
|
@@ -1426,7 +1452,7 @@
|
|
1426 |
"plt.plot(model_topics, perplexity_values)\n",
|
1427 |
"plt.xlabel(\"Num Topics\")\n",
|
1428 |
"plt.ylabel(\"Perplexity score\")\n",
|
1429 |
-
"plt.legend((\"perplexity_values\"), loc
|
1430 |
"plt.show()"
|
1431 |
]
|
1432 |
},
|
@@ -1476,21 +1502,22 @@
|
|
1476 |
"source": [
|
1477 |
"# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
|
1478 |
"k = 8\n",
|
1479 |
-
"a =
|
1480 |
"# a = 0.91\n",
|
1481 |
"# b = 0.61\n",
|
1482 |
-
"b =
|
1483 |
"\n",
|
1484 |
"\n",
|
1485 |
-
"\n",
|
1486 |
-
"
|
1487 |
-
"
|
1488 |
-
"
|
1489 |
-
"
|
1490 |
-
"
|
1491 |
-
"
|
1492 |
-
"
|
1493 |
-
"
|
|
|
1494 |
]
|
1495 |
},
|
1496 |
{
|
@@ -1513,7 +1540,7 @@
|
|
1513 |
}
|
1514 |
],
|
1515 |
"source": [
|
1516 |
-
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b)
|
1517 |
]
|
1518 |
},
|
1519 |
{
|
@@ -1536,12 +1563,12 @@
|
|
1536 |
}
|
1537 |
],
|
1538 |
"source": [
|
1539 |
-
"#Set up the environment to display the graphical outputs\n",
|
1540 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1541 |
"pyLDAvis.enable_notebook()\n",
|
1542 |
-
"visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1543 |
"\n",
|
1544 |
-
"#Save the output to the html file\n",
|
1545 |
"pyLDAvis.save_html(visual, \"data/topic_viz8_severe_training.html\")"
|
1546 |
]
|
1547 |
},
|
@@ -1594,7 +1621,7 @@
|
|
1594 |
}
|
1595 |
],
|
1596 |
"source": [
|
1597 |
-
"#Save a model to disk, or reload a pre-trained model\n",
|
1598 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1599 |
"final_model.save(\"final_model_8_asym_sym\")"
|
1600 |
]
|
@@ -1627,13 +1654,19 @@
|
|
1627 |
],
|
1628 |
"source": [
|
1629 |
"import warnings\n",
|
1630 |
-
"
|
|
|
|
|
1631 |
"\n",
|
1632 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1633 |
" # Preallocate memory for the DataFrame\n",
|
1634 |
" num_docs = len(corpus)\n",
|
1635 |
-
" sent_topics = {
|
1636 |
-
"
|
|
|
|
|
|
|
|
|
1637 |
" # Get main topic in each document\n",
|
1638 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1639 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
@@ -1641,13 +1674,13 @@
|
|
1641 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1642 |
" dominant_topic, perc_contribution = row[0]\n",
|
1643 |
" topic_distribution = row\n",
|
1644 |
-
" sent_topics[
|
1645 |
-
" sent_topics[
|
1646 |
-
" sent_topics[
|
1647 |
"\n",
|
1648 |
" # Create the DataFrame\n",
|
1649 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1650 |
-
" sent_topics_df[
|
1651 |
"\n",
|
1652 |
" return sent_topics_df"
|
1653 |
]
|
@@ -1670,7 +1703,9 @@
|
|
1670 |
}
|
1671 |
],
|
1672 |
"source": [
|
1673 |
-
"df_topic_sents_keywords = format_topics_sentences(
|
|
|
|
|
1674 |
]
|
1675 |
},
|
1676 |
{
|
@@ -1693,7 +1728,13 @@
|
|
1693 |
"source": [
|
1694 |
"# Format\n",
|
1695 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1696 |
-
"df_dominant_topic.columns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
1697 |
"\n",
|
1698 |
"# Show\n",
|
1699 |
"df_dominant_topic.head(10)"
|
@@ -1767,7 +1808,7 @@
|
|
1767 |
"# Show the plot\n",
|
1768 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1769 |
"plt.tight_layout()\n",
|
1770 |
-
"plt.show()
|
1771 |
]
|
1772 |
},
|
1773 |
{
|
@@ -1788,7 +1829,7 @@
|
|
1788 |
}
|
1789 |
],
|
1790 |
"source": [
|
1791 |
-
"df_dominant_topic.sort_values(by
|
1792 |
]
|
1793 |
},
|
1794 |
{
|
@@ -1810,9 +1851,9 @@
|
|
1810 |
],
|
1811 |
"source": [
|
1812 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1813 |
-
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)
|
1814 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1815 |
-
"sampled_df.to_csv(
|
1816 |
]
|
1817 |
},
|
1818 |
{
|
|
|
76 |
"import datetime\n",
|
77 |
"\n",
|
78 |
"import warnings\n",
|
79 |
+
"\n",
|
80 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
81 |
"\n",
|
82 |
"from pprint import pprint\n",
|
83 |
"import pyLDAvis\n",
|
|
|
117 |
"metadata": {},
|
118 |
"outputs": [],
|
119 |
"source": [
|
120 |
+
"df = pd.read_parquet(\"data/processed_data2.parquet\")"
|
121 |
]
|
122 |
},
|
123 |
{
|
|
|
436 |
"outputs": [],
|
437 |
"source": [
|
438 |
"# choose only the extreme and severe cases for modelling\n",
|
439 |
+
"cleaned = df_copy[df_copy[\"Severity\"].isin([\"Extreme\", \"Severe\"])]\n",
|
440 |
"cleaned.reset_index(drop=True, inplace=True)"
|
441 |
]
|
442 |
},
|
|
|
513 |
}
|
514 |
],
|
515 |
"source": [
|
516 |
+
"print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
|
517 |
+
"print(\"Number of articles: %d\" % len(docs_vecs))"
|
518 |
]
|
519 |
},
|
520 |
{
|
|
|
675 |
"outputs": [],
|
676 |
"source": [
|
677 |
"# Build LDA benchmark model\n",
|
678 |
+
"lda_model = gensim.models.LdaMulticore(\n",
|
679 |
+
" corpus=docs_vecs,\n",
|
680 |
+
" id2word=doc_dict,\n",
|
681 |
+
" num_topics=4,\n",
|
682 |
+
" random_state=42,\n",
|
683 |
+
" chunksize=100,\n",
|
684 |
+
" passes=10,\n",
|
685 |
+
" per_word_topics=True,\n",
|
686 |
+
")"
|
687 |
]
|
688 |
},
|
689 |
{
|
|
|
754 |
"%%time\n",
|
755 |
"\n",
|
756 |
"# Compute Benchmark Coherence Score\n",
|
757 |
+
"coherence_model_lda = CoherenceModel(\n",
|
758 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
759 |
+
")\n",
|
760 |
"coherence_lda = coherence_model_lda.get_coherence()\n",
|
761 |
+
"print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
|
762 |
]
|
763 |
},
|
764 |
{
|
|
|
780 |
],
|
781 |
"source": [
|
782 |
"# Compute Benchmark Perplexity\n",
|
783 |
+
"perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
|
784 |
+
"# a measure of how good the model is. lower the better.\n",
|
785 |
"\n",
|
786 |
+
"print(\"\\nPerplexity for LDAModel: \", perplex)"
|
787 |
]
|
788 |
},
|
789 |
{
|
|
|
836 |
"\n",
|
837 |
"# feed the LDA model into the pyLDAvis instance\n",
|
838 |
"pyLDAvis.enable_notebook()\n",
|
839 |
+
"visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
|
840 |
"\n",
|
841 |
+
"# Save the output to the html file\n",
|
842 |
"pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_severe.html\")"
|
843 |
]
|
844 |
},
|
|
|
918 |
}
|
919 |
],
|
920 |
"source": [
|
921 |
+
"pd.set_option(\"max_colwidth\", 200)\n",
|
922 |
"# Get the topics and their top keywords into a dataframe\n",
|
923 |
+
"topics = lda_model.show_topics(num_words=6)\n",
|
924 |
"\n",
|
925 |
"topic_keywords = pd.DataFrame()\n",
|
926 |
"for topic_id, topic in topics:\n",
|
927 |
+
" topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
|
928 |
"\n",
|
929 |
+
"topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
|
930 |
"topic_keywords"
|
931 |
]
|
932 |
},
|
|
|
937 |
"metadata": {},
|
938 |
"outputs": [],
|
939 |
"source": [
|
940 |
+
"# break"
|
941 |
]
|
942 |
},
|
943 |
{
|
|
|
965 |
"source": [
|
966 |
"# hyper-perameter tuning (alpha and beta)\n",
|
967 |
"def compute_coherence_values(corpus, dictionary, k, a, b):\n",
|
968 |
+
"\n",
|
969 |
+
" lda_model = gensim.models.LdaMulticore(\n",
|
970 |
+
" corpus=corpus,\n",
|
971 |
+
" id2word=dictionary,\n",
|
972 |
+
" num_topics=k,\n",
|
973 |
+
" random_state=42,\n",
|
974 |
+
" chunksize=100,\n",
|
975 |
+
" passes=10,\n",
|
976 |
+
" alpha=a,\n",
|
977 |
+
" eta=b,\n",
|
978 |
+
" )\n",
|
979 |
+
"\n",
|
980 |
+
" coherence_model_lda = CoherenceModel(\n",
|
981 |
+
" model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
|
982 |
+
" )\n",
|
983 |
" coherence = coherence_model_lda.get_coherence()\n",
|
984 |
+
" perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)\n",
|
985 |
+
"\n",
|
986 |
" return coherence, perplex"
|
987 |
]
|
988 |
},
|
|
|
1010 |
"\n",
|
1011 |
"# Alpha parameter\n",
|
1012 |
"alpha = list(np.arange(0.31, 1, 0.3))\n",
|
1013 |
+
"alpha.append(\"symmetric\")\n",
|
1014 |
+
"alpha.append(\"asymmetric\")\n",
|
1015 |
"\n",
|
1016 |
"# Beta parameter\n",
|
1017 |
"beta = list(np.arange(0.31, 1, 0.3))\n",
|
1018 |
+
"beta.append(\"symmetric\")"
|
1019 |
]
|
1020 |
},
|
1021 |
{
|
|
|
1045 |
}
|
1046 |
],
|
1047 |
"source": [
|
1048 |
+
"print(\"Topic range: \", num_topics)\n",
|
1049 |
+
"print(\"Alpha: \", alpha)\n",
|
1050 |
"print(\"Beta: \", beta)"
|
1051 |
]
|
1052 |
},
|
|
|
1291 |
"for a in alpha:\n",
|
1292 |
" for b in beta:\n",
|
1293 |
" for num in num_topics:\n",
|
1294 |
+
" cv, pv = compute_coherence_values(\n",
|
1295 |
+
" corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b\n",
|
1296 |
+
" )\n",
|
1297 |
"\n",
|
1298 |
+
" model_topics.append(num)\n",
|
1299 |
+
" coherence_values.append(cv)\n",
|
1300 |
" perplexity_values.append(pv)\n",
|
1301 |
" alpha_result.append(a)\n",
|
1302 |
" beta_result.append(b)\n",
|
1303 |
+
" print(\n",
|
1304 |
+
" \"#Topics: \"\n",
|
1305 |
+
" + str(num)\n",
|
1306 |
+
" + \", CV Score: \"\n",
|
1307 |
+
" + str(coherence_values[-1])\n",
|
1308 |
+
" + \", PV Score: \"\n",
|
1309 |
+
" + str(perplexity_values[-1])\n",
|
1310 |
+
" + \", Alpha: \"\n",
|
1311 |
+
" + str(alpha_result[-1])\n",
|
1312 |
+
" + \", Beta: \"\n",
|
1313 |
+
" + str(beta_result[-1])\n",
|
1314 |
+
" )\n",
|
1315 |
+
"\n",
|
1316 |
"print(datetime.datetime.now())"
|
1317 |
]
|
1318 |
},
|
|
|
1344 |
"source": [
|
1345 |
"# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
|
1346 |
"result = pd.DataFrame(\n",
|
1347 |
+
" {\n",
|
1348 |
+
" \"Topics\": model_topics,\n",
|
1349 |
+
" \"Coherence Score\": coherence_values,\n",
|
1350 |
+
" \"Perplexity Score\": perplexity_values,\n",
|
1351 |
+
" \"Alpha\": alpha_result,\n",
|
1352 |
+
" \"Beta\": beta_result,\n",
|
1353 |
+
" }\n",
|
1354 |
+
")\n",
|
1355 |
+
"result.sort_values(\n",
|
1356 |
+
" by=[\"Coherence Score\", \"Perplexity Score\"], ascending=[False, True]\n",
|
1357 |
+
").head(20)"
|
1358 |
]
|
1359 |
},
|
1360 |
{
|
|
|
1375 |
}
|
1376 |
],
|
1377 |
"source": [
|
1378 |
+
"result.to_csv(\"data/lda_fine_tuning_result_severe.csv\")"
|
1379 |
]
|
1380 |
},
|
1381 |
{
|
|
|
1399 |
],
|
1400 |
"source": [
|
1401 |
"# Show graph Topics vs Coherence Score\n",
|
1402 |
+
"result.groupby(\"Alpha\").plot(x=\"Topics\", y=\"Coherence Score\", legend=True)"
|
1403 |
]
|
1404 |
},
|
1405 |
{
|
|
|
1425 |
"plt.plot(model_topics, coherence_values)\n",
|
1426 |
"plt.xlabel(\"Num Topics\")\n",
|
1427 |
"plt.ylabel(\"Coherence Score\")\n",
|
1428 |
+
"plt.legend((\"Coherence Score\"), loc=\"best\")\n",
|
1429 |
"plt.show()"
|
1430 |
]
|
1431 |
},
|
|
|
1452 |
"plt.plot(model_topics, perplexity_values)\n",
|
1453 |
"plt.xlabel(\"Num Topics\")\n",
|
1454 |
"plt.ylabel(\"Perplexity score\")\n",
|
1455 |
+
"plt.legend((\"perplexity_values\"), loc=\"best\")\n",
|
1456 |
"plt.show()"
|
1457 |
]
|
1458 |
},
|
|
|
1502 |
"source": [
|
1503 |
"# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
|
1504 |
"k = 8\n",
|
1505 |
+
"a = \"asymmetric\"\n",
|
1506 |
"# a = 0.91\n",
|
1507 |
"# b = 0.61\n",
|
1508 |
+
"b = \"symmetric\"\n",
|
1509 |
"\n",
|
1510 |
"\n",
|
1511 |
+
"final_model = gensim.models.LdaMulticore(\n",
|
1512 |
+
" corpus=docs_vecs,\n",
|
1513 |
+
" id2word=doc_dict,\n",
|
1514 |
+
" num_topics=k,\n",
|
1515 |
+
" random_state=42,\n",
|
1516 |
+
" chunksize=100,\n",
|
1517 |
+
" passes=10,\n",
|
1518 |
+
" alpha=a,\n",
|
1519 |
+
" eta=b,\n",
|
1520 |
+
")"
|
1521 |
]
|
1522 |
},
|
1523 |
{
|
|
|
1540 |
}
|
1541 |
],
|
1542 |
"source": [
|
1543 |
+
"compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)"
|
1544 |
]
|
1545 |
},
|
1546 |
{
|
|
|
1563 |
}
|
1564 |
],
|
1565 |
"source": [
|
1566 |
+
"# Set up the environment to display the graphical outputs\n",
|
1567 |
"# feed the LDA model into the pyLDAvis instance\n",
|
1568 |
"pyLDAvis.enable_notebook()\n",
|
1569 |
+
"visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
|
1570 |
"\n",
|
1571 |
+
"# Save the output to the html file\n",
|
1572 |
"pyLDAvis.save_html(visual, \"data/topic_viz8_severe_training.html\")"
|
1573 |
]
|
1574 |
},
|
|
|
1621 |
}
|
1622 |
],
|
1623 |
"source": [
|
1624 |
+
"# Save a model to disk, or reload a pre-trained model\n",
|
1625 |
"# naming convention: final_model_topic_alpha_eta\n",
|
1626 |
"final_model.save(\"final_model_8_asym_sym\")"
|
1627 |
]
|
|
|
1654 |
],
|
1655 |
"source": [
|
1656 |
"import warnings\n",
|
1657 |
+
"\n",
|
1658 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
1659 |
+
"\n",
|
1660 |
"\n",
|
1661 |
"def format_topics_sentences(ldamodel, corpus, data):\n",
|
1662 |
" # Preallocate memory for the DataFrame\n",
|
1663 |
" num_docs = len(corpus)\n",
|
1664 |
+
" sent_topics = {\n",
|
1665 |
+
" \"Dominant_Topic\": [0] * num_docs,\n",
|
1666 |
+
" \"Perc_Contribution\": [0.0] * num_docs,\n",
|
1667 |
+
" \"Topic_Distribution\": [()] * num_docs,\n",
|
1668 |
+
" }\n",
|
1669 |
+
"\n",
|
1670 |
" # Get main topic in each document\n",
|
1671 |
" for i, row in enumerate(ldamodel[corpus]):\n",
|
1672 |
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
|
|
|
1674 |
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
|
1675 |
" dominant_topic, perc_contribution = row[0]\n",
|
1676 |
" topic_distribution = row\n",
|
1677 |
+
" sent_topics[\"Dominant_Topic\"][i] = int(dominant_topic)\n",
|
1678 |
+
" sent_topics[\"Perc_Contribution\"][i] = round(perc_contribution, 4)\n",
|
1679 |
+
" sent_topics[\"Topic_Distribution\"][i] = topic_distribution\n",
|
1680 |
"\n",
|
1681 |
" # Create the DataFrame\n",
|
1682 |
" sent_topics_df = pd.DataFrame(sent_topics)\n",
|
1683 |
+
" sent_topics_df[\"Text\"] = data\n",
|
1684 |
"\n",
|
1685 |
" return sent_topics_df"
|
1686 |
]
|
|
|
1703 |
}
|
1704 |
],
|
1705 |
"source": [
|
1706 |
+
"df_topic_sents_keywords = format_topics_sentences(\n",
|
1707 |
+
" ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details\n",
|
1708 |
+
")"
|
1709 |
]
|
1710 |
},
|
1711 |
{
|
|
|
1728 |
"source": [
|
1729 |
"# Format\n",
|
1730 |
"df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
|
1731 |
+
"df_dominant_topic.columns = [\n",
|
1732 |
+
" \"Document_No\",\n",
|
1733 |
+
" \"Dominant_Topic\",\n",
|
1734 |
+
" \"Topic_Perc_Contrib\",\n",
|
1735 |
+
" \"Topic_Distribution\",\n",
|
1736 |
+
" \"Text\",\n",
|
1737 |
+
"]\n",
|
1738 |
"\n",
|
1739 |
"# Show\n",
|
1740 |
"df_dominant_topic.head(10)"
|
|
|
1808 |
"# Show the plot\n",
|
1809 |
"plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
|
1810 |
"plt.tight_layout()\n",
|
1811 |
+
"plt.show()"
|
1812 |
]
|
1813 |
},
|
1814 |
{
|
|
|
1829 |
}
|
1830 |
],
|
1831 |
"source": [
|
1832 |
+
"df_dominant_topic.sort_values(by=\"Topic_Perc_Contrib\", ascending=False).head(20)"
|
1833 |
]
|
1834 |
},
|
1835 |
{
|
|
|
1851 |
],
|
1852 |
"source": [
|
1853 |
"# Sample 100 rows, can change the random_state for different samples\n",
|
1854 |
+
"sampled_data = df_dominant_topic.sample(n=100, random_state=42)\n",
|
1855 |
"sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
|
1856 |
+
"sampled_df.to_csv(\"data/sample_severe.csv\")"
|
1857 |
]
|
1858 |
},
|
1859 |
{
|
notebooks/10_LLM_evaluation.ipynb
CHANGED
@@ -872,7 +872,7 @@
|
|
872 |
}
|
873 |
],
|
874 |
"source": [
|
875 |
-
"df_sorted = df.groupby(\"Category\").count().sort_values(by
|
876 |
"df_sorted"
|
877 |
]
|
878 |
},
|
@@ -885,8 +885,8 @@
|
|
885 |
"source": [
|
886 |
"# Function to determine the value for the new column\n",
|
887 |
"def categorize(value):\n",
|
888 |
-
" if
|
889 |
-
" return
|
890 |
" else:\n",
|
891 |
" return value"
|
892 |
]
|
@@ -898,7 +898,7 @@
|
|
898 |
"metadata": {},
|
899 |
"outputs": [],
|
900 |
"source": [
|
901 |
-
"df[
|
902 |
]
|
903 |
},
|
904 |
{
|
@@ -1182,7 +1182,9 @@
|
|
1182 |
}
|
1183 |
],
|
1184 |
"source": [
|
1185 |
-
"df_sorted1 =
|
|
|
|
|
1186 |
"df_sorted1"
|
1187 |
]
|
1188 |
},
|
@@ -1531,7 +1533,7 @@
|
|
1531 |
}
|
1532 |
],
|
1533 |
"source": [
|
1534 |
-
"eva = pd.read_csv(
|
1535 |
"eva"
|
1536 |
]
|
1537 |
},
|
@@ -1567,8 +1569,10 @@
|
|
1567 |
}
|
1568 |
],
|
1569 |
"source": [
|
1570 |
-
"eva[
|
1571 |
-
"
|
|
|
|
|
1572 |
"\n",
|
1573 |
"result_gpt = result.sort_values(ascending=False)\n",
|
1574 |
"result_gpt"
|
@@ -1778,7 +1782,9 @@
|
|
1778 |
}
|
1779 |
],
|
1780 |
"source": [
|
1781 |
-
"test =
|
|
|
|
|
1782 |
"test"
|
1783 |
]
|
1784 |
},
|
@@ -1806,8 +1812,10 @@
|
|
1806 |
}
|
1807 |
],
|
1808 |
"source": [
|
1809 |
-
"eva[
|
1810 |
-
"
|
|
|
|
|
1811 |
"\n",
|
1812 |
"# If you want to sort the result by the count in descending order:\n",
|
1813 |
"result_golden = result.sort_values(ascending=False)\n",
|
|
|
872 |
}
|
873 |
],
|
874 |
"source": [
|
875 |
+
"df_sorted = df.groupby(\"Category\").count().sort_values(by=\"id\", ascending=False)\n",
|
876 |
"df_sorted"
|
877 |
]
|
878 |
},
|
|
|
885 |
"source": [
|
886 |
"# Function to determine the value for the new column\n",
|
887 |
"def categorize(value):\n",
|
888 |
+
" if \"/\" in str(value) or \",\" in str(value):\n",
|
889 |
+
" return \"Miscellaneous Events\"\n",
|
890 |
" else:\n",
|
891 |
" return value"
|
892 |
]
|
|
|
898 |
"metadata": {},
|
899 |
"outputs": [],
|
900 |
"source": [
|
901 |
+
"df[\"GPT Generated Result\"] = df[\"Category\"].apply(categorize)"
|
902 |
]
|
903 |
},
|
904 |
{
|
|
|
1182 |
}
|
1183 |
],
|
1184 |
"source": [
|
1185 |
+
"df_sorted1 = (\n",
|
1186 |
+
" df.groupby(\"GPT Generated Result\").count().sort_values(by=\"id\", ascending=False)\n",
|
1187 |
+
")\n",
|
1188 |
"df_sorted1"
|
1189 |
]
|
1190 |
},
|
|
|
1533 |
}
|
1534 |
],
|
1535 |
"source": [
|
1536 |
+
"eva = pd.read_csv(\"data/evaluation_result.csv\")\n",
|
1537 |
"eva"
|
1538 |
]
|
1539 |
},
|
|
|
1569 |
}
|
1570 |
],
|
1571 |
"source": [
|
1572 |
+
"eva[\"Result_GPT_True_Count\"] = eva[\"Result_GPT\"].astype(\n",
|
1573 |
+
" int\n",
|
1574 |
+
") # Convert boolean values to integers\n",
|
1575 |
+
"result = eva.groupby(\"GPT Generated Result\")[\"Result_GPT_True_Count\"].sum()\n",
|
1576 |
"\n",
|
1577 |
"result_gpt = result.sort_values(ascending=False)\n",
|
1578 |
"result_gpt"
|
|
|
1782 |
}
|
1783 |
],
|
1784 |
"source": [
|
1785 |
+
"test = (\n",
|
1786 |
+
" eva.groupby(\"Category_GoldenResult\").count().sort_values(by=\"id\", ascending=False)\n",
|
1787 |
+
")\n",
|
1788 |
"test"
|
1789 |
]
|
1790 |
},
|
|
|
1812 |
}
|
1813 |
],
|
1814 |
"source": [
|
1815 |
+
"eva[\"Result_Golden_True_Count\"] = eva[\"Result_Golden\"].astype(\n",
|
1816 |
+
" int\n",
|
1817 |
+
") # Convert boolean values to integers\n",
|
1818 |
+
"result = eva.groupby(\"Category_GoldenResult\")[\"Result_Golden_True_Count\"].sum()\n",
|
1819 |
"\n",
|
1820 |
"# If you want to sort the result by the count in descending order:\n",
|
1821 |
"result_golden = result.sort_values(ascending=False)\n",
|
requirements.txt
CHANGED
@@ -22,4 +22,7 @@ black[jupyter]
|
|
22 |
lxml_html_clean
|
23 |
newspaper3k
|
24 |
gnews
|
25 |
-
|
|
|
|
|
|
|
|
22 |
lxml_html_clean
|
23 |
newspaper3k
|
24 |
gnews
|
25 |
+
googlenewsdecoder
|
26 |
+
tqdm
|
27 |
+
bert_score
|
28 |
+
evaluate
|