{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ddf1e32e-7751-43db-9b5a-22cb08e35c6c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from gnews import GNews\n", "import nltk\n", "from nltk.corpus import PlaintextCorpusReader\n", "from nltk.corpus import stopwords\n", "from nltk.stem.porter import *\n", "from nltk import pos_tag, word_tokenize\n", "from nltk.stem import WordNetLemmatizer\n", "from nltk.probability import FreqDist\n", "from nltk.tokenize import sent_tokenize\n", "from nltk.tokenize import word_tokenize\n", "import contractions\n", "\n", "import gensim\n", "from gensim import corpora\n", "from gensim import similarities\n", "from gensim import models\n", "from gensim.models import CoherenceModel\n", "\n", "# from wordcloud import WordCloud, ImageColorGenerator\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import re\n", "import os\n", "import glob\n", "import json\n", "\n", "import psycopg2\n", "import pickle\n", "from datetime import datetime\n", "import datetime" ] }, { "cell_type": "code", "execution_count": 2, "id": "aca214bc-6426-4fc8-8574-23375f7f46f8", "metadata": {}, "outputs": [], "source": [ "# !pip3 install GNews" ] }, { "cell_type": "code", "execution_count": 3, "id": "a9c6ed07-a7d0-4aaa-b571-038919c75e05", "metadata": {}, "outputs": [], "source": [ "# Function to get full news article by headline\n", "def get_news_article(headline):\n", " # Initialize GNews client\n", " gnews = GNews()\n", "\n", " articles = gnews.get_news(headline)\n", "\n", " # Check if any articles were found\n", " results = []\n", "\n", " if articles:\n", " for link in range(len(articles)):\n", " try:\n", " article = gnews.get_full_article(articles[link][\"url\"])\n", " if article.text:\n", " results.append([article.url, article.title, article.text])\n", " break\n", "\n", " except Exception as e:\n", " # link cannot be scraped\n", " continue\n", "\n", " if not results:\n", " # if blocked by the website\n", " results.append(\n", " [\n", " \"cannot scrape the url\",\n", " \"cannot scrape the title\",\n", " \"cannot scrape the content\",\n", " ]\n", " )\n", "\n", " # No articles found for the given title.\n", " else:\n", " results.append([\"no url found\", \"no title found\", \"no content found\"])\n", "\n", " return results" ] }, { "cell_type": "code", "execution_count": 4, "id": "0acaf899-cd96-4424-b825-c71bf042355a", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Read the CSV file with news titles\n", "csv_file_path = \"LDA/cleaned_data.csv\"\n", "df = pd.read_csv(csv_file_path)" ] }, { "cell_type": "code", "execution_count": 5, "id": "465acb47-5f57-4d90-a272-7972d56e002b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5782, 17)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "801d892d-3b87-4ecf-805e-3441815d00f1", "metadata": {}, "outputs": [], "source": [ "# drop empty news\n", "df.dropna(subset=[\"Headline\"], inplace=True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "1267bf68-4726-49bf-ae21-9fed09682945", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"Year\", \"Headline\", \"Region\"]].duplicated().any()" ] }, { "cell_type": "code", "execution_count": 8, "id": "2a8c4229-adc3-42e7-83e1-6593be1977ec", "metadata": {}, "outputs": [], "source": [ "# drop the duplicated news\n", "duplicates = df.duplicated(subset=[\"Year\", \"Headline\", \"Region\"], keep=\"first\")\n", "df_uni = df[~duplicates]" ] }, { "cell_type": "code", "execution_count": 9, "id": "eb663cdd-b6c7-4468-9f69-f6e1c818ccd2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5710, 17)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_uni.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "8520b8bf-e6d0-4533-9b0a-d5003a5e2fe3", "metadata": {}, "outputs": [], "source": [ "print(datetime.datetime.now())" ] }, { "cell_type": "code", "execution_count": 10, "id": "f3c35a3c-d852-4285-86bf-3f7db7df78e7", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tes.at[index, 'url'] = results[0][0]\n", ":11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tes.at[index, 'title'] = results[0][1]\n", ":12: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " tes.at[index, 'content'] = results[0][2]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/china/major-container-ports-eastern-china-see-worsening-congestion-after-covid-cases-2021-08-12/ on URL https://news.google.com/rss/articles/CBMifmh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2NoaW5hL21ham9yLWNvbnRhaW5lci1wb3J0cy1lYXN0ZXJuLWNoaW5hLXNlZS13b3JzZW5pbmctY29uZ2VzdGlvbi1hZnRlci1jb3ZpZC1jYXNlcy0yMDIxLTA4LTEyL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/article/britain-equinor-oil-strike-idUKL8N1UG4QA/ on URL https://news.google.com/rss/articles/CBMiSWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL2FydGljbGUvYnJpdGFpbi1lcXVpbm9yLW9pbC1zdHJpa2UtaWRVS0w4TjFVRzRRQS_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.moneyweb.co.za/news/south-africa/it-will-take-months-to-clear-durban-port-backlog/ on URL https://news.google.com/rss/articles/CBMiXmh0dHBzOi8vd3d3Lm1vbmV5d2ViLmNvLnphL25ld3Mvc291dGgtYWZyaWNhL2l0LXdpbGwtdGFrZS1tb250aHMtdG8tY2xlYXItZHVyYmFuLXBvcnQtYmFja2xvZy_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 11:42:16 AM - fromstring() returned an invalid string: \n", "\n", "\n", " ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/europe/one-dead-cargo-ship-fire-electric-car-suspected-source-dutch-coastguard-2023-07-26/ on URL https://news.google.com/rss/articles/CBMieGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2V1cm9wZS9vbmUtZGVhZC1jYXJnby1zaGlwLWZpcmUtZWxlY3RyaWMtY2FyLXN1c3BlY3RlZC1zb3VyY2UtZHV0Y2gtY29hc3RndWFyZC0yMDIzLTA3LTI2L9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://news.usni.org/2023/05/30/authorities-detain-chinese-ship-suspected-of-salvaging-u-k-wwii-wrecks on URL https://news.google.com/rss/articles/CBMiZ2h0dHBzOi8vbmV3cy51c25pLm9yZy8yMDIzLzA1LzMwL2F1dGhvcml0aWVzLWRldGFpbi1jaGluZXNlLXNoaXAtc3VzcGVjdGVkLW9mLXNhbHZhZ2luZy11LWstd3dpaS13cmVja3PSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 11:47:56 AM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\t...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.abc.net.au/news/2023-02-18/sydney-storm-outages/101994636 on URL https://news.google.com/rss/articles/CBMiRWh0dHBzOi8vd3d3LmFiYy5uZXQuYXUvbmV3cy8yMDIzLTAyLTE4L3N5ZG5leS1zdG9ybS1vdXRhZ2VzLzEwMTk5NDYzNtIBKGh0dHBzOi8vYW1wLmFiYy5uZXQuYXUvYXJ0aWNsZS8xMDE5OTQ2MzY?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with HTTPSConnectionPool(host='www.aljazeera.com', port=443): Read timed out. (read timeout=7) on URL https://news.google.com/rss/articles/CBMiYGh0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vbmV3cy8yMDE4LzEwLzIyL3llbWVuLWRlYXRoLXRvbGwtZnJvbS10cm9waWNhbC1zdG9ybS1sdWJhbi1yaXNlcy10by0xMtIBZGh0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vYW1wL25ld3MvMjAxOC8xMC8yMi95ZW1lbi1kZWF0aC10b2xsLWZyb20tdHJvcGljYWwtc3Rvcm0tbHViYW4tcmlzZXMtdG8tMTI?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.iol.co.za/business-report/economy/durban-container-terminal-pier-2-backlog-reduced-to-five-vessels-at-anchor-transnet-fa3b7dbe-3275-4afc-9244-4b8ef0b88822 on URL https://news.google.com/rss/articles/CBMipgFodHRwczovL3d3dy5pb2wuY28uemEvYnVzaW5lc3MtcmVwb3J0L2Vjb25vbXkvZHVyYmFuLWNvbnRhaW5lci10ZXJtaW5hbC1waWVyLTItYmFja2xvZy1yZWR1Y2VkLXRvLWZpdmUtdmVzc2Vscy1hdC1hbmNob3ItdHJhbnNuZXQtZmEzYjdkYmUtMzI3NS00YWZjLTkyNDQtNGI4ZWYwYjg4ODIy0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/sustainable-business/south-africas-busiest-port-durban-hobbled-by-strike-2022-10-12/ on URL https://news.google.com/rss/articles/CBMidWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL3N1c3RhaW5hYmxlLWJ1c2luZXNzL3NvdXRoLWFmcmljYXMtYnVzaWVzdC1wb3J0LWR1cmJhbi1ob2JibGVkLWJ5LXN0cmlrZS0yMDIyLTEwLTEyL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:05:17 PM - fromstring() returned an invalid string: \n", "\n", "\t...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/asia-pacific/pakistan-election-candidate-shot-dead-while-campaigning-2024-01-10/ on URL https://news.google.com/rss/articles/CBMibmh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2FzaWEtcGFjaWZpYy9wYWtpc3Rhbi1lbGVjdGlvbi1jYW5kaWRhdGUtc2hvdC1kZWFkLXdoaWxlLWNhbXBhaWduaW5nLTIwMjQtMDEtMTAv0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/toyota-resume-operations-remaining-halted-plants-tuesday-2023-10-23/ on URL https://news.google.com/rss/articles/CBMiemh0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL2F1dG9zLXRyYW5zcG9ydGF0aW9uL3RveW90YS1yZXN1bWUtb3BlcmF0aW9ucy1yZW1haW5pbmctaGFsdGVkLXBsYW50cy10dWVzZGF5LTIwMjMtMTAtMjMv0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.timesofisrael.com/liveblog-december-18-2013/ on URL https://news.google.com/rss/articles/CBMiOGh0dHBzOi8vd3d3LnRpbWVzb2Zpc3JhZWwuY29tL2xpdmVibG9nLWRlY2VtYmVyLTE4LTIwMTMv0gE8aHR0cHM6Ly93d3cudGltZXNvZmlzcmFlbC5jb20vbGl2ZWJsb2ctZGVjZW1iZXItMTgtMjAxMy9hbXAv?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:09:23 PM - fromstring() returned an invalid string: \n", "\n", "\t...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/uaw-expands-strike-against-gm-walking-out-texas-suv-plant-2023-10-24/ on URL https://news.google.com/rss/articles/CBMie2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL2F1dG9zLXRyYW5zcG9ydGF0aW9uL3Vhdy1leHBhbmRzLXN0cmlrZS1hZ2FpbnN0LWdtLXdhbGtpbmctb3V0LXRleGFzLXN1di1wbGFudC0yMDIzLTEwLTI0L9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/hyundai-motor-south-korea-union-reach-tentative-wage-deal-union-official-2023-09-12/ on URL https://news.google.com/rss/articles/CBMiigFodHRwczovL3d3dy5yZXV0ZXJzLmNvbS9idXNpbmVzcy9hdXRvcy10cmFuc3BvcnRhdGlvbi9oeXVuZGFpLW1vdG9yLXNvdXRoLWtvcmVhLXVuaW9uLXJlYWNoLXRlbnRhdGl2ZS13YWdlLWRlYWwtdW5pb24tb2ZmaWNpYWwtMjAyMy0wOS0xMi_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://news.usni.org/2018/06/13/trump-kim-summit-statement-casts-doubt-on-us-navy-korea-visits-in-doubt on URL https://news.google.com/rss/articles/CBMiaGh0dHBzOi8vbmV3cy51c25pLm9yZy8yMDE4LzA2LzEzL3RydW1wLWtpbS1zdW1taXQtc3RhdGVtZW50LWNhc3RzLWRvdWJ0LW9uLXVzLW5hdnkta29yZWEtdmlzaXRzLWluLWRvdWJ00gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/asia-pacific/philippines-lifts-tsunami-alert-after-magnitude-74-earthquake-2023-12-03/ on URL https://news.google.com/rss/articles/CBMidGh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2FzaWEtcGFjaWZpYy9waGlsaXBwaW5lcy1saWZ0cy10c3VuYW1pLWFsZXJ0LWFmdGVyLW1hZ25pdHVkZS03NC1lYXJ0aHF1YWtlLTIwMjMtMTItMDMv0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:13:45 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "\t...\n", "03/11/2024 12:19:12 PM - fromstring() returned an invalid string: \n", "\n", " ...\n", "03/11/2024 12:21:15 PM - fromstring() returned an invalid string: \n", "\n", " ...\n", "03/11/2024 12:21:24 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\t...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.houstonchronicle.com/news/houston-weather/article/Live-weather-updates-Houston-snow-winter-storm-15951522.php on URL https://news.google.com/rss/articles/CBMieWh0dHBzOi8vd3d3LmhvdXN0b25jaHJvbmljbGUuY29tL25ld3MvaG91c3Rvbi13ZWF0aGVyL2FydGljbGUvTGl2ZS13ZWF0aGVyLXVwZGF0ZXMtSG91c3Rvbi1zbm93LXdpbnRlci1zdG9ybS0xNTk1MTUyMi5waHDSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:38:54 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", " ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.news.com.au/technology/environment/smoke-settles-over-sydney-as-firefighters-rush-to-reduce-fire-risk-ahead-of-warmer-weather/news-story/3c641dedab3af67e9fd9847176783500 on URL https://news.google.com/rss/articles/CBMitQFodHRwczovL3d3dy5uZXdzLmNvbS5hdS90ZWNobm9sb2d5L2Vudmlyb25tZW50L3Ntb2tlLXNldHRsZXMtb3Zlci1zeWRuZXktYXMtZmlyZWZpZ2h0ZXJzLXJ1c2gtdG8tcmVkdWNlLWZpcmUtcmlzay1haGVhZC1vZi13YXJtZXItd2VhdGhlci9uZXdzLXN0b3J5LzNjNjQxZGVkYWIzYWY2N2U5ZmQ5ODQ3MTc2NzgzNTAw0gG5AWh0dHBzOi8vd3d3Lm5ld3MuY29tLmF1L3RlY2hub2xvZ3kvZW52aXJvbm1lbnQvc21va2Utc2V0dGxlcy1vdmVyLXN5ZG5leS1hcy1maXJlZmlnaHRlcnMtcnVzaC10by1yZWR1Y2UtZmlyZS1yaXNrLWFoZWFkLW9mLXdhcm1lci13ZWF0aGVyL25ld3Mtc3RvcnkvM2M2NDFkZWRhYjNhZjY3ZTlmZDk4NDcxNzY3ODM1MDA_YW1w?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.news.com.au/travel/travel-updates/incidents/fresh-chaos-at-sydney-airport-as-16-more-flights-cancelled/news-story/6d860f4f59f990bdc3fb5c57b7e72798 on URL https://news.google.com/rss/articles/CBMingFodHRwczovL3d3dy5uZXdzLmNvbS5hdS90cmF2ZWwvdHJhdmVsLXVwZGF0ZXMvaW5jaWRlbnRzL2ZyZXNoLWNoYW9zLWF0LXN5ZG5leS1haXJwb3J0LWFzLTE2LW1vcmUtZmxpZ2h0cy1jYW5jZWxsZWQvbmV3cy1zdG9yeS82ZDg2MGY0ZjU5Zjk5MGJkYzNmYjVjNTdiN2U3Mjc5ONIBogFodHRwczovL3d3dy5uZXdzLmNvbS5hdS90cmF2ZWwvdHJhdmVsLXVwZGF0ZXMvaW5jaWRlbnRzL2ZyZXNoLWNoYW9zLWF0LXN5ZG5leS1haXJwb3J0LWFzLTE2LW1vcmUtZmxpZ2h0cy1jYW5jZWxsZWQvbmV3cy1zdG9yeS82ZDg2MGY0ZjU5Zjk5MGJkYzNmYjVjNTdiN2U3Mjc5OD9hbXA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:46:02 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "\n", " ...\n", "03/11/2024 12:50:13 PM - fromstring() returned an invalid string: \n", " ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://english.ahram.org.eg/NewsContent/1/64/404393/Egypt/Politics-/The-Little-Sun-gale-to-drench-Alexandria-for-three.aspx on URL https://news.google.com/rss/articles/CBMifGh0dHBzOi8vZW5nbGlzaC5haHJhbS5vcmcuZWcvTmV3c0NvbnRlbnQvMS82NC80MDQzOTMvRWd5cHQvUG9saXRpY3MtL1RoZS1MaXR0bGUtU3VuLWdhbGUtdG8tZHJlbmNoLUFsZXhhbmRyaWEtZm9yLXRocmVlLmFzcHjSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.legacy.com/us/obituaries/stltoday/name/james-bagby-obituary?id=54529291 on URL https://news.google.com/rss/articles/CBMiU2h0dHBzOi8vd3d3LmxlZ2FjeS5jb20vdXMvb2JpdHVhcmllcy9zdGx0b2RheS9uYW1lL2phbWVzLWJhZ2J5LW9iaXR1YXJ5P2lkPTU0NTI5Mjkx0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/uk/advisor/travel-insurance/2024/01/02/travel-latest-news/ on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vdWsvYWR2aXNvci90cmF2ZWwtaW5zdXJhbmNlLzIwMjQvMDEvMDIvdHJhdmVsLWxhdGVzdC1uZXdzL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:50:37 PM - fromstring() returned an invalid string: \n", "\n", " ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/europe/port-antwerp-disrupted-by-belgian-farmers-protests-2024-02-13/ on URL https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2V1cm9wZS9wb3J0LWFudHdlcnAtZGlzcnVwdGVkLWJ5LWJlbGdpYW4tZmFybWVycy1wcm90ZXN0cy0yMDI0LTAyLTEzL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://newsbook.com.mt/en/bridge-collapse-blocks-brussels-scheldt-canal-traffic/ on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vbmV3c2Jvb2suY29tLm10L2VuL2JyaWRnZS1jb2xsYXBzZS1ibG9ja3MtYnJ1c3NlbHMtc2NoZWxkdC1jYW5hbC10cmFmZmljL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.ajot.com/news/msc-adds-new-container-service-through-jaxport on URL https://news.google.com/rss/articles/CBMiSGh0dHBzOi8vd3d3LmFqb3QuY29tL25ld3MvbXNjLWFkZHMtbmV3LWNvbnRhaW5lci1zZXJ2aWNlLXRocm91Z2gtamF4cG9ydNIBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://newsbook.com.mt/en/cargo-ship-runs-aground-in-istanbuls-bosphorus-strait/ on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vbmV3c2Jvb2suY29tLm10L2VuL2NhcmdvLXNoaXAtcnVucy1hZ3JvdW5kLWluLWlzdGFuYnVscy1ib3NwaG9ydXMtc3RyYWl0L9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.timesofisrael.com/liveblog-february-18-2024/ on URL https://news.google.com/rss/articles/CBMiOGh0dHBzOi8vd3d3LnRpbWVzb2Zpc3JhZWwuY29tL2xpdmVibG9nLWZlYnJ1YXJ5LTE4LTIwMjQv0gE8aHR0cHM6Ly93d3cudGltZXNvZmlzcmFlbC5jb20vbGl2ZWJsb2ctZmVicnVhcnktMTgtMjAyNC9hbXAv?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://sunnewsonline.com/piracy-navy-omsl-rescue-chinese-cargo-ship-off-nigerias-coast/ on URL https://news.google.com/rss/articles/CBMiWGh0dHBzOi8vc3VubmV3c29ubGluZS5jb20vcGlyYWN5LW5hdnktb21zbC1yZXNjdWUtY2hpbmVzZS1jYXJnby1zaGlwLW9mZi1uaWdlcmlhcy1jb2FzdC_SAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.caranddriver.com/features/a35877638/golden-ray-final-voyage/ on URL https://news.google.com/rss/articles/CBMiSGh0dHBzOi8vd3d3LmNhcmFuZGRyaXZlci5jb20vZmVhdHVyZXMvYTM1ODc3NjM4L2dvbGRlbi1yYXktZmluYWwtdm95YWdlL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.newsweek.com/watch-massive-container-ships-collide-losing-cargo-overboard-852319 on URL https://news.google.com/rss/articles/CBMiXGh0dHBzOi8vd3d3Lm5ld3N3ZWVrLmNvbS93YXRjaC1tYXNzaXZlLWNvbnRhaW5lci1zaGlwcy1jb2xsaWRlLWxvc2luZy1jYXJnby1vdmVyYm9hcmQtODUyMzE50gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.hapag-lloyd.com/es/company/about-us/newsletter/2023/09/-i-love-the-sailor-s-life-----jan-rusch.html on URL https://news.google.com/rss/articles/CBMib2h0dHBzOi8vd3d3LmhhcGFnLWxsb3lkLmNvbS9lcy9jb21wYW55L2Fib3V0LXVzL25ld3NsZXR0ZXIvMjAyMy8wOS8taS1sb3ZlLXRoZS1zYWlsb3Itcy1saWZlLS0tLS1qYW4tcnVzY2guaHRtbNIBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 429 Client Error: for url: https://www.rivieramm.com/opinion/opinion/lof-award-supports-french-etv-services-77561 on URL https://news.google.com/rss/articles/CBMiVmh0dHBzOi8vd3d3LnJpdmllcmFtbS5jb20vb3Bpbmlvbi9vcGluaW9uL2xvZi1hd2FyZC1zdXBwb3J0cy1mcmVuY2gtZXR2LXNlcnZpY2VzLTc3NTYx0gEA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://english.alarabiya.net/business/economy/2023/06/21/Suez-Canal-revenues-reach-all-time-high-of-9-4-bln-Official on URL https://news.google.com/rss/articles/CBMidWh0dHBzOi8vZW5nbGlzaC5hbGFyYWJpeWEubmV0L2J1c2luZXNzL2Vjb25vbXkvMjAyMy8wNi8yMS9TdWV6LUNhbmFsLXJldmVudWVzLXJlYWNoLWFsbC10aW1lLWhpZ2gtb2YtOS00LWJsbi1PZmZpY2lhbNIBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 12:55:18 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "\n", "\t...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "An error occurred while fetching the article: Article `download()` failed with ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) on URL https://news.google.com/rss/articles/CBMiP2h0dHBzOi8vd3d3LmFuZHJvaWRwb2xpY2UuY29tL2F2b2lkLWZhY2Vib29rLW1hcmtldHBsYWNlLXNjYW1zL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n", "An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.fbi.gov/stats-services/publications/financial-crimes-report-2009 on URL https://news.google.com/rss/articles/CBMiTGh0dHBzOi8vd3d3LmZiaS5nb3Yvc3RhdHMtc2VydmljZXMvcHVibGljYXRpb25zL2ZpbmFuY2lhbC1jcmltZXMtcmVwb3J0LTIwMDnSAQA?oc=5&hl=en-SG&gl=SG&ceid=SG:en\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "03/11/2024 01:06:27 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "<...\n", "03/11/2024 01:14:37 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\t...\n", "03/11/2024 01:18:21 PM - fromstring() returned an invalid string: \n", "\n", "\n", "\n", "\n", "