File size: 5,275 Bytes
329d383 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests as r
import regex as re
from dateutil import parser
import logging
import multiprocessing
from datetime import date
def get_time_delta(dt):
BASE_DATE = date(2024, 1, 1)
time_delta = dt.date() - BASE_DATE
return time_delta.days
def text_clean(desc):
"""
Cleans the text by removing special chars.
:param desc: string containing description
:return: str, cleaned description.
"""
desc = desc.replace("<", "<")
desc = desc.replace(">", ">")
desc = re.sub("<.*?>", "", desc)
desc = desc.replace("#39;", "'")
desc = desc.replace('"', '"')
desc = desc.replace(' ', ' ')
desc = desc.replace('#32;', ' ')
return desc
def rss_parser(i):
"""
Returns a data frame of parsed news item.
:param i: single news item in RSS feed.
:return: Data frame of parsed news item.
"""
b1 = BeautifulSoup(str(i), "xml")
title = "" if b1.find("title") is None else b1.find("title").get_text()
title = text_clean(title)
url = "" if b1.find("link") is None else b1.find("link").get_text()
date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
if url.find("businesstoday.in") >= 0:
date = date.replace("GMT", "+0530")
date1 = parser.parse(date)
return pd.DataFrame({"title": title,
"url": url,
"parsed_date": date1}, index=[0])
def src_parse(rss):
"""
Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
:param rss: RSS URL
:return: str, string containing the source name
"""
if rss.find('ndtv') >= 0:
rss = 'ndtv.com'
if rss.find('bbc') >= 0:
rss = 'bbc.com'
if rss.find('huffpost.') >= 0:
rss = 'huffpost.com'
if rss.find('nytimes.') >= 0:
rss = 'nytimes.com'
rss = rss.replace("https://www.", "")
rss = rss.split("/")
return rss[0]
def news_agg(rss):
"""
Returns feeds from each 'rss' URL.
:param rss: RSS URL.
:return: Data frame of processed articles.
"""
try:
rss_df = pd.DataFrame()
headers = {
'authority': 'www.google.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
timeout = 5
resp = r.get(rss, timeout=timeout, headers=headers)
logging.info(f'{rss}: {resp.status_code}')
b = BeautifulSoup(resp.content, "xml")
items = b.find_all("item")
for i in items:
rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
rss_df.reset_index(drop=True, inplace=True)
rss_df["src"] = src_parse(rss)
rss_df['news_age'] = rss_df["parsed_date"].apply(get_time_delta)
rss_df["parsed_date"] = rss_df["parsed_date"].map(lambda x: x.date).astype("str")
except Exception as e:
logging.warning(f"Couldn't process {rss}\nSTATUS CODE: {resp.status_code}\nREASON: {e}")
pass
return rss_df
# List of RSS feeds
rss = ['https://chaski.huffpost.com/us/auto/vertical/world-news',
'https://feeds.bbci.co.uk/news/world/rss.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/World.xml',
'https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
'https://www.thehindu.com/news/feeder/default.rss',
'https://www.businesstoday.in/rssfeeds/?id=225346',
'https://feeds.feedburner.com/ndtvnews-latest',
'https://www.hindustantimes.com/feeds/rss/world-news/rssfeed.xml',
'https://www.indiatoday.in/rss/1206578',
'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
def get_news_rss(url):
final_df = news_agg(url)
final_df.reset_index(drop=True, inplace=True)
final_df.drop_duplicates(subset='url', inplace=True)
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
return final_df
def get_news_multi_process(urls):
'''
Get the data shape by parallely calculating lenght of each chunk and
aggregating them to get lenght of complete training dataset
'''
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
results = []
for url in urls:
f = pool.apply_async(get_news_rss, [url]) # asynchronously applying function to chunk. Each worker parallely begins to work on the job
results.append(f) # appending result to results
final_df = pd.DataFrame()
for f in results:
final_df = pd.concat([final_df, f.get(timeout=120)], axis=0) # getting output of each parallel job
final_df.reset_index(drop=True, inplace=True)
logging.info(final_df['src'].unique())
pool.close()
pool.join()
return final_df
def get_news():
return get_news_multi_process(rss) |