ksvmuralidhar's picture
Add update and delete services code base
329d383 verified
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests as r
import regex as re
from dateutil import parser
import logging
import multiprocessing
from datetime import date
def get_time_delta(dt):
BASE_DATE = date(2024, 1, 1)
time_delta = dt.date() - BASE_DATE
return time_delta.days
def text_clean(desc):
"""
Cleans the text by removing special chars.
:param desc: string containing description
:return: str, cleaned description.
"""
desc = desc.replace("&lt;", "<")
desc = desc.replace("&gt;", ">")
desc = re.sub("<.*?>", "", desc)
desc = desc.replace("#39;", "'")
desc = desc.replace('&quot;', '"')
desc = desc.replace('&nbsp;', ' ')
desc = desc.replace('#32;', ' ')
return desc
def rss_parser(i):
"""
Returns a data frame of parsed news item.
:param i: single news item in RSS feed.
:return: Data frame of parsed news item.
"""
b1 = BeautifulSoup(str(i), "xml")
title = "" if b1.find("title") is None else b1.find("title").get_text()
title = text_clean(title)
url = "" if b1.find("link") is None else b1.find("link").get_text()
date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
if url.find("businesstoday.in") >= 0:
date = date.replace("GMT", "+0530")
date1 = parser.parse(date)
return pd.DataFrame({"title": title,
"url": url,
"parsed_date": date1}, index=[0])
def src_parse(rss):
"""
Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
:param rss: RSS URL
:return: str, string containing the source name
"""
if rss.find('ndtv') >= 0:
rss = 'ndtv.com'
if rss.find('bbc') >= 0:
rss = 'bbc.com'
if rss.find('huffpost.') >= 0:
rss = 'huffpost.com'
if rss.find('nytimes.') >= 0:
rss = 'nytimes.com'
rss = rss.replace("https://www.", "")
rss = rss.split("/")
return rss[0]
def news_agg(rss):
"""
Returns feeds from each 'rss' URL.
:param rss: RSS URL.
:return: Data frame of processed articles.
"""
try:
rss_df = pd.DataFrame()
headers = {
'authority': 'www.google.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
timeout = 5
resp = r.get(rss, timeout=timeout, headers=headers)
logging.info(f'{rss}: {resp.status_code}')
b = BeautifulSoup(resp.content, "xml")
items = b.find_all("item")
for i in items:
rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
rss_df.reset_index(drop=True, inplace=True)
rss_df["src"] = src_parse(rss)
rss_df['news_age'] = rss_df["parsed_date"].apply(get_time_delta)
rss_df["parsed_date"] = rss_df["parsed_date"].map(lambda x: x.date).astype("str")
except Exception as e:
logging.warning(f"Couldn't process {rss}\nSTATUS CODE: {resp.status_code}\nREASON: {e}")
pass
return rss_df
# List of RSS feeds
rss = ['https://chaski.huffpost.com/us/auto/vertical/world-news',
'https://feeds.bbci.co.uk/news/world/rss.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/World.xml',
'https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
'https://www.thehindu.com/news/feeder/default.rss',
'https://www.businesstoday.in/rssfeeds/?id=225346',
'https://feeds.feedburner.com/ndtvnews-latest',
'https://www.hindustantimes.com/feeds/rss/world-news/rssfeed.xml',
'https://www.indiatoday.in/rss/1206578',
'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
def get_news_rss(url):
final_df = news_agg(url)
final_df.reset_index(drop=True, inplace=True)
final_df.drop_duplicates(subset='url', inplace=True)
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
return final_df
def get_news_multi_process(urls):
'''
Get the data shape by parallely calculating lenght of each chunk and
aggregating them to get lenght of complete training dataset
'''
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
results = []
for url in urls:
f = pool.apply_async(get_news_rss, [url]) # asynchronously applying function to chunk. Each worker parallely begins to work on the job
results.append(f) # appending result to results
final_df = pd.DataFrame()
for f in results:
final_df = pd.concat([final_df, f.get(timeout=120)], axis=0) # getting output of each parallel job
final_df.reset_index(drop=True, inplace=True)
logging.info(final_df['src'].unique())
pool.close()
pool.join()
return final_df
def get_news():
return get_news_multi_process(rss)