File size: 5,724 Bytes
e857da4 127d8ad e857da4 4842df9 e857da4 8271332 4d0f06f e857da4 8567825 54f031d 4aab4a6 18c9a32 e857da4 196cf86 c3b44d2 8271332 c3b44d2 194bc0b e857da4 68c6e68 e857da4 7a67d7f 4d0f06f 7a67d7f f899b3b 4260545 e857da4 68c6e68 e857da4 b125f18 e857da4 88291b7 c5ed3ab b125f18 e857da4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests as r
import regex as re
from dateutil import parser
import logging
def date_time_parser(dt):
"""
Computes the minutes elapsed since published time.
:param dt: date
:return: int, minutes elapsed.
"""
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
def text_clean(desc):
"""
Cleans the text by removing special chars.
:param desc: string containing description
:return: str, cleaned description.
"""
desc = desc.replace("<", "<")
desc = desc.replace(">", ">")
desc = re.sub("<.*?>", "", desc)
desc = desc.replace("#39;", "'")
desc = desc.replace('"', '"')
desc = desc.replace(' ', ' ')
desc = desc.replace('#32;', ' ')
return desc
def rss_parser(i):
"""
Returns a data frame of parsed news item.
:param i: single news item in RSS feed.
:return: Data frame of parsed news item.
"""
b1 = BeautifulSoup(str(i), "xml")
title = "" if b1.find("title") is None else b1.find("title").get_text()
title = text_clean(title)
url = "" if b1.find("link") is None else b1.find("link").get_text()
desc = "" if b1.find("description") is None else b1.find("description").get_text()
desc = text_clean(desc)
desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
if url.find("businesstoday.in") >= 0:
date = date.replace("GMT", "+0530")
date1 = parser.parse(date)
return pd.DataFrame({"title": title,
"url": url,
"description": desc,
"parsed_date": date1}, index=[0])
def src_parse(rss):
"""
Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
:param rss: RSS URL
:return: str, string containing the source name
"""
if rss.find('ndtvprofit') >= 0:
rss = 'ndtv profit'
if rss.find('ndtv') >= 0:
rss = 'ndtv.com'
if rss.find('telanganatoday') >= 0:
rss = 'telanganatoday.com'
rss = rss.replace("https://www.", "")
rss = rss.split("/")
return rss[0]
def news_agg(rss):
"""
Returns feeds from each 'rss' URL.
:param rss: RSS URL.
:return: Data frame of processed articles.
"""
try:
rss_df = pd.DataFrame()
# user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
headers = {
'authority': 'www.google.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
timeout = 5
resp = r.get(rss, timeout=timeout, headers=headers)
logging.warning(f'{rss}: {resp.status_code}')
b = BeautifulSoup(resp.content, "xml")
items = b.find_all("item")
for i in items:
# rss_df = rss_df.append(rss_parser(i)).copy()
rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
rss_df.reset_index(drop=True, inplace=True)
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
rss_df.dropna(inplace=True)
rss_df["src"] = src_parse(rss)
rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
# rss_df["elapsed_time_str"] = rss_df["elapsed_time"].apply(elapsed_time_str)
except Exception as e:
print(e)
pass
return rss_df
# List of RSS feeds
rss = ['https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
'https://www.thehindu.com/news/feeder/default.rss',
'https://telanganatoday.com/feed',
'https://www.businesstoday.in/rssfeeds/?id=225346',
'https://feeds.feedburner.com/ndtvnews-latest',
'https://www.hindustantimes.com/feeds/rss/world-news/rssfeed.xml',
'https://www.indiatoday.in/rss/1206578',
'https://www.moneycontrol.com/rss/latestnews.xml',
'https://www.livemint.com/rss/news',
'https://www.zeebiz.com/latest.xml/feed',
'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
def get_news():
final_df = pd.DataFrame()
for i in rss:
# final_df = final_df.append(news_agg(i))
final_df = pd.concat([final_df, news_agg(i)], axis=0)
final_df.reset_index(drop=True, inplace=True)
logging.warning(final_df['src'].unique())
final_df.sort_values(by="elapsed_time", inplace=True)
# final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
# final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
final_df.drop(columns=['elapsed_time'], inplace=True)
final_df.drop_duplicates(subset='description', inplace=True)
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
return final_df
|