Commit
•
960de68
1
Parent(s):
24f1253
Delete news_extractor
Browse files- news_extractor/__init__.py +0 -1
- news_extractor/news_extractor.py +0 -192
news_extractor/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from news_extractor.news_extractor import *
|
|
|
|
news_extractor/news_extractor.py
DELETED
@@ -1,192 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
import requests as r
|
5 |
-
import regex as re
|
6 |
-
from dateutil import parser
|
7 |
-
import logging
|
8 |
-
import multiprocessing
|
9 |
-
|
10 |
-
|
11 |
-
def date_time_parser(dt):
|
12 |
-
"""
|
13 |
-
Computes the minutes elapsed since published time.
|
14 |
-
:param dt: date
|
15 |
-
:return: int, minutes elapsed.
|
16 |
-
"""
|
17 |
-
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
18 |
-
|
19 |
-
def text_clean(desc):
|
20 |
-
"""
|
21 |
-
Cleans the text by removing special chars.
|
22 |
-
:param desc: string containing description
|
23 |
-
:return: str, cleaned description.
|
24 |
-
"""
|
25 |
-
desc = desc.replace("<", "<")
|
26 |
-
desc = desc.replace(">", ">")
|
27 |
-
desc = re.sub("<.*?>", "", desc)
|
28 |
-
desc = desc.replace("#39;", "'")
|
29 |
-
desc = desc.replace('"', '"')
|
30 |
-
desc = desc.replace(' ', ' ')
|
31 |
-
desc = desc.replace('#32;', ' ')
|
32 |
-
return desc
|
33 |
-
|
34 |
-
|
35 |
-
def rss_parser(i):
|
36 |
-
"""
|
37 |
-
Returns a data frame of parsed news item.
|
38 |
-
:param i: single news item in RSS feed.
|
39 |
-
:return: Data frame of parsed news item.
|
40 |
-
"""
|
41 |
-
b1 = BeautifulSoup(str(i), "xml")
|
42 |
-
title = "" if b1.find("title") is None else b1.find("title").get_text()
|
43 |
-
title = text_clean(title)
|
44 |
-
url = "" if b1.find("link") is None else b1.find("link").get_text()
|
45 |
-
desc = "" if b1.find("description") is None else b1.find("description").get_text()
|
46 |
-
desc = text_clean(desc)
|
47 |
-
desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
|
48 |
-
date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
|
49 |
-
if url.find("businesstoday.in") >= 0:
|
50 |
-
date = date.replace("GMT", "+0530")
|
51 |
-
|
52 |
-
date1 = parser.parse(date)
|
53 |
-
return pd.DataFrame({"title": title,
|
54 |
-
"url": url,
|
55 |
-
"description": desc,
|
56 |
-
"parsed_date": date1}, index=[0])
|
57 |
-
|
58 |
-
|
59 |
-
def src_parse(rss):
|
60 |
-
"""
|
61 |
-
Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
|
62 |
-
:param rss: RSS URL
|
63 |
-
:return: str, string containing the source name
|
64 |
-
"""
|
65 |
-
if rss.find('ndtvprofit') >= 0:
|
66 |
-
rss = 'ndtv profit'
|
67 |
-
if rss.find('ndtv') >= 0:
|
68 |
-
rss = 'ndtv.com'
|
69 |
-
if rss.find('telanganatoday') >= 0:
|
70 |
-
rss = 'telanganatoday.com'
|
71 |
-
|
72 |
-
rss = rss.replace("https://www.", "")
|
73 |
-
rss = rss.split("/")
|
74 |
-
return rss[0]
|
75 |
-
|
76 |
-
|
77 |
-
def news_agg(rss):
|
78 |
-
"""
|
79 |
-
Returns feeds from each 'rss' URL.
|
80 |
-
:param rss: RSS URL.
|
81 |
-
:return: Data frame of processed articles.
|
82 |
-
"""
|
83 |
-
try:
|
84 |
-
rss_df = pd.DataFrame()
|
85 |
-
# user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
86 |
-
headers = {
|
87 |
-
'authority': 'www.google.com',
|
88 |
-
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
89 |
-
'accept-language': 'en-US,en;q=0.9',
|
90 |
-
'cache-control': 'max-age=0',
|
91 |
-
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
|
92 |
-
}
|
93 |
-
|
94 |
-
timeout = 5
|
95 |
-
|
96 |
-
resp = r.get(rss, timeout=timeout, headers=headers)
|
97 |
-
logging.warning(f'{rss}: {resp.status_code}')
|
98 |
-
b = BeautifulSoup(resp.content, "xml")
|
99 |
-
items = b.find_all("item")
|
100 |
-
for i in items:
|
101 |
-
# rss_df = rss_df.append(rss_parser(i)).copy()
|
102 |
-
rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
|
103 |
-
rss_df.reset_index(drop=True, inplace=True)
|
104 |
-
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
|
105 |
-
|
106 |
-
|
107 |
-
#### UNCOMMENT IN CASE OF OOM ERROR IN RENDER
|
108 |
-
# rss_df.dropna(inplace=True)
|
109 |
-
|
110 |
-
####
|
111 |
-
|
112 |
-
|
113 |
-
rss_df["src"] = src_parse(rss)
|
114 |
-
rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
|
115 |
-
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
|
116 |
-
# rss_df["elapsed_time_str"] = rss_df["elapsed_time"].apply(elapsed_time_str)
|
117 |
-
except Exception as e:
|
118 |
-
print(e)
|
119 |
-
pass
|
120 |
-
return rss_df
|
121 |
-
|
122 |
-
|
123 |
-
# List of RSS feeds
|
124 |
-
rss = ['https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
|
125 |
-
'https://www.thehindu.com/news/feeder/default.rss',
|
126 |
-
# 'https://telanganatoday.com/feed',
|
127 |
-
'https://www.businesstoday.in/rssfeeds/?id=225346',
|
128 |
-
'https://feeds.feedburner.com/ndtvnews-latest',
|
129 |
-
'https://www.hindustantimes.com/feeds/rss/world-news/rssfeed.xml',
|
130 |
-
'https://www.indiatoday.in/rss/1206578',
|
131 |
-
|
132 |
-
'https://www.moneycontrol.com/rss/latestnews.xml',
|
133 |
-
'https://www.livemint.com/rss/news',
|
134 |
-
|
135 |
-
'https://www.zeebiz.com/latest.xml/feed',
|
136 |
-
'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
|
137 |
-
|
138 |
-
|
139 |
-
def get_news_rss(url):
|
140 |
-
# final_df = pd.DataFrame()
|
141 |
-
# for i in rss:
|
142 |
-
# # final_df = final_df.append(news_agg(i))
|
143 |
-
# final_df = pd.concat([final_df, news_agg(i)], axis=0)
|
144 |
-
final_df = news_agg(url)
|
145 |
-
final_df.reset_index(drop=True, inplace=True)
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
final_df.sort_values(by="elapsed_time", inplace=True)
|
150 |
-
# final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
|
151 |
-
# final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
|
152 |
-
final_df.drop(columns=['elapsed_time'], inplace=True)
|
153 |
-
|
154 |
-
|
155 |
-
#### UNCOMMENT 1ST STATEMENT AND REMOVE 2ND STATEMENT IN CASE OF OOM ERROR IN RENDER
|
156 |
-
# final_df.drop_duplicates(subset='description', inplace=True)
|
157 |
-
final_df.drop_duplicates(subset='url', inplace=True)
|
158 |
-
|
159 |
-
####
|
160 |
-
|
161 |
-
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
162 |
-
|
163 |
-
final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
|
164 |
-
|
165 |
-
return final_df
|
166 |
-
|
167 |
-
def get_news_multi_process(urls):
|
168 |
-
'''
|
169 |
-
Get the data shape by parallely calculating lenght of each chunk and
|
170 |
-
aggregating them to get lenght of complete training dataset
|
171 |
-
'''
|
172 |
-
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
173 |
-
|
174 |
-
results = []
|
175 |
-
for url in urls:
|
176 |
-
f = pool.apply_async(get_news_rss, [url]) # asynchronously applying function to chunk. Each worker parallely begins to work on the job
|
177 |
-
results.append(f) # appending result to results
|
178 |
-
|
179 |
-
final_df = pd.DataFrame()
|
180 |
-
for f in results:
|
181 |
-
# print(f.get())
|
182 |
-
final_df = pd.concat([final_df, f.get(timeout=120)], axis=0) # getting output of each parallel job
|
183 |
-
|
184 |
-
final_df.reset_index(drop=True, inplace=True)
|
185 |
-
logging.warning(final_df['src'].unique())
|
186 |
-
pool.close()
|
187 |
-
pool.join()
|
188 |
-
return final_df
|
189 |
-
|
190 |
-
|
191 |
-
def get_news():
|
192 |
-
return get_news_multi_process(rss)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|