lalithadevi
commited on
Commit
•
a92fa68
1
Parent(s):
c9e162f
Update news_extractor/news_extractor.py
Browse files- news_extractor/news_extractor.py +64 -48
news_extractor/news_extractor.py
CHANGED
@@ -18,7 +18,10 @@ def date_time_parser(dt):
|
|
18 |
:param dt: date
|
19 |
:return: int, minutes elapsed.
|
20 |
"""
|
21 |
-
|
|
|
|
|
|
|
22 |
|
23 |
def text_clean(desc):
|
24 |
"""
|
@@ -26,13 +29,16 @@ def text_clean(desc):
|
|
26 |
:param desc: string containing description
|
27 |
:return: str, cleaned description.
|
28 |
"""
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
return desc
|
37 |
|
38 |
|
@@ -42,18 +48,22 @@ def rss_parser(i):
|
|
42 |
:param i: single news item in RSS feed.
|
43 |
:return: Data frame of parsed news item.
|
44 |
"""
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
57 |
return pd.DataFrame({"title": title,
|
58 |
"url": url,
|
59 |
"description": desc,
|
@@ -100,23 +110,21 @@ def news_agg(rss):
|
|
100 |
b = BeautifulSoup(resp.content, "xml")
|
101 |
items = b.find_all("item")
|
102 |
for i in items:
|
103 |
-
|
|
|
|
|
104 |
rss_df.reset_index(drop=True, inplace=True)
|
105 |
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
|
106 |
-
|
107 |
-
|
108 |
-
#### UNCOMMENT IN CASE OF OOM ERROR IN RENDER
|
109 |
-
# rss_df.dropna(inplace=True)
|
110 |
-
|
111 |
-
####
|
112 |
-
|
113 |
|
114 |
rss_df["src"] = src_parse(rss)
|
115 |
rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
|
116 |
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
|
|
|
|
|
|
|
117 |
except Exception as e:
|
118 |
-
logger.warning(f'Skipping {rss} due to an error {e}')
|
119 |
-
|
120 |
return rss_df
|
121 |
|
122 |
|
@@ -125,25 +133,28 @@ rss = RSS_FEEDS_TO_EXTRACT
|
|
125 |
|
126 |
|
127 |
def get_news_rss(url):
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
####
|
140 |
-
|
141 |
-
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
142 |
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
return final_df
|
146 |
|
|
|
147 |
def get_news_multi_process(urls):
|
148 |
logger.warning('Entering get_news_multi_process() to read news')
|
149 |
'''
|
@@ -159,13 +170,18 @@ def get_news_multi_process(urls):
|
|
159 |
|
160 |
final_df = pd.DataFrame()
|
161 |
for f in results:
|
162 |
-
|
|
|
|
|
163 |
|
164 |
final_df.reset_index(drop=True, inplace=True)
|
165 |
-
logging.warning(final_df['src'].unique())
|
166 |
pool.close()
|
167 |
pool.join()
|
168 |
logger.warning('Exiting get_news_multi_process()')
|
|
|
|
|
|
|
|
|
169 |
return final_df
|
170 |
|
171 |
|
|
|
18 |
:param dt: date
|
19 |
:return: int, minutes elapsed.
|
20 |
"""
|
21 |
+
try:
|
22 |
+
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
23 |
+
except:
|
24 |
+
return 100000
|
25 |
|
26 |
def text_clean(desc):
|
27 |
"""
|
|
|
29 |
:param desc: string containing description
|
30 |
:return: str, cleaned description.
|
31 |
"""
|
32 |
+
try:
|
33 |
+
desc = desc.replace("<", "<")
|
34 |
+
desc = desc.replace(">", ">")
|
35 |
+
desc = re.sub("<.*?>", "", desc)
|
36 |
+
desc = desc.replace("#39;", "'")
|
37 |
+
desc = desc.replace('"', '"')
|
38 |
+
desc = desc.replace(' ', ' ')
|
39 |
+
desc = desc.replace('#32;', ' ')
|
40 |
+
except:
|
41 |
+
desc = ""
|
42 |
return desc
|
43 |
|
44 |
|
|
|
48 |
:param i: single news item in RSS feed.
|
49 |
:return: Data frame of parsed news item.
|
50 |
"""
|
51 |
+
try:
|
52 |
+
b1 = BeautifulSoup(str(i), "xml")
|
53 |
+
title = "" if b1.find("title") is None else b1.find("title").get_text()
|
54 |
+
title = text_clean(title)
|
55 |
+
url = "" if b1.find("link") is None else b1.find("link").get_text()
|
56 |
+
desc = "" if b1.find("description") is None else b1.find("description").get_text()
|
57 |
+
desc = text_clean(desc)
|
58 |
+
desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
|
59 |
+
date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
|
60 |
+
if url.find("businesstoday.in") >= 0:
|
61 |
+
date = date.replace("GMT", "+0530")
|
62 |
+
|
63 |
+
date1 = parser.parse(date)
|
64 |
+
except Exception as e:
|
65 |
+
logger.warning(f'Skipping item {i} due to an error {e}')
|
66 |
+
return None
|
67 |
return pd.DataFrame({"title": title,
|
68 |
"url": url,
|
69 |
"description": desc,
|
|
|
110 |
b = BeautifulSoup(resp.content, "xml")
|
111 |
items = b.find_all("item")
|
112 |
for i in items:
|
113 |
+
parsed_item = rss_parser(i)
|
114 |
+
if parsed_item is not None:
|
115 |
+
rss_df = pd.concat([rss_df, parsed_item], axis=0)
|
116 |
rss_df.reset_index(drop=True, inplace=True)
|
117 |
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
rss_df["src"] = src_parse(rss)
|
120 |
rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
|
121 |
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
|
122 |
+
|
123 |
+
if len(rss_df) == 0:
|
124 |
+
rss_df = None
|
125 |
except Exception as e:
|
126 |
+
logger.warning(f'Skipping {rss} feed extraction due to an error {e}')
|
127 |
+
return None
|
128 |
return rss_df
|
129 |
|
130 |
|
|
|
133 |
|
134 |
|
135 |
def get_news_rss(url):
|
136 |
+
'''
|
137 |
+
Function that is used in multiprocessing
|
138 |
+
'''
|
139 |
+
try:
|
140 |
+
final_df = news_agg(url)
|
141 |
+
if final_df is not None:
|
142 |
+
final_df.reset_index(drop=True, inplace=True)
|
143 |
+
|
144 |
+
final_df.sort_values(by="elapsed_time", inplace=True)
|
145 |
+
final_df.drop(columns=['elapsed_time'], inplace=True)
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
final_df.drop_duplicates(subset='url', inplace=True)
|
148 |
+
|
149 |
+
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
150 |
+
|
151 |
+
final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
|
152 |
+
except Exception as e:
|
153 |
+
logger.warning(f'Skipping {url} feed processing due to an error {e}')
|
154 |
+
return None
|
155 |
return final_df
|
156 |
|
157 |
+
|
158 |
def get_news_multi_process(urls):
|
159 |
logger.warning('Entering get_news_multi_process() to read news')
|
160 |
'''
|
|
|
170 |
|
171 |
final_df = pd.DataFrame()
|
172 |
for f in results:
|
173 |
+
rss_df = f.get(timeout=120)
|
174 |
+
if rss_df is not None:
|
175 |
+
final_df = pd.concat([final_df, rss_df], axis=0) # getting output of each parallel job
|
176 |
|
177 |
final_df.reset_index(drop=True, inplace=True)
|
|
|
178 |
pool.close()
|
179 |
pool.join()
|
180 |
logger.warning('Exiting get_news_multi_process()')
|
181 |
+
|
182 |
+
if len(final_df) == 0:
|
183 |
+
final_df = None
|
184 |
+
|
185 |
return final_df
|
186 |
|
187 |
|