lalithadevi commited on
Commit
a92fa68
1 Parent(s): c9e162f

Update news_extractor/news_extractor.py

Browse files
Files changed (1) hide show
  1. news_extractor/news_extractor.py +64 -48
news_extractor/news_extractor.py CHANGED
@@ -18,7 +18,10 @@ def date_time_parser(dt):
18
  :param dt: date
19
  :return: int, minutes elapsed.
20
  """
21
- return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
 
 
 
22
 
23
  def text_clean(desc):
24
  """
@@ -26,13 +29,16 @@ def text_clean(desc):
26
  :param desc: string containing description
27
  :return: str, cleaned description.
28
  """
29
- desc = desc.replace("&lt;", "<")
30
- desc = desc.replace("&gt;", ">")
31
- desc = re.sub("<.*?>", "", desc)
32
- desc = desc.replace("#39;", "'")
33
- desc = desc.replace('&quot;', '"')
34
- desc = desc.replace('&nbsp;', ' ')
35
- desc = desc.replace('#32;', ' ')
 
 
 
36
  return desc
37
 
38
 
@@ -42,18 +48,22 @@ def rss_parser(i):
42
  :param i: single news item in RSS feed.
43
  :return: Data frame of parsed news item.
44
  """
45
- b1 = BeautifulSoup(str(i), "xml")
46
- title = "" if b1.find("title") is None else b1.find("title").get_text()
47
- title = text_clean(title)
48
- url = "" if b1.find("link") is None else b1.find("link").get_text()
49
- desc = "" if b1.find("description") is None else b1.find("description").get_text()
50
- desc = text_clean(desc)
51
- desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
52
- date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
53
- if url.find("businesstoday.in") >= 0:
54
- date = date.replace("GMT", "+0530")
55
-
56
- date1 = parser.parse(date)
 
 
 
 
57
  return pd.DataFrame({"title": title,
58
  "url": url,
59
  "description": desc,
@@ -100,23 +110,21 @@ def news_agg(rss):
100
  b = BeautifulSoup(resp.content, "xml")
101
  items = b.find_all("item")
102
  for i in items:
103
- rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
 
 
104
  rss_df.reset_index(drop=True, inplace=True)
105
  rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
106
-
107
-
108
- #### UNCOMMENT IN CASE OF OOM ERROR IN RENDER
109
- # rss_df.dropna(inplace=True)
110
-
111
- ####
112
-
113
 
114
  rss_df["src"] = src_parse(rss)
115
  rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
116
  rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
 
 
 
117
  except Exception as e:
118
- logger.warning(f'Skipping {rss} due to an error {e}')
119
- pass
120
  return rss_df
121
 
122
 
@@ -125,25 +133,28 @@ rss = RSS_FEEDS_TO_EXTRACT
125
 
126
 
127
  def get_news_rss(url):
128
- final_df = news_agg(url)
129
- final_df.reset_index(drop=True, inplace=True)
130
-
131
- final_df.sort_values(by="elapsed_time", inplace=True)
132
- final_df.drop(columns=['elapsed_time'], inplace=True)
133
-
134
-
135
- #### UNCOMMENT 1ST STATEMENT AND REMOVE 2ND STATEMENT IN CASE OF OOM ERROR IN RENDER
136
- # final_df.drop_duplicates(subset='description', inplace=True)
137
- final_df.drop_duplicates(subset='url', inplace=True)
138
-
139
- ####
140
-
141
- final_df = final_df.loc[(final_df["title"] != ""), :].copy()
142
 
143
- final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
144
-
 
 
 
 
 
 
145
  return final_df
146
 
 
147
  def get_news_multi_process(urls):
148
  logger.warning('Entering get_news_multi_process() to read news')
149
  '''
@@ -159,13 +170,18 @@ def get_news_multi_process(urls):
159
 
160
  final_df = pd.DataFrame()
161
  for f in results:
162
- final_df = pd.concat([final_df, f.get(timeout=120)], axis=0) # getting output of each parallel job
 
 
163
 
164
  final_df.reset_index(drop=True, inplace=True)
165
- logging.warning(final_df['src'].unique())
166
  pool.close()
167
  pool.join()
168
  logger.warning('Exiting get_news_multi_process()')
 
 
 
 
169
  return final_df
170
 
171
 
 
18
  :param dt: date
19
  :return: int, minutes elapsed.
20
  """
21
+ try:
22
+ return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
23
+ except:
24
+ return 100000
25
 
26
  def text_clean(desc):
27
  """
 
29
  :param desc: string containing description
30
  :return: str, cleaned description.
31
  """
32
+ try:
33
+ desc = desc.replace("&lt;", "<")
34
+ desc = desc.replace("&gt;", ">")
35
+ desc = re.sub("<.*?>", "", desc)
36
+ desc = desc.replace("#39;", "'")
37
+ desc = desc.replace('&quot;', '"')
38
+ desc = desc.replace('&nbsp;', ' ')
39
+ desc = desc.replace('#32;', ' ')
40
+ except:
41
+ desc = ""
42
  return desc
43
 
44
 
 
48
  :param i: single news item in RSS feed.
49
  :return: Data frame of parsed news item.
50
  """
51
+ try:
52
+ b1 = BeautifulSoup(str(i), "xml")
53
+ title = "" if b1.find("title") is None else b1.find("title").get_text()
54
+ title = text_clean(title)
55
+ url = "" if b1.find("link") is None else b1.find("link").get_text()
56
+ desc = "" if b1.find("description") is None else b1.find("description").get_text()
57
+ desc = text_clean(desc)
58
+ desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
59
+ date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
60
+ if url.find("businesstoday.in") >= 0:
61
+ date = date.replace("GMT", "+0530")
62
+
63
+ date1 = parser.parse(date)
64
+ except Exception as e:
65
+ logger.warning(f'Skipping item {i} due to an error {e}')
66
+ return None
67
  return pd.DataFrame({"title": title,
68
  "url": url,
69
  "description": desc,
 
110
  b = BeautifulSoup(resp.content, "xml")
111
  items = b.find_all("item")
112
  for i in items:
113
+ parsed_item = rss_parser(i)
114
+ if parsed_item is not None:
115
+ rss_df = pd.concat([rss_df, parsed_item], axis=0)
116
  rss_df.reset_index(drop=True, inplace=True)
117
  rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
 
 
 
 
 
 
 
118
 
119
  rss_df["src"] = src_parse(rss)
120
  rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
121
  rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
122
+
123
+ if len(rss_df) == 0:
124
+ rss_df = None
125
  except Exception as e:
126
+ logger.warning(f'Skipping {rss} feed extraction due to an error {e}')
127
+ return None
128
  return rss_df
129
 
130
 
 
133
 
134
 
135
  def get_news_rss(url):
136
+ '''
137
+ Function that is used in multiprocessing
138
+ '''
139
+ try:
140
+ final_df = news_agg(url)
141
+ if final_df is not None:
142
+ final_df.reset_index(drop=True, inplace=True)
143
+
144
+ final_df.sort_values(by="elapsed_time", inplace=True)
145
+ final_df.drop(columns=['elapsed_time'], inplace=True)
 
 
 
 
146
 
147
+ final_df.drop_duplicates(subset='url', inplace=True)
148
+
149
+ final_df = final_df.loc[(final_df["title"] != ""), :].copy()
150
+
151
+ final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
152
+ except Exception as e:
153
+ logger.warning(f'Skipping {url} feed processing due to an error {e}')
154
+ return None
155
  return final_df
156
 
157
+
158
  def get_news_multi_process(urls):
159
  logger.warning('Entering get_news_multi_process() to read news')
160
  '''
 
170
 
171
  final_df = pd.DataFrame()
172
  for f in results:
173
+ rss_df = f.get(timeout=120)
174
+ if rss_df is not None:
175
+ final_df = pd.concat([final_df, rss_df], axis=0) # getting output of each parallel job
176
 
177
  final_df.reset_index(drop=True, inplace=True)
 
178
  pool.close()
179
  pool.join()
180
  logger.warning('Exiting get_news_multi_process()')
181
+
182
+ if len(final_df) == 0:
183
+ final_df = None
184
+
185
  return final_df
186
 
187