lalithadevi commited on
Commit
8a35125
1 Parent(s): 29b3836

Update news_extractor/news_extractor.py

Browse files
Files changed (1) hide show
  1. news_extractor/news_extractor.py +33 -5
news_extractor/news_extractor.py CHANGED
@@ -5,6 +5,7 @@ import requests as r
5
  import regex as re
6
  from dateutil import parser
7
  import logging
 
8
 
9
 
10
  def date_time_parser(dt):
@@ -128,11 +129,12 @@ rss = ['https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
128
  'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
129
 
130
 
131
- def get_news():
132
- final_df = pd.DataFrame()
133
- for i in rss:
134
- # final_df = final_df.append(news_agg(i))
135
- final_df = pd.concat([final_df, news_agg(i)], axis=0)
 
136
  final_df.reset_index(drop=True, inplace=True)
137
 
138
  logging.warning(final_df['src'].unique())
@@ -147,3 +149,29 @@ def get_news():
147
  final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
148
 
149
  return final_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import regex as re
6
  from dateutil import parser
7
  import logging
8
+ import multiprocessing
9
 
10
 
11
  def date_time_parser(dt):
 
129
  'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
130
 
131
 
132
+ def get_news_rss(url):
133
+ # final_df = pd.DataFrame()
134
+ # for i in rss:
135
+ # # final_df = final_df.append(news_agg(i))
136
+ # final_df = pd.concat([final_df, news_agg(i)], axis=0)
137
+ final_df = news_agg(url)
138
  final_df.reset_index(drop=True, inplace=True)
139
 
140
  logging.warning(final_df['src'].unique())
 
149
  final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
150
 
151
  return final_df
152
+
153
+ def get_news_multi_process(urls):
154
+ '''
155
+ Get the data shape by parallely calculating lenght of each chunk and
156
+ aggregating them to get lenght of complete training dataset
157
+ '''
158
+ pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
159
+
160
+ results = []
161
+ for url in urls:
162
+ f = pool.apply_async(get_news, [url]) # asynchronously applying function to chunk. Each worker parallely begins to work on the job
163
+ results.append(f) # appending result to results
164
+
165
+ final_df = pd.DataFrame()
166
+ for f in results:
167
+ # print(f.get())
168
+ final_df = pd.concat([final_df, f.get(timeout=120)], axis=0) # getting output of each parallel job
169
+
170
+ final_df.reset_index(drop=True, inplace=True)
171
+ pool.close()
172
+ pool.join()
173
+ return final_df
174
+
175
+
176
+ def get_news():
177
+ return get_data(rss)