lalithadevi commited on
Commit
8fbf7cb
1 Parent(s): f740095

Update news_extractor/news_extractor.py

Browse files
Files changed (1) hide show
  1. news_extractor/news_extractor.py +16 -2
news_extractor/news_extractor.py CHANGED
@@ -102,7 +102,14 @@ def news_agg(rss):
102
  rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
103
  rss_df.reset_index(drop=True, inplace=True)
104
  rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
105
- rss_df.dropna(inplace=True)
 
 
 
 
 
 
 
106
  rss_df["src"] = src_parse(rss)
107
  rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
108
  rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
@@ -143,7 +150,14 @@ def get_news_rss(url):
143
  # final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
144
  # final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
145
  final_df.drop(columns=['elapsed_time'], inplace=True)
146
- final_df.drop_duplicates(subset='description', inplace=True)
 
 
 
 
 
 
 
147
  final_df = final_df.loc[(final_df["title"] != ""), :].copy()
148
 
149
  final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
 
102
  rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
103
  rss_df.reset_index(drop=True, inplace=True)
104
  rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
105
+
106
+
107
+ #### UNCOMMENT IN CASE OF OOM ERROR IN RENDER
108
+ # rss_df.dropna(inplace=True)
109
+
110
+ ####
111
+
112
+
113
  rss_df["src"] = src_parse(rss)
114
  rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
115
  rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
 
150
  # final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
151
  # final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
152
  final_df.drop(columns=['elapsed_time'], inplace=True)
153
+
154
+
155
+ #### UNCOMMENT 1ST AND REMOVE 2ND STATEMENT IN CASE OF OOM ERROR IN RENDER
156
+ # final_df.drop_duplicates(subset='description', inplace=True)
157
+ final_df.drop_duplicates(subset='url', inplace=True)
158
+
159
+ ####
160
+
161
  final_df = final_df.loc[(final_df["title"] != ""), :].copy()
162
 
163
  final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']