lalithadevi commited on
Commit
4e08fbd
1 Parent(s): 24bd232

Update news_category_similar_news_prediction.py

Browse files
news_category_similar_news_prediction.py CHANGED
@@ -81,6 +81,22 @@ def inference(text, interpreter, label_encoder, tokenizer):
81
  def cols_check(new_cols, old_cols):
82
  return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, interpreter, label_encoder, tokenizer,
86
  collection, vectorizer, sent_model, ce_model):
@@ -142,12 +158,16 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
142
  final_df['category'] = label
143
  final_df['pred_proba'] = prob
144
  final_df['similar_news'] = sim_news
145
- final_df.reset_index(drop=True, inplace=True)
146
- final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
147
- final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
148
- logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
149
- final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
150
- logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
 
 
 
 
151
  else:
152
  logger.warning('Prior predictions found in old news')
153
  if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
@@ -170,17 +190,23 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
170
  new_news['pred_proba'] = prob
171
  new_news['similar_news'] = sim_news
172
  final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
173
- final_df.drop_duplicates(subset='url', keep='first', inplace=True)
174
- final_df.reset_index(drop=True, inplace=True)
175
- final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
176
- final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
177
- logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
178
- final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
179
- logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
 
 
 
 
 
180
  else:
181
  logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
182
  db_updation_required = 0
183
  final_df = old_news.copy()
 
184
 
185
 
186
  if len(final_df) == 0:
@@ -190,6 +216,7 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
190
  except Exception as e:
191
  logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
192
  final_df = None
 
193
  db_updation_required = 0
194
- return final_df, db_updation_required
195
 
 
81
  def cols_check(new_cols, old_cols):
82
  return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
83
 
84
+ def process_prediction_df(df, df_type: str="production"):
85
+ logger.warning(f"Entering process_prediction_df(): {df_type}")
86
+ df = df.copy()
87
+ df.drop_duplicates(subset='url', keep='first', inplace=True)
88
+ df.reset_index(drop=True, inplace=True)
89
+
90
+ df.reset_index(drop=True, inplace=True)
91
+ df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
92
+ df.loc[(df['title'].str.contains('Pakistan')) & (df['category'] == 'NATION'), 'category'] = 'WORLD'
93
+ logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
94
+ df.loc[(df['title'].str.contains('Zodiac Sign', case=False)) | (df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
95
+ logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
96
+
97
+ logger.warning(f"Exiting process_prediction_df(): {df_type}")
98
+ return df
99
+
100
 
101
  def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, interpreter, label_encoder, tokenizer,
102
  collection, vectorizer, sent_model, ce_model):
 
158
  final_df['category'] = label
159
  final_df['pred_proba'] = prob
160
  final_df['similar_news'] = sim_news
161
+ # final_df.reset_index(drop=True, inplace=True)
162
+ # final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
163
+ # final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
164
+ # logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
165
+ # final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
166
+ # logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
167
+
168
+ final_df = process_prediction_df(final_df, df_type="production & archive")
169
+ prediction_df = final_df.copy()
170
+
171
  else:
172
  logger.warning('Prior predictions found in old news')
173
  if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
 
190
  new_news['pred_proba'] = prob
191
  new_news['similar_news'] = sim_news
192
  final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
193
+
194
+ final_df = process_prediction_df(final_df, df_type="production")
195
+ archive_df = new_news.copy()
196
+ archive_df = process_prediction_df(archive_df, df_type="archive")
197
+
198
+ # final_df.drop_duplicates(subset='url', keep='first', inplace=True)
199
+ # final_df.reset_index(drop=True, inplace=True)
200
+ # final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
201
+ # final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
202
+ # logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
203
+ # final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
204
+ # logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
205
  else:
206
  logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
207
  db_updation_required = 0
208
  final_df = old_news.copy()
209
+ archive_df = final_df.copy()
210
 
211
 
212
  if len(final_df) == 0:
 
216
  except Exception as e:
217
  logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
218
  final_df = None
219
+ archive_df = final_df.copy()
220
  db_updation_required = 0
221
+ return final_df, archive_df, db_updation_required
222