latest_news_backend_with_cat_pred_similar_news

Running

App Files Files Community

lalithadevi commited on May 28

Commit

4e08fbd

•

1 Parent(s): 24bd232

Update news_category_similar_news_prediction.py

Browse files

Files changed (1) hide show

news_category_similar_news_prediction.py +41 -14

news_category_similar_news_prediction.py CHANGED Viewed

@@ -81,6 +81,22 @@ def inference(text, interpreter, label_encoder, tokenizer):
 def cols_check(new_cols, old_cols):
     return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
 def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, interpreter, label_encoder, tokenizer,
                          collection, vectorizer, sent_model, ce_model):
@@ -142,12 +158,16 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
             final_df['category'] = label
             final_df['pred_proba'] = prob
             final_df['similar_news'] = sim_news
-            final_df.reset_index(drop=True, inplace=True)
-            final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
-            final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
-            logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
-            final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
-            logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
         else:
             logger.warning('Prior predictions found in old news')
             if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
@@ -170,17 +190,23 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
                 new_news['pred_proba'] = prob
                 new_news['similar_news'] = sim_news
                 final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
-                final_df.drop_duplicates(subset='url', keep='first', inplace=True)
-                final_df.reset_index(drop=True, inplace=True)
-                final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
-                final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
-                logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
-                final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
-                logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
             else:
                 logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
                 db_updation_required = 0
                 final_df = old_news.copy()
         if len(final_df) == 0:
@@ -190,6 +216,7 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
     except Exception as e:
         logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
         final_df = None
         db_updation_required = 0
-    return final_df, db_updation_required

 def cols_check(new_cols, old_cols):
     return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
+def process_prediction_df(df, df_type: str="production"):
+    logger.warning(f"Entering process_prediction_df(): {df_type}")
+    df = df.copy()
+    df.drop_duplicates(subset='url', keep='first', inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
+    df.loc[(df['title'].str.contains('Pakistan')) & (df['category'] == 'NATION'), 'category'] = 'WORLD'
+    logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
+    df.loc[(df['title'].str.contains('Zodiac Sign', case=False)) | (df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
+    logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
+    logger.warning(f"Exiting process_prediction_df(): {df_type}")
+    return df
 def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, interpreter, label_encoder, tokenizer,
                          collection, vectorizer, sent_model, ce_model):
             final_df['category'] = label
             final_df['pred_proba'] = prob
             final_df['similar_news'] = sim_news
+            # final_df.reset_index(drop=True, inplace=True)
+            # final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
+            # final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
+            # logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
+            # final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
+            # logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
+            final_df = process_prediction_df(final_df, df_type="production & archive")
+            prediction_df = final_df.copy()
         else:
             logger.warning('Prior predictions found in old news')
             if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
                 new_news['pred_proba'] = prob
                 new_news['similar_news'] = sim_news
                 final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
+                final_df = process_prediction_df(final_df, df_type="production")
+                archive_df = new_news.copy()
+                archive_df = process_prediction_df(archive_df, df_type="archive")
+                # final_df.drop_duplicates(subset='url', keep='first', inplace=True)
+                # final_df.reset_index(drop=True, inplace=True)
+                # final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
+                # final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
+                # logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
+                # final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
+                # logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
             else:
                 logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
                 db_updation_required = 0
                 final_df = old_news.copy()
+                archive_df = final_df.copy()
         if len(final_df) == 0:
     except Exception as e:
         logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
         final_df = None
+        archive_df = final_df.copy()
         db_updation_required = 0
+    return final_df, archive_df, db_updation_required