lalithadevi
commited on
Commit
•
4e08fbd
1
Parent(s):
24bd232
Update news_category_similar_news_prediction.py
Browse files
news_category_similar_news_prediction.py
CHANGED
@@ -81,6 +81,22 @@ def inference(text, interpreter, label_encoder, tokenizer):
|
|
81 |
def cols_check(new_cols, old_cols):
|
82 |
return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, interpreter, label_encoder, tokenizer,
|
86 |
collection, vectorizer, sent_model, ce_model):
|
@@ -142,12 +158,16 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
142 |
final_df['category'] = label
|
143 |
final_df['pred_proba'] = prob
|
144 |
final_df['similar_news'] = sim_news
|
145 |
-
final_df.reset_index(drop=True, inplace=True)
|
146 |
-
final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
|
147 |
-
final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
|
148 |
-
logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
|
149 |
-
final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
|
150 |
-
logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
|
|
|
|
|
|
|
|
|
151 |
else:
|
152 |
logger.warning('Prior predictions found in old news')
|
153 |
if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
|
@@ -170,17 +190,23 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
170 |
new_news['pred_proba'] = prob
|
171 |
new_news['similar_news'] = sim_news
|
172 |
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
|
173 |
-
|
174 |
-
final_df
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
final_df.
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
180 |
else:
|
181 |
logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
|
182 |
db_updation_required = 0
|
183 |
final_df = old_news.copy()
|
|
|
184 |
|
185 |
|
186 |
if len(final_df) == 0:
|
@@ -190,6 +216,7 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
190 |
except Exception as e:
|
191 |
logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
|
192 |
final_df = None
|
|
|
193 |
db_updation_required = 0
|
194 |
-
return final_df, db_updation_required
|
195 |
|
|
|
81 |
def cols_check(new_cols, old_cols):
|
82 |
return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
|
83 |
|
84 |
+
def process_prediction_df(df, df_type: str="production"):
|
85 |
+
logger.warning(f"Entering process_prediction_df(): {df_type}")
|
86 |
+
df = df.copy()
|
87 |
+
df.drop_duplicates(subset='url', keep='first', inplace=True)
|
88 |
+
df.reset_index(drop=True, inplace=True)
|
89 |
+
|
90 |
+
df.reset_index(drop=True, inplace=True)
|
91 |
+
df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
|
92 |
+
df.loc[(df['title'].str.contains('Pakistan')) & (df['category'] == 'NATION'), 'category'] = 'WORLD'
|
93 |
+
logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
|
94 |
+
df.loc[(df['title'].str.contains('Zodiac Sign', case=False)) | (df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
|
95 |
+
logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
|
96 |
+
|
97 |
+
logger.warning(f"Exiting process_prediction_df(): {df_type}")
|
98 |
+
return df
|
99 |
+
|
100 |
|
101 |
def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, interpreter, label_encoder, tokenizer,
|
102 |
collection, vectorizer, sent_model, ce_model):
|
|
|
158 |
final_df['category'] = label
|
159 |
final_df['pred_proba'] = prob
|
160 |
final_df['similar_news'] = sim_news
|
161 |
+
# final_df.reset_index(drop=True, inplace=True)
|
162 |
+
# final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
|
163 |
+
# final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
|
164 |
+
# logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
|
165 |
+
# final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
|
166 |
+
# logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
|
167 |
+
|
168 |
+
final_df = process_prediction_df(final_df, df_type="production & archive")
|
169 |
+
prediction_df = final_df.copy()
|
170 |
+
|
171 |
else:
|
172 |
logger.warning('Prior predictions found in old news')
|
173 |
if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
|
|
|
190 |
new_news['pred_proba'] = prob
|
191 |
new_news['similar_news'] = sim_news
|
192 |
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
|
193 |
+
|
194 |
+
final_df = process_prediction_df(final_df, df_type="production")
|
195 |
+
archive_df = new_news.copy()
|
196 |
+
archive_df = process_prediction_df(archive_df, df_type="archive")
|
197 |
+
|
198 |
+
# final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
199 |
+
# final_df.reset_index(drop=True, inplace=True)
|
200 |
+
# final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
|
201 |
+
# final_df.loc[(final_df['title'].str.contains('Pakistan')) & (final_df['category'] == 'NATION'), 'category'] = 'WORLD'
|
202 |
+
# logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
|
203 |
+
# final_df.loc[(final_df['title'].str.contains('Zodiac Sign', case=False)) | (final_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'SCIENCE'
|
204 |
+
# logger.warning('Updated category of articles having Zodiac Sign in title to SCIENCE')
|
205 |
else:
|
206 |
logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
|
207 |
db_updation_required = 0
|
208 |
final_df = old_news.copy()
|
209 |
+
archive_df = final_df.copy()
|
210 |
|
211 |
|
212 |
if len(final_df) == 0:
|
|
|
216 |
except Exception as e:
|
217 |
logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
|
218 |
final_df = None
|
219 |
+
archive_df = final_df.copy()
|
220 |
db_updation_required = 0
|
221 |
+
return final_df, archive_df, db_updation_required
|
222 |
|