Commit
•
4b6249a
1
Parent(s):
890a875
Update news_category_similar_news_prediction.py
Browse files
news_category_similar_news_prediction.py
CHANGED
@@ -76,22 +76,26 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
76 |
headlines = [*final_df['title']].copy()
|
77 |
label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
78 |
sent_embs = vectorizer.vectorize(headlines)
|
79 |
-
sim_news = [find_similar_news(
|
80 |
final_df['category'] = label
|
81 |
final_df['pred_proba'] = prob
|
|
|
82 |
final_df.reset_index(drop=True, inplace=True)
|
83 |
final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'OTHERS'
|
84 |
else:
|
85 |
logger.warning('Prior predictions found in old news')
|
86 |
-
if not cols_check([*new_news.columns], [*old_news.columns][:-
|
87 |
raise Exception("New and old cols don't match")
|
88 |
old_urls = [*old_news['url']]
|
89 |
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
|
90 |
if len(new_news) > 0:
|
91 |
headlines = [*new_news['title']].copy()
|
92 |
label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
|
|
|
|
93 |
new_news['category'] = label
|
94 |
new_news['pred_proba'] = prob
|
|
|
95 |
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
|
96 |
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
97 |
final_df.reset_index(drop=True, inplace=True)
|
|
|
76 |
headlines = [*final_df['title']].copy()
|
77 |
label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
78 |
sent_embs = vectorizer.vectorize(headlines)
|
79 |
+
sim_news = [find_similar_news(search_vec, collection, vectorizer, sent_model, ce_model) for search_vec in sent_embs]
|
80 |
final_df['category'] = label
|
81 |
final_df['pred_proba'] = prob
|
82 |
+
final_df['similar_news'] = sim_news
|
83 |
final_df.reset_index(drop=True, inplace=True)
|
84 |
final_df.loc[final_df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'OTHERS'
|
85 |
else:
|
86 |
logger.warning('Prior predictions found in old news')
|
87 |
+
if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
|
88 |
raise Exception("New and old cols don't match")
|
89 |
old_urls = [*old_news['url']]
|
90 |
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
|
91 |
if len(new_news) > 0:
|
92 |
headlines = [*new_news['title']].copy()
|
93 |
label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
94 |
+
sent_embs = vectorizer.vectorize(headlines)
|
95 |
+
sim_news = [find_similar_news(search_vec, collection, vectorizer, sent_model, ce_model) for search_vec in sent_embs]
|
96 |
new_news['category'] = label
|
97 |
new_news['pred_proba'] = prob
|
98 |
+
final_df['similar_news'] = sim_news
|
99 |
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
|
100 |
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
101 |
final_df.reset_index(drop=True, inplace=True)
|