Commit
•
f9f964a
1
Parent(s):
ad4ed94
Update news_category_similar_news_prediction.py
Browse files
news_category_similar_news_prediction.py
CHANGED
@@ -128,8 +128,15 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
128 |
raise Exception("New and old cols don't match")
|
129 |
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
|
130 |
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
|
|
|
|
131 |
headlines = [*final_df['title']].copy()
|
132 |
-
label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
|
|
|
|
|
|
|
|
|
|
133 |
sent_embs = vectorizer.vectorize_(headlines, sent_model)
|
134 |
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
|
135 |
final_df['category'] = label
|
@@ -146,8 +153,15 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
146 |
old_urls = [*old_news['url']]
|
147 |
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
|
148 |
if len(new_news) > 0:
|
|
|
|
|
149 |
headlines = [*new_news['title']].copy()
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
151 |
sent_embs = vectorizer.vectorize_(headlines, sent_model)
|
152 |
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
|
153 |
new_news['category'] = label
|
|
|
128 |
raise Exception("New and old cols don't match")
|
129 |
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
|
130 |
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
131 |
+
|
132 |
+
|
133 |
headlines = [*final_df['title']].copy()
|
134 |
+
# label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
135 |
+
|
136 |
+
headlines_desc = [*final_df['title'] + ". " + final_df['description']].copy()
|
137 |
+
label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
|
138 |
+
|
139 |
+
|
140 |
sent_embs = vectorizer.vectorize_(headlines, sent_model)
|
141 |
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
|
142 |
final_df['category'] = label
|
|
|
153 |
old_urls = [*old_news['url']]
|
154 |
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
|
155 |
if len(new_news) > 0:
|
156 |
+
|
157 |
+
|
158 |
headlines = [*new_news['title']].copy()
|
159 |
+
|
160 |
+
headlines_desc = [*final_df['title'] + ". " + final_df['description']].copy()
|
161 |
+
label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
|
162 |
+
|
163 |
+
|
164 |
+
# label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
165 |
sent_embs = vectorizer.vectorize_(headlines, sent_model)
|
166 |
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
|
167 |
new_news['category'] = label
|