Commit
•
1bda5ba
1
Parent(s):
1e46c79
Update news_category_similar_news_prediction.py
Browse files
news_category_similar_news_prediction.py
CHANGED
@@ -130,10 +130,10 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
130 |
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
131 |
|
132 |
|
133 |
-
headlines = [*final_df['title']
|
|
|
134 |
# label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
135 |
-
|
136 |
-
headlines_desc = [*final_df['title'] + ". " + final_df['description']].copy()
|
137 |
label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
|
138 |
|
139 |
|
@@ -154,14 +154,14 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
|
|
154 |
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
|
155 |
if len(new_news) > 0:
|
156 |
|
157 |
-
|
158 |
-
headlines = [*new_news['title']].copy()
|
159 |
|
160 |
-
|
|
|
|
|
|
|
161 |
label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
|
162 |
|
163 |
|
164 |
-
# label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
165 |
sent_embs = vectorizer.vectorize_(headlines, sent_model)
|
166 |
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
|
167 |
new_news['category'] = label
|
|
|
130 |
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
|
131 |
|
132 |
|
133 |
+
headlines = [*final_df['title'].fillna("").str.strip()]
|
134 |
+
descriptions = [*final_df['description'].fillna("").str.strip()]
|
135 |
# label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
136 |
+
headlines_desc = [h if (h == d) else f"{h}. {d}" for h, d in zip(headlines, descriptions)]
|
|
|
137 |
label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
|
138 |
|
139 |
|
|
|
154 |
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
|
155 |
if len(new_news) > 0:
|
156 |
|
|
|
|
|
157 |
|
158 |
+
headlines = [*new_news['title'].fillna("").str.strip()]
|
159 |
+
descriptions = [*new_news['description'].fillna("").str.strip()]
|
160 |
+
# label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
|
161 |
+
headlines_desc = [h if (h == d) else f"{h}. {d}" for h, d in zip(headlines, descriptions)]
|
162 |
label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
|
163 |
|
164 |
|
|
|
165 |
sent_embs = vectorizer.vectorize_(headlines, sent_model)
|
166 |
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
|
167 |
new_news['category'] = label
|