lalithadevi commited on
Commit
f9f964a
1 Parent(s): ad4ed94

Update news_category_similar_news_prediction.py

Browse files
news_category_similar_news_prediction.py CHANGED
@@ -128,8 +128,15 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
128
  raise Exception("New and old cols don't match")
129
  final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
130
  final_df.drop_duplicates(subset='url', keep='first', inplace=True)
 
 
131
  headlines = [*final_df['title']].copy()
132
- label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
 
 
 
 
 
133
  sent_embs = vectorizer.vectorize_(headlines, sent_model)
134
  sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
135
  final_df['category'] = label
@@ -146,8 +153,15 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
146
  old_urls = [*old_news['url']]
147
  new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
148
  if len(new_news) > 0:
 
 
149
  headlines = [*new_news['title']].copy()
150
- label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
 
 
 
 
 
151
  sent_embs = vectorizer.vectorize_(headlines, sent_model)
152
  sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
153
  new_news['category'] = label
 
128
  raise Exception("New and old cols don't match")
129
  final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
130
  final_df.drop_duplicates(subset='url', keep='first', inplace=True)
131
+
132
+
133
  headlines = [*final_df['title']].copy()
134
+ # label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
135
+
136
+ headlines_desc = [*final_df['title'] + ". " + final_df['description']].copy()
137
+ label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
138
+
139
+
140
  sent_embs = vectorizer.vectorize_(headlines, sent_model)
141
  sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
142
  final_df['category'] = label
 
153
  old_urls = [*old_news['url']]
154
  new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
155
  if len(new_news) > 0:
156
+
157
+
158
  headlines = [*new_news['title']].copy()
159
+
160
+ headlines_desc = [*final_df['title'] + ". " + final_df['description']].copy()
161
+ label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
162
+
163
+
164
+ # label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
165
  sent_embs = vectorizer.vectorize_(headlines, sent_model)
166
  sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
167
  new_news['category'] = label