lalithadevi commited on
Commit
1bda5ba
1 Parent(s): 1e46c79

Update news_category_similar_news_prediction.py

Browse files
news_category_similar_news_prediction.py CHANGED
@@ -130,10 +130,10 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
130
  final_df.drop_duplicates(subset='url', keep='first', inplace=True)
131
 
132
 
133
- headlines = [*final_df['title']].copy()
 
134
  # label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
135
-
136
- headlines_desc = [*final_df['title'] + ". " + final_df['description']].copy()
137
  label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
138
 
139
 
@@ -154,14 +154,14 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
154
  new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
155
  if len(new_news) > 0:
156
 
157
-
158
- headlines = [*new_news['title']].copy()
159
 
160
- headlines_desc = [*new_news['title'] + ". " + new_news['description']].copy()
 
 
 
161
  label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
162
 
163
 
164
- # label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
165
  sent_embs = vectorizer.vectorize_(headlines, sent_model)
166
  sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
167
  new_news['category'] = label
 
130
  final_df.drop_duplicates(subset='url', keep='first', inplace=True)
131
 
132
 
133
+ headlines = [*final_df['title'].fillna("").str.strip()]
134
+ descriptions = [*final_df['description'].fillna("").str.strip()]
135
  # label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
136
+ headlines_desc = [h if (h == d) else f"{h}. {d}" for h, d in zip(headlines, descriptions)]
 
137
  label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
138
 
139
 
 
154
  new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
155
  if len(new_news) > 0:
156
 
 
 
157
 
158
+ headlines = [*new_news['title'].fillna("").str.strip()]
159
+ descriptions = [*new_news['description'].fillna("").str.strip()]
160
+ # label, prob = inference(headlines, interpreter, label_encoder, tokenizer)
161
+ headlines_desc = [h if (h == d) else f"{h}. {d}" for h, d in zip(headlines, descriptions)]
162
  label, prob = inference(headlines_desc, interpreter, label_encoder, tokenizer)
163
 
164
 
 
165
  sent_embs = vectorizer.vectorize_(headlines, sent_model)
166
  sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
167
  new_news['category'] = label