lalithadevi commited on
Commit
60be9e8
1 Parent(s): 06d4f50

Update news_category_similar_news_prediction.py

Browse files
news_category_similar_news_prediction.py CHANGED
@@ -10,6 +10,39 @@ from find_similar_news import find_similar_news
10
 
11
  logger = get_logger()
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def parse_prediction(tflite_pred, label_encoder):
14
  tflite_pred_argmax = np.argmax(tflite_pred, axis=1)
15
  tflite_pred_label = label_encoder.inverse_transform(tflite_pred_argmax)
@@ -53,12 +86,14 @@ def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.Data
53
  if not isinstance(new_news, pd.DataFrame):
54
  raise Exception('No New News Found')
55
  else:
56
- new_news = new_news.copy()
 
57
  logger.warning(f'new news columns: {[*new_news.columns]}')
58
  logger.warning(f'{len(new_news)} new news items found')
59
 
60
  if isinstance(old_news, pd.DataFrame):
61
- old_news = old_news.copy()
 
62
  logger.warning(f'old news columns: {[*old_news.columns]}')
63
  logger.warning(f'{len(old_news)} old news items found')
64
  old_news.drop(columns='_id', inplace=True)
 
10
 
11
  logger = get_logger()
12
 
13
+ ##########################
14
+
15
+ from dateutil import parser
16
+ def correct_date(x):
17
+ if (not isinstance(x, str)) or (str(x).find(":") == -1):
18
+ return "2020-11-07 00:36:44+05:30"
19
+ return x
20
+
21
+ def date_time_parser(dt):
22
+ """
23
+ Computes the minutes elapsed since published time.
24
+ :param dt: date
25
+ :return: int, minutes elapsed.
26
+ """
27
+ try:
28
+ return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
29
+ except:
30
+ return 100000
31
+
32
+ def delete_outdated_news(final_df: pd.DataFrame):
33
+ final_df = final_df.copy()
34
+ final_df["parsed_date"] = final_df["parsed_date"].map(correct_date)
35
+ final_df["parsed_date"] = final_df["parsed_date"].map(parser.parse)
36
+ final_df["elapsed_time"] = final_df["parsed_date"].apply(date_time_parser)
37
+ final_df = final_df.loc[final_df["elapsed_time"] <= 720, :].copy() # 1440=24 hrs and 720=12 hrs
38
+ final_df.drop(columns='elapsed_time', inplace=True)
39
+ return final_df
40
+
41
+
42
+
43
+ ############################################
44
+
45
+
46
  def parse_prediction(tflite_pred, label_encoder):
47
  tflite_pred_argmax = np.argmax(tflite_pred, axis=1)
48
  tflite_pred_label = label_encoder.inverse_transform(tflite_pred_argmax)
 
86
  if not isinstance(new_news, pd.DataFrame):
87
  raise Exception('No New News Found')
88
  else:
89
+ # new_news = new_news.copy()
90
+ new_news = delete_outdated_news(new_news)
91
  logger.warning(f'new news columns: {[*new_news.columns]}')
92
  logger.warning(f'{len(new_news)} new news items found')
93
 
94
  if isinstance(old_news, pd.DataFrame):
95
+ # old_news = old_news.copy()
96
+ old_news = delete_outdated_news(old_news)
97
  logger.warning(f'old news columns: {[*old_news.columns]}')
98
  logger.warning(f'{len(old_news)} old news items found')
99
  old_news.drop(columns='_id', inplace=True)