File size: 11,311 Bytes
c2c5fc6 e35644f 74a2149 1462b37 74a2149 c2c5fc6 60be9e8 07d0294 60be9e8 07d0294 60be9e8 2431f23 60be9e8 561ed17 ad4ed94 07d0294 7404293 2431f23 60be9e8 2e9336a 60be9e8 c2c5fc6 2e9336a 74a2149 e35644f c2c5fc6 2e9336a 77aa593 74a2149 c2c5fc6 4e08fbd e35644f 4e08fbd c7a5729 8407a74 c7a5729 163c84f 8407a74 9c90a67 163c84f 62bffd5 4e5c75f c7a5729 c2c5fc6 2e9336a f442802 0b44a23 f442802 60be9e8 132f624 60be9e8 132f624 f442802 59792ae 132f624 f442802 60be9e8 132f624 60be9e8 132f624 f442802 132f624 f97e412 59792ae f442802 59792ae f442802 0cb5606 f442802 f9f964a 1bda5ba 2e9336a f9f964a 61f5068 e48e7d5 f442802 4b6249a 4e08fbd 68c3650 4e08fbd c7a5729 4e08fbd f442802 4b6249a 0cb5606 f442802 d5f669f f9f964a 1bda5ba 2e9336a f9f964a 61f5068 e48e7d5 d5f669f 06d4f50 d5f669f 4e08fbd 68c3650 e35644f c7a5729 4e08fbd d5f669f 0b44a23 68c3650 d5f669f 2536052 f442802 73a423f 68c3650 73a423f 68c3650 c2c5fc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import pandas as pd
import numpy as np
import tensorflow as tf
from config import CLASSIFIER_THRESHOLD, NEWS_RETENTION_SECONDS
from logger import get_logger
from find_similar_news import find_similar_news
logger = get_logger()
##########################
from dateutil import parser
def correct_date(x):
if (not isinstance(x, str)) or (str(x).find(":") == -1):
logger.warning(f'correct_date() error: {x} is not the right date format')
return "2020-11-07 00:36:44+05:30"
return x
def date_time_parser(dt):
"""
Computes the minutes elapsed since published time.
:param dt: date
:return: int, minutes elapsed.
"""
try:
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
except:
logger.warning(f'date_time_parser() error: {dt} is not the right date format')
return 100000
def delete_outdated_news(final_df: pd.DataFrame):
logger.warning("Entering delete_outdated_news()")
final_df = final_df.copy()
final_df["parsed_date_1"] = [correct_date(date_) for date_ in final_df['parsed_date']]
final_df["parsed_date_1"] = [parser.parse(date_) for date_ in final_df['parsed_date_1']]
final_df["elapsed_time"] = [date_time_parser(date_) for date_ in final_df['parsed_date_1']]
final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
final_df.drop(columns=['elapsed_time', 'parsed_date_1'], inplace=True)
final_df.reset_index(drop=True, inplace=True)
logger.warning("Exiting delete_outdated_news()")
return final_df
def find_path(url):
if url == '':
return ''
url = url.replace("-/-", "-")
url_split = url.replace("https://", "")
url_split = url_split.replace("www.", "")
url_split = url_split.strip()
url = url.replace("//", "/")
url = url.replace("https/timesofindia-indiatimes-com", "")
url_split = url_split.split("/")
url_split = [u for u in url_split if (u != "") and
(u != "articleshow") and
(u.find(".cms")==-1) and
(u.find(".ece")==-1) and
(u.find(".htm")==-1) and
(len(u.split('-')) <= 5) and
(u.find(" ") == -1)
]
if len(url_split) > 2:
url_split = "/".join(url_split[1:])
else:
if len(url_split) > 0:
url_split = url_split[-1]
else:
url_split = '-'
return url_split
############################################
def parse_prediction(tflite_pred, label_encoder):
tflite_pred_argmax = np.argmax(tflite_pred, axis=1)
tflite_pred_label = label_encoder.inverse_transform(tflite_pred_argmax)
tflite_pred_prob = np.max(tflite_pred, axis=1)
return tflite_pred_label, tflite_pred_prob
def inference(text, calibrated_model, label_encoder):
logger.warning('Entering inference()')
logger.warning(f'Samples to predict: {len(text)}')
if text != "":
tflite_pred = calibrated_model.predict_proba(text)
tflite_pred = parse_prediction(tflite_pred, label_encoder)
logger.warning('Exiting inference()')
return tflite_pred
def cols_check(new_cols, old_cols):
return all([new_col==old_col for new_col, old_col in zip(new_cols, old_cols)])
def process_prediction_df(df, df_type: str="production"):
logger.warning(f"Entering process_prediction_df(): {df_type}")
df = df.copy()
df.drop_duplicates(subset='url', keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)
# df.loc[df['pred_proba']<CLASSIFIER_THRESHOLD, 'category'] = 'NATION'
# df.loc[(df['title'].str.contains('Pakistan')) & (df['category'] == 'NATION'), 'category'] = 'WORLD'
# logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD')
# df.loc[(df['title'].str.contains('Zodiac Sign', case=False)) | (df['title'].str.contains('Horoscope', case=False)), 'category'] = 'ASTROLOGY'
# logger.warning('Updated category of articles having Zodiac Sign in title to ASTROLOGY')
logger.warning(f"Exiting process_prediction_df(): {df_type}")
return df
def add_flags_to_prediction_df(df):
df = df.copy()
headlines = [*df['title'].fillna("").str.strip()]
descriptions = [*df['description'].fillna("").str.strip()]
headlines_desc = [h if (h == d) else f"{h}. {d}" for h, d in zip(headlines, descriptions)]
df['text'] = headlines_desc
df['used_for_training'] = 0
df['allocated'] = 0
df.rename(columns={'category': 'y_pred', 'parsed_date': 'date_time', 'pred_proba': 'y_pred_proba'}, inplace=True)
df.drop(columns=['similar_news', 'src', 'title', 'description'], inplace=True)
df['y_true_proba'] = df['y_pred_proba']
df['y_true'] = df['y_pred']
df['updated_prediction'] = 1
return df
def predict_news_category_similar_news(old_news: pd.DataFrame, new_news: pd.DataFrame, calibrated_model, label_encoder,
collection, vectorizer, sent_model, ce_model):
try:
db_updation_required = 1
logger.warning('Entering predict_news_category()')
logger.warning(f'old news: {old_news}\nnew_news: {new_news}')
if not isinstance(new_news, pd.DataFrame):
raise Exception('No New News Found')
else:
# new_news = new_news.copy()
logger.warning(f'{len(new_news)} new news items found before deleting outdated news')
new_news = delete_outdated_news(new_news)
logger.warning(f'{len(new_news)} new news items found after deleting outdated news')
logger.warning(f'new news columns: {[*new_news.columns]}')
if len(new_news) <= 1:
new_news = None
if not isinstance(new_news, pd.DataFrame):
raise Exception('No New News Found after deleting outdated news')
if isinstance(old_news, pd.DataFrame):
# old_news = old_news.copy()
logger.warning(f'{len(old_news)} old news items found before deleting outdated news')
old_news = delete_outdated_news(old_news)
logger.warning(f'{len(old_news)} old news items found after deleting outdated news')
logger.warning(f'old news columns: {[*old_news.columns]}')
old_news.drop(columns='_id', inplace=True)
logger.warning('Dropped _id column from old news data frame.')
if len(old_news) <= 1:
old_news = None
else:
logger.warning('No old news is found')
old_news = new_news.copy()
if not isinstance(old_news, pd.DataFrame):
logger.warning('No old news is found after deleting outdate news')
old_news = new_news.copy()
if 'category' not in [*old_news.columns]:
logger.warning('No prior predictions found in old news')
if not cols_check([*new_news.columns], [*old_news.columns]):
raise Exception("New and old cols don't match")
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
final_df.drop_duplicates(subset='url', keep='first', inplace=True)
headlines = [*final_df['title'].fillna("").str.strip()]
descriptions = [*final_df['description'].fillna("").str.strip()]
paths = [*final_df['url'].map(find_path).fillna("").str.strip()]
headlines_desc = [f"{p}: {h}" if (h == d) else f"{p}: {h}. {d}" for p, h, d in zip(paths, headlines, descriptions)]
label, prob = inference(headlines_desc, calibrated_model, label_encoder)
sent_embs = vectorizer.vectorize_(headlines, sent_model)
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
final_df['category'] = label
final_df['pred_proba'] = prob
final_df['similar_news'] = sim_news
final_df = process_prediction_df(final_df, df_type="production & prediction")
prediction_df = final_df.copy()
prediction_df = add_flags_to_prediction_df(prediction_df)
else:
logger.warning('Prior predictions found in old news')
if not cols_check([*new_news.columns], [*old_news.columns][:-3]):
raise Exception("New and old cols don't match")
old_urls = [*old_news['url']]
new_news = new_news.loc[new_news['url'].isin(old_urls) == False, :]
if len(new_news) > 0:
headlines = [*new_news['title'].fillna("").str.strip()]
descriptions = [*new_news['description'].fillna("").str.strip()]
paths = [*new_news['url'].map(find_path).fillna("").str.strip()]
headlines_desc = [f"{p}: {h}" if (h == d) else f"{p}: {h}. {d}" for p, h, d in zip(paths, headlines, descriptions)]
label, prob = inference(headlines_desc, calibrated_model, label_encoder)
sent_embs = vectorizer.vectorize_(headlines, sent_model)
sim_news = [find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model) for search_vec, text in zip(sent_embs, headlines)]
new_news['category'] = label
new_news['pred_proba'] = prob
new_news['similar_news'] = sim_news
final_df = pd.concat([old_news, new_news], axis=0, ignore_index=True)
final_df = process_prediction_df(final_df, df_type="production")
prediction_df = new_news.copy()
# prediction_df.loc[(prediction_df['title'].str.contains('Pakistan')) & (prediction_df['category'] == 'NATION'), 'category'] = 'WORLD'
# logger.warning('Updated category of articles having Pakistan in title and category=NATION to WORLD in prediction_df')
# prediction_df.loc[(prediction_df['title'].str.contains('Zodiac Sign', case=False)) | (prediction_df['title'].str.contains('Horoscope', case=False)), 'category'] = 'ASTROLOGY'
# logger.warning('Updated category of articles having Zodiac Sign in title to ASTROLOGY in prediction_df')
prediction_df = add_flags_to_prediction_df(prediction_df)
else:
logger.warning('INFO: Old & New Articles are the same. There is no requirement of updating them in the database. Database is not updated.')
db_updation_required = 0
final_df = old_news.copy()
prediction_df = final_df.copy()
if len(final_df) == 0:
final_df = None
logger.warning('Exiting predict_news_category()')
except Exception as e:
logger.warning(f'Unexcpected error in predict_news_category()\n{e}')
final_df = None
prediction_df = final_df.copy()
db_updation_required = 0
return final_df, prediction_df, db_updation_required
|