Commit
•
4db51e3
1
Parent(s):
03d3a0e
Update news_extractor/news_extractor.py
Browse files
news_extractor/news_extractor.py
CHANGED
@@ -12,16 +12,6 @@ from logger import get_logger
|
|
12 |
logger = get_logger()
|
13 |
|
14 |
|
15 |
-
def date_time_parser(dt):
|
16 |
-
"""
|
17 |
-
Computes the minutes elapsed since published time.
|
18 |
-
:param dt: date
|
19 |
-
:return: int, minutes elapsed.
|
20 |
-
"""
|
21 |
-
try:
|
22 |
-
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
23 |
-
except:
|
24 |
-
return 100000
|
25 |
|
26 |
def text_clean(desc):
|
27 |
"""
|
@@ -117,7 +107,6 @@ def news_agg(rss):
|
|
117 |
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
|
118 |
|
119 |
rss_df["src"] = src_parse(rss)
|
120 |
-
rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
|
121 |
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
|
122 |
|
123 |
if len(rss_df) == 0:
|
@@ -141,8 +130,7 @@ def get_news_rss(url):
|
|
141 |
if final_df is not None:
|
142 |
final_df.reset_index(drop=True, inplace=True)
|
143 |
|
144 |
-
|
145 |
-
final_df.drop(columns=['elapsed_time'], inplace=True)
|
146 |
|
147 |
final_df.drop_duplicates(subset='url', inplace=True)
|
148 |
|
|
|
12 |
logger = get_logger()
|
13 |
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def text_clean(desc):
|
17 |
"""
|
|
|
107 |
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
|
108 |
|
109 |
rss_df["src"] = src_parse(rss)
|
|
|
110 |
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
|
111 |
|
112 |
if len(rss_df) == 0:
|
|
|
130 |
if final_df is not None:
|
131 |
final_df.reset_index(drop=True, inplace=True)
|
132 |
|
133 |
+
|
|
|
134 |
|
135 |
final_df.drop_duplicates(subset='url', inplace=True)
|
136 |
|