lalithadevi commited on
Commit
960de68
1 Parent(s): 24f1253

Delete news_extractor

Browse files
news_extractor/__init__.py DELETED
@@ -1 +0,0 @@
1
- from news_extractor.news_extractor import *
 
 
news_extractor/news_extractor.py DELETED
@@ -1,192 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from bs4 import BeautifulSoup
4
- import requests as r
5
- import regex as re
6
- from dateutil import parser
7
- import logging
8
- import multiprocessing
9
-
10
-
11
- def date_time_parser(dt):
12
- """
13
- Computes the minutes elapsed since published time.
14
- :param dt: date
15
- :return: int, minutes elapsed.
16
- """
17
- return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
18
-
19
- def text_clean(desc):
20
- """
21
- Cleans the text by removing special chars.
22
- :param desc: string containing description
23
- :return: str, cleaned description.
24
- """
25
- desc = desc.replace("&lt;", "<")
26
- desc = desc.replace("&gt;", ">")
27
- desc = re.sub("<.*?>", "", desc)
28
- desc = desc.replace("#39;", "'")
29
- desc = desc.replace('&quot;', '"')
30
- desc = desc.replace('&nbsp;', ' ')
31
- desc = desc.replace('#32;', ' ')
32
- return desc
33
-
34
-
35
- def rss_parser(i):
36
- """
37
- Returns a data frame of parsed news item.
38
- :param i: single news item in RSS feed.
39
- :return: Data frame of parsed news item.
40
- """
41
- b1 = BeautifulSoup(str(i), "xml")
42
- title = "" if b1.find("title") is None else b1.find("title").get_text()
43
- title = text_clean(title)
44
- url = "" if b1.find("link") is None else b1.find("link").get_text()
45
- desc = "" if b1.find("description") is None else b1.find("description").get_text()
46
- desc = text_clean(desc)
47
- desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
48
- date = "Sat, 12 Aug 2000 13:39:15 +05:30" if ((b1.find("pubDate") == "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
49
- if url.find("businesstoday.in") >= 0:
50
- date = date.replace("GMT", "+0530")
51
-
52
- date1 = parser.parse(date)
53
- return pd.DataFrame({"title": title,
54
- "url": url,
55
- "description": desc,
56
- "parsed_date": date1}, index=[0])
57
-
58
-
59
- def src_parse(rss):
60
- """
61
- Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
62
- :param rss: RSS URL
63
- :return: str, string containing the source name
64
- """
65
- if rss.find('ndtvprofit') >= 0:
66
- rss = 'ndtv profit'
67
- if rss.find('ndtv') >= 0:
68
- rss = 'ndtv.com'
69
- if rss.find('telanganatoday') >= 0:
70
- rss = 'telanganatoday.com'
71
-
72
- rss = rss.replace("https://www.", "")
73
- rss = rss.split("/")
74
- return rss[0]
75
-
76
-
77
- def news_agg(rss):
78
- """
79
- Returns feeds from each 'rss' URL.
80
- :param rss: RSS URL.
81
- :return: Data frame of processed articles.
82
- """
83
- try:
84
- rss_df = pd.DataFrame()
85
- # user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
86
- headers = {
87
- 'authority': 'www.google.com',
88
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
89
- 'accept-language': 'en-US,en;q=0.9',
90
- 'cache-control': 'max-age=0',
91
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
92
- }
93
-
94
- timeout = 5
95
-
96
- resp = r.get(rss, timeout=timeout, headers=headers)
97
- logging.warning(f'{rss}: {resp.status_code}')
98
- b = BeautifulSoup(resp.content, "xml")
99
- items = b.find_all("item")
100
- for i in items:
101
- # rss_df = rss_df.append(rss_parser(i)).copy()
102
- rss_df = pd.concat([rss_df, rss_parser(i)], axis=0)
103
- rss_df.reset_index(drop=True, inplace=True)
104
- rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
105
-
106
-
107
- #### UNCOMMENT IN CASE OF OOM ERROR IN RENDER
108
- # rss_df.dropna(inplace=True)
109
-
110
- ####
111
-
112
-
113
- rss_df["src"] = src_parse(rss)
114
- rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
115
- rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
116
- # rss_df["elapsed_time_str"] = rss_df["elapsed_time"].apply(elapsed_time_str)
117
- except Exception as e:
118
- print(e)
119
- pass
120
- return rss_df
121
-
122
-
123
- # List of RSS feeds
124
- rss = ['https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
125
- 'https://www.thehindu.com/news/feeder/default.rss',
126
- # 'https://telanganatoday.com/feed',
127
- 'https://www.businesstoday.in/rssfeeds/?id=225346',
128
- 'https://feeds.feedburner.com/ndtvnews-latest',
129
- 'https://www.hindustantimes.com/feeds/rss/world-news/rssfeed.xml',
130
- 'https://www.indiatoday.in/rss/1206578',
131
-
132
- 'https://www.moneycontrol.com/rss/latestnews.xml',
133
- 'https://www.livemint.com/rss/news',
134
-
135
- 'https://www.zeebiz.com/latest.xml/feed',
136
- 'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
137
-
138
-
139
- def get_news_rss(url):
140
- # final_df = pd.DataFrame()
141
- # for i in rss:
142
- # # final_df = final_df.append(news_agg(i))
143
- # final_df = pd.concat([final_df, news_agg(i)], axis=0)
144
- final_df = news_agg(url)
145
- final_df.reset_index(drop=True, inplace=True)
146
-
147
-
148
-
149
- final_df.sort_values(by="elapsed_time", inplace=True)
150
- # final_df['src_time'] = final_df['src'] + ("&nbsp;" * 5) + final_df["elapsed_time_str"]
151
- # final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
152
- final_df.drop(columns=['elapsed_time'], inplace=True)
153
-
154
-
155
- #### UNCOMMENT 1ST STATEMENT AND REMOVE 2ND STATEMENT IN CASE OF OOM ERROR IN RENDER
156
- # final_df.drop_duplicates(subset='description', inplace=True)
157
- final_df.drop_duplicates(subset='url', inplace=True)
158
-
159
- ####
160
-
161
- final_df = final_df.loc[(final_df["title"] != ""), :].copy()
162
-
163
- final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'description'] = final_df.loc[(final_df['description'].isna()) | (final_df['description']=='')| (final_df['description']==' '), 'title']
164
-
165
- return final_df
166
-
167
- def get_news_multi_process(urls):
168
- '''
169
- Get the data shape by parallely calculating lenght of each chunk and
170
- aggregating them to get lenght of complete training dataset
171
- '''
172
- pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
173
-
174
- results = []
175
- for url in urls:
176
- f = pool.apply_async(get_news_rss, [url]) # asynchronously applying function to chunk. Each worker parallely begins to work on the job
177
- results.append(f) # appending result to results
178
-
179
- final_df = pd.DataFrame()
180
- for f in results:
181
- # print(f.get())
182
- final_df = pd.concat([final_df, f.get(timeout=120)], axis=0) # getting output of each parallel job
183
-
184
- final_df.reset_index(drop=True, inplace=True)
185
- logging.warning(final_df['src'].unique())
186
- pool.close()
187
- pool.join()
188
- return final_df
189
-
190
-
191
- def get_news():
192
- return get_news_multi_process(rss)