Upload 7 files
Browse files- app.py +26 -0
- db_operations/__init__.py +1 -0
- db_operations/db_operations.py +50 -0
- docker/Dockerfile +7 -0
- news_extractor/__init__.py +1 -0
- news_extractor/news_extractor.py +118 -0
- requirements.txt +1 -1
app.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from news_extractor import get_news
|
2 |
+
from db_operations import DBOperations
|
3 |
+
import json
|
4 |
+
from flask import Flask
|
5 |
+
from flask_cors import cross_origin, CORS
|
6 |
+
|
7 |
+
app = Flask(__name__)
|
8 |
+
CORS(app)
|
9 |
+
|
10 |
+
|
11 |
+
@app.route("/")
|
12 |
+
@cross_origin()
|
13 |
+
def update_news():
|
14 |
+
status = "success"
|
15 |
+
try:
|
16 |
+
news_df = get_news()
|
17 |
+
news_json = [*json.loads(news_df.reset_index(drop=True).to_json(orient="index")).values()]
|
18 |
+
db = DBOperations()
|
19 |
+
db.insert_news_into_db(news_json)
|
20 |
+
except:
|
21 |
+
status = "failure"
|
22 |
+
return status
|
23 |
+
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
app.run(host="0.0.0.0", port=5002, timeout=120, workers=3, threads=3)
|
db_operations/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from db_operations.db_operations import *
|
db_operations/db_operations.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymongo
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
class DBOperations:
|
6 |
+
"""
|
7 |
+
Inserts processed news into MongoDB
|
8 |
+
"""
|
9 |
+
def __init__(self):
|
10 |
+
self.url = os.getenv('DB_URL')
|
11 |
+
self.database = "rss_news_db"
|
12 |
+
self.collection = "rss_news"
|
13 |
+
self.__client = None
|
14 |
+
self.__error = 0
|
15 |
+
|
16 |
+
def __connect(self):
|
17 |
+
try:
|
18 |
+
self.__client = pymongo.MongoClient(self.url)
|
19 |
+
_ = self.__client.list_database_names()
|
20 |
+
except Exception as conn_exception:
|
21 |
+
self.__error = 1
|
22 |
+
self.__client = None
|
23 |
+
raise
|
24 |
+
|
25 |
+
def __insert(self, documents):
|
26 |
+
try:
|
27 |
+
|
28 |
+
db = self.__client[self.database]
|
29 |
+
coll = db[self.collection]
|
30 |
+
coll.drop()
|
31 |
+
coll.insert_many(documents=documents)
|
32 |
+
except Exception as insert_err:
|
33 |
+
self.__error = 1
|
34 |
+
raise
|
35 |
+
|
36 |
+
def __close_connection(self):
|
37 |
+
if self.__client is not None:
|
38 |
+
self.__client.close()
|
39 |
+
self.__client = None
|
40 |
+
|
41 |
+
def insert_news_into_db(self, documents: list):
|
42 |
+
if self.url is not None:
|
43 |
+
if self.__error == 0:
|
44 |
+
self.__connect()
|
45 |
+
if self.__error == 0:
|
46 |
+
self.__insert(documents=documents)
|
47 |
+
if self.__error == 0:
|
48 |
+
print("Insertion Successful")
|
49 |
+
if self.__client is not None:
|
50 |
+
self.__close_connection()
|
docker/Dockerfile
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
WORKDIR /webapp
|
3 |
+
COPY . .
|
4 |
+
RUN rm -r ./docker
|
5 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
6 |
+
CMD ["gunicorn", "app:app", "--timeout", "120", "--workers=3", "--threads=3", "--worker-connections=1000"]
|
7 |
+
EXPOSE 5002
|
news_extractor/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from news_extractor.news_extractor import *
|
news_extractor/news_extractor.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import requests as r
|
5 |
+
import regex as re
|
6 |
+
from dateutil import parser
|
7 |
+
|
8 |
+
|
9 |
+
def date_time_parser(dt):
|
10 |
+
"""
|
11 |
+
Computes the minutes elapsed since published time.
|
12 |
+
:param dt: date
|
13 |
+
:return: int, minutes elapsed.
|
14 |
+
"""
|
15 |
+
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
16 |
+
|
17 |
+
def text_clean(desc):
|
18 |
+
"""
|
19 |
+
Cleans the text by removing special chars.
|
20 |
+
:param desc: string containing description
|
21 |
+
:return: str, cleaned description.
|
22 |
+
"""
|
23 |
+
desc = desc.replace("<", "<")
|
24 |
+
desc = desc.replace(">", ">")
|
25 |
+
desc = re.sub("<.*?>", "", desc)
|
26 |
+
desc = desc.replace("#39;", "'")
|
27 |
+
desc = desc.replace('"', '"')
|
28 |
+
desc = desc.replace(' ', ' ')
|
29 |
+
desc = desc.replace('#32;', ' ')
|
30 |
+
return desc
|
31 |
+
|
32 |
+
|
33 |
+
def rss_parser(i):
|
34 |
+
"""
|
35 |
+
Returns a data frame of parsed news item.
|
36 |
+
:param i: single news item in RSS feed.
|
37 |
+
:return: Data frame of parsed news item.
|
38 |
+
"""
|
39 |
+
b1 = BeautifulSoup(str(i), "xml")
|
40 |
+
title = "" if b1.find("title") is None else b1.find("title").get_text()
|
41 |
+
title = text_clean(title)
|
42 |
+
url = "" if b1.find("link") is None else b1.find("link").get_text()
|
43 |
+
desc = "" if b1.find("description") is None else b1.find("description").get_text()
|
44 |
+
desc = text_clean(desc)
|
45 |
+
desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
|
46 |
+
date = "Sat, 12 Aug 2000 13:39:15 +0530" if ((b1.find("pubDate") is "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
|
47 |
+
if url.find("businesstoday.in") >= 0:
|
48 |
+
date = date.replace("GMT", "+0530")
|
49 |
+
date1 = parser.parse(date)
|
50 |
+
return pd.DataFrame({"title": title,
|
51 |
+
"url": url,
|
52 |
+
"description": desc,
|
53 |
+
"parsed_date": date1}, index=[0])
|
54 |
+
|
55 |
+
|
56 |
+
def src_parse(rss):
|
57 |
+
"""
|
58 |
+
Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
|
59 |
+
:param rss: RSS URL
|
60 |
+
:return: str, string containing the source name
|
61 |
+
"""
|
62 |
+
if rss.find('ndtvprofit') >= 0:
|
63 |
+
rss = 'ndtv profit'
|
64 |
+
rss = rss.replace("https://www.", "")
|
65 |
+
rss = rss.split("/")
|
66 |
+
return rss[0]
|
67 |
+
|
68 |
+
|
69 |
+
def news_agg(rss):
|
70 |
+
"""
|
71 |
+
Returns feeds from each 'rss' URL.
|
72 |
+
:param rss: RSS URL.
|
73 |
+
:return: Data frame of processed articles.
|
74 |
+
"""
|
75 |
+
try:
|
76 |
+
rss_df = pd.DataFrame()
|
77 |
+
resp = r.get(rss, headers={
|
78 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
79 |
+
"(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"})
|
80 |
+
b = BeautifulSoup(resp.content, "xml")
|
81 |
+
items = b.find_all("item")
|
82 |
+
for i in items:
|
83 |
+
rss_df = rss_df.append(rss_parser(i)).copy()
|
84 |
+
rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
|
85 |
+
rss_df.dropna(inplace=True)
|
86 |
+
rss_df["src"] = src_parse(rss)
|
87 |
+
rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
|
88 |
+
rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
|
89 |
+
# rss_df["elapsed_time_str"] = rss_df["elapsed_time"].apply(elapsed_time_str)
|
90 |
+
except Exception as e:
|
91 |
+
print(e)
|
92 |
+
pass
|
93 |
+
return rss_df
|
94 |
+
|
95 |
+
|
96 |
+
# List of RSS feeds
|
97 |
+
rss = ['https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
|
98 |
+
|
99 |
+
|
100 |
+
'https://www.moneycontrol.com/rss/latestnews.xml',
|
101 |
+
'https://www.livemint.com/rss/news',
|
102 |
+
|
103 |
+
'https://www.zeebiz.com/latest.xml/feed',
|
104 |
+
'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
|
105 |
+
|
106 |
+
|
107 |
+
def get_news():
|
108 |
+
final_df = pd.DataFrame()
|
109 |
+
for i in rss:
|
110 |
+
final_df = final_df.append(news_agg(i))
|
111 |
+
|
112 |
+
final_df.sort_values(by="elapsed_time", inplace=True)
|
113 |
+
# final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
|
114 |
+
# final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
|
115 |
+
final_df.drop(columns=['elapsed_time'], inplace=True)
|
116 |
+
final_df.drop_duplicates(subset='description', inplace=True)
|
117 |
+
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
118 |
+
return final_df
|
requirements.txt
CHANGED
@@ -9,4 +9,4 @@ flask==2.2.2
|
|
9 |
flask_cors==3.0.10
|
10 |
gunicorn==20.1.0
|
11 |
pymongo==4.3.3
|
12 |
-
Werkzeug==2.2.2
|
|
|
9 |
flask_cors==3.0.10
|
10 |
gunicorn==20.1.0
|
11 |
pymongo==4.3.3
|
12 |
+
Werkzeug==2.2.2
|