lalithadevi commited on
Commit
e857da4
1 Parent(s): 9720d76

Upload 7 files

Browse files
app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from news_extractor import get_news
2
+ from db_operations import DBOperations
3
+ import json
4
+ from flask import Flask
5
+ from flask_cors import cross_origin, CORS
6
+
7
+ app = Flask(__name__)
8
+ CORS(app)
9
+
10
+
11
+ @app.route("/")
12
+ @cross_origin()
13
+ def update_news():
14
+ status = "success"
15
+ try:
16
+ news_df = get_news()
17
+ news_json = [*json.loads(news_df.reset_index(drop=True).to_json(orient="index")).values()]
18
+ db = DBOperations()
19
+ db.insert_news_into_db(news_json)
20
+ except:
21
+ status = "failure"
22
+ return status
23
+
24
+
25
+ if __name__ == "__main__":
26
+ app.run(host="0.0.0.0", port=5002, timeout=120, workers=3, threads=3)
db_operations/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from db_operations.db_operations import *
db_operations/db_operations.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymongo
2
+ import os
3
+
4
+
5
+ class DBOperations:
6
+ """
7
+ Inserts processed news into MongoDB
8
+ """
9
+ def __init__(self):
10
+ self.url = os.getenv('DB_URL')
11
+ self.database = "rss_news_db"
12
+ self.collection = "rss_news"
13
+ self.__client = None
14
+ self.__error = 0
15
+
16
+ def __connect(self):
17
+ try:
18
+ self.__client = pymongo.MongoClient(self.url)
19
+ _ = self.__client.list_database_names()
20
+ except Exception as conn_exception:
21
+ self.__error = 1
22
+ self.__client = None
23
+ raise
24
+
25
+ def __insert(self, documents):
26
+ try:
27
+
28
+ db = self.__client[self.database]
29
+ coll = db[self.collection]
30
+ coll.drop()
31
+ coll.insert_many(documents=documents)
32
+ except Exception as insert_err:
33
+ self.__error = 1
34
+ raise
35
+
36
+ def __close_connection(self):
37
+ if self.__client is not None:
38
+ self.__client.close()
39
+ self.__client = None
40
+
41
+ def insert_news_into_db(self, documents: list):
42
+ if self.url is not None:
43
+ if self.__error == 0:
44
+ self.__connect()
45
+ if self.__error == 0:
46
+ self.__insert(documents=documents)
47
+ if self.__error == 0:
48
+ print("Insertion Successful")
49
+ if self.__client is not None:
50
+ self.__close_connection()
docker/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+ WORKDIR /webapp
3
+ COPY . .
4
+ RUN rm -r ./docker
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+ CMD ["gunicorn", "app:app", "--timeout", "120", "--workers=3", "--threads=3", "--worker-connections=1000"]
7
+ EXPOSE 5002
news_extractor/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from news_extractor.news_extractor import *
news_extractor/news_extractor.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from bs4 import BeautifulSoup
4
+ import requests as r
5
+ import regex as re
6
+ from dateutil import parser
7
+
8
+
9
+ def date_time_parser(dt):
10
+ """
11
+ Computes the minutes elapsed since published time.
12
+ :param dt: date
13
+ :return: int, minutes elapsed.
14
+ """
15
+ return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
16
+
17
+ def text_clean(desc):
18
+ """
19
+ Cleans the text by removing special chars.
20
+ :param desc: string containing description
21
+ :return: str, cleaned description.
22
+ """
23
+ desc = desc.replace("&lt;", "<")
24
+ desc = desc.replace("&gt;", ">")
25
+ desc = re.sub("<.*?>", "", desc)
26
+ desc = desc.replace("#39;", "'")
27
+ desc = desc.replace('&quot;', '"')
28
+ desc = desc.replace('&nbsp;', ' ')
29
+ desc = desc.replace('#32;', ' ')
30
+ return desc
31
+
32
+
33
+ def rss_parser(i):
34
+ """
35
+ Returns a data frame of parsed news item.
36
+ :param i: single news item in RSS feed.
37
+ :return: Data frame of parsed news item.
38
+ """
39
+ b1 = BeautifulSoup(str(i), "xml")
40
+ title = "" if b1.find("title") is None else b1.find("title").get_text()
41
+ title = text_clean(title)
42
+ url = "" if b1.find("link") is None else b1.find("link").get_text()
43
+ desc = "" if b1.find("description") is None else b1.find("description").get_text()
44
+ desc = text_clean(desc)
45
+ desc = f'{desc[:300]}...' if len(desc) >= 300 else desc
46
+ date = "Sat, 12 Aug 2000 13:39:15 +0530" if ((b1.find("pubDate") is "") or (b1.find("pubDate") is None)) else b1.find("pubDate").get_text()
47
+ if url.find("businesstoday.in") >= 0:
48
+ date = date.replace("GMT", "+0530")
49
+ date1 = parser.parse(date)
50
+ return pd.DataFrame({"title": title,
51
+ "url": url,
52
+ "description": desc,
53
+ "parsed_date": date1}, index=[0])
54
+
55
+
56
+ def src_parse(rss):
57
+ """
58
+ Returns the root domain name (eg. livemint.com is extracted from www.livemint.com
59
+ :param rss: RSS URL
60
+ :return: str, string containing the source name
61
+ """
62
+ if rss.find('ndtvprofit') >= 0:
63
+ rss = 'ndtv profit'
64
+ rss = rss.replace("https://www.", "")
65
+ rss = rss.split("/")
66
+ return rss[0]
67
+
68
+
69
+ def news_agg(rss):
70
+ """
71
+ Returns feeds from each 'rss' URL.
72
+ :param rss: RSS URL.
73
+ :return: Data frame of processed articles.
74
+ """
75
+ try:
76
+ rss_df = pd.DataFrame()
77
+ resp = r.get(rss, headers={
78
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
79
+ "(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"})
80
+ b = BeautifulSoup(resp.content, "xml")
81
+ items = b.find_all("item")
82
+ for i in items:
83
+ rss_df = rss_df.append(rss_parser(i)).copy()
84
+ rss_df["description"] = rss_df["description"].replace([" NULL", ''], np.nan)
85
+ rss_df.dropna(inplace=True)
86
+ rss_df["src"] = src_parse(rss)
87
+ rss_df["elapsed_time"] = rss_df["parsed_date"].apply(date_time_parser)
88
+ rss_df["parsed_date"] = rss_df["parsed_date"].astype("str")
89
+ # rss_df["elapsed_time_str"] = rss_df["elapsed_time"].apply(elapsed_time_str)
90
+ except Exception as e:
91
+ print(e)
92
+ pass
93
+ return rss_df
94
+
95
+
96
+ # List of RSS feeds
97
+ rss = ['https://www.economictimes.indiatimes.com/rssfeedstopstories.cms',
98
+
99
+
100
+ 'https://www.moneycontrol.com/rss/latestnews.xml',
101
+ 'https://www.livemint.com/rss/news',
102
+
103
+ 'https://www.zeebiz.com/latest.xml/feed',
104
+ 'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
105
+
106
+
107
+ def get_news():
108
+ final_df = pd.DataFrame()
109
+ for i in rss:
110
+ final_df = final_df.append(news_agg(i))
111
+
112
+ final_df.sort_values(by="elapsed_time", inplace=True)
113
+ # final_df['src_time'] = final_df['src'] + ("&nbsp;" * 5) + final_df["elapsed_time_str"]
114
+ # final_df.drop(columns=['date', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
115
+ final_df.drop(columns=['elapsed_time'], inplace=True)
116
+ final_df.drop_duplicates(subset='description', inplace=True)
117
+ final_df = final_df.loc[(final_df["title"] != ""), :].copy()
118
+ return final_df
requirements.txt CHANGED
@@ -9,4 +9,4 @@ flask==2.2.2
9
  flask_cors==3.0.10
10
  gunicorn==20.1.0
11
  pymongo==4.3.3
12
- Werkzeug==2.2.2
 
9
  flask_cors==3.0.10
10
  gunicorn==20.1.0
11
  pymongo==4.3.3
12
+ Werkzeug==2.2.2