Randima-Silva commited on
Commit
ca53fb7
1 Parent(s): 9caaa67

initial commit

Browse files
app.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from flask import Flask, render_template, request
2
+ # from weather import get_current_weather
3
+ # from waitress import serve
4
+
5
+ # from transformers import PegasusForConditionalGeneration, PegasusTokenizer
6
+
7
+ # app = Flask(__name__)
8
+
9
+
10
+ # @app.route('/')
11
+ # @app.route('/index')
12
+ # def index():
13
+ # return render_template('index.html')
14
+
15
+ # @app.route('/test')
16
+ # def test():
17
+ # tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
18
+ # # Load model
19
+ # model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
20
+ # return "Hello World!..."
21
+
22
+
23
+ # @app.route('/weather')
24
+ # def get_weather():
25
+ # city = request.args.get('city')
26
+
27
+ # print("working...")
28
+
29
+ # # Check for empty strings or string with only spaces
30
+ # if not bool(city.strip()):
31
+ # # You could render "City Not Found" instead like we do below
32
+ # city = "Kansas City"
33
+
34
+ # weather_data = get_current_weather(city)
35
+
36
+ # # City is not found by API
37
+ # if not weather_data['cod'] == 200:
38
+ # return render_template('city-not-found.html')
39
+
40
+ # return render_template(
41
+ # "weather.html",
42
+ # title=weather_data["name"],
43
+ # status=weather_data["weather"][0]["description"].capitalize(),
44
+ # temp=f"{weather_data['main']['temp']:.1f}",
45
+ # feels_like=f"{weather_data['main']['feels_like']:.1f}"
46
+ # )
47
+
48
+
49
+ # if __name__ == "__main__":
50
+ # serve(app, host="0.0.0.0", port=8000)
51
+
52
+
53
+
54
+
55
+
56
+ # ---------------------------------------------------------------------------------
57
+
58
+
59
+ # from flask import Flask, render_template, request, jsonify
60
+ # from waitress import serve
61
+ # from transformers import PegasusForConditionalGeneration, PegasusTokenizer
62
+ # import time
63
+
64
+ # app = Flask(__name__)
65
+
66
+ # # Assuming the rest of your Flask app code remains unchanged
67
+
68
+ # @app.route('/')
69
+ # @app.route('/index')
70
+ # def index():
71
+ # return render_template('index.html')
72
+
73
+ # @app.route('/test', methods=['POST'])
74
+ # def test():
75
+ # # Extract text from the request body
76
+ # content = request.json.get('content', '')
77
+
78
+ # if not content:
79
+ # return jsonify({"error": "No content provided"}), 400
80
+
81
+ # start_time = time.time()
82
+
83
+ # # Specify the directory where you have saved the model
84
+ # model_save_directory = "./my_project_folder/pegasus_model"
85
+
86
+ # # Load the model and tokenizer from the directory
87
+ # model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
88
+ # tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)
89
+
90
+ # # Create tokens - number representation of our text
91
+ # tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")
92
+
93
+ # # Summarize
94
+ # summary = model.generate(**tokens, min_length=60, max_length=100)
95
+
96
+ # # Decode summary
97
+ # summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)
98
+
99
+ # end_time = time.time()
100
+ # execution_time = end_time - start_time
101
+
102
+ # # Return the summarized text and execution time
103
+ # return jsonify({
104
+ # "summarized_text": summarized_text,
105
+ # "execution_time": f"{execution_time} seconds"
106
+ # })
107
+
108
+ # # Assuming you have the `if __name__ == "__main__"` block to run the app
109
+ # if __name__ == "__main__":
110
+ # serve(app, host="0.0.0.0", port=8000)
111
+
112
+
113
+
114
+ # ======================================================================================
115
+
116
+
117
+ # from flask import Flask, request, jsonify
118
+ # from waitress import serve
119
+ from pymongo import MongoClient
120
+ # from transformers import PegasusForConditionalGeneration, PegasusTokenizer
121
+
122
+ from flask import Flask, render_template, request, jsonify
123
+ from flask_cors import CORS
124
+ from waitress import serve
125
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
126
+ from transformers import BartForConditionalGeneration, BartTokenizer
127
+
128
+ import torch
129
+ import time
130
+ import time
131
+ from datetime import datetime, timedelta
132
+
133
+ app = Flask(__name__)
134
+ CORS(app)
135
+
136
+ # Use your MongoDB Atlas connection string
137
+ mongo_conn_str = 'mongodb+srv://final_year_project:Ngd2jIj9PpvQfb5i@cluster0.3mhko.mongodb.net/news_scraping_site?retryWrites=true&w=majority&appName=Cluster0'
138
+ client = MongoClient(mongo_conn_str)
139
+
140
+ # Adjust these to match your specific database and collection names
141
+ db = client['news_scraping_site']
142
+ summaries_collection = db.articles
143
+ scraped_collection = db.scrapedarticles
144
+
145
+
146
+ @app.route('/')
147
+ def hello():
148
+ return {"hello":"its fucking working..."}
149
+ @app.route('/index')
150
+ def index():
151
+ return render_template('index.html')
152
+
153
+ @app.route('/test', methods=['POST'])
154
+ def test():
155
+ content = request.json.get('content', '')
156
+
157
+ if not content:
158
+ return jsonify({"error": "No content provided"}), 400
159
+
160
+ start_time = time.time()
161
+
162
+ # model_save_directory = "./my_project_folder/pegasus_model"
163
+ model_save_directory = "./models/pegasus_model"
164
+
165
+ model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
166
+ tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)
167
+
168
+ tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")
169
+ summary = model.generate(**tokens, min_length=60, max_length=100)
170
+ summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)
171
+
172
+ # Save the summary to MongoDB Atlas
173
+ summary_document = {
174
+ "original_text": content,
175
+ "summarized_text": summarized_text,
176
+ "timestamp": time.time()
177
+ }
178
+ result = summaries_collection.insert_one(summary_document)
179
+
180
+ end_time = time.time()
181
+ execution_time = end_time - start_time
182
+
183
+ return jsonify({
184
+ "summarized_text": summarized_text,
185
+ "execution_time": f"{execution_time} seconds",
186
+ "mongodb_object_id": str(result.inserted_id) # Return the MongoDB Object ID of the inserted document
187
+ })
188
+
189
+
190
+ @app.route('/bart', methods=['POST'])
191
+ def bart():
192
+ print("bart route called")
193
+ # Get the content from the request
194
+ content = request.json.get('content', '')
195
+ print(content)
196
+
197
+ # Check if content is provided
198
+ if not content:
199
+ return jsonify({"error": "No content provided"}), 400
200
+
201
+ start_time = time.time()
202
+
203
+ # Path to your BART model, adjust as necessary
204
+ model_save_directory = "./models/bart-large-cnn"
205
+
206
+ # Load the tokenizer and model
207
+ tokenizer = BartTokenizer.from_pretrained(model_save_directory)
208
+ model = BartForConditionalGeneration.from_pretrained(model_save_directory)
209
+
210
+ # Process the content for summarization
211
+ inputs_no_trunc = tokenizer(content, max_length=None, return_tensors='pt', truncation=False)
212
+ chunk_start = 0
213
+ chunk_end = tokenizer.model_max_length # 1024 for BART
214
+ inputs_batch_lst = []
215
+ while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
216
+ inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end]
217
+ inputs_batch = torch.unsqueeze(inputs_batch, 0)
218
+ inputs_batch_lst.append(inputs_batch)
219
+ chunk_start += tokenizer.model_max_length
220
+ chunk_end += tokenizer.model_max_length
221
+
222
+ # Generate summaries for each batch of tokens
223
+ summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=100, early_stopping=True) for inputs in inputs_batch_lst]
224
+
225
+ # Combine the batched summaries
226
+ summary_batch_lst = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids_lst for g in summary_id]
227
+ summary_all = '\n'.join(summary_batch_lst)
228
+
229
+ # Calculate the execution time
230
+ execution_time = time.time() - start_time
231
+
232
+ summary_document = {
233
+ "original_text": content,
234
+ "summarized_text": summary_all,
235
+ "timestamp": time.time()
236
+ }
237
+
238
+ result = summaries_collection.insert_one(summary_document)
239
+
240
+ # Return the summarized text and execution time
241
+ return jsonify({
242
+ "summarized_text": summary_all,
243
+ "execution_time": f"{execution_time} seconds",
244
+ "mongodb_article_id":f"{result.inserted_id}"
245
+ })
246
+
247
+
248
+ @app.route('/one', methods=['POST'])
249
+ def one():
250
+ print("bart route called")
251
+ # Get the limit from the request
252
+ limit = request.json.get('limit', 5)
253
+
254
+ # Calculate the time threshold (1 hour ago)
255
+ time_threshold = datetime.now() - timedelta(hours=1)
256
+
257
+ # Query for articles
258
+ articles = scraped_collection.find({
259
+ "summarized": "false"
260
+ # "fetched_time": {"$gte": time_threshold}
261
+ }).limit(limit)
262
+
263
+ # print(len(articles))
264
+ articles_list = list(articles)
265
+ print(articles_list)
266
+
267
+ # Path to your BART model
268
+ model_save_directory = "./models/bart-large-cnn"
269
+
270
+ # Load the tokenizer and model
271
+ tokenizer = BartTokenizer.from_pretrained(model_save_directory)
272
+ model = BartForConditionalGeneration.from_pretrained(model_save_directory)
273
+
274
+ for article in articles:
275
+ content = article['content']
276
+ start_time = time.time()
277
+
278
+ # Summarize the content
279
+ inputs = tokenizer(content, return_tensors='pt', max_length=1024, truncation=True)
280
+ summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
281
+ summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
282
+
283
+ execution_time = time.time() - start_time
284
+
285
+
286
+
287
+ summary_document = {
288
+ # "original_text": content,
289
+ "summary": summary_text,
290
+ "summarized":"true"
291
+ # "timestamp": time.time()
292
+ }
293
+
294
+ result = summaries_collection.insert_one(summary_document)
295
+
296
+ # Save the summarized text back to the database
297
+ result_scraped = scraped_collection.update_one(
298
+ {"_id": article['_id']},
299
+ {"$set": {"summarized":"true"}}
300
+ )
301
+
302
+ print(f"Summarized and updated article ID {article['_id']}, Execution time: {execution_time} seconds")
303
+
304
+ return jsonify({"message": "Summarization completed for requested articles"})
305
+
306
+
307
+
308
+ if __name__ == "__main__":
309
+ app.run(host="0.0.0.0", port=7860)
310
+
311
+
312
+ # if __name__ == "__main__":
313
+ # # serve(app, host="0.0.0.0", port=9000)
314
+ # app.run(host="0.0.0.0", port=9000, debug=True)
requirements.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ blinker==1.6.2
3
+ certifi==2023.5.7
4
+ charset-normalizer==3.2.0
5
+ click==8.1.5
6
+ colorama==0.4.6
7
+ dnspython==2.6.1
8
+ filelock==3.13.1
9
+ Flask==2.3.2
10
+ Flask-Cors==4.0.0
11
+ fsspec==2024.3.0
12
+ huggingface-hub==0.21.4
13
+ idna==3.4
14
+ itsdangerous==2.1.2
15
+ Jinja2==3.1.2
16
+ MarkupSafe==2.1.3
17
+ mpmath==1.3.0
18
+ networkx==3.2.1
19
+ numpy==1.26.4
20
+ packaging==24.0
21
+ pillow==10.2.0
22
+ pip==24.0
23
+ pymongo==4.6.2
24
+ python-dotenv==1.0.0
25
+ PyYAML==6.0.1
26
+ regex==2023.12.25
27
+ requests==2.31.0
28
+ safetensors==0.4.2
29
+ sentencepiece==0.2.0
30
+ sympy==1.12
31
+ tokenizers==0.15.2
32
+ torch==2.2.1
33
+ torchaudio==2.2.1
34
+ torchvision==0.17.1
35
+ tqdm==4.66.2
36
+ transformers==4.38.2
37
+ typing_extensions==4.10.0
38
+ urllib3==2.0.3
39
+ waitress==2.1.2
40
+ Werkzeug==2.3.6
41
+ gunicorn
static/styles/style.css ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * {
2
+ margin: 0;
3
+ padding: 0;
4
+ box-sizing: border-box;
5
+ }
6
+
7
+ body {
8
+ padding: 2rem;
9
+ background-color: #333;
10
+ color: whitesmoke;
11
+ min-height: 100vh;
12
+ display: flex;
13
+ flex-direction: column;
14
+ align-items: center;
15
+ gap: 2rem;
16
+ font-size: 2rem;
17
+ }
18
+
19
+ input, button {
20
+ font-size: 2rem;
21
+ padding: 1rem;
22
+ border-radius: 10px;
23
+ }
templates/city-not-found.html ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>City Not Found</title>
8
+ <link href="{{ url_for('static', filename='styles/style.css') }}" rel="stylesheet" />
9
+ </head>
10
+
11
+ <body>
12
+ <h1>City Not Found</h1>
13
+ <h2>Try Again?</h2>
14
+ <form action="/weather">
15
+ <input type="text" name="city" id="city" placeholder="Enter a City" />
16
+ <button type="submit">Submit</button>
17
+ </form>
18
+ </body>
19
+
20
+ </html>
templates/index.html ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Get Weather Conditions</title>
8
+ <link href="{{ url_for('static', filename='styles/style.css') }}" rel="stylesheet" />
9
+ </head>
10
+
11
+ <body>
12
+ <h1>Get Weather Conditions</h1>
13
+ <form action="/weather">
14
+ <input type="text" name="city" id="city" placeholder="Enter a City" />
15
+ <button type="submit">Submit</button>
16
+ </form>
17
+ </body>
18
+
19
+ </html>
templates/weather.html ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>{{ title }} Weather</title>
8
+ <link href="{{ url_for('static', filename='styles/style.css') }}" rel="stylesheet" />
9
+ </head>
10
+
11
+ <body>
12
+ <h1>{{ title }} Weather</h1>
13
+ <p>{{ status }} and {{ temp }} &deg;</p>
14
+ <p>Feels like {{ feels_like }} &deg;</p>
15
+
16
+ <form action="/weather">
17
+ <input type="text" name="city" id="city" placeholder="Enter a City" />
18
+ <button type="submit">Submit</button>
19
+ </form>
20
+ </body>
21
+
22
+ </html>