from pymongo import MongoClient
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from waitress import serve
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
import time
from datetime import datetime, timedelta
app = Flask(__name__)
# Use your MongoDB Atlas connection string
mongo_conn_str = 'mongodb+srv://'
client = MongoClient(mongo_conn_str)
# Adjust these to match your specific database and collection names
db = client['news_scraping_site']
summaries_collection = db.articles
scraped_collection = db.scrapedarticles
def hello():
return {"hello":"its fucking working..."}
def index():
return render_template('index.html')
@app.route('/test', methods=['POST'])
def test():
content = request.json.get('content', '')
if not content:
return jsonify({"error": "No content provided"}), 400
start_time = time.time()
# model_save_directory = "./my_project_folder/pegasus_model"
model_save_directory = "google/pegasus-xsum"
model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)
tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")
summary = model.generate(**tokens, min_length=60, max_length=100)
summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)
# Save the summary to MongoDB Atlas
summary_document = {
"original_text": content,
"summarized_text": summarized_text,
"timestamp": time.time()
result = summaries_collection.insert_one(summary_document)
end_time = time.time()
execution_time = end_time - start_time
return jsonify({
"summarized_text": summarized_text,
"execution_time": f"{execution_time} seconds",
"mongodb_object_id": str(result.inserted_id) # Return the MongoDB Object ID of the inserted document
@app.route('/bart', methods=['POST'])
def bart():
print("bart route called")
# Get the content from the request
content = request.json.get('content', '')
# Check if content is provided
if not content:
return jsonify({"error": "No content provided"}), 400
start_time = time.time()
# Path to your BART model, adjust as necessary
model_save_directory = "facebook/bart-large-cnn"
# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_save_directory)
model = BartForConditionalGeneration.from_pretrained(model_save_directory)
# Process the content for summarization
inputs_no_trunc = tokenizer(content, max_length=None, return_tensors='pt', truncation=False)
chunk_start = 0
chunk_end = tokenizer.model_max_length # 1024 for BART
inputs_batch_lst = []
while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end]
inputs_batch = torch.unsqueeze(inputs_batch, 0)
chunk_start += tokenizer.model_max_length
chunk_end += tokenizer.model_max_length
# Generate summaries for each batch of tokens
summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=100, early_stopping=True) for inputs in inputs_batch_lst]
# Combine the batched summaries
summary_batch_lst = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids_lst for g in summary_id]
summary_all = '\n'.join(summary_batch_lst)
# Calculate the execution time
execution_time = time.time() - start_time
summary_document = {
"original_text": content,
"summarized_text": summary_all,
"timestamp": time.time()
result = summaries_collection.insert_one(summary_document)
# Return the summarized text and execution time
return jsonify({
"summarized_text": summary_all,
"execution_time": f"{execution_time} seconds",
@app.route('/one', methods=['POST'])
def one():
print("bart route called")
# Get the limit from the request
limit = request.json.get('limit', 5)
# Calculate the time threshold (1 hour ago)
time_threshold = - timedelta(hours=1)
# Query for articles
articles = scraped_collection.find({
"summarized": "false"
# "fetched_time": {"$gte": time_threshold}
# print(len(articles))
articles_list = list(articles)
# Path to your BART model
model_save_directory = "facebook/bart-large-cnn"
# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_save_directory)
model = BartForConditionalGeneration.from_pretrained(model_save_directory)
for article in articles:
content = article['content']
start_time = time.time()
# Summarize the content
inputs = tokenizer(content, return_tensors='pt', max_length=1024, truncation=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
execution_time = time.time() - start_time
summary_document = {
# "original_text": content,
"summary": summary_text,
# "timestamp": time.time()
result = summaries_collection.insert_one(summary_document)
# Save the summarized text back to the database
result_scraped = scraped_collection.update_one(
{"_id": article['_id']},
{"$set": {"summarized":"true"}}
print(f"Summarized and updated article ID {article['_id']}, Execution time: {execution_time} seconds")
return jsonify({"message": "Summarization completed for requested articles"})
if __name__ == "__main__":"", port=7860)
# if __name__ == "__main__":
# # serve(app, host="", port=9000)
#"", port=9000, debug=True)