Spaces:

Randima-Silva
/

summarizer-api

Sleeping

App Files Files Community

summarizer-api / app.py

Randima-Silva

Update app.py

33500d9 verified over 1 year ago

raw

history blame contribute delete

10.1 kB

	# from flask import Flask, render_template, request
	# from weather import get_current_weather


	# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

	# app = Flask(__name__)


	# @app.route('/')
	# @app.route('/index')
	# def index():
	# return render_template('index.html')

	# @app.route('/test')
	# def test():
	# tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
	# # Load model
	# model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
	# return "Hello World!..."


	# @app.route('/weather')
	# def get_weather():
	# city = request.args.get('city')

	# print("working...")

	# # Check for empty strings or string with only spaces
	# if not bool(city.strip()):
	# # You could render "City Not Found" instead like we do below
	# city = "Kansas City"

	# weather_data = get_current_weather(city)

	# # City is not found by API
	# if not weather_data['cod'] == 200:
	# return render_template('city-not-found.html')

	# return render_template(
	# "weather.html",
	# title=weather_data["name"],
	# status=weather_data["weather"][0]["description"].capitalize(),
	# temp=f"{weather_data['main']['temp']:.1f}",
	# feels_like=f"{weather_data['main']['feels_like']:.1f}"
	# )


	# if __name__ == "__main__":
	# serve(app, host="0.0.0.0", port=8000)





	# ---------------------------------------------------------------------------------


	# from flask import Flask, render_template, request, jsonify
	# from waitress import serve
	# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
	# import time

	# app = Flask(__name__)

	# # Assuming the rest of your Flask app code remains unchanged

	# @app.route('/')
	# @app.route('/index')
	# def index():
	# return render_template('index.html')

	# @app.route('/test', methods=['POST'])
	# def test():
	# # Extract text from the request body
	# content = request.json.get('content', '')

	# if not content:
	# return jsonify({"error": "No content provided"}), 400

	# start_time = time.time()

	# # Specify the directory where you have saved the model
	# model_save_directory = "./my_project_folder/pegasus_model"

	# # Load the model and tokenizer from the directory
	# model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
	# tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)

	# # Create tokens - number representation of our text
	# tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")

	# # Summarize
	# summary = model.generate(**tokens, min_length=60, max_length=100)

	# # Decode summary
	# summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)

	# end_time = time.time()
	# execution_time = end_time - start_time

	# # Return the summarized text and execution time
	# return jsonify({
	# "summarized_text": summarized_text,
	# "execution_time": f"{execution_time} seconds"
	# })

	# # Assuming you have the `if __name__ == "__main__"` block to run the app
	# if __name__ == "__main__":
	# serve(app, host="0.0.0.0", port=8000)



	# ======================================================================================


	# from flask import Flask, request, jsonify
	# from waitress import serve
	from pymongo import MongoClient
	# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

	from flask import Flask, render_template, request, jsonify
	from flask_cors import CORS
	from waitress import serve
	from transformers import PegasusForConditionalGeneration, PegasusTokenizer
	from transformers import BartForConditionalGeneration, BartTokenizer

	import torch
	import time
	import time
	from datetime import datetime, timedelta

	app = Flask(__name__)
	CORS(app)

	# Use your MongoDB Atlas connection string
	mongo_conn_str = 'mongodb+srv://final_year_project:Ngd2jIj9PpvQfb5i@cluster0.3mhko.mongodb.net/news_scraping_site?retryWrites=true&w=majority&appName=Cluster0'
	client = MongoClient(mongo_conn_str)

	# Adjust these to match your specific database and collection names
	db = client['news_scraping_site']
	summaries_collection = db.articles
	scraped_collection = db.scrapedarticles


	@app.route('/')
	def hello():
	return {"hello":"its fucking working..."}
	@app.route('/index')
	def index():
	return render_template('index.html')

	@app.route('/test', methods=['POST'])
	def test():
	content = request.json.get('content', '')

	if not content:
	return jsonify({"error": "No content provided"}), 400

	start_time = time.time()

	# model_save_directory = "./my_project_folder/pegasus_model"
	model_save_directory = "google/pegasus-xsum"

	model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
	tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)

	tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")
	summary = model.generate(**tokens, min_length=60, max_length=100)
	summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)

	# Save the summary to MongoDB Atlas
	summary_document = {
	"original_text": content,
	"summarized_text": summarized_text,
	"timestamp": time.time()
	}
	result = summaries_collection.insert_one(summary_document)

	end_time = time.time()
	execution_time = end_time - start_time

	return jsonify({
	"summarized_text": summarized_text,
	"execution_time": f"{execution_time} seconds",
	"mongodb_object_id": str(result.inserted_id) # Return the MongoDB Object ID of the inserted document
	})


	@app.route('/bart', methods=['POST'])
	def bart():
	print("bart route called")
	# Get the content from the request
	content = request.json.get('content', '')
	print(content)

	# Check if content is provided
	if not content:
	return jsonify({"error": "No content provided"}), 400

	start_time = time.time()

	# Path to your BART model, adjust as necessary
	model_save_directory = "facebook/bart-large-cnn"

	# Load the tokenizer and model
	tokenizer = BartTokenizer.from_pretrained(model_save_directory)
	model = BartForConditionalGeneration.from_pretrained(model_save_directory)

	# Process the content for summarization
	inputs_no_trunc = tokenizer(content, max_length=None, return_tensors='pt', truncation=False)
	chunk_start = 0
	chunk_end = tokenizer.model_max_length # 1024 for BART
	inputs_batch_lst = []
	while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
	inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end]
	inputs_batch = torch.unsqueeze(inputs_batch, 0)
	inputs_batch_lst.append(inputs_batch)
	chunk_start += tokenizer.model_max_length
	chunk_end += tokenizer.model_max_length

	# Generate summaries for each batch of tokens
	summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=100, early_stopping=True) for inputs in inputs_batch_lst]

	# Combine the batched summaries
	summary_batch_lst = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids_lst for g in summary_id]
	summary_all = '\n'.join(summary_batch_lst)

	# Calculate the execution time
	execution_time = time.time() - start_time

	summary_document = {
	"original_text": content,
	"summarized_text": summary_all,
	"timestamp": time.time()
	}

	result = summaries_collection.insert_one(summary_document)

	# Return the summarized text and execution time
	return jsonify({
	"summarized_text": summary_all,
	"execution_time": f"{execution_time} seconds",
	"mongodb_article_id":f"{result.inserted_id}"
	})


	@app.route('/one', methods=['POST'])
	def one():
	print("bart route called")
	# Get the limit from the request
	limit = request.json.get('limit', 5)

	# Calculate the time threshold (1 hour ago)
	time_threshold = datetime.now() - timedelta(hours=1)

	# Query for articles
	articles = scraped_collection.find({
	"summarized": "false"
	# "fetched_time": {"$gte": time_threshold}
	}).limit(limit)

	# print(len(articles))
	articles_list = list(articles)
	print(articles_list)

	# Path to your BART model
	model_save_directory = "facebook/bart-large-cnn"

	# Load the tokenizer and model
	tokenizer = BartTokenizer.from_pretrained(model_save_directory)
	model = BartForConditionalGeneration.from_pretrained(model_save_directory)

	for article in articles:
	content = article['content']
	start_time = time.time()

	# Summarize the content
	inputs = tokenizer(content, return_tensors='pt', max_length=1024, truncation=True)
	summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
	summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	execution_time = time.time() - start_time



	summary_document = {
	# "original_text": content,
	"summary": summary_text,
	"summarized":"true"
	# "timestamp": time.time()
	}

	result = summaries_collection.insert_one(summary_document)

	# Save the summarized text back to the database
	result_scraped = scraped_collection.update_one(
	{"_id": article['_id']},
	{"$set": {"summarized":"true"}}
	)

	print(f"Summarized and updated article ID {article['_id']}, Execution time: {execution_time} seconds")

	return jsonify({"message": "Summarization completed for requested articles"})



	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)


	# if __name__ == "__main__":
	# # serve(app, host="0.0.0.0", port=9000)
	# app.run(host="0.0.0.0", port=9000, debug=True)