Spaces:
Sleeping
Sleeping
| # from flask import Flask, render_template, request | |
| # from weather import get_current_weather | |
| # from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
| # app = Flask(__name__) | |
| # @app.route('/') | |
| # @app.route('/index') | |
| # def index(): | |
| # return render_template('index.html') | |
| # @app.route('/test') | |
| # def test(): | |
| # tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") | |
| # # Load model | |
| # model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") | |
| # return "Hello World!..." | |
| # @app.route('/weather') | |
| # def get_weather(): | |
| # city = request.args.get('city') | |
| # print("working...") | |
| # # Check for empty strings or string with only spaces | |
| # if not bool(city.strip()): | |
| # # You could render "City Not Found" instead like we do below | |
| # city = "Kansas City" | |
| # weather_data = get_current_weather(city) | |
| # # City is not found by API | |
| # if not weather_data['cod'] == 200: | |
| # return render_template('city-not-found.html') | |
| # return render_template( | |
| # "weather.html", | |
| # title=weather_data["name"], | |
| # status=weather_data["weather"][0]["description"].capitalize(), | |
| # temp=f"{weather_data['main']['temp']:.1f}", | |
| # feels_like=f"{weather_data['main']['feels_like']:.1f}" | |
| # ) | |
| # if __name__ == "__main__": | |
| # serve(app, host="0.0.0.0", port=8000) | |
| # --------------------------------------------------------------------------------- | |
| # from flask import Flask, render_template, request, jsonify | |
| # from waitress import serve | |
| # from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
| # import time | |
| # app = Flask(__name__) | |
| # # Assuming the rest of your Flask app code remains unchanged | |
| # @app.route('/') | |
| # @app.route('/index') | |
| # def index(): | |
| # return render_template('index.html') | |
| # @app.route('/test', methods=['POST']) | |
| # def test(): | |
| # # Extract text from the request body | |
| # content = request.json.get('content', '') | |
| # if not content: | |
| # return jsonify({"error": "No content provided"}), 400 | |
| # start_time = time.time() | |
| # # Specify the directory where you have saved the model | |
| # model_save_directory = "./my_project_folder/pegasus_model" | |
| # # Load the model and tokenizer from the directory | |
| # model = PegasusForConditionalGeneration.from_pretrained(model_save_directory) | |
| # tokenizer = PegasusTokenizer.from_pretrained(model_save_directory) | |
| # # Create tokens - number representation of our text | |
| # tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt") | |
| # # Summarize | |
| # summary = model.generate(**tokens, min_length=60, max_length=100) | |
| # # Decode summary | |
| # summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True) | |
| # end_time = time.time() | |
| # execution_time = end_time - start_time | |
| # # Return the summarized text and execution time | |
| # return jsonify({ | |
| # "summarized_text": summarized_text, | |
| # "execution_time": f"{execution_time} seconds" | |
| # }) | |
| # # Assuming you have the `if __name__ == "__main__"` block to run the app | |
| # if __name__ == "__main__": | |
| # serve(app, host="0.0.0.0", port=8000) | |
| # ====================================================================================== | |
| # from flask import Flask, request, jsonify | |
| # from waitress import serve | |
| from pymongo import MongoClient | |
| # from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
| from flask import Flask, render_template, request, jsonify | |
| from flask_cors import CORS | |
| from waitress import serve | |
| from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
| from transformers import BartForConditionalGeneration, BartTokenizer | |
| import torch | |
| import time | |
| import time | |
| from datetime import datetime, timedelta | |
| app = Flask(__name__) | |
| CORS(app) | |
| # Use your MongoDB Atlas connection string | |
| mongo_conn_str = 'mongodb+srv://final_year_project:Ngd2jIj9PpvQfb5i@cluster0.3mhko.mongodb.net/news_scraping_site?retryWrites=true&w=majority&appName=Cluster0' | |
| client = MongoClient(mongo_conn_str) | |
| # Adjust these to match your specific database and collection names | |
| db = client['news_scraping_site'] | |
| summaries_collection = db.articles | |
| scraped_collection = db.scrapedarticles | |
| def hello(): | |
| return {"hello":"its fucking working..."} | |
| def index(): | |
| return render_template('index.html') | |
| def test(): | |
| content = request.json.get('content', '') | |
| if not content: | |
| return jsonify({"error": "No content provided"}), 400 | |
| start_time = time.time() | |
| # model_save_directory = "./my_project_folder/pegasus_model" | |
| model_save_directory = "google/pegasus-xsum" | |
| model = PegasusForConditionalGeneration.from_pretrained(model_save_directory) | |
| tokenizer = PegasusTokenizer.from_pretrained(model_save_directory) | |
| tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt") | |
| summary = model.generate(**tokens, min_length=60, max_length=100) | |
| summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True) | |
| # Save the summary to MongoDB Atlas | |
| summary_document = { | |
| "original_text": content, | |
| "summarized_text": summarized_text, | |
| "timestamp": time.time() | |
| } | |
| result = summaries_collection.insert_one(summary_document) | |
| end_time = time.time() | |
| execution_time = end_time - start_time | |
| return jsonify({ | |
| "summarized_text": summarized_text, | |
| "execution_time": f"{execution_time} seconds", | |
| "mongodb_object_id": str(result.inserted_id) # Return the MongoDB Object ID of the inserted document | |
| }) | |
| def bart(): | |
| print("bart route called") | |
| # Get the content from the request | |
| content = request.json.get('content', '') | |
| print(content) | |
| # Check if content is provided | |
| if not content: | |
| return jsonify({"error": "No content provided"}), 400 | |
| start_time = time.time() | |
| # Path to your BART model, adjust as necessary | |
| model_save_directory = "facebook/bart-large-cnn" | |
| # Load the tokenizer and model | |
| tokenizer = BartTokenizer.from_pretrained(model_save_directory) | |
| model = BartForConditionalGeneration.from_pretrained(model_save_directory) | |
| # Process the content for summarization | |
| inputs_no_trunc = tokenizer(content, max_length=None, return_tensors='pt', truncation=False) | |
| chunk_start = 0 | |
| chunk_end = tokenizer.model_max_length # 1024 for BART | |
| inputs_batch_lst = [] | |
| while chunk_start <= len(inputs_no_trunc['input_ids'][0]): | |
| inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end] | |
| inputs_batch = torch.unsqueeze(inputs_batch, 0) | |
| inputs_batch_lst.append(inputs_batch) | |
| chunk_start += tokenizer.model_max_length | |
| chunk_end += tokenizer.model_max_length | |
| # Generate summaries for each batch of tokens | |
| summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=100, early_stopping=True) for inputs in inputs_batch_lst] | |
| # Combine the batched summaries | |
| summary_batch_lst = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids_lst for g in summary_id] | |
| summary_all = '\n'.join(summary_batch_lst) | |
| # Calculate the execution time | |
| execution_time = time.time() - start_time | |
| summary_document = { | |
| "original_text": content, | |
| "summarized_text": summary_all, | |
| "timestamp": time.time() | |
| } | |
| result = summaries_collection.insert_one(summary_document) | |
| # Return the summarized text and execution time | |
| return jsonify({ | |
| "summarized_text": summary_all, | |
| "execution_time": f"{execution_time} seconds", | |
| "mongodb_article_id":f"{result.inserted_id}" | |
| }) | |
| def one(): | |
| print("bart route called") | |
| # Get the limit from the request | |
| limit = request.json.get('limit', 5) | |
| # Calculate the time threshold (1 hour ago) | |
| time_threshold = datetime.now() - timedelta(hours=1) | |
| # Query for articles | |
| articles = scraped_collection.find({ | |
| "summarized": "false" | |
| # "fetched_time": {"$gte": time_threshold} | |
| }).limit(limit) | |
| # print(len(articles)) | |
| articles_list = list(articles) | |
| print(articles_list) | |
| # Path to your BART model | |
| model_save_directory = "facebook/bart-large-cnn" | |
| # Load the tokenizer and model | |
| tokenizer = BartTokenizer.from_pretrained(model_save_directory) | |
| model = BartForConditionalGeneration.from_pretrained(model_save_directory) | |
| for article in articles: | |
| content = article['content'] | |
| start_time = time.time() | |
| # Summarize the content | |
| inputs = tokenizer(content, return_tensors='pt', max_length=1024, truncation=True) | |
| summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True) | |
| summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| execution_time = time.time() - start_time | |
| summary_document = { | |
| # "original_text": content, | |
| "summary": summary_text, | |
| "summarized":"true" | |
| # "timestamp": time.time() | |
| } | |
| result = summaries_collection.insert_one(summary_document) | |
| # Save the summarized text back to the database | |
| result_scraped = scraped_collection.update_one( | |
| {"_id": article['_id']}, | |
| {"$set": {"summarized":"true"}} | |
| ) | |
| print(f"Summarized and updated article ID {article['_id']}, Execution time: {execution_time} seconds") | |
| return jsonify({"message": "Summarization completed for requested articles"}) | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860) | |
| # if __name__ == "__main__": | |
| # # serve(app, host="0.0.0.0", port=9000) | |
| # app.run(host="0.0.0.0", port=9000, debug=True) | |