import os import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.probability import FreqDist from flask import Flask, request, jsonify, render_template import PyPDF2 import docx import re import heapq # Download necessary NLTK data nltk.download('punkt') nltk.download('stopwords') app = Flask(__name__) class SimpleDocumentAgent: def __init__(self): """Initialize a simple document processing agent using free libraries.""" self.current_document_text = "" self.document_name = "" self.stop_words = set(stopwords.words('english')) def load_document(self, file_path): """Load document from PDF or DOCX file.""" try: if file_path.endswith('.pdf'): self.document_name = os.path.basename(file_path) with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) self.current_document_text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] self.current_document_text += page.extract_text() elif file_path.endswith('.docx'): self.document_name = os.path.basename(file_path) doc = docx.Document(file_path) self.current_document_text = "\n".join([para.text for para in doc.paragraphs]) else: return "Unsupported file format. Please use PDF or DOCX." return f"Successfully loaded {self.document_name}" except Exception as e: return f"Error loading document: {str(e)}" def summarize_document(self, sentences_count=5): """Generate a summary using frequency-based extraction.""" if not self.current_document_text: return "No document loaded. Please load a document first." # Tokenize the text into sentences sentences = sent_tokenize(self.current_document_text) # Calculate word frequencies words = word_tokenize(self.current_document_text.lower()) words = [word for word in words if word.isalnum() and word not in self.stop_words] freq_dist = FreqDist(words) # Calculate sentence scores based on word frequencies sentence_scores = {} for i, sentence in enumerate(sentences): for word in word_tokenize(sentence.lower()): if word in freq_dist: if i in sentence_scores: sentence_scores[i] += freq_dist[word] else: sentence_scores[i] = freq_dist[word] # Get top sentences summary_sentences_indices = heapq.nlargest(sentences_count, sentence_scores, key=sentence_scores.get) # Sort the indices to preserve original order summary_sentences_indices.sort() # Create the summary summary = [sentences[i] for i in summary_sentences_indices] return " ".join(summary) def extract_information(self, info_type): """Extract specific information like dates, emails, or phone numbers.""" if not self.current_document_text: return "No document loaded. Please load a document first." results = [] if info_type.lower() == "email" or info_type.lower() == "emails": # Pattern for emails email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' results = re.findall(email_pattern, self.current_document_text) elif info_type.lower() == "phone" or info_type.lower() == "phones" or info_type.lower() == "phone numbers": # Pattern for phone numbers phone_pattern = r'\b(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b' results = re.findall(phone_pattern, self.current_document_text) elif info_type.lower() == "date" or info_type.lower() == "dates": # Pattern for dates (simple pattern, can be improved) date_pattern = r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b' results = re.findall(date_pattern, self.current_document_text) elif info_type.lower() == "url" or info_type.lower() == "urls" or info_type.lower() == "website" or info_type.lower() == "websites": # Pattern for URLs url_pattern = r'https?://[^\s]+' results = re.findall(url_pattern, self.current_document_text) else: # If not a specific pattern, search for occurrences of the term results = [sentence for sentence in sent_tokenize(self.current_document_text) if info_type.lower() in sentence.lower()] if not results: return f"No {info_type} found in the document." return results def answer_question(self, question): """Attempt to answer questions about the document using keyword matching.""" if not self.current_document_text: return "No document loaded. Please load a document first." # Tokenize the question and remove stop words question_words = [w.lower() for w in word_tokenize(question) if w.lower() not in self.stop_words and w.isalnum()] # Tokenize the document into sentences sentences = sent_tokenize(self.current_document_text) # Score sentences based on the question words they contain sentence_scores = {} for i, sentence in enumerate(sentences): words = [w.lower() for w in word_tokenize(sentence)] score = sum(1 for word in question_words if word in words) if score > 0: sentence_scores[i] = score # If no matches found if not sentence_scores: return "I couldn't find information related to your question in the document." # Get the top 3 most relevant sentences top_indices = heapq.nlargest(3, sentence_scores, key=sentence_scores.get) relevant_sentences = [sentences[i] for i in sorted(top_indices)] return " ".join(relevant_sentences) # Set up Flask routes @app.route('/') def home(): return render_template('index.html') @app.route('/upload', methods=['POST']) def upload_file(): # Check if the post request has the file part if 'file' not in request.files: return jsonify({"error": "No file part"}) file = request.files['file'] if file.filename == '': return jsonify({"error": "No selected file"}) if file: # Save the file temporarily file_path = os.path.join("temp", file.filename) os.makedirs("temp", exist_ok=True) file.save(file_path) # Process the file result = agent.load_document(file_path) # Remove the temporary file os.remove(file_path) return jsonify({"message": result}) @app.route('/summarize', methods=['POST']) def summarize(): sentences = request.json.get('sentences', 5) result = agent.summarize_document(sentences) return jsonify({"summary": result}) @app.route('/extract', methods=['POST']) def extract(): info_type = request.json.get('info_type', '') result = agent.extract_information(info_type) return jsonify({"extracted": result}) @app.route('/question', methods=['POST']) def question(): query = request.json.get('question', '') result = agent.answer_question(query) return jsonify({"answer": result}) # Initialize the agent agent = SimpleDocumentAgent() # Create a basic HTML template @app.route('/get_index_template') def get_index_template(): html_content = """ Document Processing Agent

Document Processing Agent

Upload Document

Summarize Document

Extract Information

Ask Questions

""" return html_content if __name__ == "__main__": # Create a templates folder and index.html os.makedirs("templates", exist_ok=True) with open("templates/index.html", "w") as f: f.write(get_index_template()) # Run the app app.run(debug=True)