Spaces:
Sleeping
Sleeping
File size: 14,912 Bytes
1f73729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from flask import Flask, request, jsonify, render_template
import PyPDF2
import docx
import re
import heapq
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
app = Flask(__name__)
class SimpleDocumentAgent:
def __init__(self):
"""Initialize a simple document processing agent using free libraries."""
self.current_document_text = ""
self.document_name = ""
self.stop_words = set(stopwords.words('english'))
def load_document(self, file_path):
"""Load document from PDF or DOCX file."""
try:
if file_path.endswith('.pdf'):
self.document_name = os.path.basename(file_path)
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
self.current_document_text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
self.current_document_text += page.extract_text()
elif file_path.endswith('.docx'):
self.document_name = os.path.basename(file_path)
doc = docx.Document(file_path)
self.current_document_text = "\n".join([para.text for para in doc.paragraphs])
else:
return "Unsupported file format. Please use PDF or DOCX."
return f"Successfully loaded {self.document_name}"
except Exception as e:
return f"Error loading document: {str(e)}"
def summarize_document(self, sentences_count=5):
"""Generate a summary using frequency-based extraction."""
if not self.current_document_text:
return "No document loaded. Please load a document first."
# Tokenize the text into sentences
sentences = sent_tokenize(self.current_document_text)
# Calculate word frequencies
words = word_tokenize(self.current_document_text.lower())
words = [word for word in words if word.isalnum() and word not in self.stop_words]
freq_dist = FreqDist(words)
# Calculate sentence scores based on word frequencies
sentence_scores = {}
for i, sentence in enumerate(sentences):
for word in word_tokenize(sentence.lower()):
if word in freq_dist:
if i in sentence_scores:
sentence_scores[i] += freq_dist[word]
else:
sentence_scores[i] = freq_dist[word]
# Get top sentences
summary_sentences_indices = heapq.nlargest(sentences_count,
sentence_scores,
key=sentence_scores.get)
# Sort the indices to preserve original order
summary_sentences_indices.sort()
# Create the summary
summary = [sentences[i] for i in summary_sentences_indices]
return " ".join(summary)
def extract_information(self, info_type):
"""Extract specific information like dates, emails, or phone numbers."""
if not self.current_document_text:
return "No document loaded. Please load a document first."
results = []
if info_type.lower() == "email" or info_type.lower() == "emails":
# Pattern for emails
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
results = re.findall(email_pattern, self.current_document_text)
elif info_type.lower() == "phone" or info_type.lower() == "phones" or info_type.lower() == "phone numbers":
# Pattern for phone numbers
phone_pattern = r'\b(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b'
results = re.findall(phone_pattern, self.current_document_text)
elif info_type.lower() == "date" or info_type.lower() == "dates":
# Pattern for dates (simple pattern, can be improved)
date_pattern = r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b'
results = re.findall(date_pattern, self.current_document_text)
elif info_type.lower() == "url" or info_type.lower() == "urls" or info_type.lower() == "website" or info_type.lower() == "websites":
# Pattern for URLs
url_pattern = r'https?://[^\s]+'
results = re.findall(url_pattern, self.current_document_text)
else:
# If not a specific pattern, search for occurrences of the term
results = [sentence for sentence in sent_tokenize(self.current_document_text)
if info_type.lower() in sentence.lower()]
if not results:
return f"No {info_type} found in the document."
return results
def answer_question(self, question):
"""Attempt to answer questions about the document using keyword matching."""
if not self.current_document_text:
return "No document loaded. Please load a document first."
# Tokenize the question and remove stop words
question_words = [w.lower() for w in word_tokenize(question)
if w.lower() not in self.stop_words and w.isalnum()]
# Tokenize the document into sentences
sentences = sent_tokenize(self.current_document_text)
# Score sentences based on the question words they contain
sentence_scores = {}
for i, sentence in enumerate(sentences):
words = [w.lower() for w in word_tokenize(sentence)]
score = sum(1 for word in question_words if word in words)
if score > 0:
sentence_scores[i] = score
# If no matches found
if not sentence_scores:
return "I couldn't find information related to your question in the document."
# Get the top 3 most relevant sentences
top_indices = heapq.nlargest(3, sentence_scores, key=sentence_scores.get)
relevant_sentences = [sentences[i] for i in sorted(top_indices)]
return " ".join(relevant_sentences)
# Set up Flask routes
@app.route('/')
def home():
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_file():
# Check if the post request has the file part
if 'file' not in request.files:
return jsonify({"error": "No file part"})
file = request.files['file']
if file.filename == '':
return jsonify({"error": "No selected file"})
if file:
# Save the file temporarily
file_path = os.path.join("temp", file.filename)
os.makedirs("temp", exist_ok=True)
file.save(file_path)
# Process the file
result = agent.load_document(file_path)
# Remove the temporary file
os.remove(file_path)
return jsonify({"message": result})
@app.route('/summarize', methods=['POST'])
def summarize():
sentences = request.json.get('sentences', 5)
result = agent.summarize_document(sentences)
return jsonify({"summary": result})
@app.route('/extract', methods=['POST'])
def extract():
info_type = request.json.get('info_type', '')
result = agent.extract_information(info_type)
return jsonify({"extracted": result})
@app.route('/question', methods=['POST'])
def question():
query = request.json.get('question', '')
result = agent.answer_question(query)
return jsonify({"answer": result})
# Initialize the agent
agent = SimpleDocumentAgent()
# Create a basic HTML template
@app.route('/get_index_template')
def get_index_template():
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>Document Processing Agent</title>
<style>
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; line-height: 1.6; }
h1 { color: #333; }
.container { max-width: 800px; margin: 0 auto; }
.section { margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }
button { background-color: #4CAF50; color: white; padding: 10px 15px; border: none; border-radius: 4px; cursor: pointer; }
button:hover { background-color: #45a049; }
input, select { padding: 8px; margin: 10px 0; width: 100%; }
textarea { width: 100%; height: 150px; }
.result { background-color: #f9f9f9; padding: 10px; border-radius: 5px; margin-top: 10px; }
</style>
</head>
<body>
<div class="container">
<h1>Document Processing Agent</h1>
<div class="section">
<h2>Upload Document</h2>
<form id="uploadForm">
<input type="file" id="documentFile" accept=".pdf,.docx">
<button type="submit">Upload</button>
</form>
<div id="uploadResult" class="result"></div>
</div>
<div class="section">
<h2>Summarize Document</h2>
<label for="sentenceCount">Number of sentences:</label>
<input type="number" id="sentenceCount" value="5" min="1" max="20">
<button onclick="summarizeDocument()">Generate Summary</button>
<div id="summaryResult" class="result"></div>
</div>
<div class="section">
<h2>Extract Information</h2>
<select id="infoType">
<option value="email">Emails</option>
<option value="phone">Phone Numbers</option>
<option value="date">Dates</option>
<option value="url">URLs</option>
</select>
<button onclick="extractInfo()">Extract</button>
<div id="extractResult" class="result"></div>
</div>
<div class="section">
<h2>Ask Questions</h2>
<input type="text" id="question" placeholder="Enter your question about the document">
<button onclick="askQuestion()">Ask</button>
<div id="questionResult" class="result"></div>
</div>
</div>
<script>
// Upload document
document.getElementById('uploadForm').addEventListener('submit', function(event) {
event.preventDefault();
const fileInput = document.getElementById('documentFile');
const file = fileInput.files[0];
if (!file) {
alert('Please select a file to upload');
return;
}
const formData = new FormData();
formData.append('file', file);
fetch('/upload', {
method: 'POST',
body: formData
})
.then(response => response.json())
.then(data => {
document.getElementById('uploadResult').textContent = data.message;
})
.catch(error => {
console.error('Error:', error);
document.getElementById('uploadResult').textContent = 'Error uploading file';
});
});
// Summarize
function summarizeDocument() {
const sentences = document.getElementById('sentenceCount').value;
fetch('/summarize', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ sentences: parseInt(sentences) })
})
.then(response => response.json())
.then(data => {
document.getElementById('summaryResult').textContent = data.summary;
})
.catch(error => {
console.error('Error:', error);
document.getElementById('summaryResult').textContent = 'Error generating summary';
});
}
// Extract info
function extractInfo() {
const infoType = document.getElementById('infoType').value;
fetch('/extract', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ info_type: infoType })
})
.then(response => response.json())
.then(data => {
if (Array.isArray(data.extracted)) {
document.getElementById('extractResult').textContent = data.extracted.join('\\n');
} else {
document.getElementById('extractResult').textContent = data.extracted;
}
})
.catch(error => {
console.error('Error:', error);
document.getElementById('extractResult').textContent = 'Error extracting information';
});
}
// Ask question
function askQuestion() {
const question = document.getElementById('question').value;
fetch('/question', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ question: question })
})
.then(response => response.json())
.then(data => {
document.getElementById('questionResult').textContent = data.answer;
})
.catch(error => {
console.error('Error:', error);
document.getElementById('questionResult').textContent = 'Error processing question';
});
}
</script>
</body>
</html>
"""
return html_content
if __name__ == "__main__":
# Create a templates folder and index.html
os.makedirs("templates", exist_ok=True)
with open("templates/index.html", "w") as f:
f.write(get_index_template())
# Run the app
app.run(debug=True) |