Spaces:
Running
Running
Commit
·
e93c61d
1
Parent(s):
5c627aa
Upd summariser Llama prompting. Upd status handler, max size, num_file
Browse files- app.py +55 -2
- static/index.html +1 -1
- static/script.js +35 -30
- utils/chunker.py +10 -7
- utils/summarizer.py +72 -27
app.py
CHANGED
|
@@ -53,6 +53,7 @@ class MessageResponse(BaseModel):
|
|
| 53 |
class UploadResponse(BaseModel):
|
| 54 |
job_id: str
|
| 55 |
status: str
|
|
|
|
| 56 |
|
| 57 |
class FileSummaryResponse(BaseModel):
|
| 58 |
filename: str
|
|
@@ -81,6 +82,9 @@ app.add_middleware(
|
|
| 81 |
# Serve static files (index.html, scripts.js, styles.css)
|
| 82 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 83 |
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# ────────────────────────────── Global Clients ──────────────────────────────
|
| 86 |
# API rotators (round robin + auto failover on quota errors)
|
|
@@ -374,12 +378,29 @@ async def upload_files(
|
|
| 374 |
"""
|
| 375 |
job_id = str(uuid.uuid4())
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
# Read file bytes upfront to avoid reading from closed streams in background task
|
| 378 |
preloaded_files = []
|
| 379 |
for uf in files:
|
| 380 |
raw = await uf.read()
|
|
|
|
|
|
|
| 381 |
preloaded_files.append((uf.filename, raw))
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
# Single background task: process files sequentially with isolation
|
| 384 |
async def _process_all():
|
| 385 |
for idx, (fname, raw) in enumerate(preloaded_files, start=1):
|
|
@@ -411,7 +432,7 @@ async def upload_files(
|
|
| 411 |
p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in caps])).strip()
|
| 412 |
|
| 413 |
# Build cards
|
| 414 |
-
cards = build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
|
| 415 |
logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
|
| 416 |
|
| 417 |
# Embed & store
|
|
@@ -426,16 +447,48 @@ async def upload_files(
|
|
| 426 |
file_summary = await cheap_summarize(full_text, max_sentences=6)
|
| 427 |
rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
|
| 428 |
logger.info(f"[{job_id}] Completed {fname}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
except Exception as e:
|
| 430 |
logger.error(f"[{job_id}] Failed processing {fname}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
finally:
|
| 432 |
# Yield control between files to keep loop responsive
|
| 433 |
await asyncio.sleep(0)
|
| 434 |
|
| 435 |
logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
background_tasks.add_task(_process_all)
|
| 438 |
-
return UploadResponse(job_id=job_id, status="processing")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
|
| 441 |
@app.get("/cards")
|
|
|
|
| 53 |
class UploadResponse(BaseModel):
|
| 54 |
job_id: str
|
| 55 |
status: str
|
| 56 |
+
total_files: Optional[int] = None
|
| 57 |
|
| 58 |
class FileSummaryResponse(BaseModel):
|
| 59 |
filename: str
|
|
|
|
| 82 |
# Serve static files (index.html, scripts.js, styles.css)
|
| 83 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 84 |
|
| 85 |
+
# In-memory job tracker (for progress queries)
|
| 86 |
+
app.state.jobs = {}
|
| 87 |
+
|
| 88 |
|
| 89 |
# ────────────────────────────── Global Clients ──────────────────────────────
|
| 90 |
# API rotators (round robin + auto failover on quota errors)
|
|
|
|
| 378 |
"""
|
| 379 |
job_id = str(uuid.uuid4())
|
| 380 |
|
| 381 |
+
# Basic upload policy limits
|
| 382 |
+
max_files = int(os.getenv("MAX_FILES_PER_UPLOAD", "15"))
|
| 383 |
+
max_mb = int(os.getenv("MAX_FILE_MB", "50"))
|
| 384 |
+
if len(files) > max_files:
|
| 385 |
+
raise HTTPException(400, detail=f"Too many files. Max {max_files} allowed per upload.")
|
| 386 |
+
|
| 387 |
# Read file bytes upfront to avoid reading from closed streams in background task
|
| 388 |
preloaded_files = []
|
| 389 |
for uf in files:
|
| 390 |
raw = await uf.read()
|
| 391 |
+
if len(raw) > max_mb * 1024 * 1024:
|
| 392 |
+
raise HTTPException(400, detail=f"{uf.filename} exceeds {max_mb} MB limit")
|
| 393 |
preloaded_files.append((uf.filename, raw))
|
| 394 |
|
| 395 |
+
# Initialize job status
|
| 396 |
+
app.state.jobs[job_id] = {
|
| 397 |
+
"created_at": time.time(),
|
| 398 |
+
"total": len(preloaded_files),
|
| 399 |
+
"completed": 0,
|
| 400 |
+
"status": "processing",
|
| 401 |
+
"last_error": None,
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
# Single background task: process files sequentially with isolation
|
| 405 |
async def _process_all():
|
| 406 |
for idx, (fname, raw) in enumerate(preloaded_files, start=1):
|
|
|
|
| 432 |
p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in caps])).strip()
|
| 433 |
|
| 434 |
# Build cards
|
| 435 |
+
cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
|
| 436 |
logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
|
| 437 |
|
| 438 |
# Embed & store
|
|
|
|
| 447 |
file_summary = await cheap_summarize(full_text, max_sentences=6)
|
| 448 |
rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
|
| 449 |
logger.info(f"[{job_id}] Completed {fname}")
|
| 450 |
+
# Update job progress
|
| 451 |
+
job = app.state.jobs.get(job_id)
|
| 452 |
+
if job:
|
| 453 |
+
job["completed"] = idx
|
| 454 |
+
job["status"] = "processing" if idx < job.get("total", 0) else "completed"
|
| 455 |
except Exception as e:
|
| 456 |
logger.error(f"[{job_id}] Failed processing {fname}: {e}")
|
| 457 |
+
job = app.state.jobs.get(job_id)
|
| 458 |
+
if job:
|
| 459 |
+
job["last_error"] = str(e)
|
| 460 |
+
job["completed"] = idx # count as completed attempt
|
| 461 |
finally:
|
| 462 |
# Yield control between files to keep loop responsive
|
| 463 |
await asyncio.sleep(0)
|
| 464 |
|
| 465 |
logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
|
| 466 |
+
# Finalize job status
|
| 467 |
+
job = app.state.jobs.get(job_id)
|
| 468 |
+
if job:
|
| 469 |
+
job["status"] = "completed"
|
| 470 |
|
| 471 |
background_tasks.add_task(_process_all)
|
| 472 |
+
return UploadResponse(job_id=job_id, status="processing", total_files=len(preloaded_files))
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
@app.get("/upload/status")
|
| 476 |
+
async def upload_status(job_id: str):
|
| 477 |
+
job = app.state.jobs.get(job_id)
|
| 478 |
+
if not job:
|
| 479 |
+
raise HTTPException(404, detail="Job not found")
|
| 480 |
+
percent = 0
|
| 481 |
+
if job.get("total"):
|
| 482 |
+
percent = int(round((job.get("completed", 0) / job.get("total", 1)) * 100))
|
| 483 |
+
return {
|
| 484 |
+
"job_id": job_id,
|
| 485 |
+
"status": job.get("status"),
|
| 486 |
+
"completed": job.get("completed"),
|
| 487 |
+
"total": job.get("total"),
|
| 488 |
+
"percent": percent,
|
| 489 |
+
"last_error": job.get("last_error"),
|
| 490 |
+
"created_at": job.get("created_at"),
|
| 491 |
+
}
|
| 492 |
|
| 493 |
|
| 494 |
@app.get("/cards")
|
static/index.html
CHANGED
|
@@ -159,7 +159,7 @@
|
|
| 159 |
<span class="btn-text">Upload Documents</span>
|
| 160 |
<span class="btn-loading" style="display:none;">
|
| 161 |
<div class="spinner"></div>
|
| 162 |
-
Processing
|
| 163 |
</span>
|
| 164 |
</button>
|
| 165 |
</form>
|
|
|
|
| 159 |
<span class="btn-text">Upload Documents</span>
|
| 160 |
<span class="btn-loading" style="display:none;">
|
| 161 |
<div class="spinner"></div>
|
| 162 |
+
Processing...
|
| 163 |
</span>
|
| 164 |
</button>
|
| 165 |
</form>
|
static/script.js
CHANGED
|
@@ -205,8 +205,8 @@
|
|
| 205 |
logProgress(`Job ID: ${data.job_id}`);
|
| 206 |
logProgress('Files uploaded successfully');
|
| 207 |
|
| 208 |
-
//
|
| 209 |
-
|
| 210 |
} else {
|
| 211 |
throw new Error(data.detail || 'Upload failed');
|
| 212 |
}
|
|
@@ -248,35 +248,40 @@
|
|
| 248 |
progressLog.scrollTop = progressLog.scrollHeight;
|
| 249 |
}
|
| 250 |
|
| 251 |
-
function
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
}
|
| 267 |
-
|
| 268 |
-
const currentTarget = targets[completed];
|
| 269 |
-
updateProgressFill(currentTarget);
|
| 270 |
-
updateProgressStatus(`Processing documents... ${currentTarget}%`);
|
| 271 |
-
logProgress(`Finished processing file ${completed + 1}/${totalFiles}`);
|
| 272 |
-
completed += 1;
|
| 273 |
-
|
| 274 |
-
// Wait a short time before next step (simulated, since backend is background)
|
| 275 |
-
setTimeout(advance, 1200);
|
| 276 |
-
}
|
| 277 |
-
|
| 278 |
-
// kick off first step after a short delay to show feedback
|
| 279 |
-
setTimeout(advance, 800);
|
| 280 |
}
|
| 281 |
|
| 282 |
function enableChat() {
|
|
|
|
| 205 |
logProgress(`Job ID: ${data.job_id}`);
|
| 206 |
logProgress('Files uploaded successfully');
|
| 207 |
|
| 208 |
+
// Poll backend for real progress
|
| 209 |
+
startUploadStatusPolling(data.job_id, data.total_files || selectedFiles.length);
|
| 210 |
} else {
|
| 211 |
throw new Error(data.detail || 'Upload failed');
|
| 212 |
}
|
|
|
|
| 248 |
progressLog.scrollTop = progressLog.scrollHeight;
|
| 249 |
}
|
| 250 |
|
| 251 |
+
function startUploadStatusPolling(jobId, totalFiles) {
|
| 252 |
+
let stopped = false;
|
| 253 |
+
const interval = setInterval(async () => {
|
| 254 |
+
if (stopped) return;
|
| 255 |
+
try {
|
| 256 |
+
const res = await fetch(`/upload/status?job_id=${encodeURIComponent(jobId)}`);
|
| 257 |
+
if (!res.ok) {
|
| 258 |
+
throw new Error('Status not available');
|
| 259 |
+
}
|
| 260 |
+
const status = await res.json();
|
| 261 |
+
const percent = Math.max(0, Math.min(100, parseInt(status.percent || 0, 10)));
|
| 262 |
+
const completed = status.completed || 0;
|
| 263 |
+
const total = status.total || totalFiles || 1;
|
| 264 |
+
updateProgressFill(percent);
|
| 265 |
+
updateProgressStatus(`Processing documents... ${percent}% (${completed}/${total})`);
|
| 266 |
+
if (status.last_error) {
|
| 267 |
+
logProgress(`Warning: ${status.last_error}`);
|
| 268 |
+
}
|
| 269 |
+
if (status.status === 'completed' || percent >= 100) {
|
| 270 |
+
clearInterval(interval);
|
| 271 |
+
stopped = true;
|
| 272 |
+
updateProgressFill(100);
|
| 273 |
+
updateProgressStatus('Processing complete!');
|
| 274 |
+
logProgress('All documents processed successfully');
|
| 275 |
+
logProgress('You can now start chatting with your documents');
|
| 276 |
+
setTimeout(() => hideUploadProgress(), 1500);
|
| 277 |
+
enableChat();
|
| 278 |
+
}
|
| 279 |
+
} catch (e) {
|
| 280 |
+
clearInterval(interval);
|
| 281 |
+
stopped = true;
|
| 282 |
+
logProgress(`Error reading job status: ${e.message}`);
|
| 283 |
}
|
| 284 |
+
}, 1200);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
}
|
| 286 |
|
| 287 |
function enableChat() {
|
utils/chunker.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# ────────────────────────────── utils/chunker.py ──────────────────────────────
|
| 2 |
import re
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
-
from .summarizer import cheap_summarize
|
| 5 |
from .common import split_sentences, slugify
|
| 6 |
from .logger import get_logger
|
| 7 |
|
|
@@ -14,6 +14,7 @@ MAX_WORDS = 500
|
|
| 14 |
MIN_WORDS = 150
|
| 15 |
logger = get_logger("CHUNKER", __name__)
|
| 16 |
|
|
|
|
| 17 |
def _by_headings(text: str):
|
| 18 |
# split on markdown-like or outline headings
|
| 19 |
pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
|
|
@@ -32,7 +33,7 @@ def _by_headings(text: str):
|
|
| 32 |
return parts
|
| 33 |
|
| 34 |
|
| 35 |
-
def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
|
| 36 |
# Concatenate pages but keep page spans for metadata
|
| 37 |
full = ""
|
| 38 |
page_markers = []
|
|
@@ -64,11 +65,13 @@ def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id:
|
|
| 64 |
|
| 65 |
# Build card dicts
|
| 66 |
out = []
|
| 67 |
-
for i,
|
| 68 |
-
|
|
|
|
|
|
|
| 69 |
if not topic:
|
| 70 |
-
topic =
|
| 71 |
-
summary = cheap_summarize(
|
| 72 |
# Estimate page span
|
| 73 |
first_page = pages[0]['page_num'] if pages else 1
|
| 74 |
last_page = pages[-1]['page_num'] if pages else 1
|
|
@@ -78,7 +81,7 @@ def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id:
|
|
| 78 |
"filename": filename,
|
| 79 |
"topic_name": topic[:120],
|
| 80 |
"summary": summary,
|
| 81 |
-
"content":
|
| 82 |
"page_span": [first_page, last_page],
|
| 83 |
"card_id": f"{slugify(filename)}-c{i:04d}"
|
| 84 |
})
|
|
|
|
| 1 |
# ────────────────────────────── utils/chunker.py ──────────────────────────────
|
| 2 |
import re
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
+
from .summarizer import cheap_summarize, clean_chunk_text
|
| 5 |
from .common import split_sentences, slugify
|
| 6 |
from .logger import get_logger
|
| 7 |
|
|
|
|
| 14 |
MIN_WORDS = 150
|
| 15 |
logger = get_logger("CHUNKER", __name__)
|
| 16 |
|
| 17 |
+
|
| 18 |
def _by_headings(text: str):
|
| 19 |
# split on markdown-like or outline headings
|
| 20 |
pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
|
|
|
|
| 33 |
return parts
|
| 34 |
|
| 35 |
|
| 36 |
+
async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
|
| 37 |
# Concatenate pages but keep page spans for metadata
|
| 38 |
full = ""
|
| 39 |
page_markers = []
|
|
|
|
| 65 |
|
| 66 |
# Build card dicts
|
| 67 |
out = []
|
| 68 |
+
for i, raw_content in enumerate(cards, 1):
|
| 69 |
+
# Clean with LLM to remove headers/footers and IDs
|
| 70 |
+
cleaned = await clean_chunk_text(raw_content)
|
| 71 |
+
topic = await cheap_summarize(cleaned, max_sentences=1)
|
| 72 |
if not topic:
|
| 73 |
+
topic = cleaned[:80] + "..."
|
| 74 |
+
summary = await cheap_summarize(cleaned, max_sentences=3)
|
| 75 |
# Estimate page span
|
| 76 |
first_page = pages[0]['page_num'] if pages else 1
|
| 77 |
last_page = pages[-1]['page_num'] if pages else 1
|
|
|
|
| 81 |
"filename": filename,
|
| 82 |
"topic_name": topic[:120],
|
| 83 |
"summary": summary,
|
| 84 |
+
"content": cleaned,
|
| 85 |
"page_span": [first_page, last_page],
|
| 86 |
"card_id": f"{slugify(filename)}-c{i:04d}"
|
| 87 |
})
|
utils/summarizer.py
CHANGED
|
@@ -1,44 +1,38 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
import os
|
| 3 |
import asyncio
|
|
|
|
| 4 |
from .logger import get_logger
|
| 5 |
from utils.rotator import robust_post_json
|
| 6 |
|
| 7 |
logger = get_logger("SUM", __name__)
|
| 8 |
|
| 9 |
|
| 10 |
-
async def
|
| 11 |
-
"""Summarize text using NVIDIA Llama via /v1/chat/completions. Returns plain text."""
|
| 12 |
-
text = (text or "").strip()
|
| 13 |
-
if not text:
|
| 14 |
-
return ""
|
| 15 |
model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
|
| 16 |
key = os.getenv("NVIDIA_API_1", "") or os.getenv("NVIDIA_API_KEY", "")
|
| 17 |
if not key:
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
system_prompt = (
|
| 22 |
-
"You are a precise summarizer. Produce a concise summary of the user's text. "
|
| 23 |
-
f"Return about {max_sentences} sentences, no preface, no markdown."
|
| 24 |
-
)
|
| 25 |
-
user_prompt = f"Summarize this:\n\n{text}"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
try:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
"temperature": 0.2,
|
| 33 |
-
"messages": [
|
| 34 |
-
{"role": "system", "content": system_prompt},
|
| 35 |
-
{"role": "user", "content": user_prompt},
|
| 36 |
-
]
|
| 37 |
-
}
|
| 38 |
-
# Using rotator helper for retries; if not available, simple fetch could be used
|
| 39 |
-
data = await robust_post_json(url, headers, payload)
|
| 40 |
-
content = data["choices"][0]["message"]["content"].strip()
|
| 41 |
-
return content
|
| 42 |
except Exception as e:
|
| 43 |
logger.warning(f"LLAMA summarization failed: {e}; using fallback")
|
| 44 |
return naive_fallback(text, max_sentences)
|
|
@@ -49,6 +43,57 @@ def naive_fallback(text: str, max_sentences: int = 3) -> str:
|
|
| 49 |
return '. '.join(parts[:max_sentences])
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Backward-compatible name used by app.py
|
| 53 |
async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
|
| 54 |
return await llama_summarize(text, max_sentences)
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import asyncio
|
| 3 |
+
from typing import List
|
| 4 |
from .logger import get_logger
|
| 5 |
from utils.rotator import robust_post_json
|
| 6 |
|
| 7 |
logger = get_logger("SUM", __name__)
|
| 8 |
|
| 9 |
|
| 10 |
+
async def llama_chat(messages, temperature: float = 0.2) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
|
| 12 |
key = os.getenv("NVIDIA_API_1", "") or os.getenv("NVIDIA_API_KEY", "")
|
| 13 |
if not key:
|
| 14 |
+
raise RuntimeError("NVIDIA API key not set")
|
| 15 |
+
url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
| 16 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 17 |
+
payload = {"model": model, "temperature": temperature, "messages": messages}
|
| 18 |
+
data = await robust_post_json(url, headers, payload)
|
| 19 |
+
return data["choices"][0]["message"]["content"].strip()
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
async def llama_summarize(text: str, max_sentences: int = 3) -> str:
|
| 23 |
+
text = (text or "").strip()
|
| 24 |
+
if not text:
|
| 25 |
+
return ""
|
| 26 |
+
system = (
|
| 27 |
+
"You are a precise summarizer. Produce a clear, faithful summary of the user's text. "
|
| 28 |
+
f"Return ~{max_sentences} sentences, no preface, no markdown."
|
| 29 |
+
)
|
| 30 |
+
user = f"Summarize this text:\n\n{text}"
|
| 31 |
try:
|
| 32 |
+
return await llama_chat([
|
| 33 |
+
{"role": "system", "content": system},
|
| 34 |
+
{"role": "user", "content": user},
|
| 35 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
except Exception as e:
|
| 37 |
logger.warning(f"LLAMA summarization failed: {e}; using fallback")
|
| 38 |
return naive_fallback(text, max_sentences)
|
|
|
|
| 43 |
return '. '.join(parts[:max_sentences])
|
| 44 |
|
| 45 |
|
| 46 |
+
async def summarize_text(text: str, max_sentences: int = 6, chunk_size: int = 2500) -> str:
|
| 47 |
+
"""Hierarchical summarization for long texts using NVIDIA Llama."""
|
| 48 |
+
if not text:
|
| 49 |
+
return ""
|
| 50 |
+
if len(text) <= chunk_size:
|
| 51 |
+
return await llama_summarize(text, max_sentences=max_sentences)
|
| 52 |
+
# Split into chunks on paragraph boundaries if possible
|
| 53 |
+
paragraphs = text.split('\n\n')
|
| 54 |
+
chunks: List[str] = []
|
| 55 |
+
buf = []
|
| 56 |
+
total = 0
|
| 57 |
+
for p in paragraphs:
|
| 58 |
+
if total + len(p) > chunk_size and buf:
|
| 59 |
+
chunks.append('\n\n'.join(buf))
|
| 60 |
+
buf, total = [], 0
|
| 61 |
+
buf.append(p)
|
| 62 |
+
total += len(p)
|
| 63 |
+
if buf:
|
| 64 |
+
chunks.append('\n\n'.join(buf))
|
| 65 |
+
|
| 66 |
+
partials = []
|
| 67 |
+
for ch in chunks:
|
| 68 |
+
partials.append(await llama_summarize(ch, max_sentences=3))
|
| 69 |
+
await asyncio.sleep(0)
|
| 70 |
+
combined = '\n'.join(partials)
|
| 71 |
+
return await llama_summarize(combined, max_sentences=max_sentences)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
async def clean_chunk_text(text: str) -> str:
|
| 75 |
+
"""Use NVIDIA LLM to remove headers/footers and personally identifying/institution boilerplate.
|
| 76 |
+
Keep the core academic content intact. Do not remove page numbers or section titles.
|
| 77 |
+
"""
|
| 78 |
+
content = (text or "").strip()
|
| 79 |
+
if not content:
|
| 80 |
+
return content
|
| 81 |
+
system = (
|
| 82 |
+
"You are a content cleaner. Remove boilerplate headers/footers like institution names, course codes, student IDs, "
|
| 83 |
+
"emails, author IDs, document footers/headers repeated across pages. Keep headings and the main body content. "
|
| 84 |
+
"Preserve meaningful section titles. Keep pagination references in the natural text if present. Return only cleaned text."
|
| 85 |
+
)
|
| 86 |
+
user = f"Clean this content by removing headers/footers and IDs, keep core content:\n\n{content}"
|
| 87 |
+
try:
|
| 88 |
+
return await llama_chat([
|
| 89 |
+
{"role": "system", "content": system},
|
| 90 |
+
{"role": "user", "content": user},
|
| 91 |
+
], temperature=0.0)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.warning(f"LLAMA cleaning failed: {e}; returning original text")
|
| 94 |
+
return content
|
| 95 |
+
|
| 96 |
+
|
| 97 |
# Backward-compatible name used by app.py
|
| 98 |
async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
|
| 99 |
return await llama_summarize(text, max_sentences)
|