LiamKhoaLe commited on
Commit
e93c61d
·
1 Parent(s): 5c627aa

Upd summariser Llama prompting. Upd status handler, max size, num_file

Browse files
Files changed (5) hide show
  1. app.py +55 -2
  2. static/index.html +1 -1
  3. static/script.js +35 -30
  4. utils/chunker.py +10 -7
  5. utils/summarizer.py +72 -27
app.py CHANGED
@@ -53,6 +53,7 @@ class MessageResponse(BaseModel):
53
  class UploadResponse(BaseModel):
54
  job_id: str
55
  status: str
 
56
 
57
  class FileSummaryResponse(BaseModel):
58
  filename: str
@@ -81,6 +82,9 @@ app.add_middleware(
81
  # Serve static files (index.html, scripts.js, styles.css)
82
  app.mount("/static", StaticFiles(directory="static"), name="static")
83
 
 
 
 
84
 
85
  # ────────────────────────────── Global Clients ──────────────────────────────
86
  # API rotators (round robin + auto failover on quota errors)
@@ -374,12 +378,29 @@ async def upload_files(
374
  """
375
  job_id = str(uuid.uuid4())
376
 
 
 
 
 
 
 
377
  # Read file bytes upfront to avoid reading from closed streams in background task
378
  preloaded_files = []
379
  for uf in files:
380
  raw = await uf.read()
 
 
381
  preloaded_files.append((uf.filename, raw))
382
 
 
 
 
 
 
 
 
 
 
383
  # Single background task: process files sequentially with isolation
384
  async def _process_all():
385
  for idx, (fname, raw) in enumerate(preloaded_files, start=1):
@@ -411,7 +432,7 @@ async def upload_files(
411
  p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in caps])).strip()
412
 
413
  # Build cards
414
- cards = build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
415
  logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
416
 
417
  # Embed & store
@@ -426,16 +447,48 @@ async def upload_files(
426
  file_summary = await cheap_summarize(full_text, max_sentences=6)
427
  rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
428
  logger.info(f"[{job_id}] Completed {fname}")
 
 
 
 
 
429
  except Exception as e:
430
  logger.error(f"[{job_id}] Failed processing {fname}: {e}")
 
 
 
 
431
  finally:
432
  # Yield control between files to keep loop responsive
433
  await asyncio.sleep(0)
434
 
435
  logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
 
 
 
 
436
 
437
  background_tasks.add_task(_process_all)
438
- return UploadResponse(job_id=job_id, status="processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
 
440
 
441
  @app.get("/cards")
 
53
  class UploadResponse(BaseModel):
54
  job_id: str
55
  status: str
56
+ total_files: Optional[int] = None
57
 
58
  class FileSummaryResponse(BaseModel):
59
  filename: str
 
82
  # Serve static files (index.html, scripts.js, styles.css)
83
  app.mount("/static", StaticFiles(directory="static"), name="static")
84
 
85
+ # In-memory job tracker (for progress queries)
86
+ app.state.jobs = {}
87
+
88
 
89
  # ────────────────────────────── Global Clients ──────────────────────────────
90
  # API rotators (round robin + auto failover on quota errors)
 
378
  """
379
  job_id = str(uuid.uuid4())
380
 
381
+ # Basic upload policy limits
382
+ max_files = int(os.getenv("MAX_FILES_PER_UPLOAD", "15"))
383
+ max_mb = int(os.getenv("MAX_FILE_MB", "50"))
384
+ if len(files) > max_files:
385
+ raise HTTPException(400, detail=f"Too many files. Max {max_files} allowed per upload.")
386
+
387
  # Read file bytes upfront to avoid reading from closed streams in background task
388
  preloaded_files = []
389
  for uf in files:
390
  raw = await uf.read()
391
+ if len(raw) > max_mb * 1024 * 1024:
392
+ raise HTTPException(400, detail=f"{uf.filename} exceeds {max_mb} MB limit")
393
  preloaded_files.append((uf.filename, raw))
394
 
395
+ # Initialize job status
396
+ app.state.jobs[job_id] = {
397
+ "created_at": time.time(),
398
+ "total": len(preloaded_files),
399
+ "completed": 0,
400
+ "status": "processing",
401
+ "last_error": None,
402
+ }
403
+
404
  # Single background task: process files sequentially with isolation
405
  async def _process_all():
406
  for idx, (fname, raw) in enumerate(preloaded_files, start=1):
 
432
  p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in caps])).strip()
433
 
434
  # Build cards
435
+ cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
436
  logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
437
 
438
  # Embed & store
 
447
  file_summary = await cheap_summarize(full_text, max_sentences=6)
448
  rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
449
  logger.info(f"[{job_id}] Completed {fname}")
450
+ # Update job progress
451
+ job = app.state.jobs.get(job_id)
452
+ if job:
453
+ job["completed"] = idx
454
+ job["status"] = "processing" if idx < job.get("total", 0) else "completed"
455
  except Exception as e:
456
  logger.error(f"[{job_id}] Failed processing {fname}: {e}")
457
+ job = app.state.jobs.get(job_id)
458
+ if job:
459
+ job["last_error"] = str(e)
460
+ job["completed"] = idx # count as completed attempt
461
  finally:
462
  # Yield control between files to keep loop responsive
463
  await asyncio.sleep(0)
464
 
465
  logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
466
+ # Finalize job status
467
+ job = app.state.jobs.get(job_id)
468
+ if job:
469
+ job["status"] = "completed"
470
 
471
  background_tasks.add_task(_process_all)
472
+ return UploadResponse(job_id=job_id, status="processing", total_files=len(preloaded_files))
473
+
474
+
475
+ @app.get("/upload/status")
476
+ async def upload_status(job_id: str):
477
+ job = app.state.jobs.get(job_id)
478
+ if not job:
479
+ raise HTTPException(404, detail="Job not found")
480
+ percent = 0
481
+ if job.get("total"):
482
+ percent = int(round((job.get("completed", 0) / job.get("total", 1)) * 100))
483
+ return {
484
+ "job_id": job_id,
485
+ "status": job.get("status"),
486
+ "completed": job.get("completed"),
487
+ "total": job.get("total"),
488
+ "percent": percent,
489
+ "last_error": job.get("last_error"),
490
+ "created_at": job.get("created_at"),
491
+ }
492
 
493
 
494
  @app.get("/cards")
static/index.html CHANGED
@@ -159,7 +159,7 @@
159
  <span class="btn-text">Upload Documents</span>
160
  <span class="btn-loading" style="display:none;">
161
  <div class="spinner"></div>
162
- Processing (DO NOT REFRESH)...
163
  </span>
164
  </button>
165
  </form>
 
159
  <span class="btn-text">Upload Documents</span>
160
  <span class="btn-loading" style="display:none;">
161
  <div class="spinner"></div>
162
+ Processing...
163
  </span>
164
  </button>
165
  </form>
static/script.js CHANGED
@@ -205,8 +205,8 @@
205
  logProgress(`Job ID: ${data.job_id}`);
206
  logProgress('Files uploaded successfully');
207
 
208
- // Deterministic per-file progression
209
- simulateProcessing(selectedFiles.length);
210
  } else {
211
  throw new Error(data.detail || 'Upload failed');
212
  }
@@ -248,35 +248,40 @@
248
  progressLog.scrollTop = progressLog.scrollHeight;
249
  }
250
 
251
- function simulateProcessing(totalFiles) {
252
- // Split 100% evenly across files. Round to nearest integer.
253
- let completed = 0;
254
- const step = Math.round(100 / Math.max(totalFiles, 1));
255
- const targets = Array.from({ length: totalFiles }, (_, i) => Math.min(100, Math.round(((i + 1) / totalFiles) * 100)));
256
-
257
- function advance() {
258
- if (completed >= totalFiles) {
259
- updateProgressFill(100);
260
- updateProgressStatus('Processing complete!');
261
- logProgress('All documents processed successfully');
262
- logProgress('You can now start chatting with your documents');
263
- setTimeout(() => hideUploadProgress(), 1500);
264
- enableChat();
265
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  }
267
-
268
- const currentTarget = targets[completed];
269
- updateProgressFill(currentTarget);
270
- updateProgressStatus(`Processing documents... ${currentTarget}%`);
271
- logProgress(`Finished processing file ${completed + 1}/${totalFiles}`);
272
- completed += 1;
273
-
274
- // Wait a short time before next step (simulated, since backend is background)
275
- setTimeout(advance, 1200);
276
- }
277
-
278
- // kick off first step after a short delay to show feedback
279
- setTimeout(advance, 800);
280
  }
281
 
282
  function enableChat() {
 
205
  logProgress(`Job ID: ${data.job_id}`);
206
  logProgress('Files uploaded successfully');
207
 
208
+ // Poll backend for real progress
209
+ startUploadStatusPolling(data.job_id, data.total_files || selectedFiles.length);
210
  } else {
211
  throw new Error(data.detail || 'Upload failed');
212
  }
 
248
  progressLog.scrollTop = progressLog.scrollHeight;
249
  }
250
 
251
+ function startUploadStatusPolling(jobId, totalFiles) {
252
+ let stopped = false;
253
+ const interval = setInterval(async () => {
254
+ if (stopped) return;
255
+ try {
256
+ const res = await fetch(`/upload/status?job_id=${encodeURIComponent(jobId)}`);
257
+ if (!res.ok) {
258
+ throw new Error('Status not available');
259
+ }
260
+ const status = await res.json();
261
+ const percent = Math.max(0, Math.min(100, parseInt(status.percent || 0, 10)));
262
+ const completed = status.completed || 0;
263
+ const total = status.total || totalFiles || 1;
264
+ updateProgressFill(percent);
265
+ updateProgressStatus(`Processing documents... ${percent}% (${completed}/${total})`);
266
+ if (status.last_error) {
267
+ logProgress(`Warning: ${status.last_error}`);
268
+ }
269
+ if (status.status === 'completed' || percent >= 100) {
270
+ clearInterval(interval);
271
+ stopped = true;
272
+ updateProgressFill(100);
273
+ updateProgressStatus('Processing complete!');
274
+ logProgress('All documents processed successfully');
275
+ logProgress('You can now start chatting with your documents');
276
+ setTimeout(() => hideUploadProgress(), 1500);
277
+ enableChat();
278
+ }
279
+ } catch (e) {
280
+ clearInterval(interval);
281
+ stopped = true;
282
+ logProgress(`Error reading job status: ${e.message}`);
283
  }
284
+ }, 1200);
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
286
 
287
  function enableChat() {
utils/chunker.py CHANGED
@@ -1,7 +1,7 @@
1
  # ────────────────────────────── utils/chunker.py ──────────────────────────────
2
  import re
3
  from typing import List, Dict, Any
4
- from .summarizer import cheap_summarize
5
  from .common import split_sentences, slugify
6
  from .logger import get_logger
7
 
@@ -14,6 +14,7 @@ MAX_WORDS = 500
14
  MIN_WORDS = 150
15
  logger = get_logger("CHUNKER", __name__)
16
 
 
17
  def _by_headings(text: str):
18
  # split on markdown-like or outline headings
19
  pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
@@ -32,7 +33,7 @@ def _by_headings(text: str):
32
  return parts
33
 
34
 
35
- def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
36
  # Concatenate pages but keep page spans for metadata
37
  full = ""
38
  page_markers = []
@@ -64,11 +65,13 @@ def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id:
64
 
65
  # Build card dicts
66
  out = []
67
- for i, content in enumerate(cards, 1):
68
- topic = cheap_summarize(content, max_sentences=1)
 
 
69
  if not topic:
70
- topic = content[:80] + "..."
71
- summary = cheap_summarize(content, max_sentences=3)
72
  # Estimate page span
73
  first_page = pages[0]['page_num'] if pages else 1
74
  last_page = pages[-1]['page_num'] if pages else 1
@@ -78,7 +81,7 @@ def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id:
78
  "filename": filename,
79
  "topic_name": topic[:120],
80
  "summary": summary,
81
- "content": content,
82
  "page_span": [first_page, last_page],
83
  "card_id": f"{slugify(filename)}-c{i:04d}"
84
  })
 
1
  # ────────────────────────────── utils/chunker.py ──────────────────────────────
2
  import re
3
  from typing import List, Dict, Any
4
+ from .summarizer import cheap_summarize, clean_chunk_text
5
  from .common import split_sentences, slugify
6
  from .logger import get_logger
7
 
 
14
  MIN_WORDS = 150
15
  logger = get_logger("CHUNKER", __name__)
16
 
17
+
18
  def _by_headings(text: str):
19
  # split on markdown-like or outline headings
20
  pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
 
33
  return parts
34
 
35
 
36
+ async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
37
  # Concatenate pages but keep page spans for metadata
38
  full = ""
39
  page_markers = []
 
65
 
66
  # Build card dicts
67
  out = []
68
+ for i, raw_content in enumerate(cards, 1):
69
+ # Clean with LLM to remove headers/footers and IDs
70
+ cleaned = await clean_chunk_text(raw_content)
71
+ topic = await cheap_summarize(cleaned, max_sentences=1)
72
  if not topic:
73
+ topic = cleaned[:80] + "..."
74
+ summary = await cheap_summarize(cleaned, max_sentences=3)
75
  # Estimate page span
76
  first_page = pages[0]['page_num'] if pages else 1
77
  last_page = pages[-1]['page_num'] if pages else 1
 
81
  "filename": filename,
82
  "topic_name": topic[:120],
83
  "summary": summary,
84
+ "content": cleaned,
85
  "page_span": [first_page, last_page],
86
  "card_id": f"{slugify(filename)}-c{i:04d}"
87
  })
utils/summarizer.py CHANGED
@@ -1,44 +1,38 @@
1
- from typing import List
2
  import os
3
  import asyncio
 
4
  from .logger import get_logger
5
  from utils.rotator import robust_post_json
6
 
7
  logger = get_logger("SUM", __name__)
8
 
9
 
10
- async def llama_summarize(text: str, max_sentences: int = 3) -> str:
11
- """Summarize text using NVIDIA Llama via /v1/chat/completions. Returns plain text."""
12
- text = (text or "").strip()
13
- if not text:
14
- return ""
15
  model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
16
  key = os.getenv("NVIDIA_API_1", "") or os.getenv("NVIDIA_API_KEY", "")
17
  if not key:
18
- logger.warning("NVIDIA API key not set; returning naive fallback summary")
19
- return naive_fallback(text, max_sentences)
 
 
 
 
20
 
21
- system_prompt = (
22
- "You are a precise summarizer. Produce a concise summary of the user's text. "
23
- f"Return about {max_sentences} sentences, no preface, no markdown."
24
- )
25
- user_prompt = f"Summarize this:\n\n{text}"
26
 
 
 
 
 
 
 
 
 
 
27
  try:
28
- url = "https://integrate.api.nvidia.com/v1/chat/completions"
29
- headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
30
- payload = {
31
- "model": model,
32
- "temperature": 0.2,
33
- "messages": [
34
- {"role": "system", "content": system_prompt},
35
- {"role": "user", "content": user_prompt},
36
- ]
37
- }
38
- # Using rotator helper for retries; if not available, simple fetch could be used
39
- data = await robust_post_json(url, headers, payload)
40
- content = data["choices"][0]["message"]["content"].strip()
41
- return content
42
  except Exception as e:
43
  logger.warning(f"LLAMA summarization failed: {e}; using fallback")
44
  return naive_fallback(text, max_sentences)
@@ -49,6 +43,57 @@ def naive_fallback(text: str, max_sentences: int = 3) -> str:
49
  return '. '.join(parts[:max_sentences])
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Backward-compatible name used by app.py
53
  async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
54
  return await llama_summarize(text, max_sentences)
 
 
1
  import os
2
  import asyncio
3
+ from typing import List
4
  from .logger import get_logger
5
  from utils.rotator import robust_post_json
6
 
7
  logger = get_logger("SUM", __name__)
8
 
9
 
10
+ async def llama_chat(messages, temperature: float = 0.2) -> str:
 
 
 
 
11
  model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
12
  key = os.getenv("NVIDIA_API_1", "") or os.getenv("NVIDIA_API_KEY", "")
13
  if not key:
14
+ raise RuntimeError("NVIDIA API key not set")
15
+ url = "https://integrate.api.nvidia.com/v1/chat/completions"
16
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
17
+ payload = {"model": model, "temperature": temperature, "messages": messages}
18
+ data = await robust_post_json(url, headers, payload)
19
+ return data["choices"][0]["message"]["content"].strip()
20
 
 
 
 
 
 
21
 
22
+ async def llama_summarize(text: str, max_sentences: int = 3) -> str:
23
+ text = (text or "").strip()
24
+ if not text:
25
+ return ""
26
+ system = (
27
+ "You are a precise summarizer. Produce a clear, faithful summary of the user's text. "
28
+ f"Return ~{max_sentences} sentences, no preface, no markdown."
29
+ )
30
+ user = f"Summarize this text:\n\n{text}"
31
  try:
32
+ return await llama_chat([
33
+ {"role": "system", "content": system},
34
+ {"role": "user", "content": user},
35
+ ])
 
 
 
 
 
 
 
 
 
 
36
  except Exception as e:
37
  logger.warning(f"LLAMA summarization failed: {e}; using fallback")
38
  return naive_fallback(text, max_sentences)
 
43
  return '. '.join(parts[:max_sentences])
44
 
45
 
46
+ async def summarize_text(text: str, max_sentences: int = 6, chunk_size: int = 2500) -> str:
47
+ """Hierarchical summarization for long texts using NVIDIA Llama."""
48
+ if not text:
49
+ return ""
50
+ if len(text) <= chunk_size:
51
+ return await llama_summarize(text, max_sentences=max_sentences)
52
+ # Split into chunks on paragraph boundaries if possible
53
+ paragraphs = text.split('\n\n')
54
+ chunks: List[str] = []
55
+ buf = []
56
+ total = 0
57
+ for p in paragraphs:
58
+ if total + len(p) > chunk_size and buf:
59
+ chunks.append('\n\n'.join(buf))
60
+ buf, total = [], 0
61
+ buf.append(p)
62
+ total += len(p)
63
+ if buf:
64
+ chunks.append('\n\n'.join(buf))
65
+
66
+ partials = []
67
+ for ch in chunks:
68
+ partials.append(await llama_summarize(ch, max_sentences=3))
69
+ await asyncio.sleep(0)
70
+ combined = '\n'.join(partials)
71
+ return await llama_summarize(combined, max_sentences=max_sentences)
72
+
73
+
74
+ async def clean_chunk_text(text: str) -> str:
75
+ """Use NVIDIA LLM to remove headers/footers and personally identifying/institution boilerplate.
76
+ Keep the core academic content intact. Do not remove page numbers or section titles.
77
+ """
78
+ content = (text or "").strip()
79
+ if not content:
80
+ return content
81
+ system = (
82
+ "You are a content cleaner. Remove boilerplate headers/footers like institution names, course codes, student IDs, "
83
+ "emails, author IDs, document footers/headers repeated across pages. Keep headings and the main body content. "
84
+ "Preserve meaningful section titles. Keep pagination references in the natural text if present. Return only cleaned text."
85
+ )
86
+ user = f"Clean this content by removing headers/footers and IDs, keep core content:\n\n{content}"
87
+ try:
88
+ return await llama_chat([
89
+ {"role": "system", "content": system},
90
+ {"role": "user", "content": user},
91
+ ], temperature=0.0)
92
+ except Exception as e:
93
+ logger.warning(f"LLAMA cleaning failed: {e}; returning original text")
94
+ return content
95
+
96
+
97
  # Backward-compatible name used by app.py
98
  async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
99
  return await llama_summarize(text, max_sentences)