Spaces:

BinKhoaLe1812
/

EdSummariser

Running

App Files Files Community

LiamKhoaLe commited on Aug 28

Commit

e93c61d

1 Parent(s): 5c627aa

Upd summariser Llama prompting. Upd status handler, max size, num_file

Browse files

Files changed (5) hide show

app.py +55 -2
static/index.html +1 -1
static/script.js +35 -30
utils/chunker.py +10 -7
utils/summarizer.py +72 -27

app.py CHANGED Viewed

@@ -53,6 +53,7 @@ class MessageResponse(BaseModel):
 class UploadResponse(BaseModel):
     job_id: str
     status: str
 class FileSummaryResponse(BaseModel):
     filename: str
@@ -81,6 +82,9 @@ app.add_middleware(
 # Serve static files (index.html, scripts.js, styles.css)
 app.mount("/static", StaticFiles(directory="static"), name="static")
 # ────────────────────────────── Global Clients ──────────────────────────────
 # API rotators (round robin + auto failover on quota errors)
@@ -374,12 +378,29 @@ async def upload_files(
     """
     job_id = str(uuid.uuid4())
     # Read file bytes upfront to avoid reading from closed streams in background task
     preloaded_files = []
     for uf in files:
         raw = await uf.read()
         preloaded_files.append((uf.filename, raw))
     # Single background task: process files sequentially with isolation
     async def _process_all():
         for idx, (fname, raw) in enumerate(preloaded_files, start=1):
@@ -411,7 +432,7 @@ async def upload_files(
                         p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in caps])).strip()
                 # Build cards
-                cards = build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
                 logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
                 # Embed & store
@@ -426,16 +447,48 @@ async def upload_files(
                 file_summary = await cheap_summarize(full_text, max_sentences=6)
                 rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
                 logger.info(f"[{job_id}] Completed {fname}")
             except Exception as e:
                 logger.error(f"[{job_id}] Failed processing {fname}: {e}")
             finally:
                 # Yield control between files to keep loop responsive
                 await asyncio.sleep(0)
         logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
     background_tasks.add_task(_process_all)
-    return UploadResponse(job_id=job_id, status="processing")
 @app.get("/cards")

 class UploadResponse(BaseModel):
     job_id: str
     status: str
+    total_files: Optional[int] = None
 class FileSummaryResponse(BaseModel):
     filename: str
 # Serve static files (index.html, scripts.js, styles.css)
 app.mount("/static", StaticFiles(directory="static"), name="static")
+# In-memory job tracker (for progress queries)
+app.state.jobs = {}
 # ────────────────────────────── Global Clients ──────────────────────────────
 # API rotators (round robin + auto failover on quota errors)
     """
     job_id = str(uuid.uuid4())
+    # Basic upload policy limits
+    max_files = int(os.getenv("MAX_FILES_PER_UPLOAD", "15"))
+    max_mb = int(os.getenv("MAX_FILE_MB", "50"))
+    if len(files) > max_files:
+        raise HTTPException(400, detail=f"Too many files. Max {max_files} allowed per upload.")
     # Read file bytes upfront to avoid reading from closed streams in background task
     preloaded_files = []
     for uf in files:
         raw = await uf.read()
+        if len(raw) > max_mb * 1024 * 1024:
+            raise HTTPException(400, detail=f"{uf.filename} exceeds {max_mb} MB limit")
         preloaded_files.append((uf.filename, raw))
+    # Initialize job status
+    app.state.jobs[job_id] = {
+        "created_at": time.time(),
+        "total": len(preloaded_files),
+        "completed": 0,
+        "status": "processing",
+        "last_error": None,
+    }
     # Single background task: process files sequentially with isolation
     async def _process_all():
         for idx, (fname, raw) in enumerate(preloaded_files, start=1):
                         p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in caps])).strip()
                 # Build cards
+                cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
                 logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
                 # Embed & store
                 file_summary = await cheap_summarize(full_text, max_sentences=6)
                 rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
                 logger.info(f"[{job_id}] Completed {fname}")
+                # Update job progress
+                job = app.state.jobs.get(job_id)
+                if job:
+                    job["completed"] = idx
+                    job["status"] = "processing" if idx < job.get("total", 0) else "completed"
             except Exception as e:
                 logger.error(f"[{job_id}] Failed processing {fname}: {e}")
+                job = app.state.jobs.get(job_id)
+                if job:
+                    job["last_error"] = str(e)
+                    job["completed"] = idx  # count as completed attempt
             finally:
                 # Yield control between files to keep loop responsive
                 await asyncio.sleep(0)
         logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
+        # Finalize job status
+        job = app.state.jobs.get(job_id)
+        if job:
+            job["status"] = "completed"
     background_tasks.add_task(_process_all)
+    return UploadResponse(job_id=job_id, status="processing", total_files=len(preloaded_files))
+@app.get("/upload/status")
+async def upload_status(job_id: str):
+    job = app.state.jobs.get(job_id)
+    if not job:
+        raise HTTPException(404, detail="Job not found")
+    percent = 0
+    if job.get("total"):
+        percent = int(round((job.get("completed", 0) / job.get("total", 1)) * 100))
+    return {
+        "job_id": job_id,
+        "status": job.get("status"),
+        "completed": job.get("completed"),
+        "total": job.get("total"),
+        "percent": percent,
+        "last_error": job.get("last_error"),
+        "created_at": job.get("created_at"),
+    }
 @app.get("/cards")

static/index.html CHANGED Viewed

@@ -159,7 +159,7 @@
                 <span class="btn-text">Upload Documents</span>
                 <span class="btn-loading" style="display:none;">
                   <div class="spinner"></div>
-                  Processing (DO NOT REFRESH)...
                 </span>
               </button>
             </form>

                 <span class="btn-text">Upload Documents</span>
                 <span class="btn-loading" style="display:none;">
                   <div class="spinner"></div>
+                  Processing...
                 </span>
               </button>
             </form>

static/script.js CHANGED Viewed

@@ -205,8 +205,8 @@
         logProgress(`Job ID: ${data.job_id}`);
         logProgress('Files uploaded successfully');
-        // Deterministic per-file progression
-        simulateProcessing(selectedFiles.length);
       } else {
         throw new Error(data.detail || 'Upload failed');
       }
@@ -248,35 +248,40 @@
     progressLog.scrollTop = progressLog.scrollHeight;
   }
-  function simulateProcessing(totalFiles) {
-    // Split 100% evenly across files. Round to nearest integer.
-    let completed = 0;
-    const step = Math.round(100 / Math.max(totalFiles, 1));
-    const targets = Array.from({ length: totalFiles }, (_, i) => Math.min(100, Math.round(((i + 1) / totalFiles) * 100)));
-    function advance() {
-      if (completed >= totalFiles) {
-        updateProgressFill(100);
-        updateProgressStatus('Processing complete!');
-        logProgress('All documents processed successfully');
-        logProgress('You can now start chatting with your documents');
-        setTimeout(() => hideUploadProgress(), 1500);
-        enableChat();
-        return;
       }
-      const currentTarget = targets[completed];
-      updateProgressFill(currentTarget);
-      updateProgressStatus(`Processing documents... ${currentTarget}%`);
-      logProgress(`Finished processing file ${completed + 1}/${totalFiles}`);
-      completed += 1;
-      // Wait a short time before next step (simulated, since backend is background)
-      setTimeout(advance, 1200);
-    }
-    // kick off first step after a short delay to show feedback
-    setTimeout(advance, 800);
   }
   function enableChat() {

         logProgress(`Job ID: ${data.job_id}`);
         logProgress('Files uploaded successfully');
+        // Poll backend for real progress
+        startUploadStatusPolling(data.job_id, data.total_files || selectedFiles.length);
       } else {
         throw new Error(data.detail || 'Upload failed');
       }
     progressLog.scrollTop = progressLog.scrollHeight;
   }
+  function startUploadStatusPolling(jobId, totalFiles) {
+    let stopped = false;
+    const interval = setInterval(async () => {
+      if (stopped) return;
+      try {
+        const res = await fetch(`/upload/status?job_id=${encodeURIComponent(jobId)}`);
+        if (!res.ok) {
+          throw new Error('Status not available');
+        }
+        const status = await res.json();
+        const percent = Math.max(0, Math.min(100, parseInt(status.percent || 0, 10)));
+        const completed = status.completed || 0;
+        const total = status.total || totalFiles || 1;
+        updateProgressFill(percent);
+        updateProgressStatus(`Processing documents... ${percent}% (${completed}/${total})`);
+        if (status.last_error) {
+          logProgress(`Warning: ${status.last_error}`);
+        }
+        if (status.status === 'completed' || percent >= 100) {
+          clearInterval(interval);
+          stopped = true;
+          updateProgressFill(100);
+          updateProgressStatus('Processing complete!');
+          logProgress('All documents processed successfully');
+          logProgress('You can now start chatting with your documents');
+          setTimeout(() => hideUploadProgress(), 1500);
+          enableChat();
+        }
+      } catch (e) {
+        clearInterval(interval);
+        stopped = true;
+        logProgress(`Error reading job status: ${e.message}`);
       }
+    }, 1200);
   }
   function enableChat() {

utils/chunker.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ────────────────────────────── utils/chunker.py ──────────────────────────────
 import re
 from typing import List, Dict, Any
-from .summarizer import cheap_summarize
 from .common import split_sentences, slugify
 from .logger import get_logger
@@ -14,6 +14,7 @@ MAX_WORDS = 500
 MIN_WORDS = 150
 logger = get_logger("CHUNKER", __name__)
 def _by_headings(text: str):
     # split on markdown-like or outline headings
     pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
@@ -32,7 +33,7 @@ def _by_headings(text: str):
     return parts
-def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
     # Concatenate pages but keep page spans for metadata
     full = ""
     page_markers = []
@@ -64,11 +65,13 @@ def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id:
     # Build card dicts
     out = []
-    for i, content in enumerate(cards, 1):
-        topic = cheap_summarize(content, max_sentences=1)
         if not topic:
-            topic = content[:80] + "..."
-        summary = cheap_summarize(content, max_sentences=3)
         # Estimate page span
         first_page = pages[0]['page_num'] if pages else 1
         last_page = pages[-1]['page_num'] if pages else 1
@@ -78,7 +81,7 @@ def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id:
             "filename": filename,
             "topic_name": topic[:120],
             "summary": summary,
-            "content": content,
             "page_span": [first_page, last_page],
             "card_id": f"{slugify(filename)}-c{i:04d}"
         })

 # ────────────────────────────── utils/chunker.py ──────────────────────────────
 import re
 from typing import List, Dict, Any
+from .summarizer import cheap_summarize, clean_chunk_text
 from .common import split_sentences, slugify
 from .logger import get_logger
 MIN_WORDS = 150
 logger = get_logger("CHUNKER", __name__)
 def _by_headings(text: str):
     # split on markdown-like or outline headings
     pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
     return parts
+async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
     # Concatenate pages but keep page spans for metadata
     full = ""
     page_markers = []
     # Build card dicts
     out = []
+    for i, raw_content in enumerate(cards, 1):
+        # Clean with LLM to remove headers/footers and IDs
+        cleaned = await clean_chunk_text(raw_content)
+        topic = await cheap_summarize(cleaned, max_sentences=1)
         if not topic:
+            topic = cleaned[:80] + "..."
+        summary = await cheap_summarize(cleaned, max_sentences=3)
         # Estimate page span
         first_page = pages[0]['page_num'] if pages else 1
         last_page = pages[-1]['page_num'] if pages else 1
             "filename": filename,
             "topic_name": topic[:120],
             "summary": summary,
+            "content": cleaned,
             "page_span": [first_page, last_page],
             "card_id": f"{slugify(filename)}-c{i:04d}"
         })

utils/summarizer.py CHANGED Viewed

@@ -1,44 +1,38 @@
-from typing import List
 import os
 import asyncio
 from .logger import get_logger
 from utils.rotator import robust_post_json
 logger = get_logger("SUM", __name__)
-async def llama_summarize(text: str, max_sentences: int = 3) -> str:
-  """Summarize text using NVIDIA Llama via /v1/chat/completions. Returns plain text."""
-  text = (text or "").strip()
-  if not text:
-    return ""
   model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
   key = os.getenv("NVIDIA_API_1", "") or os.getenv("NVIDIA_API_KEY", "")
   if not key:
-    logger.warning("NVIDIA API key not set; returning naive fallback summary")
-    return naive_fallback(text, max_sentences)
-  system_prompt = (
-    "You are a precise summarizer. Produce a concise summary of the user's text. "
-    f"Return about {max_sentences} sentences, no preface, no markdown."
-  )
-  user_prompt = f"Summarize this:\n\n{text}"
   try:
-    url = "https://integrate.api.nvidia.com/v1/chat/completions"
-    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
-    payload = {
-      "model": model,
-      "temperature": 0.2,
-      "messages": [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_prompt},
-      ]
-    }
-    # Using rotator helper for retries; if not available, simple fetch could be used
-    data = await robust_post_json(url, headers, payload)
-    content = data["choices"][0]["message"]["content"].strip()
-    return content
   except Exception as e:
     logger.warning(f"LLAMA summarization failed: {e}; using fallback")
     return naive_fallback(text, max_sentences)
@@ -49,6 +43,57 @@ def naive_fallback(text: str, max_sentences: int = 3) -> str:
   return '. '.join(parts[:max_sentences])
 # Backward-compatible name used by app.py
 async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
   return await llama_summarize(text, max_sentences)

 import os
 import asyncio
+from typing import List
 from .logger import get_logger
 from utils.rotator import robust_post_json
 logger = get_logger("SUM", __name__)
+async def llama_chat(messages, temperature: float = 0.2) -> str:
   model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
   key = os.getenv("NVIDIA_API_1", "") or os.getenv("NVIDIA_API_KEY", "")
   if not key:
+    raise RuntimeError("NVIDIA API key not set")
+  url = "https://integrate.api.nvidia.com/v1/chat/completions"
+  headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
+  payload = {"model": model, "temperature": temperature, "messages": messages}
+  data = await robust_post_json(url, headers, payload)
+  return data["choices"][0]["message"]["content"].strip()
+async def llama_summarize(text: str, max_sentences: int = 3) -> str:
+  text = (text or "").strip()
+  if not text:
+    return ""
+  system = (
+    "You are a precise summarizer. Produce a clear, faithful summary of the user's text. "
+    f"Return ~{max_sentences} sentences, no preface, no markdown."
+  )
+  user = f"Summarize this text:\n\n{text}"
   try:
+    return await llama_chat([
+      {"role": "system", "content": system},
+      {"role": "user", "content": user},
+    ])
   except Exception as e:
     logger.warning(f"LLAMA summarization failed: {e}; using fallback")
     return naive_fallback(text, max_sentences)
   return '. '.join(parts[:max_sentences])
+async def summarize_text(text: str, max_sentences: int = 6, chunk_size: int = 2500) -> str:
+  """Hierarchical summarization for long texts using NVIDIA Llama."""
+  if not text:
+    return ""
+  if len(text) <= chunk_size:
+    return await llama_summarize(text, max_sentences=max_sentences)
+  # Split into chunks on paragraph boundaries if possible
+  paragraphs = text.split('\n\n')
+  chunks: List[str] = []
+  buf = []
+  total = 0
+  for p in paragraphs:
+    if total + len(p) > chunk_size and buf:
+      chunks.append('\n\n'.join(buf))
+      buf, total = [], 0
+    buf.append(p)
+    total += len(p)
+  if buf:
+    chunks.append('\n\n'.join(buf))
+  partials = []
+  for ch in chunks:
+    partials.append(await llama_summarize(ch, max_sentences=3))
+    await asyncio.sleep(0)
+  combined = '\n'.join(partials)
+  return await llama_summarize(combined, max_sentences=max_sentences)
+async def clean_chunk_text(text: str) -> str:
+  """Use NVIDIA LLM to remove headers/footers and personally identifying/institution boilerplate.
+  Keep the core academic content intact. Do not remove page numbers or section titles.
+  """
+  content = (text or "").strip()
+  if not content:
+    return content
+  system = (
+    "You are a content cleaner. Remove boilerplate headers/footers like institution names, course codes, student IDs, "
+    "emails, author IDs, document footers/headers repeated across pages. Keep headings and the main body content. "
+    "Preserve meaningful section titles. Keep pagination references in the natural text if present. Return only cleaned text."
+  )
+  user = f"Clean this content by removing headers/footers and IDs, keep core content:\n\n{content}"
+  try:
+    return await llama_chat([
+      {"role": "system", "content": system},
+      {"role": "user", "content": user},
+    ], temperature=0.0)
+  except Exception as e:
+    logger.warning(f"LLAMA cleaning failed: {e}; returning original text")
+    return content
 # Backward-compatible name used by app.py
 async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
   return await llama_summarize(text, max_sentences)