Spaces:

Peterase
/

rag-api-node-1

Running

Peterase commited on 5 days ago

Commit

d13f5bc

1 Parent(s): 54dfb7e

fix: live doc dedup + Jina boilerplate stripping

rag_chat_use_case.py:
- Fix dedup: live docs have doc_id=None causing all but first to be dropped
Now uses URL as fallback dedup key for live results
- Result: temporal queries now correctly return 3-4 live docs instead of 1

jina_reader_adapter.py:
- Add _strip_boilerplate() method to remove nav/footer/archives from extracted content
- Cuts at: Post navigation, Archives, Categories, Recent Posts, Social links, Copyright
- Hard cap at 8,000 chars to protect LLM context window
- Prevents 43K-176K char pages from flooding the LLM with navigation HTML

Files changed (2) hide show

src/core/use_cases/rag_chat_use_case.py +4 -1
src/infrastructure/adapters/jina_reader_adapter.py +52 -4

src/core/use_cases/rag_chat_use_case.py CHANGED Viewed

@@ -722,11 +722,14 @@ JSON:"""
             else:
                 final_pool = quality_docs[:top_k]
-        # Deduplicate by doc_id
         seen: set = set()
         deduped_final: List[Dict[str, Any]] = []
         for d in final_pool:
             did = d.get("doc_id")
             if did in seen:
                 continue
             seen.add(did)

             else:
                 final_pool = quality_docs[:top_k]
+        # Deduplicate by doc_id — live results use URL as fallback key
         seen: set = set()
         deduped_final: List[Dict[str, Any]] = []
         for d in final_pool:
             did = d.get("doc_id")
+            # Live results have no doc_id — use URL as dedup key instead
+            if did is None:
+                did = d.get("url") or d.get("metadata", {}).get("url") or id(d)
             if did in seen:
                 continue
             seen.add(did)

src/infrastructure/adapters/jina_reader_adapter.py CHANGED Viewed

@@ -133,9 +133,14 @@ class JinaReaderAdapter:
                     if line.strip():  # Skip empty lines at start
                         body_lines = lines[i:]
                         break
                 body = '\n'.join(body_lines).strip()
                 # Validate content
                 if not body or len(body) < 100:
                     logger.warning(
@@ -147,11 +152,11 @@ class JinaReaderAdapter:
                         "url": url,
                         "error": "Insufficient content extracted"
                     }
                 logger.info(
                     f"✅ Jina extracted {len(body):,} chars from {url[:50]}"
                 )
                 return {
                     "success": True,
                     "url": url,
@@ -355,6 +360,49 @@ class JinaReaderAdapter:
             self.client = None
             logger.debug("Jina Reader client closed")
     def is_available(self) -> bool:
         """Check if Jina Reader is available"""
         # Jina Reader is always available (no API key required)

                     if line.strip():  # Skip empty lines at start
                         body_lines = lines[i:]
                         break
                 body = '\n'.join(body_lines).strip()
+                # ── Strip boilerplate: navigation, footer, archives ───────────
+                # Jina extracts the full page markdown including nav/footer.
+                # We cut at the first sign of boilerplate to keep only the article.
+                body = self._strip_boilerplate(body)
                 # Validate content
                 if not body or len(body) < 100:
                     logger.warning(
                         "url": url,
                         "error": "Insufficient content extracted"
                     }
                 logger.info(
                     f"✅ Jina extracted {len(body):,} chars from {url[:50]}"
                 )
                 return {
                     "success": True,
                     "url": url,
             self.client = None
             logger.debug("Jina Reader client closed")
+    def _strip_boilerplate(self, content: str, max_chars: int = 8000) -> str:
+        """
+        Strip navigation, footer, archives and other boilerplate from
+        Jina-extracted markdown. Keeps only the article body.
+        Strategy:
+        1. Cut at common boilerplate section markers
+        2. Hard cap at max_chars to avoid sending 176K chars to the LLM
+        """
+        import re
+        # Markers that indicate end of article content
+        # Everything after these is navigation/footer/boilerplate
+        CUTOFF_PATTERNS = [
+            r'\n## (Post navigation|Archives|Categories|Recent Posts|Search|Newsletter|Socials|Tags|Related)',
+            r'\n### (Post navigation|Archives|Categories|Recent Posts|Related)',
+            r'\n\* \[Home\]\(',          # Navigation list starting with Home
+            r'\n\* \[Facebook\]\(',      # Social links
+            r'\nCopyright ©',
+            r'\n---\n.*\n---',           # Horizontal rules often mark footer
+            r'\nShare on (Facebook|Twitter|X|LinkedIn)',
+            r'\n## Search\n',
+            r'\n## Newsletter\n',
+            r'\n## Socials\n',
+        ]
+        for pattern in CUTOFF_PATTERNS:
+            match = re.search(pattern, content, re.IGNORECASE)
+            if match:
+                content = content[:match.start()].strip()
+                break
+        # Hard cap — LLM context window protection
+        if len(content) > max_chars:
+            # Try to cut at a paragraph boundary
+            cutoff = content[:max_chars].rfind('\n\n')
+            if cutoff > max_chars * 0.7:
+                content = content[:cutoff].strip()
+            else:
+                content = content[:max_chars].strip()
+        return content
     def is_available(self) -> bool:
         """Check if Jina Reader is available"""
         # Jina Reader is always available (no API key required)