Peterase commited on
Commit
d13f5bc
Β·
1 Parent(s): 54dfb7e

fix: live doc dedup + Jina boilerplate stripping

Browse files

rag_chat_use_case.py:
- Fix dedup: live docs have doc_id=None causing all but first to be dropped
Now uses URL as fallback dedup key for live results
- Result: temporal queries now correctly return 3-4 live docs instead of 1

jina_reader_adapter.py:
- Add _strip_boilerplate() method to remove nav/footer/archives from extracted content
- Cuts at: Post navigation, Archives, Categories, Recent Posts, Social links, Copyright
- Hard cap at 8,000 chars to protect LLM context window
- Prevents 43K-176K char pages from flooding the LLM with navigation HTML

src/core/use_cases/rag_chat_use_case.py CHANGED
@@ -722,11 +722,14 @@ JSON:"""
722
  else:
723
  final_pool = quality_docs[:top_k]
724
 
725
- # Deduplicate by doc_id
726
  seen: set = set()
727
  deduped_final: List[Dict[str, Any]] = []
728
  for d in final_pool:
729
  did = d.get("doc_id")
 
 
 
730
  if did in seen:
731
  continue
732
  seen.add(did)
 
722
  else:
723
  final_pool = quality_docs[:top_k]
724
 
725
+ # Deduplicate by doc_id β€” live results use URL as fallback key
726
  seen: set = set()
727
  deduped_final: List[Dict[str, Any]] = []
728
  for d in final_pool:
729
  did = d.get("doc_id")
730
+ # Live results have no doc_id β€” use URL as dedup key instead
731
+ if did is None:
732
+ did = d.get("url") or d.get("metadata", {}).get("url") or id(d)
733
  if did in seen:
734
  continue
735
  seen.add(did)
src/infrastructure/adapters/jina_reader_adapter.py CHANGED
@@ -133,9 +133,14 @@ class JinaReaderAdapter:
133
  if line.strip(): # Skip empty lines at start
134
  body_lines = lines[i:]
135
  break
136
-
137
  body = '\n'.join(body_lines).strip()
138
-
 
 
 
 
 
139
  # Validate content
140
  if not body or len(body) < 100:
141
  logger.warning(
@@ -147,11 +152,11 @@ class JinaReaderAdapter:
147
  "url": url,
148
  "error": "Insufficient content extracted"
149
  }
150
-
151
  logger.info(
152
  f"βœ… Jina extracted {len(body):,} chars from {url[:50]}"
153
  )
154
-
155
  return {
156
  "success": True,
157
  "url": url,
@@ -355,6 +360,49 @@ class JinaReaderAdapter:
355
  self.client = None
356
  logger.debug("Jina Reader client closed")
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  def is_available(self) -> bool:
359
  """Check if Jina Reader is available"""
360
  # Jina Reader is always available (no API key required)
 
133
  if line.strip(): # Skip empty lines at start
134
  body_lines = lines[i:]
135
  break
136
+
137
  body = '\n'.join(body_lines).strip()
138
+
139
+ # ── Strip boilerplate: navigation, footer, archives ───────────
140
+ # Jina extracts the full page markdown including nav/footer.
141
+ # We cut at the first sign of boilerplate to keep only the article.
142
+ body = self._strip_boilerplate(body)
143
+
144
  # Validate content
145
  if not body or len(body) < 100:
146
  logger.warning(
 
152
  "url": url,
153
  "error": "Insufficient content extracted"
154
  }
155
+
156
  logger.info(
157
  f"βœ… Jina extracted {len(body):,} chars from {url[:50]}"
158
  )
159
+
160
  return {
161
  "success": True,
162
  "url": url,
 
360
  self.client = None
361
  logger.debug("Jina Reader client closed")
362
 
363
+ def _strip_boilerplate(self, content: str, max_chars: int = 8000) -> str:
364
+ """
365
+ Strip navigation, footer, archives and other boilerplate from
366
+ Jina-extracted markdown. Keeps only the article body.
367
+
368
+ Strategy:
369
+ 1. Cut at common boilerplate section markers
370
+ 2. Hard cap at max_chars to avoid sending 176K chars to the LLM
371
+ """
372
+ import re
373
+
374
+ # Markers that indicate end of article content
375
+ # Everything after these is navigation/footer/boilerplate
376
+ CUTOFF_PATTERNS = [
377
+ r'\n## (Post navigation|Archives|Categories|Recent Posts|Search|Newsletter|Socials|Tags|Related)',
378
+ r'\n### (Post navigation|Archives|Categories|Recent Posts|Related)',
379
+ r'\n\* \[Home\]\(', # Navigation list starting with Home
380
+ r'\n\* \[Facebook\]\(', # Social links
381
+ r'\nCopyright Β©',
382
+ r'\n---\n.*\n---', # Horizontal rules often mark footer
383
+ r'\nShare on (Facebook|Twitter|X|LinkedIn)',
384
+ r'\n## Search\n',
385
+ r'\n## Newsletter\n',
386
+ r'\n## Socials\n',
387
+ ]
388
+
389
+ for pattern in CUTOFF_PATTERNS:
390
+ match = re.search(pattern, content, re.IGNORECASE)
391
+ if match:
392
+ content = content[:match.start()].strip()
393
+ break
394
+
395
+ # Hard cap β€” LLM context window protection
396
+ if len(content) > max_chars:
397
+ # Try to cut at a paragraph boundary
398
+ cutoff = content[:max_chars].rfind('\n\n')
399
+ if cutoff > max_chars * 0.7:
400
+ content = content[:cutoff].strip()
401
+ else:
402
+ content = content[:max_chars].strip()
403
+
404
+ return content
405
+
406
  def is_available(self) -> bool:
407
  """Check if Jina Reader is available"""
408
  # Jina Reader is always available (no API key required)