Spaces:

Peterase
/

rag-api-node-1

Running

Peterase commited on 6 days ago

Commit

54dfb7e

1 Parent(s): 4d63907

fix: NewsAPI query precision + non-news domain filtering

NewsAPI (newsapi_adapter.py):
- Single-word queries now anchored: 'Ethiopia' → 'Ethiopia' AND ('Ethiopia' OR 'Africa' OR 'Horn of Africa')
- Fetch 2x results then filter, return max_results clean articles
- Block non-news domains at source: pypi.org, github, arxiv, plos, stemlynsblog, etc.
- Log count of filtered non-news articles

Ranker (hybrid_result_ranker.py):
- Add _NON_NEWS_DOMAINS blocklist to ranker as second filter
- Filter before deduplication so reranker never sees pypi/github/academic results
- Log count of filtered results

Files changed (2) hide show

src/core/ranking/hybrid_result_ranker.py +20 -0
src/infrastructure/adapters/newsapi_adapter.py +45 -16

src/core/ranking/hybrid_result_ranker.py CHANGED Viewed

@@ -60,6 +60,14 @@ class HybridResultRanker:
         "waltainfo.com": 0.7,
     }
     def __init__(self, reranker):
         """
         Initialize hybrid result ranker.
@@ -101,6 +109,18 @@ class HybridResultRanker:
             f"(temporal_boost={'ON' if is_temporal else 'OFF'})"
         )
         unique_results = self._deduplicate(all_results)
         logger.info(f"After deduplication: {len(unique_results)} unique results")

         "waltainfo.com": 0.7,
     }
+    # Non-news domains to filter out before reranking
+    _NON_NEWS_DOMAINS = {
+        "pypi.org", "github.com", "stackoverflow.com", "reddit.com",
+        "wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu",
+        "plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com",
+        "stemlynsblog.org", "linkedin.com", "youtube.com",
+    }
     def __init__(self, reranker):
         """
         Initialize hybrid result ranker.
             f"(temporal_boost={'ON' if is_temporal else 'OFF'})"
         )
+        # Filter non-news domains from live results
+        before = len(all_results)
+        all_results = [
+            r for r in all_results
+            if not any(
+                nd in (r.get("url") or r.get("metadata", {}).get("url") or "").lower()
+                for nd in self._NON_NEWS_DOMAINS
+            )
+        ]
+        if len(all_results) < before:
+            logger.info(f"Filtered {before - len(all_results)} non-news domain results")
         unique_results = self._deduplicate(all_results)
         logger.info(f"After deduplication: {len(unique_results)} unique results")

src/infrastructure/adapters/newsapi_adapter.py CHANGED Viewed

@@ -69,8 +69,19 @@ class NewsAPIAdapter:
                 }
             )
     async def search(
-        self,
         query: str,
         language: str = "en",
         sort_by: str = "publishedAt",
@@ -79,51 +90,69 @@ class NewsAPIAdapter:
     ) -> List[Dict[str, Any]]:
         """
         Search NewsAPI for the given query.
-        Automatically wraps multi-word queries in quotes for exact matching.
         """
         if not self.api_key:
             logger.warning("NewsAPI unavailable - returning empty results")
             return []
         await self._ensure_client()
         max_results = max_results or self.max_results
-        # Wrap in quotes if multi-word and not already quoted — improves precision
-        search_q = query
         words = query.strip().split()
-        if len(words) > 1 and not query.startswith('"'):
-            # Use AND logic: all key terms must appear
             search_q = " AND ".join(f'"{w}"' for w in words[:3])
         try:
             url = f"{self.BASE_URL}/everything"
             params = {
                 "q": search_q,
                 "language": language,
                 "sortBy": sort_by,
-                "pageSize": max_results
             }
             if from_date:
                 params["from"] = from_date
             logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})")
             response = await self.client.get(url, params=params)
             if response.status_code == 200:
                 data = response.json()
                 if data.get("status") != "ok":
                     logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}")
                     return []
                 articles = data.get("articles", [])
                 results = []
                 for article in articles:
                     normalized = self._normalize_result(article)
                     if normalized:
                         results.append(normalized)
                 logger.info(
                     f"[NewsAPI] '{query[:50]}' → {len(results)} results "
                     f"(total available: {data.get('totalResults', 0)})"

                 }
             )
+    # Domains that are NOT news sources — filter these out
+    _NON_NEWS_DOMAINS = {
+        "pypi.org", "github.com", "stackoverflow.com", "reddit.com",
+        "wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu",
+        "linkedin.com", "facebook.com", "twitter.com", "x.com",
+        "youtube.com", "instagram.com", "tiktok.com",
+        "amazon.com", "ebay.com", "etsy.com",
+        "plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com",
+        "stemlynsblog.org",
+    }
     async def search(
+        self,
         query: str,
         language: str = "en",
         sort_by: str = "publishedAt",
     ) -> List[Dict[str, Any]]:
         """
         Search NewsAPI for the given query.
+        Always anchors to Ethiopia/Africa context for single-word queries.
+        Filters out non-news domains (pypi, github, academic, social media).
         """
         if not self.api_key:
             logger.warning("NewsAPI unavailable - returning empty results")
             return []
         await self._ensure_client()
         max_results = max_results or self.max_results
+        # Build search query — always ensure Ethiopia/Africa context
         words = query.strip().split()
+        if len(words) == 1:
+            # Single word: anchor to Ethiopia news explicitly
+            search_q = f'"{query}" AND ("Ethiopia" OR "Africa" OR "Horn of Africa")'
+        elif len(words) <= 3:
+            # Short query: AND all terms
+            search_q = " AND ".join(f'"{w}"' for w in words)
+        else:
+            # Longer query: use first 3 key terms
             search_q = " AND ".join(f'"{w}"' for w in words[:3])
         try:
             url = f"{self.BASE_URL}/everything"
             params = {
                 "q": search_q,
                 "language": language,
                 "sortBy": sort_by,
+                "pageSize": min(max_results * 2, 100),  # Fetch extra to allow filtering
             }
             if from_date:
                 params["from"] = from_date
             logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})")
             response = await self.client.get(url, params=params)
             if response.status_code == 200:
                 data = response.json()
                 if data.get("status") != "ok":
                     logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}")
                     return []
                 articles = data.get("articles", [])
                 results = []
+                filtered_out = 0
                 for article in articles:
+                    # Filter non-news domains
+                    url_str = article.get("url", "")
+                    domain = self._extract_domain(url_str)
+                    if any(nd in domain for nd in self._NON_NEWS_DOMAINS):
+                        filtered_out += 1
+                        logger.debug(f"[NewsAPI] Filtered non-news: {domain}")
+                        continue
                     normalized = self._normalize_result(article)
                     if normalized:
                         results.append(normalized)
+                    if len(results) >= max_results:
+                        break
+                if filtered_out:
+                    logger.info(f"[NewsAPI] Filtered {filtered_out} non-news articles")
                 logger.info(
                     f"[NewsAPI] '{query[:50]}' → {len(results)} results "
                     f"(total available: {data.get('totalResults', 0)})"