Spaces:
Running
Running
fix: NewsAPI query precision + non-news domain filtering
Browse filesNewsAPI (newsapi_adapter.py):
- Single-word queries now anchored: 'Ethiopia' → 'Ethiopia' AND ('Ethiopia' OR 'Africa' OR 'Horn of Africa')
- Fetch 2x results then filter, return max_results clean articles
- Block non-news domains at source: pypi.org, github, arxiv, plos, stemlynsblog, etc.
- Log count of filtered non-news articles
Ranker (hybrid_result_ranker.py):
- Add _NON_NEWS_DOMAINS blocklist to ranker as second filter
- Filter before deduplication so reranker never sees pypi/github/academic results
- Log count of filtered results
src/core/ranking/hybrid_result_ranker.py
CHANGED
|
@@ -60,6 +60,14 @@ class HybridResultRanker:
|
|
| 60 |
"waltainfo.com": 0.7,
|
| 61 |
}
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def __init__(self, reranker):
|
| 64 |
"""
|
| 65 |
Initialize hybrid result ranker.
|
|
@@ -101,6 +109,18 @@ class HybridResultRanker:
|
|
| 101 |
f"(temporal_boost={'ON' if is_temporal else 'OFF'})"
|
| 102 |
)
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
unique_results = self._deduplicate(all_results)
|
| 105 |
logger.info(f"After deduplication: {len(unique_results)} unique results")
|
| 106 |
|
|
|
|
| 60 |
"waltainfo.com": 0.7,
|
| 61 |
}
|
| 62 |
|
| 63 |
+
# Non-news domains to filter out before reranking
|
| 64 |
+
_NON_NEWS_DOMAINS = {
|
| 65 |
+
"pypi.org", "github.com", "stackoverflow.com", "reddit.com",
|
| 66 |
+
"wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu",
|
| 67 |
+
"plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com",
|
| 68 |
+
"stemlynsblog.org", "linkedin.com", "youtube.com",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
def __init__(self, reranker):
|
| 72 |
"""
|
| 73 |
Initialize hybrid result ranker.
|
|
|
|
| 109 |
f"(temporal_boost={'ON' if is_temporal else 'OFF'})"
|
| 110 |
)
|
| 111 |
|
| 112 |
+
# Filter non-news domains from live results
|
| 113 |
+
before = len(all_results)
|
| 114 |
+
all_results = [
|
| 115 |
+
r for r in all_results
|
| 116 |
+
if not any(
|
| 117 |
+
nd in (r.get("url") or r.get("metadata", {}).get("url") or "").lower()
|
| 118 |
+
for nd in self._NON_NEWS_DOMAINS
|
| 119 |
+
)
|
| 120 |
+
]
|
| 121 |
+
if len(all_results) < before:
|
| 122 |
+
logger.info(f"Filtered {before - len(all_results)} non-news domain results")
|
| 123 |
+
|
| 124 |
unique_results = self._deduplicate(all_results)
|
| 125 |
logger.info(f"After deduplication: {len(unique_results)} unique results")
|
| 126 |
|
src/infrastructure/adapters/newsapi_adapter.py
CHANGED
|
@@ -69,8 +69,19 @@ class NewsAPIAdapter:
|
|
| 69 |
}
|
| 70 |
)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
async def search(
|
| 73 |
-
self,
|
| 74 |
query: str,
|
| 75 |
language: str = "en",
|
| 76 |
sort_by: str = "publishedAt",
|
|
@@ -79,51 +90,69 @@ class NewsAPIAdapter:
|
|
| 79 |
) -> List[Dict[str, Any]]:
|
| 80 |
"""
|
| 81 |
Search NewsAPI for the given query.
|
| 82 |
-
|
|
|
|
| 83 |
"""
|
| 84 |
if not self.api_key:
|
| 85 |
logger.warning("NewsAPI unavailable - returning empty results")
|
| 86 |
return []
|
| 87 |
-
|
| 88 |
await self._ensure_client()
|
| 89 |
-
|
| 90 |
max_results = max_results or self.max_results
|
| 91 |
-
|
| 92 |
-
#
|
| 93 |
-
search_q = query
|
| 94 |
words = query.strip().split()
|
| 95 |
-
if len(words)
|
| 96 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
search_q = " AND ".join(f'"{w}"' for w in words[:3])
|
| 98 |
-
|
| 99 |
try:
|
| 100 |
url = f"{self.BASE_URL}/everything"
|
| 101 |
params = {
|
| 102 |
"q": search_q,
|
| 103 |
"language": language,
|
| 104 |
"sortBy": sort_by,
|
| 105 |
-
"pageSize": max_results
|
| 106 |
}
|
| 107 |
if from_date:
|
| 108 |
params["from"] = from_date
|
| 109 |
-
|
| 110 |
logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})")
|
| 111 |
-
|
| 112 |
response = await self.client.get(url, params=params)
|
| 113 |
-
|
| 114 |
if response.status_code == 200:
|
| 115 |
data = response.json()
|
| 116 |
if data.get("status") != "ok":
|
| 117 |
logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}")
|
| 118 |
return []
|
| 119 |
-
|
| 120 |
articles = data.get("articles", [])
|
| 121 |
results = []
|
|
|
|
| 122 |
for article in articles:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
normalized = self._normalize_result(article)
|
| 124 |
if normalized:
|
| 125 |
results.append(normalized)
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
logger.info(
|
| 128 |
f"[NewsAPI] '{query[:50]}' → {len(results)} results "
|
| 129 |
f"(total available: {data.get('totalResults', 0)})"
|
|
|
|
| 69 |
}
|
| 70 |
)
|
| 71 |
|
| 72 |
+
# Domains that are NOT news sources — filter these out
|
| 73 |
+
_NON_NEWS_DOMAINS = {
|
| 74 |
+
"pypi.org", "github.com", "stackoverflow.com", "reddit.com",
|
| 75 |
+
"wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu",
|
| 76 |
+
"linkedin.com", "facebook.com", "twitter.com", "x.com",
|
| 77 |
+
"youtube.com", "instagram.com", "tiktok.com",
|
| 78 |
+
"amazon.com", "ebay.com", "etsy.com",
|
| 79 |
+
"plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com",
|
| 80 |
+
"stemlynsblog.org",
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
async def search(
|
| 84 |
+
self,
|
| 85 |
query: str,
|
| 86 |
language: str = "en",
|
| 87 |
sort_by: str = "publishedAt",
|
|
|
|
| 90 |
) -> List[Dict[str, Any]]:
|
| 91 |
"""
|
| 92 |
Search NewsAPI for the given query.
|
| 93 |
+
Always anchors to Ethiopia/Africa context for single-word queries.
|
| 94 |
+
Filters out non-news domains (pypi, github, academic, social media).
|
| 95 |
"""
|
| 96 |
if not self.api_key:
|
| 97 |
logger.warning("NewsAPI unavailable - returning empty results")
|
| 98 |
return []
|
| 99 |
+
|
| 100 |
await self._ensure_client()
|
|
|
|
| 101 |
max_results = max_results or self.max_results
|
| 102 |
+
|
| 103 |
+
# Build search query — always ensure Ethiopia/Africa context
|
|
|
|
| 104 |
words = query.strip().split()
|
| 105 |
+
if len(words) == 1:
|
| 106 |
+
# Single word: anchor to Ethiopia news explicitly
|
| 107 |
+
search_q = f'"{query}" AND ("Ethiopia" OR "Africa" OR "Horn of Africa")'
|
| 108 |
+
elif len(words) <= 3:
|
| 109 |
+
# Short query: AND all terms
|
| 110 |
+
search_q = " AND ".join(f'"{w}"' for w in words)
|
| 111 |
+
else:
|
| 112 |
+
# Longer query: use first 3 key terms
|
| 113 |
search_q = " AND ".join(f'"{w}"' for w in words[:3])
|
| 114 |
+
|
| 115 |
try:
|
| 116 |
url = f"{self.BASE_URL}/everything"
|
| 117 |
params = {
|
| 118 |
"q": search_q,
|
| 119 |
"language": language,
|
| 120 |
"sortBy": sort_by,
|
| 121 |
+
"pageSize": min(max_results * 2, 100), # Fetch extra to allow filtering
|
| 122 |
}
|
| 123 |
if from_date:
|
| 124 |
params["from"] = from_date
|
| 125 |
+
|
| 126 |
logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})")
|
| 127 |
+
|
| 128 |
response = await self.client.get(url, params=params)
|
| 129 |
+
|
| 130 |
if response.status_code == 200:
|
| 131 |
data = response.json()
|
| 132 |
if data.get("status") != "ok":
|
| 133 |
logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}")
|
| 134 |
return []
|
| 135 |
+
|
| 136 |
articles = data.get("articles", [])
|
| 137 |
results = []
|
| 138 |
+
filtered_out = 0
|
| 139 |
for article in articles:
|
| 140 |
+
# Filter non-news domains
|
| 141 |
+
url_str = article.get("url", "")
|
| 142 |
+
domain = self._extract_domain(url_str)
|
| 143 |
+
if any(nd in domain for nd in self._NON_NEWS_DOMAINS):
|
| 144 |
+
filtered_out += 1
|
| 145 |
+
logger.debug(f"[NewsAPI] Filtered non-news: {domain}")
|
| 146 |
+
continue
|
| 147 |
normalized = self._normalize_result(article)
|
| 148 |
if normalized:
|
| 149 |
results.append(normalized)
|
| 150 |
+
if len(results) >= max_results:
|
| 151 |
+
break
|
| 152 |
+
|
| 153 |
+
if filtered_out:
|
| 154 |
+
logger.info(f"[NewsAPI] Filtered {filtered_out} non-news articles")
|
| 155 |
+
|
| 156 |
logger.info(
|
| 157 |
f"[NewsAPI] '{query[:50]}' → {len(results)} results "
|
| 158 |
f"(total available: {data.get('totalResults', 0)})"
|