Peterase commited on
Commit
54dfb7e
·
1 Parent(s): 4d63907

fix: NewsAPI query precision + non-news domain filtering

Browse files

NewsAPI (newsapi_adapter.py):
- Single-word queries now anchored: 'Ethiopia' → 'Ethiopia' AND ('Ethiopia' OR 'Africa' OR 'Horn of Africa')
- Fetch 2x results then filter, return max_results clean articles
- Block non-news domains at source: pypi.org, github, arxiv, plos, stemlynsblog, etc.
- Log count of filtered non-news articles

Ranker (hybrid_result_ranker.py):
- Add _NON_NEWS_DOMAINS blocklist to ranker as second filter
- Filter before deduplication so reranker never sees pypi/github/academic results
- Log count of filtered results

src/core/ranking/hybrid_result_ranker.py CHANGED
@@ -60,6 +60,14 @@ class HybridResultRanker:
60
  "waltainfo.com": 0.7,
61
  }
62
 
 
 
 
 
 
 
 
 
63
  def __init__(self, reranker):
64
  """
65
  Initialize hybrid result ranker.
@@ -101,6 +109,18 @@ class HybridResultRanker:
101
  f"(temporal_boost={'ON' if is_temporal else 'OFF'})"
102
  )
103
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  unique_results = self._deduplicate(all_results)
105
  logger.info(f"After deduplication: {len(unique_results)} unique results")
106
 
 
60
  "waltainfo.com": 0.7,
61
  }
62
 
63
+ # Non-news domains to filter out before reranking
64
+ _NON_NEWS_DOMAINS = {
65
+ "pypi.org", "github.com", "stackoverflow.com", "reddit.com",
66
+ "wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu",
67
+ "plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com",
68
+ "stemlynsblog.org", "linkedin.com", "youtube.com",
69
+ }
70
+
71
  def __init__(self, reranker):
72
  """
73
  Initialize hybrid result ranker.
 
109
  f"(temporal_boost={'ON' if is_temporal else 'OFF'})"
110
  )
111
 
112
+ # Filter non-news domains from live results
113
+ before = len(all_results)
114
+ all_results = [
115
+ r for r in all_results
116
+ if not any(
117
+ nd in (r.get("url") or r.get("metadata", {}).get("url") or "").lower()
118
+ for nd in self._NON_NEWS_DOMAINS
119
+ )
120
+ ]
121
+ if len(all_results) < before:
122
+ logger.info(f"Filtered {before - len(all_results)} non-news domain results")
123
+
124
  unique_results = self._deduplicate(all_results)
125
  logger.info(f"After deduplication: {len(unique_results)} unique results")
126
 
src/infrastructure/adapters/newsapi_adapter.py CHANGED
@@ -69,8 +69,19 @@ class NewsAPIAdapter:
69
  }
70
  )
71
 
 
 
 
 
 
 
 
 
 
 
 
72
  async def search(
73
- self,
74
  query: str,
75
  language: str = "en",
76
  sort_by: str = "publishedAt",
@@ -79,51 +90,69 @@ class NewsAPIAdapter:
79
  ) -> List[Dict[str, Any]]:
80
  """
81
  Search NewsAPI for the given query.
82
- Automatically wraps multi-word queries in quotes for exact matching.
 
83
  """
84
  if not self.api_key:
85
  logger.warning("NewsAPI unavailable - returning empty results")
86
  return []
87
-
88
  await self._ensure_client()
89
-
90
  max_results = max_results or self.max_results
91
-
92
- # Wrap in quotes if multi-word and not already quoted — improves precision
93
- search_q = query
94
  words = query.strip().split()
95
- if len(words) > 1 and not query.startswith('"'):
96
- # Use AND logic: all key terms must appear
 
 
 
 
 
 
97
  search_q = " AND ".join(f'"{w}"' for w in words[:3])
98
-
99
  try:
100
  url = f"{self.BASE_URL}/everything"
101
  params = {
102
  "q": search_q,
103
  "language": language,
104
  "sortBy": sort_by,
105
- "pageSize": max_results
106
  }
107
  if from_date:
108
  params["from"] = from_date
109
-
110
  logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})")
111
-
112
  response = await self.client.get(url, params=params)
113
-
114
  if response.status_code == 200:
115
  data = response.json()
116
  if data.get("status") != "ok":
117
  logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}")
118
  return []
119
-
120
  articles = data.get("articles", [])
121
  results = []
 
122
  for article in articles:
 
 
 
 
 
 
 
123
  normalized = self._normalize_result(article)
124
  if normalized:
125
  results.append(normalized)
126
-
 
 
 
 
 
127
  logger.info(
128
  f"[NewsAPI] '{query[:50]}' → {len(results)} results "
129
  f"(total available: {data.get('totalResults', 0)})"
 
69
  }
70
  )
71
 
72
+ # Domains that are NOT news sources — filter these out
73
+ _NON_NEWS_DOMAINS = {
74
+ "pypi.org", "github.com", "stackoverflow.com", "reddit.com",
75
+ "wikipedia.org", "arxiv.org", "researchgate.net", "academia.edu",
76
+ "linkedin.com", "facebook.com", "twitter.com", "x.com",
77
+ "youtube.com", "instagram.com", "tiktok.com",
78
+ "amazon.com", "ebay.com", "etsy.com",
79
+ "plos.org", "pubmed.ncbi.nlm.nih.gov", "springer.com",
80
+ "stemlynsblog.org",
81
+ }
82
+
83
  async def search(
84
+ self,
85
  query: str,
86
  language: str = "en",
87
  sort_by: str = "publishedAt",
 
90
  ) -> List[Dict[str, Any]]:
91
  """
92
  Search NewsAPI for the given query.
93
+ Always anchors to Ethiopia/Africa context for single-word queries.
94
+ Filters out non-news domains (pypi, github, academic, social media).
95
  """
96
  if not self.api_key:
97
  logger.warning("NewsAPI unavailable - returning empty results")
98
  return []
99
+
100
  await self._ensure_client()
 
101
  max_results = max_results or self.max_results
102
+
103
+ # Build search query always ensure Ethiopia/Africa context
 
104
  words = query.strip().split()
105
+ if len(words) == 1:
106
+ # Single word: anchor to Ethiopia news explicitly
107
+ search_q = f'"{query}" AND ("Ethiopia" OR "Africa" OR "Horn of Africa")'
108
+ elif len(words) <= 3:
109
+ # Short query: AND all terms
110
+ search_q = " AND ".join(f'"{w}"' for w in words)
111
+ else:
112
+ # Longer query: use first 3 key terms
113
  search_q = " AND ".join(f'"{w}"' for w in words[:3])
114
+
115
  try:
116
  url = f"{self.BASE_URL}/everything"
117
  params = {
118
  "q": search_q,
119
  "language": language,
120
  "sortBy": sort_by,
121
+ "pageSize": min(max_results * 2, 100), # Fetch extra to allow filtering
122
  }
123
  if from_date:
124
  params["from"] = from_date
125
+
126
  logger.info(f"[NewsAPI] Searching: '{search_q}' (lang={language})")
127
+
128
  response = await self.client.get(url, params=params)
129
+
130
  if response.status_code == 200:
131
  data = response.json()
132
  if data.get("status") != "ok":
133
  logger.warning(f"NewsAPI error: {data.get('message', 'unknown')}")
134
  return []
135
+
136
  articles = data.get("articles", [])
137
  results = []
138
+ filtered_out = 0
139
  for article in articles:
140
+ # Filter non-news domains
141
+ url_str = article.get("url", "")
142
+ domain = self._extract_domain(url_str)
143
+ if any(nd in domain for nd in self._NON_NEWS_DOMAINS):
144
+ filtered_out += 1
145
+ logger.debug(f"[NewsAPI] Filtered non-news: {domain}")
146
+ continue
147
  normalized = self._normalize_result(article)
148
  if normalized:
149
  results.append(normalized)
150
+ if len(results) >= max_results:
151
+ break
152
+
153
+ if filtered_out:
154
+ logger.info(f"[NewsAPI] Filtered {filtered_out} non-news articles")
155
+
156
  logger.info(
157
  f"[NewsAPI] '{query[:50]}' → {len(results)} results "
158
  f"(total available: {data.get('totalResults', 0)})"