| """ |
| Query Builder Utility (Phase 20 β Dynamic Round-Robin Query Builder) |
| ===================================================================== |
| |
| PURPOSE |
| ------- |
| When we ask a news API for articles, we send a "query string" β a list of |
| keywords that tells the API what topics we want. Our Phase 19 taxonomy can |
| have up to 28 keywords per category. Stuffing all 28 into one API call would: |
| 1. Crash the API with an HTTP 400 "query too long" error. |
| 2. Return the same broad results every single hour β wasting our paid quota. |
| |
| SOLUTION: The Anchor + Round-Robin Strategy |
| ------------------------------------------- |
| For every category we split the keyword list into two parts: |
| |
| ANCHORS β The first 3 keywords. These are the most important, core terms |
| (e.g. "artificial intelligence", "machine learning", "openai"). |
| They are ALWAYS included in every query, every hour. |
| This guarantees we never miss breaking news on a core topic. |
| |
| ROTATORS β The remaining keywords (e.g. "anthropic", "mistral", "llama"...). |
| These are divided into chunks of 4. |
| Each hour of the day, one chunk is added to the anchors. |
| So over 24 hours, we cycle through all chunks, covering every |
| niche keyword without ever exceeding the URL character limit. |
| |
| CLOCK MATH (Stateless & Restart-Safe) |
| -------------------------------------- |
| chunk_index = datetime.now(UTC).hour % number_of_chunks |
| |
| - Uses UTC so the rotation is identical everywhere β Hugging Face, local, |
| AWS β regardless of which timezone the server is in. |
| - No Redis, no database, no file. Just Python's clock. If the server |
| restarts, the correct chunk for the current hour is immediately selected. |
| |
| SINGLE SOURCE OF TRUTH |
| ----------------------- |
| We IMPORT CATEGORY_KEYWORDS from data_validation.py. We never copy it here. |
| One dict, one place. Phase 21 expansions will automatically be picked up. |
| |
| SUPPORTED API TYPES |
| ------------------- |
| "newsapi" β Multi-word phrases quoted, terms joined with " OR " |
| Example: '"artificial intelligence" OR openai OR llm' |
| "gnews" β All terms joined with a single space |
| Example: 'artificial intelligence openai llm' |
| "newsdata" β All terms joined with a comma |
| Example: 'artificial intelligence,openai,llm' |
| """ |
|
|
| from datetime import datetime, timezone |
| from typing import List |
|
|
| |
| |
| |
| |
| from app.utils.data_validation import CATEGORY_KEYWORDS |
|
|
| |
| _ANCHOR_COUNT = 3 |
| _CHUNK_SIZE = 4 |
|
|
|
|
| def _chunk_list(items: List[str], size: int) -> List[List[str]]: |
| """ |
| Splits a flat list into groups of `size`. |
| |
| Example: |
| _chunk_list(['a','b','c','d','e','f'], 3) |
| β [['a','b','c'], ['d','e','f']] |
| |
| If the list divides unevenly, the last chunk is shorter β that is fine. |
| """ |
| return [items[i : i + size] for i in range(0, len(items), size)] |
|
|
|
|
| def _format_for_api(keywords: List[str], api_type: str) -> str: |
| """ |
| Converts a list of keywords into the query string format a specific API expects. |
| |
| Rules by api_type: |
| "newsapi" β Wrap any keyword that contains a space in double-quotes so |
| the API treats it as an exact phrase. Then join with " OR ". |
| Example output: '"artificial intelligence" OR openai OR llm' |
| |
| "gnews" β Just join everything with spaces. GNews search is tolerant |
| of natural language. |
| Example output: 'artificial intelligence openai llm' |
| |
| "newsdata" β Join with commas. NewsData.io uses comma-separated terms. |
| Example output: 'artificial intelligence,openai,llm' |
| |
| Any unknown api_type falls back to the newsapi format (safest default). |
| """ |
| if not keywords: |
| return "" |
|
|
| if api_type == "newsapi": |
| |
| |
| formatted = [ |
| f'"{kw}"' if ' ' in kw else kw |
| for kw in keywords |
| ] |
| return " OR ".join(formatted) |
|
|
| elif api_type == "gnews": |
| |
| return " ".join(keywords) |
|
|
| elif api_type == "newsdata": |
| |
| return ",".join(keywords) |
|
|
| else: |
| |
| formatted = [f'"{kw}"' if ' ' in kw else kw for kw in keywords] |
| return " OR ".join(formatted) |
|
|
|
|
| def build_dynamic_query(category: str, api_type: str = "newsapi") -> str: |
| """ |
| Build a query string for the given category using the Anchor + Round-Robin |
| strategy driven by the current UTC hour. |
| |
| Args: |
| category β e.g. "ai", "cloud-aws", "data-engineering" |
| api_type β one of "newsapi", "gnews", "newsdata" |
| """ |
| |
| all_keywords = CATEGORY_KEYWORDS.get(category) |
|
|
| if not all_keywords: |
| return category |
|
|
| |
| |
| anchor_count = 2 if api_type == "newsdata" else _ANCHOR_COUNT |
| chunk_size = 2 if api_type == "newsdata" else _CHUNK_SIZE |
|
|
| |
| anchors = all_keywords[:anchor_count] |
| rotators = all_keywords[anchor_count:] |
|
|
| |
| chunks = _chunk_list(rotators, chunk_size) |
|
|
| |
| current_hour = datetime.now(timezone.utc).hour |
|
|
| if chunks: |
| active_index = current_hour % len(chunks) |
| active_chunk = chunks[active_index] |
| else: |
| active_chunk = [] |
|
|
| |
| final_keywords = anchors + active_chunk |
|
|
| |
| return _format_for_api(final_keywords, api_type) |
|
|