scraper refinements
Browse files- components/fetchers/scraper.py +123 -22
- components/generators/daily_feed.py +65 -129
- pipeline/news_ingest.py +4 -1
components/fetchers/scraper.py
CHANGED
@@ -3,6 +3,12 @@ import trafilatura
|
|
3 |
from newspaper import Article
|
4 |
from typing import Optional
|
5 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
HEADERS = {
|
8 |
"User-Agent": (
|
@@ -13,50 +19,145 @@ HEADERS = {
|
|
13 |
}
|
14 |
|
15 |
def clean_text(text: str) -> str:
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
soup = BeautifulSoup(text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
18 |
cleaned = soup.get_text(separator=" ", strip=True)
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return cleaned
|
21 |
|
22 |
def is_low_quality(text: str) -> bool:
|
23 |
-
"""
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
junk_markers = [
|
27 |
-
"subscribe
|
28 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
]
|
30 |
-
return any(marker in text.lower() for marker in junk_markers)
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Try Trafilatura first
|
34 |
try:
|
|
|
35 |
response = requests.get(url, timeout=timeout, headers=HEADERS)
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
except Exception as e:
|
46 |
-
|
47 |
|
48 |
# Fallback to newspaper3k
|
49 |
try:
|
50 |
-
article = Article(url)
|
51 |
article.download()
|
52 |
article.parse()
|
53 |
if article.text:
|
54 |
text = clean_text(article.text)
|
55 |
if not is_low_quality(text):
|
|
|
56 |
return text
|
57 |
else:
|
58 |
-
|
|
|
|
|
|
|
|
|
59 |
except Exception as e:
|
60 |
-
|
61 |
|
62 |
-
|
|
|
|
3 |
from newspaper import Article
|
4 |
from typing import Optional
|
5 |
from bs4 import BeautifulSoup
|
6 |
+
import logging
|
7 |
+
import re # For regex in clean_text
|
8 |
+
# from tenacity import retry, wait_exponential, stop_after_after_attempt # If you want to add retries
|
9 |
+
|
10 |
+
# Configure logging at the beginning of your script or module
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
12 |
|
13 |
HEADERS = {
|
14 |
"User-Agent": (
|
|
|
19 |
}
|
20 |
|
21 |
def clean_text(text: str) -> str:
|
22 |
+
"""
|
23 |
+
Cleans extracted text by removing HTML tags, normalizing whitespace,
|
24 |
+
and optionally removing common non-content patterns.
|
25 |
+
"""
|
26 |
+
if not text:
|
27 |
+
return ""
|
28 |
+
|
29 |
soup = BeautifulSoup(text, "html.parser")
|
30 |
+
|
31 |
+
# Add double newlines after paragraphs to preserve some structure
|
32 |
+
for p in soup.find_all('p'):
|
33 |
+
p.append('\n\n')
|
34 |
+
|
35 |
cleaned = soup.get_text(separator=" ", strip=True)
|
36 |
+
|
37 |
+
# Normalize all whitespace characters to single spaces, then strip leading/trailing
|
38 |
+
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
39 |
+
|
40 |
+
# Optional: Remove common non-content patterns (e.g., "Image: ...", "Photo by ...")
|
41 |
+
# This might be too aggressive for some articles, test carefully
|
42 |
+
# cleaned = re.sub(r'(?:Image|Photo) by [^\n]*\n*', '', cleaned, flags=re.IGNORECASE)
|
43 |
+
# cleaned = re.sub(r'\[\s*\d+\s*[/\\-]\s*\d+\s*\]', '', cleaned) # e.g., [1/5], [2-3]
|
44 |
+
|
45 |
return cleaned
|
46 |
|
47 |
def is_low_quality(text: str) -> bool:
|
48 |
+
"""
|
49 |
+
Detect navigation garbage, footers, or low-word-count dumps.
|
50 |
+
Uses an expanded list of junk markers and word count checks.
|
51 |
+
"""
|
52 |
+
if not text:
|
53 |
+
logging.debug("Text is empty, considered low quality.")
|
54 |
return True
|
55 |
+
|
56 |
+
words = text.split()
|
57 |
+
if len(words) < 150: # Increased minimum word count slightly for better content
|
58 |
+
logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).")
|
59 |
+
return True
|
60 |
+
|
61 |
+
# Expanded list of common junk phrases/markers
|
62 |
junk_markers = [
|
63 |
+
"subscribe to our newsletter", "cookie policy", "terms and conditions",
|
64 |
+
"privacy statement", "all rights reserved", "contact us", "about us",
|
65 |
+
"careers", "sitemap", "advertisement", "sponsored content",
|
66 |
+
"read more", "view all", "back to top", "connect with us",
|
67 |
+
"follow us on", "email us", "download our app", "footer",
|
68 |
+
"comments policy", "disclaimer", "affiliate links", "related posts",
|
69 |
+
"latest updates", "breaking news", "trending topics", "more news",
|
70 |
+
"featured stories", "sign up", "login", "register", "join us",
|
71 |
+
"newsletter signup", "skip to content", "navigation", "main menu",
|
72 |
+
"sidebar", "archive", "categories", "tags", "go to top", "licence",
|
73 |
+
"unlimited access", "support us", "exclusive content", "follow @",
|
74 |
+
"copyright", "imprint", "impressum", "legal notice"
|
75 |
]
|
|
|
76 |
|
77 |
+
low_quality_score = 0
|
78 |
+
lower_text = text.lower()
|
79 |
+
|
80 |
+
for marker in junk_markers:
|
81 |
+
if marker in lower_text:
|
82 |
+
low_quality_score += 1
|
83 |
+
|
84 |
+
# Heuristic: if a significant portion of the text appears to be junk markers
|
85 |
+
# Or if too many different markers are present
|
86 |
+
if low_quality_score >= 4: # If 4 or more distinct markers are found
|
87 |
+
logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
|
88 |
+
return True
|
89 |
+
|
90 |
+
# More advanced heuristic for very short lines, indicating lists/tables/boilerplate
|
91 |
+
lines = text.split('\n')
|
92 |
+
if len(lines) > 15: # Only apply if there are enough lines to make sense
|
93 |
+
short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) # Lines with 1-6 words
|
94 |
+
if short_lines_count / len(lines) > 0.4: # If more than 40% of lines are very short
|
95 |
+
logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
|
96 |
+
return True
|
97 |
+
|
98 |
+
return False
|
99 |
+
|
100 |
+
# @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
|
101 |
+
# def _make_request_with_retry(url, timeout, headers):
|
102 |
+
# """Helper for retries if tenacity is enabled."""
|
103 |
+
# response = requests.get(url, timeout=timeout, headers=headers)
|
104 |
+
# response.raise_for_status()
|
105 |
+
# return response
|
106 |
+
|
107 |
+
def scrape_url(url: str, timeout: int = 15) -> Optional[str]: # Increased default timeout
|
108 |
+
"""
|
109 |
+
Scrapes content from a given URL using Trafilatura and falls back to Newspaper3k.
|
110 |
+
Includes robust error handling and quality checks.
|
111 |
+
"""
|
112 |
+
logging.info(f"Attempting to scrape: {url}")
|
113 |
+
|
114 |
# Try Trafilatura first
|
115 |
try:
|
116 |
+
# Use _make_request_with_retry if retries are enabled
|
117 |
response = requests.get(url, timeout=timeout, headers=HEADERS)
|
118 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
119 |
+
|
120 |
+
# Handle encoding more robustly
|
121 |
+
try:
|
122 |
+
html = response.content.decode(response.apparent_encoding)
|
123 |
+
except UnicodeDecodeError:
|
124 |
+
html = response.content.decode('utf-8', errors='ignore') # Fallback to UTF-8 with ignore
|
125 |
+
|
126 |
+
extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
|
127 |
+
|
128 |
+
if extracted:
|
129 |
+
text = clean_text(extracted)
|
130 |
+
if not is_low_quality(text):
|
131 |
+
logging.info(f"Successfully extracted content using Trafilatura for: {url}")
|
132 |
+
return text
|
133 |
+
else:
|
134 |
+
logging.warning(f"Trafilatura: Content identified as low quality for {url}.")
|
135 |
+
else:
|
136 |
+
logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
|
137 |
+
|
138 |
+
except requests.exceptions.RequestException as req_err:
|
139 |
+
logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
|
140 |
except Exception as e:
|
141 |
+
logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) # Keep exc_info=False for less verbose logging unless deep debug is needed
|
142 |
|
143 |
# Fallback to newspaper3k
|
144 |
try:
|
145 |
+
article = Article(url, headers=HEADERS, keep_article_html=False) # Pass headers, no need for raw HTML
|
146 |
article.download()
|
147 |
article.parse()
|
148 |
if article.text:
|
149 |
text = clean_text(article.text)
|
150 |
if not is_low_quality(text):
|
151 |
+
logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
|
152 |
return text
|
153 |
else:
|
154 |
+
logging.warning(f"Newspaper3k: Content identified as low quality for {url}.")
|
155 |
+
else:
|
156 |
+
logging.info(f"Newspaper3k returned no main content for: {url}.")
|
157 |
+
except requests.exceptions.RequestException as req_err:
|
158 |
+
logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}")
|
159 |
except Exception as e:
|
160 |
+
logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False)
|
161 |
|
162 |
+
logging.error(f"Failed to extract quality content from: {url} using both methods.")
|
163 |
+
return None
|
components/generators/daily_feed.py
CHANGED
@@ -5,179 +5,115 @@ import numpy as np
|
|
5 |
from typing import List, Dict
|
6 |
from openai import OpenAI
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
-
from llama_index.core import StorageContext
|
9 |
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
10 |
|
11 |
# π Environment variables
|
12 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
13 |
-
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
14 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
15 |
|
16 |
# β
Redis client
|
17 |
-
|
18 |
-
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
19 |
-
except Exception as e:
|
20 |
-
print("β [Redis Init Error]", e)
|
21 |
-
raise
|
22 |
|
23 |
-
# π°
|
24 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
25 |
-
# This list correctly generates 'india', 'world', etc.
|
26 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
27 |
|
28 |
-
# π§
|
29 |
BASE_PROMPT = (
|
30 |
"You are Nuseβs editorial summarizer. Read the excerpts below and extract the most important stories. "
|
31 |
-
"Return up to 3 punchy headlines, each under 20 words,
|
32 |
)
|
33 |
|
34 |
-
# π₯ Load documents
|
35 |
-
def
|
36 |
topic_docs = {key: [] for key in TOPIC_KEYS}
|
37 |
-
|
38 |
try:
|
39 |
vector_store = get_upstash_vector_store()
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
# instead of 'full_topic_name' (e.g., "India news").
|
53 |
-
filters = MetadataFilters(
|
54 |
-
filters=[
|
55 |
-
MetadataFilter(key="topic", value=topic_key_for_filter, operator=FilterOperator.EQ)
|
56 |
-
]
|
57 |
-
)
|
58 |
-
|
59 |
-
dummy_vector = np.random.rand(384).tolist() # Assuming MiniLM embeddings
|
60 |
-
query = VectorStoreQuery(
|
61 |
-
query_embedding=dummy_vector,
|
62 |
-
similarity_top_k=50, # Retrieve enough documents for summarization
|
63 |
-
filters=filters # Apply the metadata filter
|
64 |
-
)
|
65 |
-
|
66 |
-
print(f"π Querying Upstash for topic: '{full_topic_name}' using filter value '{topic_key_for_filter}'")
|
67 |
-
result = vector_store.query(query)
|
68 |
-
print(f"β‘οΈ Found {len(result.nodes)} nodes for topic: '{full_topic_name}'.")
|
69 |
-
|
70 |
-
for node in result.nodes:
|
71 |
-
content = node.get_content().strip()
|
72 |
-
if content:
|
73 |
-
topic_docs[topic_key_for_filter].append(content)
|
74 |
-
# Optional: Print metadata to verify filtering
|
75 |
-
# print(f" Node metadata: {node.metadata}")
|
76 |
-
except Exception as e:
|
77 |
-
print(f"β [Topic Metadata Filter error for '{full_topic_name}']: {e}")
|
78 |
-
|
79 |
except Exception as e:
|
80 |
-
print("β [
|
81 |
-
|
82 |
return topic_docs
|
83 |
|
84 |
-
# π§ͺ Summarize
|
85 |
-
def summarize_topic(topic_key: str, docs: List[
|
86 |
if not docs:
|
87 |
-
print(f"β οΈ No docs
|
88 |
return []
|
89 |
|
90 |
try:
|
|
|
91 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
92 |
-
|
93 |
-
|
94 |
-
print(f"π§ Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
|
95 |
-
completion = client.chat.completions.create(
|
96 |
-
model="gpt-4",
|
97 |
messages=[
|
98 |
{"role": "system", "content": BASE_PROMPT},
|
99 |
{"role": "user", "content": content},
|
100 |
],
|
101 |
-
max_tokens=512,
|
102 |
-
temperature=0.7,
|
103 |
)
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
"
|
113 |
-
"image_url": "https://source.unsplash.com/800x600/?news",
|
114 |
-
"article_link": f"https://google.com/search?q={topic_key}+news"
|
115 |
})
|
116 |
-
return
|
117 |
-
|
118 |
except Exception as e:
|
119 |
-
print(f"β [
|
120 |
return []
|
121 |
|
122 |
-
# π
|
123 |
def generate_and_cache_daily_feed():
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
for topic_key in TOPIC_KEYS:
|
130 |
-
try:
|
131 |
-
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
|
132 |
-
feed_map[topic_key] = summaries
|
133 |
-
except Exception as e:
|
134 |
-
print(f"β [Topic Loop Error for '{topic_key}']: {e}")
|
135 |
-
feed_map[topic_key] = []
|
136 |
-
|
137 |
-
# When creating final_feed, use TOPICS for the display name but TOPIC_KEYS for mapping
|
138 |
-
final_feed = [{"topic": display_name, "feed": feed_map[actual_key]}
|
139 |
-
for display_name, actual_key in zip(TOPICS, TOPIC_KEYS)]
|
140 |
-
|
141 |
try:
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
print(f"β
Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
|
146 |
except Exception as e:
|
147 |
-
print("β [
|
148 |
-
|
149 |
-
return final_feed
|
150 |
|
|
|
|
|
|
|
|
|
151 |
except Exception as e:
|
152 |
-
print("β [
|
153 |
-
|
|
|
154 |
|
155 |
-
#
|
156 |
def get_cached_daily_feed():
|
157 |
try:
|
158 |
-
|
159 |
-
|
160 |
-
if cached:
|
161 |
-
print(f"β
Retrieved cached daily feed from '{cache_key_name}'.")
|
162 |
-
return json.loads(cached)
|
163 |
-
else:
|
164 |
-
print(f"βΉοΈ No cached data found under key '{cache_key_name}'.")
|
165 |
-
return []
|
166 |
except Exception as e:
|
167 |
-
print("β [
|
168 |
return []
|
169 |
|
170 |
-
# Example of how to run it (for testing purposes, if this were the main script)
|
171 |
if __name__ == "__main__":
|
172 |
-
|
173 |
-
|
174 |
-
# os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token"
|
175 |
-
# os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
|
176 |
-
|
177 |
-
generated_feed = generate_and_cache_daily_feed()
|
178 |
-
print("\n--- Generated and Cached Feed ---")
|
179 |
-
# print(json.dumps(generated_feed, indent=2, ensure_ascii=False))
|
180 |
-
|
181 |
-
cached_feed = get_cached_daily_feed()
|
182 |
-
print("\n--- Retrieved from Cache ---")
|
183 |
-
# print(json.dumps(cached_feed, indent=2, ensure_ascii=False))
|
|
|
5 |
from typing import List, Dict
|
6 |
from openai import OpenAI
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
|
|
8 |
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
9 |
|
10 |
# π Environment variables
|
11 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
12 |
+
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
13 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
14 |
|
15 |
# β
Redis client
|
16 |
+
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
# π° Topics
|
19 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
|
|
20 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
21 |
|
22 |
+
# π§ Prompt for summarization
|
23 |
BASE_PROMPT = (
|
24 |
"You are Nuseβs editorial summarizer. Read the excerpts below and extract the most important stories. "
|
25 |
+
"Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line."
|
26 |
)
|
27 |
|
28 |
+
# π₯ Load documents by topic and collect references
|
29 |
+
def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
|
30 |
topic_docs = {key: [] for key in TOPIC_KEYS}
|
|
|
31 |
try:
|
32 |
vector_store = get_upstash_vector_store()
|
33 |
+
for full_topic, topic_key in zip(TOPICS, TOPIC_KEYS):
|
34 |
+
filters = MetadataFilters(
|
35 |
+
filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)]
|
36 |
+
)
|
37 |
+
dummy_vector = np.random.rand(384).tolist()
|
38 |
+
query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters)
|
39 |
+
result = vector_store.query(query)
|
40 |
+
for node in result.nodes:
|
41 |
+
content = node.get_content().strip()
|
42 |
+
ref_id = node.node_id or node.id_ or ""
|
43 |
+
if content and ref_id:
|
44 |
+
topic_docs[topic_key].append({"text": content, "ref": ref_id})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
except Exception as e:
|
46 |
+
print("β [load_docs_by_topic_with_refs Error]", e)
|
|
|
47 |
return topic_docs
|
48 |
|
49 |
+
# π§ͺ Summarize topic with reference IDs
|
50 |
+
def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
|
51 |
if not docs:
|
52 |
+
print(f"β οΈ No docs for topic: {topic_key}")
|
53 |
return []
|
54 |
|
55 |
try:
|
56 |
+
content = "\n\n---\n\n".join([d["text"] for d in docs])[:12000]
|
57 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
58 |
+
response = client.chat.completions.create(
|
59 |
+
model="gpt-4",
|
|
|
|
|
|
|
60 |
messages=[
|
61 |
{"role": "system", "content": BASE_PROMPT},
|
62 |
{"role": "user", "content": content},
|
63 |
],
|
64 |
+
max_tokens=512,
|
65 |
+
temperature=0.7,
|
66 |
)
|
67 |
+
headlines = response.choices[0].message.content.strip().splitlines()
|
68 |
+
result = []
|
69 |
+
for i, line in enumerate(headlines):
|
70 |
+
clean_line = line.strip("-ββ’ ")
|
71 |
+
if clean_line:
|
72 |
+
ref_id = docs[i]["ref"] if i < len(docs) else ""
|
73 |
+
result.append({
|
74 |
+
"summary": f"{start_index + i}. {clean_line}",
|
75 |
+
"ref": ref_id,
|
76 |
+
"image_url": "https://source.unsplash.com/800x600/?news",
|
77 |
+
"article_link": f"https://google.com/search?q={topic_key}+news"
|
78 |
})
|
79 |
+
return result
|
|
|
80 |
except Exception as e:
|
81 |
+
print(f"β [Summarize topic '{topic_key}' Error]", e)
|
82 |
return []
|
83 |
|
84 |
+
# π Generate and cache full feed
|
85 |
def generate_and_cache_daily_feed():
|
86 |
+
print("π Starting daily feed generation with OpenAI...")
|
87 |
+
docs_by_topic = load_docs_by_topic_with_refs()
|
88 |
+
all_feed = []
|
89 |
+
counter = 1
|
90 |
+
for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
try:
|
92 |
+
summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter)
|
93 |
+
counter += len(summaries)
|
94 |
+
all_feed.append({"topic": topic, "feed": summaries})
|
|
|
95 |
except Exception as e:
|
96 |
+
print(f"β [Feed generation error for {topic_key}]", e)
|
97 |
+
all_feed.append({"topic": topic, "feed": []})
|
|
|
98 |
|
99 |
+
try:
|
100 |
+
redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False))
|
101 |
+
redis_client.expire("daily_news_feed_cache", 86400)
|
102 |
+
print("β
Cached final feed.")
|
103 |
except Exception as e:
|
104 |
+
print("β [Redis caching error]", e)
|
105 |
+
|
106 |
+
return all_feed
|
107 |
|
108 |
+
# ποΈ Fetch from cache
|
109 |
def get_cached_daily_feed():
|
110 |
try:
|
111 |
+
data = redis_client.get("daily_news_feed_cache")
|
112 |
+
return json.loads(data) if data else []
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
except Exception as e:
|
114 |
+
print("β [Cache fetch error]", e)
|
115 |
return []
|
116 |
|
|
|
117 |
if __name__ == "__main__":
|
118 |
+
feed = generate_and_cache_daily_feed()
|
119 |
+
print(json.dumps(feed, indent=2, ensure_ascii=False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pipeline/news_ingest.py
CHANGED
@@ -58,6 +58,7 @@ async def main():
|
|
58 |
print("π Fetching news URLs from Google...")
|
59 |
|
60 |
all_articles = []
|
|
|
61 |
|
62 |
for query in QUERIES:
|
63 |
print(f"π Searching for: {query}")
|
@@ -76,13 +77,15 @@ async def main():
|
|
76 |
article_text = scrape_url(url)
|
77 |
|
78 |
if article_text:
|
|
|
79 |
all_articles.append({
|
80 |
"topic": query,
|
81 |
-
"title":
|
82 |
"url": url,
|
83 |
"source": source,
|
84 |
"content": article_text
|
85 |
})
|
|
|
86 |
else:
|
87 |
print(f"β οΈ Skipped: {url}")
|
88 |
|
|
|
58 |
print("π Fetching news URLs from Google...")
|
59 |
|
60 |
all_articles = []
|
61 |
+
counter = 1 # β
Initialize global counter
|
62 |
|
63 |
for query in QUERIES:
|
64 |
print(f"π Searching for: {query}")
|
|
|
77 |
article_text = scrape_url(url)
|
78 |
|
79 |
if article_text:
|
80 |
+
numbered_title = f"{counter}. {title}" # β
Add headline number
|
81 |
all_articles.append({
|
82 |
"topic": query,
|
83 |
+
"title": numbered_title,
|
84 |
"url": url,
|
85 |
"source": source,
|
86 |
"content": article_text
|
87 |
})
|
88 |
+
counter += 1
|
89 |
else:
|
90 |
print(f"β οΈ Skipped: {url}")
|
91 |
|