princeta3011 commited on
Commit
69a077e
·
verified ·
1 Parent(s): f451b98

Upload 11 files

Browse files
Files changed (11) hide show
  1. .gitignore +2 -0
  2. app.py +36 -0
  3. chat_agent.py +43 -0
  4. data_extractor.py +118 -0
  5. dom_analyzer.py +162 -0
  6. html_loader.py +64 -0
  7. main.py +186 -0
  8. mongo_storage.py +143 -0
  9. neo4j_storage.py +216 -0
  10. requirements.txt +67 -0
  11. settings.py +57 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ .venv
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import asyncio
4
+ from agent.chat_agent import SimpleChatAgent
5
+
6
+ agent = SimpleChatAgent()
7
+
8
+ async def async_chat_fn(message, history):
9
+ return await agent.handle_query(message, history)
10
+
11
+ def chat_fn(message, history):
12
+ return asyncio.run(async_chat_fn(message, history))
13
+
14
+ with gr.Blocks(css="""
15
+ #title { font-size: 2.2rem; font-weight: bold; text-align: center; margin-bottom: 0.5em; color: #2e3a59; }
16
+ #desc { font-size: 1.1rem; text-align: center; color: #6c7a92; margin-bottom: 2em; }
17
+ footer { text-align: center; font-size: 0.9rem; color: #999; margin-top: 2em; }
18
+ .gradio-container { background-color: #f9fbfc; }
19
+ """) as demo:
20
+
21
+ gr.Markdown("<div id='title'> Chat + Web Scraper Agent</div>")
22
+ gr.Markdown("<div id='desc'>Ask anything, or tell me to scrape a webpage. your custom agent logic.</div>")
23
+
24
+ with gr.Row():
25
+ with gr.Column(scale=1):
26
+ gr.ChatInterface(
27
+ fn=chat_fn,
28
+ chatbot=gr.Chatbot(show_copy_button=True),
29
+ textbox=gr.Textbox(placeholder="Type your question or paste a URL to scrape...", show_label=False),
30
+ title=None,
31
+ theme="soft",
32
+ )
33
+
34
+ gr.Markdown("<footer>Built with ❤️ using LLM + Gradio UI</footer>")
35
+
36
+ demo.launch()
chat_agent.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # chat_agent.py
2
+ import os
3
+ import re
4
+ from openai import OpenAI
5
+ from main import WebScrapingOrchestrator
6
+
7
+ class SimpleChatAgent:
8
+ def __init__(self):
9
+ self.client = OpenAI(
10
+ base_url="https://api.studio.nebius.com/v1/",
11
+ api_key=os.environ.get("NEBIUS_API_KEY"),
12
+ )
13
+ self.model = "meta-llama/Meta-Llama-3.1-70B-Instruct"
14
+ self.orchestrator = WebScrapingOrchestrator()
15
+
16
+ async def handle_query(self, user_input, history):
17
+ # Web scraping check
18
+ url_match = re.search(r"(https?://[^\s]+)", user_input)
19
+ if "scrape" in user_input.lower() and url_match:
20
+ url = url_match.group(1)
21
+ result = await self.orchestrator.process_url(url)
22
+ if "error" in result:
23
+ return f"❌ Error scraping {url}: {result['error']}"
24
+ return (
25
+ f"✅ Scraped Data from {result['title']}:\n"
26
+ f"- Topics: {', '.join(result['llm_ready_data']['main_topics'])}\n"
27
+ f"- Summary: {result['llm_ready_data']['text_summary'][:500]}..."
28
+ )
29
+
30
+ # Build full chat history
31
+ messages = []
32
+ for user_msg, bot_msg in history:
33
+ messages.append({"role": "user", "content": user_msg})
34
+ messages.append({"role": "assistant", "content": bot_msg})
35
+ messages.append({"role": "user", "content": user_input})
36
+
37
+ # Call Nebius LLM
38
+ response = self.client.chat.completions.create(
39
+ model=self.model,
40
+ messages=messages,
41
+ temperature=0.6,
42
+ )
43
+ return response.choices[0].message.content
data_extractor.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup, Comment
2
+ from typing import Dict, List, Optional
3
+ import re
4
+ from urllib.parse import urljoin, urlparse
5
+ from config.settings import settings
6
+
7
+ class DataExtractor:
8
+ def __init__(self):
9
+ self.config = settings.extraction
10
+
11
+ def extract_structured_data(self, html: str, url: str) -> Dict:
12
+ """Extract structured data from HTML for LLM consumption"""
13
+ soup = BeautifulSoup(html, 'lxml')
14
+
15
+ # Remove unwanted elements
16
+ self._clean_html(soup)
17
+
18
+ return {
19
+ "content": self._extract_content(soup),
20
+ "metadata": self._extract_metadata(soup, url),
21
+ "structure": self._extract_structure(soup),
22
+ "links": self._extract_links(soup, url),
23
+ "images": self._extract_images(soup, url),
24
+ "text_summary": self._extract_text_summary(soup)
25
+ }
26
+
27
+ def _clean_html(self, soup: BeautifulSoup):
28
+ """Remove unwanted elements for cleaner extraction"""
29
+ for selector in self.config.ignore_selectors:
30
+ for element in soup.select(selector):
31
+ element.decompose()
32
+
33
+ # Remove comments and scripts
34
+ for element in soup(text=lambda text: isinstance(text, Comment)):
35
+ element.extract()
36
+
37
+ def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
38
+ """Extract main content blocks"""
39
+ content_blocks = []
40
+
41
+ for selector in self.config.content_selectors:
42
+ elements = soup.select(selector)
43
+ for elem in elements:
44
+ text = elem.get_text(strip=True)
45
+ if len(text) >= self.config.min_text_length:
46
+ content_blocks.append({
47
+ "tag": elem.name,
48
+ "text": text,
49
+ "html": str(elem),
50
+ "attributes": dict(elem.attrs) if elem.attrs else {}
51
+ })
52
+
53
+ return content_blocks
54
+
55
+ def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
56
+ """Extract page metadata"""
57
+ title = soup.find('title')
58
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
59
+
60
+ return {
61
+ "title": title.get_text().strip() if title else "",
62
+ "description": meta_desc.get('content', '') if meta_desc else "",
63
+ "url": url,
64
+ "domain": urlparse(url).netloc,
65
+ "headings": self._extract_headings(soup)
66
+ }
67
+
68
+ def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
69
+ """Extract heading hierarchy for structure"""
70
+ headings = []
71
+ for i in range(1, 7):
72
+ for heading in soup.find_all(f'h{i}'):
73
+ headings.append({
74
+ "level": i,
75
+ "text": heading.get_text().strip(),
76
+ "id": heading.get('id', '')
77
+ })
78
+ return headings
79
+
80
+ def _extract_structure(self, soup: BeautifulSoup) -> Dict:
81
+ """Extract DOM structure for relationships"""
82
+ return {
83
+ "sections": len(soup.find_all(['section', 'article', 'div'])),
84
+ "paragraphs": len(soup.find_all('p')),
85
+ "lists": len(soup.find_all(['ul', 'ol'])),
86
+ "tables": len(soup.find_all('table')),
87
+ "forms": len(soup.find_all('form'))
88
+ }
89
+
90
+ def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
91
+ """Extract all links for relationship mapping"""
92
+ links = []
93
+ for link in soup.find_all('a', href=True):
94
+ href = urljoin(base_url, link['href'])
95
+ links.append({
96
+ "url": href,
97
+ "text": link.get_text().strip(),
98
+ "internal": urlparse(href).netloc == urlparse(base_url).netloc
99
+ })
100
+ return links[:50] # Limit for performance
101
+
102
+ def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
103
+ """Extract images with context"""
104
+ images = []
105
+ for img in soup.find_all('img', src=True):
106
+ images.append({
107
+ "src": urljoin(base_url, img['src']),
108
+ "alt": img.get('alt', ''),
109
+ "caption": img.get('title', '')
110
+ })
111
+ return images[:20] # Limit for performance
112
+
113
+ def _extract_text_summary(self, soup: BeautifulSoup) -> str:
114
+ """Extract clean text for LLM processing"""
115
+ text = soup.get_text()
116
+ # Clean whitespace and normalize
117
+ text = re.sub(r'\s+', ' ', text).strip()
118
+ return text[:5000] # Limit for token efficiency
dom_analyzer.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from typing import Dict, List
3
+ import hashlib
4
+
5
+ class DOMAnalyzer:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def analyze_structure(self, html: str) -> Dict:
10
+ """Analyze DOM structure and create tree representation"""
11
+ soup = BeautifulSoup(html, 'lxml')
12
+
13
+ return {
14
+ "tree": self._build_dom_tree(soup.body if soup.body else soup),
15
+ "statistics": self._get_dom_statistics(soup),
16
+ "semantic_structure": self._analyze_semantic_structure(soup),
17
+ "content_blocks": self._identify_content_blocks(soup)
18
+ }
19
+
20
+ def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
21
+ """Build hierarchical DOM tree structure"""
22
+ if depth > max_depth or not element or not hasattr(element, 'name'):
23
+ return {}
24
+
25
+ node = {
26
+ "tag": element.name if element.name else "text",
27
+ "id": element.get('id', ''),
28
+ "classes": element.get('class', []),
29
+ "text_content": element.get_text()[:100] if element.get_text() else "",
30
+ "children": [],
31
+ "attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
32
+ "depth": depth,
33
+ "node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
34
+ }
35
+
36
+ # Add children (limit to prevent huge trees)
37
+ if hasattr(element, 'children') and depth < max_depth:
38
+ child_count = 0
39
+ for child in element.children:
40
+ if child_count >= 10: # Limit children per node
41
+ break
42
+ if hasattr(child, 'name') and child.name:
43
+ child_node = self._build_dom_tree(child, depth + 1, max_depth)
44
+ if child_node:
45
+ node["children"].append(child_node)
46
+ child_count += 1
47
+
48
+ return node
49
+
50
+ def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
51
+ """Get DOM statistics for analysis"""
52
+ all_tags = soup.find_all()
53
+ tag_counts = {}
54
+
55
+ for tag in all_tags:
56
+ tag_name = tag.name
57
+ tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
58
+
59
+ return {
60
+ "total_elements": len(all_tags),
61
+ "tag_distribution": tag_counts,
62
+ "max_depth": self._calculate_max_depth(soup),
63
+ "text_content_ratio": self._calculate_text_ratio(soup)
64
+ }
65
+
66
+ def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
67
+ """Analyze semantic HTML structure"""
68
+ semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
69
+ semantic_elements = {}
70
+
71
+ for tag in semantic_tags:
72
+ elements = soup.find_all(tag)
73
+ semantic_elements[tag] = len(elements)
74
+
75
+ return {
76
+ "semantic_elements": semantic_elements,
77
+ "has_semantic_structure": sum(semantic_elements.values()) > 0,
78
+ "content_hierarchy": self._analyze_heading_hierarchy(soup)
79
+ }
80
+
81
+ def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
82
+ """Identify main content blocks for LLM processing"""
83
+ content_blocks = []
84
+
85
+ # Look for common content containers
86
+ selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
87
+
88
+ for selector in selectors:
89
+ elements = soup.select(selector)
90
+ for elem in elements:
91
+ if elem.get_text(strip=True):
92
+ content_blocks.append({
93
+ "selector": selector,
94
+ "tag": elem.name,
95
+ "text_length": len(elem.get_text()),
96
+ "element_id": elem.get('id', ''),
97
+ "classes": elem.get('class', []),
98
+ "priority": self._calculate_content_priority(elem)
99
+ })
100
+
101
+ return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
102
+
103
+ def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
104
+ """Calculate maximum DOM depth"""
105
+ def get_depth(element, current_depth=0):
106
+ if not hasattr(element, 'children'):
107
+ return current_depth
108
+
109
+ max_child_depth = current_depth
110
+ for child in element.children:
111
+ if hasattr(child, 'name') and child.name:
112
+ depth = get_depth(child, current_depth + 1)
113
+ max_child_depth = max(max_child_depth, depth)
114
+
115
+ return max_child_depth
116
+
117
+ return get_depth(soup)
118
+
119
+ def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
120
+ """Calculate ratio of text content to HTML tags"""
121
+ text_length = len(soup.get_text())
122
+ html_length = len(str(soup))
123
+ return text_length / html_length if html_length > 0 else 0
124
+
125
+ def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
126
+ """Analyze heading structure for content organization"""
127
+ headings = []
128
+ for i in range(1, 7):
129
+ for heading in soup.find_all(f'h{i}'):
130
+ headings.append({
131
+ "level": i,
132
+ "text": heading.get_text().strip(),
133
+ "position": len(headings)
134
+ })
135
+ return headings
136
+
137
+ def _calculate_content_priority(self, element) -> int:
138
+ """Calculate priority score for content blocks"""
139
+ score = 0
140
+ text_length = len(element.get_text())
141
+
142
+ # Text length scoring
143
+ score += min(text_length // 100, 10)
144
+
145
+ # Semantic tag bonus
146
+ if element.name in ['article', 'main']:
147
+ score += 5
148
+ elif element.name in ['section', 'div']:
149
+ score += 2
150
+
151
+ # Class/ID based scoring
152
+ classes = element.get('class', [])
153
+ element_id = element.get('id', '')
154
+
155
+ content_indicators = ['content', 'article', 'post', 'main', 'body']
156
+ for indicator in content_indicators:
157
+ if any(indicator in str(c).lower() for c in classes):
158
+ score += 3
159
+ if indicator in element_id.lower():
160
+ score += 3
161
+
162
+ return score
html_loader.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from playwright.async_api import async_playwright
3
+ from typing import Dict, Optional
4
+ import time
5
+ from config.settings import settings
6
+
7
+ class HTMLLoader:
8
+ def __init__(self):
9
+ self.browser = None
10
+ self.context = None
11
+
12
+ async def __aenter__(self):
13
+ self.playwright = await async_playwright().start()
14
+ self.browser = await self.playwright.chromium.launch(
15
+ headless=settings.scraping.headless
16
+ )
17
+ self.context = await self.browser.new_context(
18
+ user_agent=settings.scraping.user_agent
19
+ )
20
+ return self
21
+
22
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
23
+ if self.context:
24
+ await self.context.close()
25
+ if self.browser:
26
+ await self.browser.close()
27
+ if self.playwright:
28
+ await self.playwright.stop()
29
+
30
+ async def load_page(self, url: str) -> Dict[str, str]:
31
+ """Load HTML content from URL handling both static and dynamic sites"""
32
+ for attempt in range(settings.scraping.max_retries):
33
+ try:
34
+ page = await self.context.new_page()
35
+ await page.goto(url, timeout=settings.scraping.timeout)
36
+
37
+ # Wait for body to load
38
+ await page.wait_for_selector(
39
+ settings.scraping.wait_for_selector,
40
+ timeout=10000
41
+ )
42
+
43
+ # Additional wait for dynamic content
44
+ await page.wait_for_timeout(2000)
45
+
46
+ html_content = await page.content()
47
+ title = await page.title()
48
+ url_final = page.url
49
+
50
+ await page.close()
51
+
52
+ return {
53
+ "html": html_content,
54
+ "title": title,
55
+ "url": url_final,
56
+ "timestamp": int(time.time())
57
+ }
58
+
59
+ except Exception as e:
60
+ if attempt == settings.scraping.max_retries - 1:
61
+ raise Exception(f"Failed to load {url}: {str(e)}")
62
+ await asyncio.sleep(settings.scraping.delay_between_requests)
63
+
64
+ return None
main.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from typing import Dict, Optional,List
3
+ from scraper.html_loader import HTMLLoader
4
+ from scraper.data_extractor import DataExtractor
5
+ from scraper.dom_analyzer import DOMAnalyzer
6
+ from storage.mongo_storage import MongoStorage
7
+ # from storage.neo4j_storage import Neo4jStorage
8
+ from config.settings import settings
9
+
10
+ class WebScrapingOrchestrator:
11
+ def __init__(self):
12
+ self.data_extractor = DataExtractor()
13
+ self.dom_analyzer = DOMAnalyzer()
14
+ self.mongo_storage = MongoStorage()
15
+ # self.neo4j_storage = Neo4jStorage()
16
+
17
+ async def process_url(self, url: str) -> Dict:
18
+ """Complete pipeline to process a URL for LLM consumption"""
19
+ try:
20
+ print(f"Processing URL: {url}")
21
+
22
+ # Step 1: Load HTML content
23
+ async with HTMLLoader() as loader:
24
+ html_data = await loader.load_page(url)
25
+
26
+ if not html_data:
27
+ return {"error": "Failed to load page"}
28
+
29
+ print("✓ HTML loaded successfully")
30
+
31
+ # Step 2: Extract structured data
32
+ extracted_data = self.data_extractor.extract_structured_data(
33
+ html_data["html"],
34
+ html_data["url"]
35
+ )
36
+
37
+ print("✓ Data extracted successfully")
38
+
39
+ # Step 3: Analyze DOM structure
40
+ dom_structure = self.dom_analyzer.analyze_structure(html_data["html"])
41
+
42
+ print("✓ DOM structure analyzed")
43
+
44
+ # Step 4: Store in MongoDB
45
+ mongo_id = self.mongo_storage.store_page_data(
46
+ html_data["url"],
47
+ extracted_data,
48
+ dom_structure
49
+ )
50
+
51
+ print("✓ Data stored in MongoDB")
52
+
53
+ # Step 5: Store relationships in Neo4j
54
+ # self.neo4j_storage.store_relationships(
55
+ # html_data["url"],
56
+ # extracted_data,
57
+ # dom_structure
58
+ # )
59
+
60
+ print("✓ Relationships stored in Neo4j")
61
+
62
+ # Return LLM-ready summary
63
+ return {
64
+ "success": True,
65
+ "url": html_data["url"],
66
+ "title": html_data["title"],
67
+ "mongo_id": mongo_id,
68
+ "summary": {
69
+ "content_blocks": len(extracted_data["content"]),
70
+ "text_length": len(extracted_data["text_summary"]),
71
+ "links_found": len(extracted_data["links"]),
72
+ "images_found": len(extracted_data["images"]),
73
+ "dom_depth": dom_structure["statistics"]["max_depth"],
74
+ "content_type": self._identify_content_type(extracted_data)
75
+ },
76
+ "llm_ready_data": {
77
+ "text_summary": extracted_data["text_summary"],
78
+ "key_headings": [h["text"] for h in extracted_data["metadata"]["headings"][:5]],
79
+ "main_topics": self._extract_main_topics(extracted_data),
80
+ "study_hints": self._generate_study_hints(extracted_data, dom_structure)
81
+ }
82
+ }
83
+
84
+ except Exception as e:
85
+ print(f"✗ Error processing {url}: {str(e)}")
86
+ return {"error": str(e), "url": url}
87
+ def agent_for_chat():
88
+ pass
89
+ def get_page_for_llm(self, url: str) -> Optional[Dict]:
90
+ """Retrieve page data optimized for LLM consumption"""
91
+ # Get from MongoDB
92
+ mongo_data = self.mongo_storage.get_page_data(url)
93
+ if not mongo_data:
94
+ return None
95
+
96
+ # Get relationships from Neo4j
97
+ # neo4j_data = self.neo4j_storage.get_page_relationships(url)
98
+
99
+ # Combine for LLM
100
+ return {
101
+ "content": mongo_data["content"]["text_summary"],
102
+ "title": mongo_data["title"],
103
+ "headings": [h["text"] for h in mongo_data["content"]["headings"]],
104
+ "structure": mongo_data["study_metadata"],
105
+ "relationships": {
106
+ "related_pages": mongo_data.get("internal_links", [])[:5],
107
+ "external_references": mongo_data.get("external_links", [])[:3]
108
+ },
109
+ "study_metadata": mongo_data["study_metadata"]
110
+ }
111
+
112
+ def search_for_llm(self, query: str, limit: int = 5) -> List[Dict]:
113
+ """Search content for LLM context"""
114
+ results = self.mongo_storage.search_pages(query, limit)
115
+
116
+ llm_ready_results = []
117
+ for result in results:
118
+ llm_ready_results.append({
119
+ "url": result["url"],
120
+ "title": result["title"],
121
+ "summary": result["content"]["text_summary"][:500],
122
+ "content_type": result["study_metadata"]["content_type"],
123
+ "complexity": result["study_metadata"]["complexity_score"],
124
+ "key_topics": result["study_metadata"]["key_topics"][:5]
125
+ })
126
+
127
+ return llm_ready_results
128
+
129
+ def _identify_content_type(self, data: Dict) -> str:
130
+ """Identify content type for processing hints"""
131
+ title = data["metadata"]["title"].lower()
132
+ text = data["text_summary"].lower()
133
+
134
+ if any(word in title for word in ["tutorial", "guide", "how to"]):
135
+ return "tutorial"
136
+ elif any(word in title for word in ["documentation", "docs", "api"]):
137
+ return "documentation"
138
+ elif any(word in title for word in ["blog", "article", "news"]):
139
+ return "article"
140
+ elif any(word in text for word in ["research", "study", "analysis"]):
141
+ return "research"
142
+ return "general"
143
+
144
+ def _extract_main_topics(self, data: Dict) -> List[str]:
145
+ """Extract main topics for LLM understanding"""
146
+ topics = set()
147
+
148
+ # From title
149
+ title_words = [word for word in data["metadata"]["title"].split() if len(word) > 3]
150
+ topics.update(title_words[:3])
151
+
152
+ # From headings
153
+ for heading in data["metadata"]["headings"][:3]:
154
+ heading_words = [word for word in heading["text"].split() if len(word) > 3]
155
+ topics.update(heading_words[:2])
156
+
157
+ return list(topics)[:5]
158
+
159
+ def _generate_study_hints(self, extracted_data: Dict, dom_structure: Dict) -> Dict:
160
+ """Generate study hints for LLM processing"""
161
+ return {
162
+ "difficulty_level": "beginner" if len(extracted_data["text_summary"]) < 2000 else "intermediate",
163
+ "estimated_study_time": f"{len(extracted_data['text_summary'].split()) // 250} minutes",
164
+ "content_structure": "well_structured" if len(extracted_data["metadata"]["headings"]) > 3 else "basic",
165
+ "has_examples": "code" in extracted_data["text_summary"].lower(),
166
+ "interactive_elements": dom_structure["statistics"]["tag_distribution"].get("form", 0) > 0
167
+ }
168
+
169
+ def close_connections(self):
170
+ """Close all database connections"""
171
+ # self.neo4j_storage.close()
172
+
173
+ # Main execution function
174
+ async def main():
175
+ orchestrator = WebScrapingOrchestrator()
176
+
177
+ # Example usage
178
+ test_url = "https://en.wikipedia.org/wiki/Virat_Kohli"
179
+ result = await orchestrator.process_url(test_url)
180
+ print(f"Processing result: {result}")
181
+
182
+ # Clean up
183
+ orchestrator.close_connections()
184
+
185
+ if __name__ == "__main__":
186
+ asyncio.run(main())
mongo_storage.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ from typing import Dict, List, Optional
3
+ import datetime
4
+ from config.settings import settings
5
+
6
+ class MongoStorage:
7
+ def __init__(self):
8
+ self.client = MongoClient(settings.database.mongo_uri)
9
+ self.db = self.client[settings.database.mongo_db]
10
+ self.collection = self.db.scraped_pages
11
+ self._create_indexes()
12
+
13
+ def _create_indexes(self):
14
+ """Create indexes for better query performance"""
15
+ self.collection.create_index("url", unique=True)
16
+ self.collection.create_index("domain")
17
+ self.collection.create_index("timestamp")
18
+ self.collection.create_index("content.metadata.title")
19
+
20
+ def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
21
+ """Store complete page data optimized for LLM consumption"""
22
+ document = {
23
+ "url": url,
24
+ "domain": extracted_data["metadata"]["domain"],
25
+ "timestamp": datetime.datetime.utcnow(),
26
+ "title": extracted_data["metadata"]["title"],
27
+ "description": extracted_data["metadata"]["description"],
28
+
29
+ # LLM-optimized content structure
30
+ "content": {
31
+ "text_summary": extracted_data["text_summary"],
32
+ "content_blocks": extracted_data["content"],
33
+ "headings": extracted_data["metadata"]["headings"],
34
+ "structure_info": extracted_data["structure"]
35
+ },
36
+
37
+ # Relationship data
38
+ "relationships": {
39
+ "internal_links": [link for link in extracted_data["links"] if link["internal"]],
40
+ "external_links": [link for link in extracted_data["links"] if not link["internal"]],
41
+ "images": extracted_data["images"]
42
+ },
43
+
44
+ # DOM analysis for advanced processing
45
+ "dom_analysis": {
46
+ "tree_structure": dom_structure["tree"],
47
+ "statistics": dom_structure["statistics"],
48
+ "semantic_structure": dom_structure["semantic_structure"],
49
+ "content_blocks": dom_structure["content_blocks"]
50
+ },
51
+
52
+ # Study-friendly metadata
53
+ "study_metadata": {
54
+ "reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
55
+ "complexity_score": self._calculate_complexity_score(extracted_data),
56
+ "content_type": self._identify_content_type(extracted_data),
57
+ "key_topics": self._extract_key_topics(extracted_data)
58
+ }
59
+ }
60
+
61
+ # Upsert document
62
+ result = self.collection.replace_one(
63
+ {"url": url},
64
+ document,
65
+ upsert=True
66
+ )
67
+
68
+ return str(result.upserted_id or result.matched_count)
69
+
70
+ def get_page_data(self, url: str) -> Optional[Dict]:
71
+ """Retrieve page data by URL"""
72
+ return self.collection.find_one({"url": url})
73
+
74
+ def get_pages_by_domain(self, domain: str) -> List[Dict]:
75
+ """Get all pages from a specific domain"""
76
+ return list(self.collection.find({"domain": domain}))
77
+
78
+ def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
79
+ """Search pages by content for LLM queries"""
80
+ search_filter = {
81
+ "$or": [
82
+ {"title": {"$regex": query, "$options": "i"}},
83
+ {"description": {"$regex": query, "$options": "i"}},
84
+ {"content.text_summary": {"$regex": query, "$options": "i"}}
85
+ ]
86
+ }
87
+
88
+ return list(self.collection.find(search_filter).limit(limit))
89
+
90
+ def _estimate_reading_time(self, text: str) -> int:
91
+ """Estimate reading time in minutes (250 words per minute)"""
92
+ word_count = len(text.split())
93
+ return max(1, word_count // 250)
94
+
95
+ def _calculate_complexity_score(self, data: Dict) -> float:
96
+ """Calculate content complexity for LLM processing hints"""
97
+ score = 0.0
98
+
99
+ # Text length factor
100
+ text_length = len(data["text_summary"])
101
+ score += min(text_length / 1000, 5.0)
102
+
103
+ # Structure complexity
104
+ content_blocks = len(data["content"])
105
+ score += min(content_blocks / 10, 3.0)
106
+
107
+ # Link density
108
+ total_links = len(data["links"])
109
+ score += min(total_links / 20, 2.0)
110
+
111
+ return round(score, 2)
112
+
113
+ def _identify_content_type(self, data: Dict) -> str:
114
+ """Identify content type for LLM processing strategy"""
115
+ title = data["metadata"]["title"].lower()
116
+ text = data["text_summary"].lower()
117
+
118
+ if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
119
+ return "tutorial"
120
+ elif any(word in title or word in text for word in ["news", "article", "report"]):
121
+ return "article"
122
+ elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
123
+ return "documentation"
124
+ elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
125
+ return "blog_post"
126
+ else:
127
+ return "general"
128
+
129
+ def _extract_key_topics(self, data: Dict) -> List[str]:
130
+ """Extract key topics for study organization"""
131
+ # Simple keyword extraction from headings and title
132
+ topics = set()
133
+
134
+ # From title
135
+ title_words = data["metadata"]["title"].split()
136
+ topics.update([word.lower() for word in title_words if len(word) > 3])
137
+
138
+ # From headings
139
+ for heading in data["metadata"]["headings"]:
140
+ heading_words = heading["text"].split()
141
+ topics.update([word.lower() for word in heading_words if len(word) > 3])
142
+
143
+ return list(topics)[:10] # Limit to top 10 topics
neo4j_storage.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from neo4j import GraphDatabase
2
+ # from typing import Dict, List
3
+ # from urllib.parse import urlparse
4
+ # from config.settings import settings
5
+
6
+ # class Neo4jStorage:
7
+ # def __init__(self):
8
+ # self.driver = GraphDatabase.driver(
9
+ # settings.database.neo4j_uri,
10
+ # auth=(settings.database.neo4j_user, settings.database.neo4j_password)
11
+ # )
12
+ # self._create_constraints()
13
+
14
+ # def _create_constraints(self):
15
+ # """Create constraints and indexes for better performance"""
16
+ # with self.driver.session() as session:
17
+ # try:
18
+ # session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
19
+ # session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
20
+ # session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
21
+ # except Exception as e:
22
+ # pass # Constraints might already exist
23
+
24
+ # def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
25
+ # """Store page relationships and structure in Neo4j"""
26
+ # with self.driver.session() as session:
27
+ # # Create main page node
28
+ # self._create_page_node(session, url, extracted_data)
29
+
30
+ # # Create domain relationships
31
+ # self._create_domain_relationships(session, url, extracted_data)
32
+
33
+ # # Create content relationships
34
+ # self._create_content_relationships(session, url, extracted_data)
35
+
36
+ # # Create link relationships
37
+ # self._create_link_relationships(session, url, extracted_data["links"])
38
+
39
+ # # Create DOM structure relationships
40
+ # self._create_dom_relationships(session, url, dom_structure)
41
+
42
+ # def _create_page_node(self, session, url: str, data: Dict):
43
+ # """Create or update page node with LLM-friendly properties"""
44
+ # query = """
45
+ # MERGE (p:Page {url: $url})
46
+ # SET p.title = $title,
47
+ # p.description = $description,
48
+ # p.domain = $domain,
49
+ # p.content_type = $content_type,
50
+ # p.complexity_score = $complexity_score,
51
+ # p.reading_time = $reading_time,
52
+ # p.word_count = $word_count,
53
+ # p.last_scraped = datetime()
54
+ # """
55
+
56
+ # session.run(query, {
57
+ # "url": url,
58
+ # "title": data["metadata"]["title"],
59
+ # "description": data["metadata"]["description"],
60
+ # "domain": data["metadata"]["domain"],
61
+ # "content_type": self._identify_content_type(data),
62
+ # "complexity_score": self._calculate_complexity_score(data),
63
+ # "reading_time": len(data["text_summary"].split()) // 250,
64
+ # "word_count": len(data["text_summary"].split())
65
+ # })
66
+
67
+ # def _create_domain_relationships(self, session, url: str, data: Dict):
68
+ # """Create domain nodes and relationships"""
69
+ # domain = data["metadata"]["domain"]
70
+
71
+ # # Create domain node
72
+ # session.run("""
73
+ # MERGE (d:Domain {name: $domain})
74
+ # SET d.last_updated = datetime()
75
+ # """, {"domain": domain})
76
+
77
+ # # Link page to domain
78
+ # session.run("""
79
+ # MATCH (p:Page {url: $url})
80
+ # MATCH (d:Domain {name: $domain})
81
+ # MERGE (p)-[:BELONGS_TO]->(d)
82
+ # """, {"url": url, "domain": domain})
83
+
84
+ # def _create_content_relationships(self, session, url: str, data: Dict):
85
+ # """Create content structure relationships for LLM understanding"""
86
+ # # Create topic nodes from headings
87
+ # for i, heading in enumerate(data["metadata"]["headings"]):
88
+ # session.run("""
89
+ # MATCH (p:Page {url: $url})
90
+ # MERGE (h:Heading {text: $text, level: $level, page_url: $url})
91
+ # SET h.position = $position
92
+ # MERGE (p)-[:HAS_HEADING]->(h)
93
+ # """, {
94
+ # "url": url,
95
+ # "text": heading["text"],
96
+ # "level": heading["level"],
97
+ # "position": i
98
+ # })
99
+
100
+ # # Create content block relationships
101
+ # for i, block in enumerate(data["content"][:10]): # Limit for performance
102
+ # session.run("""
103
+ # MATCH (p:Page {url: $url})
104
+ # MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
105
+ # SET c.tag = $tag,
106
+ # c.length = $length
107
+ # MERGE (p)-[:HAS_CONTENT]->(c)
108
+ # """, {
109
+ # "url": url,
110
+ # "text": block["text"][:500], # Truncate for storage
111
+ # "tag": block["tag"],
112
+ # "length": len(block["text"]),
113
+ # "position": i
114
+ # })
115
+
116
+ # def _create_link_relationships(self, session, url: str, links: List[Dict]):
117
+ # """Create link relationships for navigation understanding"""
118
+ # for link in links[:20]: # Limit for performance
119
+ # target_url = link["url"]
120
+ # link_text = link["text"]
121
+ # is_internal = link["internal"]
122
+
123
+ # # Create target page node (minimal)
124
+ # session.run("""
125
+ # MERGE (target:Page {url: $target_url})
126
+ # SET target.discovered_via = $source_url
127
+ # """, {"target_url": target_url, "source_url": url})
128
+
129
+ # # Create relationship
130
+ # relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
131
+ # session.run(f"""
132
+ # MATCH (source:Page {{url: $source_url}})
133
+ # MATCH (target:Page {{url: $target_url}})
134
+ # MERGE (source)-[r:{relationship_type}]->(target)
135
+ # SET r.link_text = $link_text,
136
+ # r.is_internal = $is_internal
137
+ # """, {
138
+ # "source_url": url,
139
+ # "target_url": target_url,
140
+ # "link_text": link_text,
141
+ # "is_internal": is_internal
142
+ # })
143
+
144
+ # def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
145
+ # """Create DOM structure relationships for content hierarchy"""
146
+ # # Create semantic structure nodes
147
+ # semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
148
+ # for tag, count in semantic_elements.items():
149
+ # if count > 0:
150
+ # session.run("""
151
+ # MATCH (p:Page {url: $url})
152
+ # MERGE (s:SemanticElement {tag: $tag, page_url: $url})
153
+ # SET s.count = $count
154
+ # MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
155
+ # """, {"url": url, "tag": tag, "count": count})
156
+
157
+ # def get_page_relationships(self, url: str) -> Dict:
158
+ # """Get all relationships for a page for LLM context"""
159
+ # with self.driver.session() as session:
160
+ # result = session.run("""
161
+ # MATCH (p:Page {url: $url})
162
+ # OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
163
+ # OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
164
+ # OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
165
+ # RETURN p, collect(DISTINCT internal.url) as internal_links,
166
+ # collect(DISTINCT external.url) as external_links,
167
+ # collect(DISTINCT {text: h.text, level: h.level}) as headings
168
+ # """, {"url": url})
169
+
170
+ # record = result.single()
171
+ # if record:
172
+ # return {
173
+ # "page": dict(record["p"]),
174
+ # "internal_links": record["internal_links"],
175
+ # "external_links": record["external_links"],
176
+ # "headings": record["headings"]
177
+ # }
178
+ # return {}
179
+
180
+ # def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
181
+ # """Find related pages for LLM context and study suggestions"""
182
+ # with self.driver.session() as session:
183
+ # result = session.run("""
184
+ # MATCH (p:Page {url: $url})
185
+ # MATCH (p)-[:BELONGS_TO]->(d:Domain)
186
+ # MATCH (related:Page)-[:BELONGS_TO]->(d)
187
+ # WHERE related.url <> $url
188
+ # RETURN related.url as url, related.title as title,
189
+ # related.content_type as content_type,
190
+ # related.complexity_score as complexity_score
191
+ # ORDER BY related.complexity_score DESC
192
+ # LIMIT $limit
193
+ # """, {"url": url, "limit": limit})
194
+
195
+ # return [dict(record) for record in result]
196
+
197
+ # def _identify_content_type(self, data: Dict) -> str:
198
+ # """Identify content type for graph relationships"""
199
+ # title = data["metadata"]["title"].lower()
200
+ # if "tutorial" in title or "guide" in title:
201
+ # return "tutorial"
202
+ # elif "documentation" in title or "docs" in title:
203
+ # return "documentation"
204
+ # elif "blog" in title or "article" in title:
205
+ # return "article"
206
+ # return "general"
207
+
208
+ # def _calculate_complexity_score(self, data: Dict) -> float:
209
+ # """Calculate complexity score for relationship weighting"""
210
+ # text_length = len(data["text_summary"])
211
+ # content_blocks = len(data["content"])
212
+ # return min(text_length / 1000 + content_blocks / 10, 10.0)
213
+
214
+ # def close(self):
215
+ # """Close database connection"""
216
+ # self.driver.close()
requirements.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ beautifulsoup4==4.13.4
5
+ bs4==0.0.2
6
+ certifi==2025.4.26
7
+ charset-normalizer==3.4.2
8
+ click==8.2.1
9
+ colorama==0.4.6
10
+ distro==1.9.0
11
+ dnspython==2.7.0
12
+ dotenv==0.9.9
13
+ fastapi==0.115.12
14
+ ffmpy==0.6.0
15
+ filelock==3.18.0
16
+ fsspec==2025.5.1
17
+ gradio==5.33.0
18
+ gradio_client==1.10.2
19
+ greenlet==3.2.3
20
+ groovy==0.1.2
21
+ h11==0.16.0
22
+ httpcore==1.0.9
23
+ httpx==0.28.1
24
+ huggingface-hub==0.32.4
25
+ idna==3.10
26
+ Jinja2==3.1.6
27
+ jiter==0.10.0
28
+ markdown-it-py==3.0.0
29
+ MarkupSafe==3.0.2
30
+ mdurl==0.1.2
31
+ numpy==2.3.0
32
+ openai==1.84.0
33
+ orjson==3.10.18
34
+ packaging==25.0
35
+ pandas==2.3.0
36
+ pillow==11.2.1
37
+ playwright==1.52.0
38
+ pydantic==2.11.5
39
+ pydantic_core==2.33.2
40
+ pydub==0.25.1
41
+ pyee==13.0.0
42
+ Pygments==2.19.1
43
+ pymongo==4.13.0
44
+ python-dateutil==2.9.0.post0
45
+ python-dotenv==1.1.0
46
+ python-multipart==0.0.20
47
+ pytz==2025.2
48
+ PyYAML==6.0.2
49
+ requests==2.32.3
50
+ rich==14.0.0
51
+ ruff==0.11.13
52
+ safehttpx==0.1.6
53
+ semantic-version==2.10.0
54
+ shellingham==1.5.4
55
+ six==1.17.0
56
+ sniffio==1.3.1
57
+ soupsieve==2.7
58
+ starlette==0.46.2
59
+ tomlkit==0.13.3
60
+ tqdm==4.67.1
61
+ typer==0.16.0
62
+ typing-inspection==0.4.1
63
+ typing_extensions==4.14.0
64
+ tzdata==2025.2
65
+ urllib3==2.4.0
66
+ uvicorn==0.34.3
67
+ websockets==15.0.1
settings.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pydantic import BaseModel
3
+ from typing import Dict, List
4
+ from dotenv import load_dotenv
5
+
6
+
7
+ load_dotenv()
8
+
9
+ class DatabaseConfig(BaseModel):
10
+ mongo_uri: str = os.getenv("mongo_uri")
11
+ mongo_db: str = os.getenv("mongo_db")
12
+ neo4j_uri: str = os.getenv("neo4j_uri")
13
+ neo4j_user: str = os.getenv("neo4j_user")
14
+ neo4j_password: str = os.getenv("neo4j_password")
15
+
16
+ class ScrapingConfig(BaseModel):
17
+ timeout: int = 30000
18
+ wait_for_selector: str = "body"
19
+ headless: bool = True
20
+ user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
21
+ max_retries: int = 3
22
+ delay_between_requests: float = 1.0
23
+
24
+ class ExtractionConfig(BaseModel):
25
+ content_selectors: List[str] = [
26
+ "article", "main", ".content", "#content",
27
+ ".post", ".article-body", "p", "h1", "h2", "h3"
28
+ ]
29
+ ignore_selectors: List[str] = [
30
+ "script", "style", "nav", "footer", "header",
31
+ ".advertisement", ".ads", ".sidebar"
32
+ ]
33
+ min_text_length: int = 50
34
+ extract_images: bool = True
35
+ extract_links: bool = True
36
+
37
+ class Settings:
38
+ def __init__(self):
39
+ self.database = DatabaseConfig()
40
+ self.scraping = ScrapingConfig()
41
+ self.extraction = ExtractionConfig()
42
+
43
+ def update_from_env(self):
44
+ # Update from environment variables if available
45
+ if os.getenv("mongo_uri"):
46
+ self.database.mongo_uri = os.getenv("mongo_uri")
47
+ if os.getenv("mongo_db"):
48
+ self.database.mongo_db = os.getenv("mongo_db")
49
+ if os.getenv("neo4j_uri"):
50
+ self.database.neo4j_uri = os.getenv("neo4j_uri")
51
+ if os.getenv("neo4j_user"):
52
+ self.database.neo4j_user = os.getenv("neo4j_user")
53
+ if os.getenv("neo4j_password"):
54
+ self.database.neo4j_password = os.getenv("neo4j_password")
55
+
56
+ settings = Settings()
57
+ settings.update_from_env()