Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- .gitignore +2 -0
- app.py +36 -0
- chat_agent.py +43 -0
- data_extractor.py +118 -0
- dom_analyzer.py +162 -0
- html_loader.py +64 -0
- main.py +186 -0
- mongo_storage.py +143 -0
- neo4j_storage.py +216 -0
- requirements.txt +67 -0
- settings.py +57 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
.venv
|
app.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import asyncio
|
| 4 |
+
from agent.chat_agent import SimpleChatAgent
|
| 5 |
+
|
| 6 |
+
agent = SimpleChatAgent()
|
| 7 |
+
|
| 8 |
+
async def async_chat_fn(message, history):
|
| 9 |
+
return await agent.handle_query(message, history)
|
| 10 |
+
|
| 11 |
+
def chat_fn(message, history):
|
| 12 |
+
return asyncio.run(async_chat_fn(message, history))
|
| 13 |
+
|
| 14 |
+
with gr.Blocks(css="""
|
| 15 |
+
#title { font-size: 2.2rem; font-weight: bold; text-align: center; margin-bottom: 0.5em; color: #2e3a59; }
|
| 16 |
+
#desc { font-size: 1.1rem; text-align: center; color: #6c7a92; margin-bottom: 2em; }
|
| 17 |
+
footer { text-align: center; font-size: 0.9rem; color: #999; margin-top: 2em; }
|
| 18 |
+
.gradio-container { background-color: #f9fbfc; }
|
| 19 |
+
""") as demo:
|
| 20 |
+
|
| 21 |
+
gr.Markdown("<div id='title'> Chat + Web Scraper Agent</div>")
|
| 22 |
+
gr.Markdown("<div id='desc'>Ask anything, or tell me to scrape a webpage. your custom agent logic.</div>")
|
| 23 |
+
|
| 24 |
+
with gr.Row():
|
| 25 |
+
with gr.Column(scale=1):
|
| 26 |
+
gr.ChatInterface(
|
| 27 |
+
fn=chat_fn,
|
| 28 |
+
chatbot=gr.Chatbot(show_copy_button=True),
|
| 29 |
+
textbox=gr.Textbox(placeholder="Type your question or paste a URL to scrape...", show_label=False),
|
| 30 |
+
title=None,
|
| 31 |
+
theme="soft",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
gr.Markdown("<footer>Built with ❤️ using LLM + Gradio UI</footer>")
|
| 35 |
+
|
| 36 |
+
demo.launch()
|
chat_agent.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# chat_agent.py
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from main import WebScrapingOrchestrator
|
| 6 |
+
|
| 7 |
+
class SimpleChatAgent:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.client = OpenAI(
|
| 10 |
+
base_url="https://api.studio.nebius.com/v1/",
|
| 11 |
+
api_key=os.environ.get("NEBIUS_API_KEY"),
|
| 12 |
+
)
|
| 13 |
+
self.model = "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
| 14 |
+
self.orchestrator = WebScrapingOrchestrator()
|
| 15 |
+
|
| 16 |
+
async def handle_query(self, user_input, history):
|
| 17 |
+
# Web scraping check
|
| 18 |
+
url_match = re.search(r"(https?://[^\s]+)", user_input)
|
| 19 |
+
if "scrape" in user_input.lower() and url_match:
|
| 20 |
+
url = url_match.group(1)
|
| 21 |
+
result = await self.orchestrator.process_url(url)
|
| 22 |
+
if "error" in result:
|
| 23 |
+
return f"❌ Error scraping {url}: {result['error']}"
|
| 24 |
+
return (
|
| 25 |
+
f"✅ Scraped Data from {result['title']}:\n"
|
| 26 |
+
f"- Topics: {', '.join(result['llm_ready_data']['main_topics'])}\n"
|
| 27 |
+
f"- Summary: {result['llm_ready_data']['text_summary'][:500]}..."
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Build full chat history
|
| 31 |
+
messages = []
|
| 32 |
+
for user_msg, bot_msg in history:
|
| 33 |
+
messages.append({"role": "user", "content": user_msg})
|
| 34 |
+
messages.append({"role": "assistant", "content": bot_msg})
|
| 35 |
+
messages.append({"role": "user", "content": user_input})
|
| 36 |
+
|
| 37 |
+
# Call Nebius LLM
|
| 38 |
+
response = self.client.chat.completions.create(
|
| 39 |
+
model=self.model,
|
| 40 |
+
messages=messages,
|
| 41 |
+
temperature=0.6,
|
| 42 |
+
)
|
| 43 |
+
return response.choices[0].message.content
|
data_extractor.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup, Comment
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import urljoin, urlparse
|
| 5 |
+
from config.settings import settings
|
| 6 |
+
|
| 7 |
+
class DataExtractor:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.config = settings.extraction
|
| 10 |
+
|
| 11 |
+
def extract_structured_data(self, html: str, url: str) -> Dict:
|
| 12 |
+
"""Extract structured data from HTML for LLM consumption"""
|
| 13 |
+
soup = BeautifulSoup(html, 'lxml')
|
| 14 |
+
|
| 15 |
+
# Remove unwanted elements
|
| 16 |
+
self._clean_html(soup)
|
| 17 |
+
|
| 18 |
+
return {
|
| 19 |
+
"content": self._extract_content(soup),
|
| 20 |
+
"metadata": self._extract_metadata(soup, url),
|
| 21 |
+
"structure": self._extract_structure(soup),
|
| 22 |
+
"links": self._extract_links(soup, url),
|
| 23 |
+
"images": self._extract_images(soup, url),
|
| 24 |
+
"text_summary": self._extract_text_summary(soup)
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
def _clean_html(self, soup: BeautifulSoup):
|
| 28 |
+
"""Remove unwanted elements for cleaner extraction"""
|
| 29 |
+
for selector in self.config.ignore_selectors:
|
| 30 |
+
for element in soup.select(selector):
|
| 31 |
+
element.decompose()
|
| 32 |
+
|
| 33 |
+
# Remove comments and scripts
|
| 34 |
+
for element in soup(text=lambda text: isinstance(text, Comment)):
|
| 35 |
+
element.extract()
|
| 36 |
+
|
| 37 |
+
def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
|
| 38 |
+
"""Extract main content blocks"""
|
| 39 |
+
content_blocks = []
|
| 40 |
+
|
| 41 |
+
for selector in self.config.content_selectors:
|
| 42 |
+
elements = soup.select(selector)
|
| 43 |
+
for elem in elements:
|
| 44 |
+
text = elem.get_text(strip=True)
|
| 45 |
+
if len(text) >= self.config.min_text_length:
|
| 46 |
+
content_blocks.append({
|
| 47 |
+
"tag": elem.name,
|
| 48 |
+
"text": text,
|
| 49 |
+
"html": str(elem),
|
| 50 |
+
"attributes": dict(elem.attrs) if elem.attrs else {}
|
| 51 |
+
})
|
| 52 |
+
|
| 53 |
+
return content_blocks
|
| 54 |
+
|
| 55 |
+
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
|
| 56 |
+
"""Extract page metadata"""
|
| 57 |
+
title = soup.find('title')
|
| 58 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 59 |
+
|
| 60 |
+
return {
|
| 61 |
+
"title": title.get_text().strip() if title else "",
|
| 62 |
+
"description": meta_desc.get('content', '') if meta_desc else "",
|
| 63 |
+
"url": url,
|
| 64 |
+
"domain": urlparse(url).netloc,
|
| 65 |
+
"headings": self._extract_headings(soup)
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
|
| 69 |
+
"""Extract heading hierarchy for structure"""
|
| 70 |
+
headings = []
|
| 71 |
+
for i in range(1, 7):
|
| 72 |
+
for heading in soup.find_all(f'h{i}'):
|
| 73 |
+
headings.append({
|
| 74 |
+
"level": i,
|
| 75 |
+
"text": heading.get_text().strip(),
|
| 76 |
+
"id": heading.get('id', '')
|
| 77 |
+
})
|
| 78 |
+
return headings
|
| 79 |
+
|
| 80 |
+
def _extract_structure(self, soup: BeautifulSoup) -> Dict:
|
| 81 |
+
"""Extract DOM structure for relationships"""
|
| 82 |
+
return {
|
| 83 |
+
"sections": len(soup.find_all(['section', 'article', 'div'])),
|
| 84 |
+
"paragraphs": len(soup.find_all('p')),
|
| 85 |
+
"lists": len(soup.find_all(['ul', 'ol'])),
|
| 86 |
+
"tables": len(soup.find_all('table')),
|
| 87 |
+
"forms": len(soup.find_all('form'))
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
| 91 |
+
"""Extract all links for relationship mapping"""
|
| 92 |
+
links = []
|
| 93 |
+
for link in soup.find_all('a', href=True):
|
| 94 |
+
href = urljoin(base_url, link['href'])
|
| 95 |
+
links.append({
|
| 96 |
+
"url": href,
|
| 97 |
+
"text": link.get_text().strip(),
|
| 98 |
+
"internal": urlparse(href).netloc == urlparse(base_url).netloc
|
| 99 |
+
})
|
| 100 |
+
return links[:50] # Limit for performance
|
| 101 |
+
|
| 102 |
+
def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
| 103 |
+
"""Extract images with context"""
|
| 104 |
+
images = []
|
| 105 |
+
for img in soup.find_all('img', src=True):
|
| 106 |
+
images.append({
|
| 107 |
+
"src": urljoin(base_url, img['src']),
|
| 108 |
+
"alt": img.get('alt', ''),
|
| 109 |
+
"caption": img.get('title', '')
|
| 110 |
+
})
|
| 111 |
+
return images[:20] # Limit for performance
|
| 112 |
+
|
| 113 |
+
def _extract_text_summary(self, soup: BeautifulSoup) -> str:
|
| 114 |
+
"""Extract clean text for LLM processing"""
|
| 115 |
+
text = soup.get_text()
|
| 116 |
+
# Clean whitespace and normalize
|
| 117 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 118 |
+
return text[:5000] # Limit for token efficiency
|
dom_analyzer.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
import hashlib
|
| 4 |
+
|
| 5 |
+
class DOMAnalyzer:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def analyze_structure(self, html: str) -> Dict:
|
| 10 |
+
"""Analyze DOM structure and create tree representation"""
|
| 11 |
+
soup = BeautifulSoup(html, 'lxml')
|
| 12 |
+
|
| 13 |
+
return {
|
| 14 |
+
"tree": self._build_dom_tree(soup.body if soup.body else soup),
|
| 15 |
+
"statistics": self._get_dom_statistics(soup),
|
| 16 |
+
"semantic_structure": self._analyze_semantic_structure(soup),
|
| 17 |
+
"content_blocks": self._identify_content_blocks(soup)
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
|
| 21 |
+
"""Build hierarchical DOM tree structure"""
|
| 22 |
+
if depth > max_depth or not element or not hasattr(element, 'name'):
|
| 23 |
+
return {}
|
| 24 |
+
|
| 25 |
+
node = {
|
| 26 |
+
"tag": element.name if element.name else "text",
|
| 27 |
+
"id": element.get('id', ''),
|
| 28 |
+
"classes": element.get('class', []),
|
| 29 |
+
"text_content": element.get_text()[:100] if element.get_text() else "",
|
| 30 |
+
"children": [],
|
| 31 |
+
"attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
|
| 32 |
+
"depth": depth,
|
| 33 |
+
"node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Add children (limit to prevent huge trees)
|
| 37 |
+
if hasattr(element, 'children') and depth < max_depth:
|
| 38 |
+
child_count = 0
|
| 39 |
+
for child in element.children:
|
| 40 |
+
if child_count >= 10: # Limit children per node
|
| 41 |
+
break
|
| 42 |
+
if hasattr(child, 'name') and child.name:
|
| 43 |
+
child_node = self._build_dom_tree(child, depth + 1, max_depth)
|
| 44 |
+
if child_node:
|
| 45 |
+
node["children"].append(child_node)
|
| 46 |
+
child_count += 1
|
| 47 |
+
|
| 48 |
+
return node
|
| 49 |
+
|
| 50 |
+
def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
|
| 51 |
+
"""Get DOM statistics for analysis"""
|
| 52 |
+
all_tags = soup.find_all()
|
| 53 |
+
tag_counts = {}
|
| 54 |
+
|
| 55 |
+
for tag in all_tags:
|
| 56 |
+
tag_name = tag.name
|
| 57 |
+
tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
"total_elements": len(all_tags),
|
| 61 |
+
"tag_distribution": tag_counts,
|
| 62 |
+
"max_depth": self._calculate_max_depth(soup),
|
| 63 |
+
"text_content_ratio": self._calculate_text_ratio(soup)
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
|
| 67 |
+
"""Analyze semantic HTML structure"""
|
| 68 |
+
semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
|
| 69 |
+
semantic_elements = {}
|
| 70 |
+
|
| 71 |
+
for tag in semantic_tags:
|
| 72 |
+
elements = soup.find_all(tag)
|
| 73 |
+
semantic_elements[tag] = len(elements)
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
"semantic_elements": semantic_elements,
|
| 77 |
+
"has_semantic_structure": sum(semantic_elements.values()) > 0,
|
| 78 |
+
"content_hierarchy": self._analyze_heading_hierarchy(soup)
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
|
| 82 |
+
"""Identify main content blocks for LLM processing"""
|
| 83 |
+
content_blocks = []
|
| 84 |
+
|
| 85 |
+
# Look for common content containers
|
| 86 |
+
selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
|
| 87 |
+
|
| 88 |
+
for selector in selectors:
|
| 89 |
+
elements = soup.select(selector)
|
| 90 |
+
for elem in elements:
|
| 91 |
+
if elem.get_text(strip=True):
|
| 92 |
+
content_blocks.append({
|
| 93 |
+
"selector": selector,
|
| 94 |
+
"tag": elem.name,
|
| 95 |
+
"text_length": len(elem.get_text()),
|
| 96 |
+
"element_id": elem.get('id', ''),
|
| 97 |
+
"classes": elem.get('class', []),
|
| 98 |
+
"priority": self._calculate_content_priority(elem)
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
|
| 102 |
+
|
| 103 |
+
def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
|
| 104 |
+
"""Calculate maximum DOM depth"""
|
| 105 |
+
def get_depth(element, current_depth=0):
|
| 106 |
+
if not hasattr(element, 'children'):
|
| 107 |
+
return current_depth
|
| 108 |
+
|
| 109 |
+
max_child_depth = current_depth
|
| 110 |
+
for child in element.children:
|
| 111 |
+
if hasattr(child, 'name') and child.name:
|
| 112 |
+
depth = get_depth(child, current_depth + 1)
|
| 113 |
+
max_child_depth = max(max_child_depth, depth)
|
| 114 |
+
|
| 115 |
+
return max_child_depth
|
| 116 |
+
|
| 117 |
+
return get_depth(soup)
|
| 118 |
+
|
| 119 |
+
def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
|
| 120 |
+
"""Calculate ratio of text content to HTML tags"""
|
| 121 |
+
text_length = len(soup.get_text())
|
| 122 |
+
html_length = len(str(soup))
|
| 123 |
+
return text_length / html_length if html_length > 0 else 0
|
| 124 |
+
|
| 125 |
+
def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
|
| 126 |
+
"""Analyze heading structure for content organization"""
|
| 127 |
+
headings = []
|
| 128 |
+
for i in range(1, 7):
|
| 129 |
+
for heading in soup.find_all(f'h{i}'):
|
| 130 |
+
headings.append({
|
| 131 |
+
"level": i,
|
| 132 |
+
"text": heading.get_text().strip(),
|
| 133 |
+
"position": len(headings)
|
| 134 |
+
})
|
| 135 |
+
return headings
|
| 136 |
+
|
| 137 |
+
def _calculate_content_priority(self, element) -> int:
|
| 138 |
+
"""Calculate priority score for content blocks"""
|
| 139 |
+
score = 0
|
| 140 |
+
text_length = len(element.get_text())
|
| 141 |
+
|
| 142 |
+
# Text length scoring
|
| 143 |
+
score += min(text_length // 100, 10)
|
| 144 |
+
|
| 145 |
+
# Semantic tag bonus
|
| 146 |
+
if element.name in ['article', 'main']:
|
| 147 |
+
score += 5
|
| 148 |
+
elif element.name in ['section', 'div']:
|
| 149 |
+
score += 2
|
| 150 |
+
|
| 151 |
+
# Class/ID based scoring
|
| 152 |
+
classes = element.get('class', [])
|
| 153 |
+
element_id = element.get('id', '')
|
| 154 |
+
|
| 155 |
+
content_indicators = ['content', 'article', 'post', 'main', 'body']
|
| 156 |
+
for indicator in content_indicators:
|
| 157 |
+
if any(indicator in str(c).lower() for c in classes):
|
| 158 |
+
score += 3
|
| 159 |
+
if indicator in element_id.lower():
|
| 160 |
+
score += 3
|
| 161 |
+
|
| 162 |
+
return score
|
html_loader.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from playwright.async_api import async_playwright
|
| 3 |
+
from typing import Dict, Optional
|
| 4 |
+
import time
|
| 5 |
+
from config.settings import settings
|
| 6 |
+
|
| 7 |
+
class HTMLLoader:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.browser = None
|
| 10 |
+
self.context = None
|
| 11 |
+
|
| 12 |
+
async def __aenter__(self):
|
| 13 |
+
self.playwright = await async_playwright().start()
|
| 14 |
+
self.browser = await self.playwright.chromium.launch(
|
| 15 |
+
headless=settings.scraping.headless
|
| 16 |
+
)
|
| 17 |
+
self.context = await self.browser.new_context(
|
| 18 |
+
user_agent=settings.scraping.user_agent
|
| 19 |
+
)
|
| 20 |
+
return self
|
| 21 |
+
|
| 22 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 23 |
+
if self.context:
|
| 24 |
+
await self.context.close()
|
| 25 |
+
if self.browser:
|
| 26 |
+
await self.browser.close()
|
| 27 |
+
if self.playwright:
|
| 28 |
+
await self.playwright.stop()
|
| 29 |
+
|
| 30 |
+
async def load_page(self, url: str) -> Dict[str, str]:
|
| 31 |
+
"""Load HTML content from URL handling both static and dynamic sites"""
|
| 32 |
+
for attempt in range(settings.scraping.max_retries):
|
| 33 |
+
try:
|
| 34 |
+
page = await self.context.new_page()
|
| 35 |
+
await page.goto(url, timeout=settings.scraping.timeout)
|
| 36 |
+
|
| 37 |
+
# Wait for body to load
|
| 38 |
+
await page.wait_for_selector(
|
| 39 |
+
settings.scraping.wait_for_selector,
|
| 40 |
+
timeout=10000
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Additional wait for dynamic content
|
| 44 |
+
await page.wait_for_timeout(2000)
|
| 45 |
+
|
| 46 |
+
html_content = await page.content()
|
| 47 |
+
title = await page.title()
|
| 48 |
+
url_final = page.url
|
| 49 |
+
|
| 50 |
+
await page.close()
|
| 51 |
+
|
| 52 |
+
return {
|
| 53 |
+
"html": html_content,
|
| 54 |
+
"title": title,
|
| 55 |
+
"url": url_final,
|
| 56 |
+
"timestamp": int(time.time())
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
if attempt == settings.scraping.max_retries - 1:
|
| 61 |
+
raise Exception(f"Failed to load {url}: {str(e)}")
|
| 62 |
+
await asyncio.sleep(settings.scraping.delay_between_requests)
|
| 63 |
+
|
| 64 |
+
return None
|
main.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from typing import Dict, Optional,List
|
| 3 |
+
from scraper.html_loader import HTMLLoader
|
| 4 |
+
from scraper.data_extractor import DataExtractor
|
| 5 |
+
from scraper.dom_analyzer import DOMAnalyzer
|
| 6 |
+
from storage.mongo_storage import MongoStorage
|
| 7 |
+
# from storage.neo4j_storage import Neo4jStorage
|
| 8 |
+
from config.settings import settings
|
| 9 |
+
|
| 10 |
+
class WebScrapingOrchestrator:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.data_extractor = DataExtractor()
|
| 13 |
+
self.dom_analyzer = DOMAnalyzer()
|
| 14 |
+
self.mongo_storage = MongoStorage()
|
| 15 |
+
# self.neo4j_storage = Neo4jStorage()
|
| 16 |
+
|
| 17 |
+
async def process_url(self, url: str) -> Dict:
|
| 18 |
+
"""Complete pipeline to process a URL for LLM consumption"""
|
| 19 |
+
try:
|
| 20 |
+
print(f"Processing URL: {url}")
|
| 21 |
+
|
| 22 |
+
# Step 1: Load HTML content
|
| 23 |
+
async with HTMLLoader() as loader:
|
| 24 |
+
html_data = await loader.load_page(url)
|
| 25 |
+
|
| 26 |
+
if not html_data:
|
| 27 |
+
return {"error": "Failed to load page"}
|
| 28 |
+
|
| 29 |
+
print("✓ HTML loaded successfully")
|
| 30 |
+
|
| 31 |
+
# Step 2: Extract structured data
|
| 32 |
+
extracted_data = self.data_extractor.extract_structured_data(
|
| 33 |
+
html_data["html"],
|
| 34 |
+
html_data["url"]
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
print("✓ Data extracted successfully")
|
| 38 |
+
|
| 39 |
+
# Step 3: Analyze DOM structure
|
| 40 |
+
dom_structure = self.dom_analyzer.analyze_structure(html_data["html"])
|
| 41 |
+
|
| 42 |
+
print("✓ DOM structure analyzed")
|
| 43 |
+
|
| 44 |
+
# Step 4: Store in MongoDB
|
| 45 |
+
mongo_id = self.mongo_storage.store_page_data(
|
| 46 |
+
html_data["url"],
|
| 47 |
+
extracted_data,
|
| 48 |
+
dom_structure
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
print("✓ Data stored in MongoDB")
|
| 52 |
+
|
| 53 |
+
# Step 5: Store relationships in Neo4j
|
| 54 |
+
# self.neo4j_storage.store_relationships(
|
| 55 |
+
# html_data["url"],
|
| 56 |
+
# extracted_data,
|
| 57 |
+
# dom_structure
|
| 58 |
+
# )
|
| 59 |
+
|
| 60 |
+
print("✓ Relationships stored in Neo4j")
|
| 61 |
+
|
| 62 |
+
# Return LLM-ready summary
|
| 63 |
+
return {
|
| 64 |
+
"success": True,
|
| 65 |
+
"url": html_data["url"],
|
| 66 |
+
"title": html_data["title"],
|
| 67 |
+
"mongo_id": mongo_id,
|
| 68 |
+
"summary": {
|
| 69 |
+
"content_blocks": len(extracted_data["content"]),
|
| 70 |
+
"text_length": len(extracted_data["text_summary"]),
|
| 71 |
+
"links_found": len(extracted_data["links"]),
|
| 72 |
+
"images_found": len(extracted_data["images"]),
|
| 73 |
+
"dom_depth": dom_structure["statistics"]["max_depth"],
|
| 74 |
+
"content_type": self._identify_content_type(extracted_data)
|
| 75 |
+
},
|
| 76 |
+
"llm_ready_data": {
|
| 77 |
+
"text_summary": extracted_data["text_summary"],
|
| 78 |
+
"key_headings": [h["text"] for h in extracted_data["metadata"]["headings"][:5]],
|
| 79 |
+
"main_topics": self._extract_main_topics(extracted_data),
|
| 80 |
+
"study_hints": self._generate_study_hints(extracted_data, dom_structure)
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"✗ Error processing {url}: {str(e)}")
|
| 86 |
+
return {"error": str(e), "url": url}
|
| 87 |
+
def agent_for_chat():
|
| 88 |
+
pass
|
| 89 |
+
def get_page_for_llm(self, url: str) -> Optional[Dict]:
|
| 90 |
+
"""Retrieve page data optimized for LLM consumption"""
|
| 91 |
+
# Get from MongoDB
|
| 92 |
+
mongo_data = self.mongo_storage.get_page_data(url)
|
| 93 |
+
if not mongo_data:
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
# Get relationships from Neo4j
|
| 97 |
+
# neo4j_data = self.neo4j_storage.get_page_relationships(url)
|
| 98 |
+
|
| 99 |
+
# Combine for LLM
|
| 100 |
+
return {
|
| 101 |
+
"content": mongo_data["content"]["text_summary"],
|
| 102 |
+
"title": mongo_data["title"],
|
| 103 |
+
"headings": [h["text"] for h in mongo_data["content"]["headings"]],
|
| 104 |
+
"structure": mongo_data["study_metadata"],
|
| 105 |
+
"relationships": {
|
| 106 |
+
"related_pages": mongo_data.get("internal_links", [])[:5],
|
| 107 |
+
"external_references": mongo_data.get("external_links", [])[:3]
|
| 108 |
+
},
|
| 109 |
+
"study_metadata": mongo_data["study_metadata"]
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
def search_for_llm(self, query: str, limit: int = 5) -> List[Dict]:
|
| 113 |
+
"""Search content for LLM context"""
|
| 114 |
+
results = self.mongo_storage.search_pages(query, limit)
|
| 115 |
+
|
| 116 |
+
llm_ready_results = []
|
| 117 |
+
for result in results:
|
| 118 |
+
llm_ready_results.append({
|
| 119 |
+
"url": result["url"],
|
| 120 |
+
"title": result["title"],
|
| 121 |
+
"summary": result["content"]["text_summary"][:500],
|
| 122 |
+
"content_type": result["study_metadata"]["content_type"],
|
| 123 |
+
"complexity": result["study_metadata"]["complexity_score"],
|
| 124 |
+
"key_topics": result["study_metadata"]["key_topics"][:5]
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
return llm_ready_results
|
| 128 |
+
|
| 129 |
+
def _identify_content_type(self, data: Dict) -> str:
|
| 130 |
+
"""Identify content type for processing hints"""
|
| 131 |
+
title = data["metadata"]["title"].lower()
|
| 132 |
+
text = data["text_summary"].lower()
|
| 133 |
+
|
| 134 |
+
if any(word in title for word in ["tutorial", "guide", "how to"]):
|
| 135 |
+
return "tutorial"
|
| 136 |
+
elif any(word in title for word in ["documentation", "docs", "api"]):
|
| 137 |
+
return "documentation"
|
| 138 |
+
elif any(word in title for word in ["blog", "article", "news"]):
|
| 139 |
+
return "article"
|
| 140 |
+
elif any(word in text for word in ["research", "study", "analysis"]):
|
| 141 |
+
return "research"
|
| 142 |
+
return "general"
|
| 143 |
+
|
| 144 |
+
def _extract_main_topics(self, data: Dict) -> List[str]:
|
| 145 |
+
"""Extract main topics for LLM understanding"""
|
| 146 |
+
topics = set()
|
| 147 |
+
|
| 148 |
+
# From title
|
| 149 |
+
title_words = [word for word in data["metadata"]["title"].split() if len(word) > 3]
|
| 150 |
+
topics.update(title_words[:3])
|
| 151 |
+
|
| 152 |
+
# From headings
|
| 153 |
+
for heading in data["metadata"]["headings"][:3]:
|
| 154 |
+
heading_words = [word for word in heading["text"].split() if len(word) > 3]
|
| 155 |
+
topics.update(heading_words[:2])
|
| 156 |
+
|
| 157 |
+
return list(topics)[:5]
|
| 158 |
+
|
| 159 |
+
def _generate_study_hints(self, extracted_data: Dict, dom_structure: Dict) -> Dict:
|
| 160 |
+
"""Generate study hints for LLM processing"""
|
| 161 |
+
return {
|
| 162 |
+
"difficulty_level": "beginner" if len(extracted_data["text_summary"]) < 2000 else "intermediate",
|
| 163 |
+
"estimated_study_time": f"{len(extracted_data['text_summary'].split()) // 250} minutes",
|
| 164 |
+
"content_structure": "well_structured" if len(extracted_data["metadata"]["headings"]) > 3 else "basic",
|
| 165 |
+
"has_examples": "code" in extracted_data["text_summary"].lower(),
|
| 166 |
+
"interactive_elements": dom_structure["statistics"]["tag_distribution"].get("form", 0) > 0
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
def close_connections(self):
|
| 170 |
+
"""Close all database connections"""
|
| 171 |
+
# self.neo4j_storage.close()
|
| 172 |
+
|
| 173 |
+
# Main execution function
|
| 174 |
+
async def main():
|
| 175 |
+
orchestrator = WebScrapingOrchestrator()
|
| 176 |
+
|
| 177 |
+
# Example usage
|
| 178 |
+
test_url = "https://en.wikipedia.org/wiki/Virat_Kohli"
|
| 179 |
+
result = await orchestrator.process_url(test_url)
|
| 180 |
+
print(f"Processing result: {result}")
|
| 181 |
+
|
| 182 |
+
# Clean up
|
| 183 |
+
orchestrator.close_connections()
|
| 184 |
+
|
| 185 |
+
if __name__ == "__main__":
|
| 186 |
+
asyncio.run(main())
|
mongo_storage.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymongo import MongoClient
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
import datetime
|
| 4 |
+
from config.settings import settings
|
| 5 |
+
|
| 6 |
+
class MongoStorage:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.client = MongoClient(settings.database.mongo_uri)
|
| 9 |
+
self.db = self.client[settings.database.mongo_db]
|
| 10 |
+
self.collection = self.db.scraped_pages
|
| 11 |
+
self._create_indexes()
|
| 12 |
+
|
| 13 |
+
def _create_indexes(self):
|
| 14 |
+
"""Create indexes for better query performance"""
|
| 15 |
+
self.collection.create_index("url", unique=True)
|
| 16 |
+
self.collection.create_index("domain")
|
| 17 |
+
self.collection.create_index("timestamp")
|
| 18 |
+
self.collection.create_index("content.metadata.title")
|
| 19 |
+
|
| 20 |
+
def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
|
| 21 |
+
"""Store complete page data optimized for LLM consumption"""
|
| 22 |
+
document = {
|
| 23 |
+
"url": url,
|
| 24 |
+
"domain": extracted_data["metadata"]["domain"],
|
| 25 |
+
"timestamp": datetime.datetime.utcnow(),
|
| 26 |
+
"title": extracted_data["metadata"]["title"],
|
| 27 |
+
"description": extracted_data["metadata"]["description"],
|
| 28 |
+
|
| 29 |
+
# LLM-optimized content structure
|
| 30 |
+
"content": {
|
| 31 |
+
"text_summary": extracted_data["text_summary"],
|
| 32 |
+
"content_blocks": extracted_data["content"],
|
| 33 |
+
"headings": extracted_data["metadata"]["headings"],
|
| 34 |
+
"structure_info": extracted_data["structure"]
|
| 35 |
+
},
|
| 36 |
+
|
| 37 |
+
# Relationship data
|
| 38 |
+
"relationships": {
|
| 39 |
+
"internal_links": [link for link in extracted_data["links"] if link["internal"]],
|
| 40 |
+
"external_links": [link for link in extracted_data["links"] if not link["internal"]],
|
| 41 |
+
"images": extracted_data["images"]
|
| 42 |
+
},
|
| 43 |
+
|
| 44 |
+
# DOM analysis for advanced processing
|
| 45 |
+
"dom_analysis": {
|
| 46 |
+
"tree_structure": dom_structure["tree"],
|
| 47 |
+
"statistics": dom_structure["statistics"],
|
| 48 |
+
"semantic_structure": dom_structure["semantic_structure"],
|
| 49 |
+
"content_blocks": dom_structure["content_blocks"]
|
| 50 |
+
},
|
| 51 |
+
|
| 52 |
+
# Study-friendly metadata
|
| 53 |
+
"study_metadata": {
|
| 54 |
+
"reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
|
| 55 |
+
"complexity_score": self._calculate_complexity_score(extracted_data),
|
| 56 |
+
"content_type": self._identify_content_type(extracted_data),
|
| 57 |
+
"key_topics": self._extract_key_topics(extracted_data)
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# Upsert document
|
| 62 |
+
result = self.collection.replace_one(
|
| 63 |
+
{"url": url},
|
| 64 |
+
document,
|
| 65 |
+
upsert=True
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
return str(result.upserted_id or result.matched_count)
|
| 69 |
+
|
| 70 |
+
def get_page_data(self, url: str) -> Optional[Dict]:
|
| 71 |
+
"""Retrieve page data by URL"""
|
| 72 |
+
return self.collection.find_one({"url": url})
|
| 73 |
+
|
| 74 |
+
def get_pages_by_domain(self, domain: str) -> List[Dict]:
|
| 75 |
+
"""Get all pages from a specific domain"""
|
| 76 |
+
return list(self.collection.find({"domain": domain}))
|
| 77 |
+
|
| 78 |
+
def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
|
| 79 |
+
"""Search pages by content for LLM queries"""
|
| 80 |
+
search_filter = {
|
| 81 |
+
"$or": [
|
| 82 |
+
{"title": {"$regex": query, "$options": "i"}},
|
| 83 |
+
{"description": {"$regex": query, "$options": "i"}},
|
| 84 |
+
{"content.text_summary": {"$regex": query, "$options": "i"}}
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
return list(self.collection.find(search_filter).limit(limit))
|
| 89 |
+
|
| 90 |
+
def _estimate_reading_time(self, text: str) -> int:
|
| 91 |
+
"""Estimate reading time in minutes (250 words per minute)"""
|
| 92 |
+
word_count = len(text.split())
|
| 93 |
+
return max(1, word_count // 250)
|
| 94 |
+
|
| 95 |
+
def _calculate_complexity_score(self, data: Dict) -> float:
|
| 96 |
+
"""Calculate content complexity for LLM processing hints"""
|
| 97 |
+
score = 0.0
|
| 98 |
+
|
| 99 |
+
# Text length factor
|
| 100 |
+
text_length = len(data["text_summary"])
|
| 101 |
+
score += min(text_length / 1000, 5.0)
|
| 102 |
+
|
| 103 |
+
# Structure complexity
|
| 104 |
+
content_blocks = len(data["content"])
|
| 105 |
+
score += min(content_blocks / 10, 3.0)
|
| 106 |
+
|
| 107 |
+
# Link density
|
| 108 |
+
total_links = len(data["links"])
|
| 109 |
+
score += min(total_links / 20, 2.0)
|
| 110 |
+
|
| 111 |
+
return round(score, 2)
|
| 112 |
+
|
| 113 |
+
def _identify_content_type(self, data: Dict) -> str:
|
| 114 |
+
"""Identify content type for LLM processing strategy"""
|
| 115 |
+
title = data["metadata"]["title"].lower()
|
| 116 |
+
text = data["text_summary"].lower()
|
| 117 |
+
|
| 118 |
+
if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
|
| 119 |
+
return "tutorial"
|
| 120 |
+
elif any(word in title or word in text for word in ["news", "article", "report"]):
|
| 121 |
+
return "article"
|
| 122 |
+
elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
|
| 123 |
+
return "documentation"
|
| 124 |
+
elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
|
| 125 |
+
return "blog_post"
|
| 126 |
+
else:
|
| 127 |
+
return "general"
|
| 128 |
+
|
| 129 |
+
def _extract_key_topics(self, data: Dict) -> List[str]:
|
| 130 |
+
"""Extract key topics for study organization"""
|
| 131 |
+
# Simple keyword extraction from headings and title
|
| 132 |
+
topics = set()
|
| 133 |
+
|
| 134 |
+
# From title
|
| 135 |
+
title_words = data["metadata"]["title"].split()
|
| 136 |
+
topics.update([word.lower() for word in title_words if len(word) > 3])
|
| 137 |
+
|
| 138 |
+
# From headings
|
| 139 |
+
for heading in data["metadata"]["headings"]:
|
| 140 |
+
heading_words = heading["text"].split()
|
| 141 |
+
topics.update([word.lower() for word in heading_words if len(word) > 3])
|
| 142 |
+
|
| 143 |
+
return list(topics)[:10] # Limit to top 10 topics
|
neo4j_storage.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from neo4j import GraphDatabase
|
| 2 |
+
# from typing import Dict, List
|
| 3 |
+
# from urllib.parse import urlparse
|
| 4 |
+
# from config.settings import settings
|
| 5 |
+
|
| 6 |
+
# class Neo4jStorage:
|
| 7 |
+
# def __init__(self):
|
| 8 |
+
# self.driver = GraphDatabase.driver(
|
| 9 |
+
# settings.database.neo4j_uri,
|
| 10 |
+
# auth=(settings.database.neo4j_user, settings.database.neo4j_password)
|
| 11 |
+
# )
|
| 12 |
+
# self._create_constraints()
|
| 13 |
+
|
| 14 |
+
# def _create_constraints(self):
|
| 15 |
+
# """Create constraints and indexes for better performance"""
|
| 16 |
+
# with self.driver.session() as session:
|
| 17 |
+
# try:
|
| 18 |
+
# session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
|
| 19 |
+
# session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
|
| 20 |
+
# session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
|
| 21 |
+
# except Exception as e:
|
| 22 |
+
# pass # Constraints might already exist
|
| 23 |
+
|
| 24 |
+
# def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
|
| 25 |
+
# """Store page relationships and structure in Neo4j"""
|
| 26 |
+
# with self.driver.session() as session:
|
| 27 |
+
# # Create main page node
|
| 28 |
+
# self._create_page_node(session, url, extracted_data)
|
| 29 |
+
|
| 30 |
+
# # Create domain relationships
|
| 31 |
+
# self._create_domain_relationships(session, url, extracted_data)
|
| 32 |
+
|
| 33 |
+
# # Create content relationships
|
| 34 |
+
# self._create_content_relationships(session, url, extracted_data)
|
| 35 |
+
|
| 36 |
+
# # Create link relationships
|
| 37 |
+
# self._create_link_relationships(session, url, extracted_data["links"])
|
| 38 |
+
|
| 39 |
+
# # Create DOM structure relationships
|
| 40 |
+
# self._create_dom_relationships(session, url, dom_structure)
|
| 41 |
+
|
| 42 |
+
# def _create_page_node(self, session, url: str, data: Dict):
|
| 43 |
+
# """Create or update page node with LLM-friendly properties"""
|
| 44 |
+
# query = """
|
| 45 |
+
# MERGE (p:Page {url: $url})
|
| 46 |
+
# SET p.title = $title,
|
| 47 |
+
# p.description = $description,
|
| 48 |
+
# p.domain = $domain,
|
| 49 |
+
# p.content_type = $content_type,
|
| 50 |
+
# p.complexity_score = $complexity_score,
|
| 51 |
+
# p.reading_time = $reading_time,
|
| 52 |
+
# p.word_count = $word_count,
|
| 53 |
+
# p.last_scraped = datetime()
|
| 54 |
+
# """
|
| 55 |
+
|
| 56 |
+
# session.run(query, {
|
| 57 |
+
# "url": url,
|
| 58 |
+
# "title": data["metadata"]["title"],
|
| 59 |
+
# "description": data["metadata"]["description"],
|
| 60 |
+
# "domain": data["metadata"]["domain"],
|
| 61 |
+
# "content_type": self._identify_content_type(data),
|
| 62 |
+
# "complexity_score": self._calculate_complexity_score(data),
|
| 63 |
+
# "reading_time": len(data["text_summary"].split()) // 250,
|
| 64 |
+
# "word_count": len(data["text_summary"].split())
|
| 65 |
+
# })
|
| 66 |
+
|
| 67 |
+
# def _create_domain_relationships(self, session, url: str, data: Dict):
|
| 68 |
+
# """Create domain nodes and relationships"""
|
| 69 |
+
# domain = data["metadata"]["domain"]
|
| 70 |
+
|
| 71 |
+
# # Create domain node
|
| 72 |
+
# session.run("""
|
| 73 |
+
# MERGE (d:Domain {name: $domain})
|
| 74 |
+
# SET d.last_updated = datetime()
|
| 75 |
+
# """, {"domain": domain})
|
| 76 |
+
|
| 77 |
+
# # Link page to domain
|
| 78 |
+
# session.run("""
|
| 79 |
+
# MATCH (p:Page {url: $url})
|
| 80 |
+
# MATCH (d:Domain {name: $domain})
|
| 81 |
+
# MERGE (p)-[:BELONGS_TO]->(d)
|
| 82 |
+
# """, {"url": url, "domain": domain})
|
| 83 |
+
|
| 84 |
+
# def _create_content_relationships(self, session, url: str, data: Dict):
|
| 85 |
+
# """Create content structure relationships for LLM understanding"""
|
| 86 |
+
# # Create topic nodes from headings
|
| 87 |
+
# for i, heading in enumerate(data["metadata"]["headings"]):
|
| 88 |
+
# session.run("""
|
| 89 |
+
# MATCH (p:Page {url: $url})
|
| 90 |
+
# MERGE (h:Heading {text: $text, level: $level, page_url: $url})
|
| 91 |
+
# SET h.position = $position
|
| 92 |
+
# MERGE (p)-[:HAS_HEADING]->(h)
|
| 93 |
+
# """, {
|
| 94 |
+
# "url": url,
|
| 95 |
+
# "text": heading["text"],
|
| 96 |
+
# "level": heading["level"],
|
| 97 |
+
# "position": i
|
| 98 |
+
# })
|
| 99 |
+
|
| 100 |
+
# # Create content block relationships
|
| 101 |
+
# for i, block in enumerate(data["content"][:10]): # Limit for performance
|
| 102 |
+
# session.run("""
|
| 103 |
+
# MATCH (p:Page {url: $url})
|
| 104 |
+
# MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
|
| 105 |
+
# SET c.tag = $tag,
|
| 106 |
+
# c.length = $length
|
| 107 |
+
# MERGE (p)-[:HAS_CONTENT]->(c)
|
| 108 |
+
# """, {
|
| 109 |
+
# "url": url,
|
| 110 |
+
# "text": block["text"][:500], # Truncate for storage
|
| 111 |
+
# "tag": block["tag"],
|
| 112 |
+
# "length": len(block["text"]),
|
| 113 |
+
# "position": i
|
| 114 |
+
# })
|
| 115 |
+
|
| 116 |
+
# def _create_link_relationships(self, session, url: str, links: List[Dict]):
|
| 117 |
+
# """Create link relationships for navigation understanding"""
|
| 118 |
+
# for link in links[:20]: # Limit for performance
|
| 119 |
+
# target_url = link["url"]
|
| 120 |
+
# link_text = link["text"]
|
| 121 |
+
# is_internal = link["internal"]
|
| 122 |
+
|
| 123 |
+
# # Create target page node (minimal)
|
| 124 |
+
# session.run("""
|
| 125 |
+
# MERGE (target:Page {url: $target_url})
|
| 126 |
+
# SET target.discovered_via = $source_url
|
| 127 |
+
# """, {"target_url": target_url, "source_url": url})
|
| 128 |
+
|
| 129 |
+
# # Create relationship
|
| 130 |
+
# relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
|
| 131 |
+
# session.run(f"""
|
| 132 |
+
# MATCH (source:Page {{url: $source_url}})
|
| 133 |
+
# MATCH (target:Page {{url: $target_url}})
|
| 134 |
+
# MERGE (source)-[r:{relationship_type}]->(target)
|
| 135 |
+
# SET r.link_text = $link_text,
|
| 136 |
+
# r.is_internal = $is_internal
|
| 137 |
+
# """, {
|
| 138 |
+
# "source_url": url,
|
| 139 |
+
# "target_url": target_url,
|
| 140 |
+
# "link_text": link_text,
|
| 141 |
+
# "is_internal": is_internal
|
| 142 |
+
# })
|
| 143 |
+
|
| 144 |
+
# def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
|
| 145 |
+
# """Create DOM structure relationships for content hierarchy"""
|
| 146 |
+
# # Create semantic structure nodes
|
| 147 |
+
# semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
|
| 148 |
+
# for tag, count in semantic_elements.items():
|
| 149 |
+
# if count > 0:
|
| 150 |
+
# session.run("""
|
| 151 |
+
# MATCH (p:Page {url: $url})
|
| 152 |
+
# MERGE (s:SemanticElement {tag: $tag, page_url: $url})
|
| 153 |
+
# SET s.count = $count
|
| 154 |
+
# MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
|
| 155 |
+
# """, {"url": url, "tag": tag, "count": count})
|
| 156 |
+
|
| 157 |
+
# def get_page_relationships(self, url: str) -> Dict:
|
| 158 |
+
# """Get all relationships for a page for LLM context"""
|
| 159 |
+
# with self.driver.session() as session:
|
| 160 |
+
# result = session.run("""
|
| 161 |
+
# MATCH (p:Page {url: $url})
|
| 162 |
+
# OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
|
| 163 |
+
# OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
|
| 164 |
+
# OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
|
| 165 |
+
# RETURN p, collect(DISTINCT internal.url) as internal_links,
|
| 166 |
+
# collect(DISTINCT external.url) as external_links,
|
| 167 |
+
# collect(DISTINCT {text: h.text, level: h.level}) as headings
|
| 168 |
+
# """, {"url": url})
|
| 169 |
+
|
| 170 |
+
# record = result.single()
|
| 171 |
+
# if record:
|
| 172 |
+
# return {
|
| 173 |
+
# "page": dict(record["p"]),
|
| 174 |
+
# "internal_links": record["internal_links"],
|
| 175 |
+
# "external_links": record["external_links"],
|
| 176 |
+
# "headings": record["headings"]
|
| 177 |
+
# }
|
| 178 |
+
# return {}
|
| 179 |
+
|
| 180 |
+
# def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
|
| 181 |
+
# """Find related pages for LLM context and study suggestions"""
|
| 182 |
+
# with self.driver.session() as session:
|
| 183 |
+
# result = session.run("""
|
| 184 |
+
# MATCH (p:Page {url: $url})
|
| 185 |
+
# MATCH (p)-[:BELONGS_TO]->(d:Domain)
|
| 186 |
+
# MATCH (related:Page)-[:BELONGS_TO]->(d)
|
| 187 |
+
# WHERE related.url <> $url
|
| 188 |
+
# RETURN related.url as url, related.title as title,
|
| 189 |
+
# related.content_type as content_type,
|
| 190 |
+
# related.complexity_score as complexity_score
|
| 191 |
+
# ORDER BY related.complexity_score DESC
|
| 192 |
+
# LIMIT $limit
|
| 193 |
+
# """, {"url": url, "limit": limit})
|
| 194 |
+
|
| 195 |
+
# return [dict(record) for record in result]
|
| 196 |
+
|
| 197 |
+
# def _identify_content_type(self, data: Dict) -> str:
|
| 198 |
+
# """Identify content type for graph relationships"""
|
| 199 |
+
# title = data["metadata"]["title"].lower()
|
| 200 |
+
# if "tutorial" in title or "guide" in title:
|
| 201 |
+
# return "tutorial"
|
| 202 |
+
# elif "documentation" in title or "docs" in title:
|
| 203 |
+
# return "documentation"
|
| 204 |
+
# elif "blog" in title or "article" in title:
|
| 205 |
+
# return "article"
|
| 206 |
+
# return "general"
|
| 207 |
+
|
| 208 |
+
# def _calculate_complexity_score(self, data: Dict) -> float:
|
| 209 |
+
# """Calculate complexity score for relationship weighting"""
|
| 210 |
+
# text_length = len(data["text_summary"])
|
| 211 |
+
# content_blocks = len(data["content"])
|
| 212 |
+
# return min(text_length / 1000 + content_blocks / 10, 10.0)
|
| 213 |
+
|
| 214 |
+
# def close(self):
|
| 215 |
+
# """Close database connection"""
|
| 216 |
+
# self.driver.close()
|
requirements.txt
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==24.1.0
|
| 2 |
+
annotated-types==0.7.0
|
| 3 |
+
anyio==4.9.0
|
| 4 |
+
beautifulsoup4==4.13.4
|
| 5 |
+
bs4==0.0.2
|
| 6 |
+
certifi==2025.4.26
|
| 7 |
+
charset-normalizer==3.4.2
|
| 8 |
+
click==8.2.1
|
| 9 |
+
colorama==0.4.6
|
| 10 |
+
distro==1.9.0
|
| 11 |
+
dnspython==2.7.0
|
| 12 |
+
dotenv==0.9.9
|
| 13 |
+
fastapi==0.115.12
|
| 14 |
+
ffmpy==0.6.0
|
| 15 |
+
filelock==3.18.0
|
| 16 |
+
fsspec==2025.5.1
|
| 17 |
+
gradio==5.33.0
|
| 18 |
+
gradio_client==1.10.2
|
| 19 |
+
greenlet==3.2.3
|
| 20 |
+
groovy==0.1.2
|
| 21 |
+
h11==0.16.0
|
| 22 |
+
httpcore==1.0.9
|
| 23 |
+
httpx==0.28.1
|
| 24 |
+
huggingface-hub==0.32.4
|
| 25 |
+
idna==3.10
|
| 26 |
+
Jinja2==3.1.6
|
| 27 |
+
jiter==0.10.0
|
| 28 |
+
markdown-it-py==3.0.0
|
| 29 |
+
MarkupSafe==3.0.2
|
| 30 |
+
mdurl==0.1.2
|
| 31 |
+
numpy==2.3.0
|
| 32 |
+
openai==1.84.0
|
| 33 |
+
orjson==3.10.18
|
| 34 |
+
packaging==25.0
|
| 35 |
+
pandas==2.3.0
|
| 36 |
+
pillow==11.2.1
|
| 37 |
+
playwright==1.52.0
|
| 38 |
+
pydantic==2.11.5
|
| 39 |
+
pydantic_core==2.33.2
|
| 40 |
+
pydub==0.25.1
|
| 41 |
+
pyee==13.0.0
|
| 42 |
+
Pygments==2.19.1
|
| 43 |
+
pymongo==4.13.0
|
| 44 |
+
python-dateutil==2.9.0.post0
|
| 45 |
+
python-dotenv==1.1.0
|
| 46 |
+
python-multipart==0.0.20
|
| 47 |
+
pytz==2025.2
|
| 48 |
+
PyYAML==6.0.2
|
| 49 |
+
requests==2.32.3
|
| 50 |
+
rich==14.0.0
|
| 51 |
+
ruff==0.11.13
|
| 52 |
+
safehttpx==0.1.6
|
| 53 |
+
semantic-version==2.10.0
|
| 54 |
+
shellingham==1.5.4
|
| 55 |
+
six==1.17.0
|
| 56 |
+
sniffio==1.3.1
|
| 57 |
+
soupsieve==2.7
|
| 58 |
+
starlette==0.46.2
|
| 59 |
+
tomlkit==0.13.3
|
| 60 |
+
tqdm==4.67.1
|
| 61 |
+
typer==0.16.0
|
| 62 |
+
typing-inspection==0.4.1
|
| 63 |
+
typing_extensions==4.14.0
|
| 64 |
+
tzdata==2025.2
|
| 65 |
+
urllib3==2.4.0
|
| 66 |
+
uvicorn==0.34.3
|
| 67 |
+
websockets==15.0.1
|
settings.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
class DatabaseConfig(BaseModel):
|
| 10 |
+
mongo_uri: str = os.getenv("mongo_uri")
|
| 11 |
+
mongo_db: str = os.getenv("mongo_db")
|
| 12 |
+
neo4j_uri: str = os.getenv("neo4j_uri")
|
| 13 |
+
neo4j_user: str = os.getenv("neo4j_user")
|
| 14 |
+
neo4j_password: str = os.getenv("neo4j_password")
|
| 15 |
+
|
| 16 |
+
class ScrapingConfig(BaseModel):
|
| 17 |
+
timeout: int = 30000
|
| 18 |
+
wait_for_selector: str = "body"
|
| 19 |
+
headless: bool = True
|
| 20 |
+
user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 21 |
+
max_retries: int = 3
|
| 22 |
+
delay_between_requests: float = 1.0
|
| 23 |
+
|
| 24 |
+
class ExtractionConfig(BaseModel):
|
| 25 |
+
content_selectors: List[str] = [
|
| 26 |
+
"article", "main", ".content", "#content",
|
| 27 |
+
".post", ".article-body", "p", "h1", "h2", "h3"
|
| 28 |
+
]
|
| 29 |
+
ignore_selectors: List[str] = [
|
| 30 |
+
"script", "style", "nav", "footer", "header",
|
| 31 |
+
".advertisement", ".ads", ".sidebar"
|
| 32 |
+
]
|
| 33 |
+
min_text_length: int = 50
|
| 34 |
+
extract_images: bool = True
|
| 35 |
+
extract_links: bool = True
|
| 36 |
+
|
| 37 |
+
class Settings:
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.database = DatabaseConfig()
|
| 40 |
+
self.scraping = ScrapingConfig()
|
| 41 |
+
self.extraction = ExtractionConfig()
|
| 42 |
+
|
| 43 |
+
def update_from_env(self):
|
| 44 |
+
# Update from environment variables if available
|
| 45 |
+
if os.getenv("mongo_uri"):
|
| 46 |
+
self.database.mongo_uri = os.getenv("mongo_uri")
|
| 47 |
+
if os.getenv("mongo_db"):
|
| 48 |
+
self.database.mongo_db = os.getenv("mongo_db")
|
| 49 |
+
if os.getenv("neo4j_uri"):
|
| 50 |
+
self.database.neo4j_uri = os.getenv("neo4j_uri")
|
| 51 |
+
if os.getenv("neo4j_user"):
|
| 52 |
+
self.database.neo4j_user = os.getenv("neo4j_user")
|
| 53 |
+
if os.getenv("neo4j_password"):
|
| 54 |
+
self.database.neo4j_password = os.getenv("neo4j_password")
|
| 55 |
+
|
| 56 |
+
settings = Settings()
|
| 57 |
+
settings.update_from_env()
|