Spaces:

WD101
/

Gradio_chat

Sleeping

App Files Files Community

princeta3011 commited on Jun 8

Commit

69a077e

verified ·

1 Parent(s): f451b98

Upload 11 files

Browse files

Files changed (11) hide show

.gitignore +2 -0
app.py +36 -0
chat_agent.py +43 -0
data_extractor.py +118 -0
dom_analyzer.py +162 -0
html_loader.py +64 -0
main.py +186 -0
mongo_storage.py +143 -0
neo4j_storage.py +216 -0
requirements.txt +67 -0
settings.py +57 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ .venv

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# app.py
+import gradio as gr
+import asyncio
+from agent.chat_agent import SimpleChatAgent
+agent = SimpleChatAgent()
+async def async_chat_fn(message, history):
+    return await agent.handle_query(message, history)
+def chat_fn(message, history):
+    return asyncio.run(async_chat_fn(message, history))
+with gr.Blocks(css="""
+#title { font-size: 2.2rem; font-weight: bold; text-align: center; margin-bottom: 0.5em; color: #2e3a59; }
+#desc { font-size: 1.1rem; text-align: center; color: #6c7a92; margin-bottom: 2em; }
+footer { text-align: center; font-size: 0.9rem; color: #999; margin-top: 2em; }
+.gradio-container { background-color: #f9fbfc; }
+""") as demo:
+    gr.Markdown("<div id='title'> Chat + Web Scraper Agent</div>")
+    gr.Markdown("<div id='desc'>Ask anything, or tell me to scrape a webpage. your custom agent logic.</div>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.ChatInterface(
+                fn=chat_fn,
+                chatbot=gr.Chatbot(show_copy_button=True),
+                textbox=gr.Textbox(placeholder="Type your question or paste a URL to scrape...", show_label=False),
+                title=None,
+                theme="soft",
+            )
+    gr.Markdown("<footer>Built with ❤️ using LLM + Gradio UI</footer>")
+demo.launch()

chat_agent.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# chat_agent.py
+import os
+import re
+from openai import OpenAI
+from main import WebScrapingOrchestrator
+class SimpleChatAgent:
+    def __init__(self):
+        self.client = OpenAI(
+            base_url="https://api.studio.nebius.com/v1/",
+            api_key=os.environ.get("NEBIUS_API_KEY"),
+        )
+        self.model = "meta-llama/Meta-Llama-3.1-70B-Instruct"
+        self.orchestrator = WebScrapingOrchestrator()
+    async def handle_query(self, user_input, history):
+        # Web scraping check
+        url_match = re.search(r"(https?://[^\s]+)", user_input)
+        if "scrape" in user_input.lower() and url_match:
+            url = url_match.group(1)
+            result = await self.orchestrator.process_url(url)
+            if "error" in result:
+                return f"❌ Error scraping {url}: {result['error']}"
+            return (
+                f"✅ Scraped Data from {result['title']}:\n"
+                f"- Topics: {', '.join(result['llm_ready_data']['main_topics'])}\n"
+                f"- Summary: {result['llm_ready_data']['text_summary'][:500]}..."
+            )
+        # Build full chat history
+        messages = []
+        for user_msg, bot_msg in history:
+            messages.append({"role": "user", "content": user_msg})
+            messages.append({"role": "assistant", "content": bot_msg})
+        messages.append({"role": "user", "content": user_input})
+        # Call Nebius LLM
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=0.6,
+        )
+        return response.choices[0].message.content

data_extractor.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from bs4 import BeautifulSoup, Comment
+from typing import Dict, List, Optional
+import re
+from urllib.parse import urljoin, urlparse
+from config.settings import settings
+class DataExtractor:
+    def __init__(self):
+        self.config = settings.extraction
+    def extract_structured_data(self, html: str, url: str) -> Dict:
+        """Extract structured data from HTML for LLM consumption"""
+        soup = BeautifulSoup(html, 'lxml')
+        # Remove unwanted elements
+        self._clean_html(soup)
+        return {
+            "content": self._extract_content(soup),
+            "metadata": self._extract_metadata(soup, url),
+            "structure": self._extract_structure(soup),
+            "links": self._extract_links(soup, url),
+            "images": self._extract_images(soup, url),
+            "text_summary": self._extract_text_summary(soup)
+        }
+    def _clean_html(self, soup: BeautifulSoup):
+        """Remove unwanted elements for cleaner extraction"""
+        for selector in self.config.ignore_selectors:
+            for element in soup.select(selector):
+                element.decompose()
+        # Remove comments and scripts
+        for element in soup(text=lambda text: isinstance(text, Comment)):
+            element.extract()
+    def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
+        """Extract main content blocks"""
+        content_blocks = []
+        for selector in self.config.content_selectors:
+            elements = soup.select(selector)
+            for elem in elements:
+                text = elem.get_text(strip=True)
+                if len(text) >= self.config.min_text_length:
+                    content_blocks.append({
+                        "tag": elem.name,
+                        "text": text,
+                        "html": str(elem),
+                        "attributes": dict(elem.attrs) if elem.attrs else {}
+                    })
+        return content_blocks
+    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
+        """Extract page metadata"""
+        title = soup.find('title')
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        return {
+            "title": title.get_text().strip() if title else "",
+            "description": meta_desc.get('content', '') if meta_desc else "",
+            "url": url,
+            "domain": urlparse(url).netloc,
+            "headings": self._extract_headings(soup)
+        }
+    def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
+        """Extract heading hierarchy for structure"""
+        headings = []
+        for i in range(1, 7):
+            for heading in soup.find_all(f'h{i}'):
+                headings.append({
+                    "level": i,
+                    "text": heading.get_text().strip(),
+                    "id": heading.get('id', '')
+                })
+        return headings
+    def _extract_structure(self, soup: BeautifulSoup) -> Dict:
+        """Extract DOM structure for relationships"""
+        return {
+            "sections": len(soup.find_all(['section', 'article', 'div'])),
+            "paragraphs": len(soup.find_all('p')),
+            "lists": len(soup.find_all(['ul', 'ol'])),
+            "tables": len(soup.find_all('table')),
+            "forms": len(soup.find_all('form'))
+        }
+    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
+        """Extract all links for relationship mapping"""
+        links = []
+        for link in soup.find_all('a', href=True):
+            href = urljoin(base_url, link['href'])
+            links.append({
+                "url": href,
+                "text": link.get_text().strip(),
+                "internal": urlparse(href).netloc == urlparse(base_url).netloc
+            })
+        return links[:50]  # Limit for performance
+    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
+        """Extract images with context"""
+        images = []
+        for img in soup.find_all('img', src=True):
+            images.append({
+                "src": urljoin(base_url, img['src']),
+                "alt": img.get('alt', ''),
+                "caption": img.get('title', '')
+            })
+        return images[:20]  # Limit for performance
+    def _extract_text_summary(self, soup: BeautifulSoup) -> str:
+        """Extract clean text for LLM processing"""
+        text = soup.get_text()
+        # Clean whitespace and normalize
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text[:5000]  # Limit for token efficiency

dom_analyzer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from bs4 import BeautifulSoup
+from typing import Dict, List
+import hashlib
+class DOMAnalyzer:
+    def __init__(self):
+        pass
+    def analyze_structure(self, html: str) -> Dict:
+        """Analyze DOM structure and create tree representation"""
+        soup = BeautifulSoup(html, 'lxml')
+        return {
+            "tree": self._build_dom_tree(soup.body if soup.body else soup),
+            "statistics": self._get_dom_statistics(soup),
+            "semantic_structure": self._analyze_semantic_structure(soup),
+            "content_blocks": self._identify_content_blocks(soup)
+        }
+    def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
+        """Build hierarchical DOM tree structure"""
+        if depth > max_depth or not element or not hasattr(element, 'name'):
+            return {}
+        node = {
+            "tag": element.name if element.name else "text",
+            "id": element.get('id', ''),
+            "classes": element.get('class', []),
+            "text_content": element.get_text()[:100] if element.get_text() else "",
+            "children": [],
+            "attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
+            "depth": depth,
+            "node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
+        }
+        # Add children (limit to prevent huge trees)
+        if hasattr(element, 'children') and depth < max_depth:
+            child_count = 0
+            for child in element.children:
+                if child_count >= 10:  # Limit children per node
+                    break
+                if hasattr(child, 'name') and child.name:
+                    child_node = self._build_dom_tree(child, depth + 1, max_depth)
+                    if child_node:
+                        node["children"].append(child_node)
+                        child_count += 1
+        return node
+    def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
+        """Get DOM statistics for analysis"""
+        all_tags = soup.find_all()
+        tag_counts = {}
+        for tag in all_tags:
+            tag_name = tag.name
+            tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
+        return {
+            "total_elements": len(all_tags),
+            "tag_distribution": tag_counts,
+            "max_depth": self._calculate_max_depth(soup),
+            "text_content_ratio": self._calculate_text_ratio(soup)
+        }
+    def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
+        """Analyze semantic HTML structure"""
+        semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
+        semantic_elements = {}
+        for tag in semantic_tags:
+            elements = soup.find_all(tag)
+            semantic_elements[tag] = len(elements)
+        return {
+            "semantic_elements": semantic_elements,
+            "has_semantic_structure": sum(semantic_elements.values()) > 0,
+            "content_hierarchy": self._analyze_heading_hierarchy(soup)
+        }
+    def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
+        """Identify main content blocks for LLM processing"""
+        content_blocks = []
+        # Look for common content containers
+        selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
+        for selector in selectors:
+            elements = soup.select(selector)
+            for elem in elements:
+                if elem.get_text(strip=True):
+                    content_blocks.append({
+                        "selector": selector,
+                        "tag": elem.name,
+                        "text_length": len(elem.get_text()),
+                        "element_id": elem.get('id', ''),
+                        "classes": elem.get('class', []),
+                        "priority": self._calculate_content_priority(elem)
+                    })
+        return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
+    def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
+        """Calculate maximum DOM depth"""
+        def get_depth(element, current_depth=0):
+            if not hasattr(element, 'children'):
+                return current_depth
+            max_child_depth = current_depth
+            for child in element.children:
+                if hasattr(child, 'name') and child.name:
+                    depth = get_depth(child, current_depth + 1)
+                    max_child_depth = max(max_child_depth, depth)
+            return max_child_depth
+        return get_depth(soup)
+    def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
+        """Calculate ratio of text content to HTML tags"""
+        text_length = len(soup.get_text())
+        html_length = len(str(soup))
+        return text_length / html_length if html_length > 0 else 0
+    def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
+        """Analyze heading structure for content organization"""
+        headings = []
+        for i in range(1, 7):
+            for heading in soup.find_all(f'h{i}'):
+                headings.append({
+                    "level": i,
+                    "text": heading.get_text().strip(),
+                    "position": len(headings)
+                })
+        return headings
+    def _calculate_content_priority(self, element) -> int:
+        """Calculate priority score for content blocks"""
+        score = 0
+        text_length = len(element.get_text())
+        # Text length scoring
+        score += min(text_length // 100, 10)
+        # Semantic tag bonus
+        if element.name in ['article', 'main']:
+            score += 5
+        elif element.name in ['section', 'div']:
+            score += 2
+        # Class/ID based scoring
+        classes = element.get('class', [])
+        element_id = element.get('id', '')
+        content_indicators = ['content', 'article', 'post', 'main', 'body']
+        for indicator in content_indicators:
+            if any(indicator in str(c).lower() for c in classes):
+                score += 3
+            if indicator in element_id.lower():
+                score += 3
+        return score

html_loader.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import asyncio
+from playwright.async_api import async_playwright
+from typing import Dict, Optional
+import time
+from config.settings import settings
+class HTMLLoader:
+    def __init__(self):
+        self.browser = None
+        self.context = None
+    async def __aenter__(self):
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(
+            headless=settings.scraping.headless
+        )
+        self.context = await self.browser.new_context(
+            user_agent=settings.scraping.user_agent
+        )
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.context:
+            await self.context.close()
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def load_page(self, url: str) -> Dict[str, str]:
+        """Load HTML content from URL handling both static and dynamic sites"""
+        for attempt in range(settings.scraping.max_retries):
+            try:
+                page = await self.context.new_page()
+                await page.goto(url, timeout=settings.scraping.timeout)
+                # Wait for body to load
+                await page.wait_for_selector(
+                    settings.scraping.wait_for_selector,
+                    timeout=10000
+                )
+                # Additional wait for dynamic content
+                await page.wait_for_timeout(2000)
+                html_content = await page.content()
+                title = await page.title()
+                url_final = page.url
+                await page.close()
+                return {
+                    "html": html_content,
+                    "title": title,
+                    "url": url_final,
+                    "timestamp": int(time.time())
+                }
+            except Exception as e:
+                if attempt == settings.scraping.max_retries - 1:
+                    raise Exception(f"Failed to load {url}: {str(e)}")
+                await asyncio.sleep(settings.scraping.delay_between_requests)
+        return None

main.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import asyncio
+from typing import Dict, Optional,List
+from scraper.html_loader import HTMLLoader
+from scraper.data_extractor import DataExtractor
+from scraper.dom_analyzer import DOMAnalyzer
+from storage.mongo_storage import MongoStorage
+# from storage.neo4j_storage import Neo4jStorage
+from config.settings import settings
+class WebScrapingOrchestrator:
+    def __init__(self):
+        self.data_extractor = DataExtractor()
+        self.dom_analyzer = DOMAnalyzer()
+        self.mongo_storage = MongoStorage()
+        # self.neo4j_storage = Neo4jStorage()
+    async def process_url(self, url: str) -> Dict:
+        """Complete pipeline to process a URL for LLM consumption"""
+        try:
+            print(f"Processing URL: {url}")
+            # Step 1: Load HTML content
+            async with HTMLLoader() as loader:
+                html_data = await loader.load_page(url)
+            if not html_data:
+                return {"error": "Failed to load page"}
+            print("✓ HTML loaded successfully")
+            # Step 2: Extract structured data
+            extracted_data = self.data_extractor.extract_structured_data(
+                html_data["html"],
+                html_data["url"]
+            )
+            print("✓ Data extracted successfully")
+            # Step 3: Analyze DOM structure
+            dom_structure = self.dom_analyzer.analyze_structure(html_data["html"])
+            print("✓ DOM structure analyzed")
+            # Step 4: Store in MongoDB
+            mongo_id = self.mongo_storage.store_page_data(
+                html_data["url"],
+                extracted_data,
+                dom_structure
+            )
+            print("✓ Data stored in MongoDB")
+            # Step 5: Store relationships in Neo4j
+            # self.neo4j_storage.store_relationships(
+            #     html_data["url"],
+            #     extracted_data,
+            #     dom_structure
+            # )
+            print("✓ Relationships stored in Neo4j")
+            # Return LLM-ready summary
+            return {
+                "success": True,
+                "url": html_data["url"],
+                "title": html_data["title"],
+                "mongo_id": mongo_id,
+                "summary": {
+                    "content_blocks": len(extracted_data["content"]),
+                    "text_length": len(extracted_data["text_summary"]),
+                    "links_found": len(extracted_data["links"]),
+                    "images_found": len(extracted_data["images"]),
+                    "dom_depth": dom_structure["statistics"]["max_depth"],
+                    "content_type": self._identify_content_type(extracted_data)
+                },
+                "llm_ready_data": {
+                    "text_summary": extracted_data["text_summary"],
+                    "key_headings": [h["text"] for h in extracted_data["metadata"]["headings"][:5]],
+                    "main_topics": self._extract_main_topics(extracted_data),
+                    "study_hints": self._generate_study_hints(extracted_data, dom_structure)
+                }
+            }
+        except Exception as e:
+            print(f"✗ Error processing {url}: {str(e)}")
+            return {"error": str(e), "url": url}
+    def agent_for_chat():
+        pass
+    def get_page_for_llm(self, url: str) -> Optional[Dict]:
+        """Retrieve page data optimized for LLM consumption"""
+        # Get from MongoDB
+        mongo_data = self.mongo_storage.get_page_data(url)
+        if not mongo_data:
+            return None
+        # Get relationships from Neo4j
+        # neo4j_data = self.neo4j_storage.get_page_relationships(url)
+        # Combine for LLM
+        return {
+            "content": mongo_data["content"]["text_summary"],
+            "title": mongo_data["title"],
+            "headings": [h["text"] for h in mongo_data["content"]["headings"]],
+            "structure": mongo_data["study_metadata"],
+            "relationships": {
+                "related_pages": mongo_data.get("internal_links", [])[:5],
+                "external_references": mongo_data.get("external_links", [])[:3]
+            },
+            "study_metadata": mongo_data["study_metadata"]
+        }
+    def search_for_llm(self, query: str, limit: int = 5) -> List[Dict]:
+        """Search content for LLM context"""
+        results = self.mongo_storage.search_pages(query, limit)
+        llm_ready_results = []
+        for result in results:
+            llm_ready_results.append({
+                "url": result["url"],
+                "title": result["title"],
+                "summary": result["content"]["text_summary"][:500],
+                "content_type": result["study_metadata"]["content_type"],
+                "complexity": result["study_metadata"]["complexity_score"],
+                "key_topics": result["study_metadata"]["key_topics"][:5]
+            })
+        return llm_ready_results
+    def _identify_content_type(self, data: Dict) -> str:
+        """Identify content type for processing hints"""
+        title = data["metadata"]["title"].lower()
+        text = data["text_summary"].lower()
+        if any(word in title for word in ["tutorial", "guide", "how to"]):
+            return "tutorial"
+        elif any(word in title for word in ["documentation", "docs", "api"]):
+            return "documentation"
+        elif any(word in title for word in ["blog", "article", "news"]):
+            return "article"
+        elif any(word in text for word in ["research", "study", "analysis"]):
+            return "research"
+        return "general"
+    def _extract_main_topics(self, data: Dict) -> List[str]:
+        """Extract main topics for LLM understanding"""
+        topics = set()
+        # From title
+        title_words = [word for word in data["metadata"]["title"].split() if len(word) > 3]
+        topics.update(title_words[:3])
+        # From headings
+        for heading in data["metadata"]["headings"][:3]:
+            heading_words = [word for word in heading["text"].split() if len(word) > 3]
+            topics.update(heading_words[:2])
+        return list(topics)[:5]
+    def _generate_study_hints(self, extracted_data: Dict, dom_structure: Dict) -> Dict:
+        """Generate study hints for LLM processing"""
+        return {
+            "difficulty_level": "beginner" if len(extracted_data["text_summary"]) < 2000 else "intermediate",
+            "estimated_study_time": f"{len(extracted_data['text_summary'].split()) // 250} minutes",
+            "content_structure": "well_structured" if len(extracted_data["metadata"]["headings"]) > 3 else "basic",
+            "has_examples": "code" in extracted_data["text_summary"].lower(),
+            "interactive_elements": dom_structure["statistics"]["tag_distribution"].get("form", 0) > 0
+        }
+    def close_connections(self):
+        """Close all database connections"""
+        # self.neo4j_storage.close()
+# Main execution function
+async def main():
+    orchestrator = WebScrapingOrchestrator()
+    # Example usage
+    test_url = "https://en.wikipedia.org/wiki/Virat_Kohli"
+    result = await orchestrator.process_url(test_url)
+    print(f"Processing result: {result}")
+    # Clean up
+    orchestrator.close_connections()
+if __name__ == "__main__":
+    asyncio.run(main())

mongo_storage.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from pymongo import MongoClient
+from typing import Dict, List, Optional
+import datetime
+from config.settings import settings
+class MongoStorage:
+    def __init__(self):
+        self.client = MongoClient(settings.database.mongo_uri)
+        self.db = self.client[settings.database.mongo_db]
+        self.collection = self.db.scraped_pages
+        self._create_indexes()
+    def _create_indexes(self):
+        """Create indexes for better query performance"""
+        self.collection.create_index("url", unique=True)
+        self.collection.create_index("domain")
+        self.collection.create_index("timestamp")
+        self.collection.create_index("content.metadata.title")
+    def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
+        """Store complete page data optimized for LLM consumption"""
+        document = {
+            "url": url,
+            "domain": extracted_data["metadata"]["domain"],
+            "timestamp": datetime.datetime.utcnow(),
+            "title": extracted_data["metadata"]["title"],
+            "description": extracted_data["metadata"]["description"],
+            # LLM-optimized content structure
+            "content": {
+                "text_summary": extracted_data["text_summary"],
+                "content_blocks": extracted_data["content"],
+                "headings": extracted_data["metadata"]["headings"],
+                "structure_info": extracted_data["structure"]
+            },
+            # Relationship data
+            "relationships": {
+                "internal_links": [link for link in extracted_data["links"] if link["internal"]],
+                "external_links": [link for link in extracted_data["links"] if not link["internal"]],
+                "images": extracted_data["images"]
+            },
+            # DOM analysis for advanced processing
+            "dom_analysis": {
+                "tree_structure": dom_structure["tree"],
+                "statistics": dom_structure["statistics"],
+                "semantic_structure": dom_structure["semantic_structure"],
+                "content_blocks": dom_structure["content_blocks"]
+            },
+            # Study-friendly metadata
+            "study_metadata": {
+                "reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
+                "complexity_score": self._calculate_complexity_score(extracted_data),
+                "content_type": self._identify_content_type(extracted_data),
+                "key_topics": self._extract_key_topics(extracted_data)
+            }
+        }
+        # Upsert document
+        result = self.collection.replace_one(
+            {"url": url},
+            document,
+            upsert=True
+        )
+        return str(result.upserted_id or result.matched_count)
+    def get_page_data(self, url: str) -> Optional[Dict]:
+        """Retrieve page data by URL"""
+        return self.collection.find_one({"url": url})
+    def get_pages_by_domain(self, domain: str) -> List[Dict]:
+        """Get all pages from a specific domain"""
+        return list(self.collection.find({"domain": domain}))
+    def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
+        """Search pages by content for LLM queries"""
+        search_filter = {
+            "$or": [
+                {"title": {"$regex": query, "$options": "i"}},
+                {"description": {"$regex": query, "$options": "i"}},
+                {"content.text_summary": {"$regex": query, "$options": "i"}}
+            ]
+        }
+        return list(self.collection.find(search_filter).limit(limit))
+    def _estimate_reading_time(self, text: str) -> int:
+        """Estimate reading time in minutes (250 words per minute)"""
+        word_count = len(text.split())
+        return max(1, word_count // 250)
+    def _calculate_complexity_score(self, data: Dict) -> float:
+        """Calculate content complexity for LLM processing hints"""
+        score = 0.0
+        # Text length factor
+        text_length = len(data["text_summary"])
+        score += min(text_length / 1000, 5.0)
+        # Structure complexity
+        content_blocks = len(data["content"])
+        score += min(content_blocks / 10, 3.0)
+        # Link density
+        total_links = len(data["links"])
+        score += min(total_links / 20, 2.0)
+        return round(score, 2)
+    def _identify_content_type(self, data: Dict) -> str:
+        """Identify content type for LLM processing strategy"""
+        title = data["metadata"]["title"].lower()
+        text = data["text_summary"].lower()
+        if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
+            return "tutorial"
+        elif any(word in title or word in text for word in ["news", "article", "report"]):
+            return "article"
+        elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
+            return "documentation"
+        elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
+            return "blog_post"
+        else:
+            return "general"
+    def _extract_key_topics(self, data: Dict) -> List[str]:
+        """Extract key topics for study organization"""
+        # Simple keyword extraction from headings and title
+        topics = set()
+        # From title
+        title_words = data["metadata"]["title"].split()
+        topics.update([word.lower() for word in title_words if len(word) > 3])
+        # From headings
+        for heading in data["metadata"]["headings"]:
+            heading_words = heading["text"].split()
+            topics.update([word.lower() for word in heading_words if len(word) > 3])
+        return list(topics)[:10]  # Limit to top 10 topics

neo4j_storage.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# from neo4j import GraphDatabase
+# from typing import Dict, List
+# from urllib.parse import urlparse
+# from config.settings import settings
+# class Neo4jStorage:
+#     def __init__(self):
+#         self.driver = GraphDatabase.driver(
+#             settings.database.neo4j_uri,
+#             auth=(settings.database.neo4j_user, settings.database.neo4j_password)
+#         )
+#         self._create_constraints()
+#     def _create_constraints(self):
+#         """Create constraints and indexes for better performance"""
+#         with self.driver.session() as session:
+#             try:
+#                 session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
+#                 session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
+#                 session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
+#             except Exception as e:
+#                 pass  # Constraints might already exist
+#     def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
+#         """Store page relationships and structure in Neo4j"""
+#         with self.driver.session() as session:
+#             # Create main page node
+#             self._create_page_node(session, url, extracted_data)
+#             # Create domain relationships
+#             self._create_domain_relationships(session, url, extracted_data)
+#             # Create content relationships
+#             self._create_content_relationships(session, url, extracted_data)
+#             # Create link relationships
+#             self._create_link_relationships(session, url, extracted_data["links"])
+#             # Create DOM structure relationships
+#             self._create_dom_relationships(session, url, dom_structure)
+#     def _create_page_node(self, session, url: str, data: Dict):
+#         """Create or update page node with LLM-friendly properties"""
+#         query = """
+#         MERGE (p:Page {url: $url})
+#         SET p.title = $title,
+#             p.description = $description,
+#             p.domain = $domain,
+#             p.content_type = $content_type,
+#             p.complexity_score = $complexity_score,
+#             p.reading_time = $reading_time,
+#             p.word_count = $word_count,
+#             p.last_scraped = datetime()
+#         """
+#         session.run(query, {
+#             "url": url,
+#             "title": data["metadata"]["title"],
+#             "description": data["metadata"]["description"],
+#             "domain": data["metadata"]["domain"],
+#             "content_type": self._identify_content_type(data),
+#             "complexity_score": self._calculate_complexity_score(data),
+#             "reading_time": len(data["text_summary"].split()) // 250,
+#             "word_count": len(data["text_summary"].split())
+#         })
+#     def _create_domain_relationships(self, session, url: str, data: Dict):
+#         """Create domain nodes and relationships"""
+#         domain = data["metadata"]["domain"]
+#         # Create domain node
+#         session.run("""
+#         MERGE (d:Domain {name: $domain})
+#         SET d.last_updated = datetime()
+#         """, {"domain": domain})
+#         # Link page to domain
+#         session.run("""
+#         MATCH (p:Page {url: $url})
+#         MATCH (d:Domain {name: $domain})
+#         MERGE (p)-[:BELONGS_TO]->(d)
+#         """, {"url": url, "domain": domain})
+#     def _create_content_relationships(self, session, url: str, data: Dict):
+#         """Create content structure relationships for LLM understanding"""
+#         # Create topic nodes from headings
+#         for i, heading in enumerate(data["metadata"]["headings"]):
+#             session.run("""
+#             MATCH (p:Page {url: $url})
+#             MERGE (h:Heading {text: $text, level: $level, page_url: $url})
+#             SET h.position = $position
+#             MERGE (p)-[:HAS_HEADING]->(h)
+#             """, {
+#                 "url": url,
+#                 "text": heading["text"],
+#                 "level": heading["level"],
+#                 "position": i
+#             })
+#         # Create content block relationships
+#         for i, block in enumerate(data["content"][:10]):  # Limit for performance
+#             session.run("""
+#             MATCH (p:Page {url: $url})
+#             MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
+#             SET c.tag = $tag,
+#                 c.length = $length
+#             MERGE (p)-[:HAS_CONTENT]->(c)
+#             """, {
+#                 "url": url,
+#                 "text": block["text"][:500],  # Truncate for storage
+#                 "tag": block["tag"],
+#                 "length": len(block["text"]),
+#                 "position": i
+#             })
+#     def _create_link_relationships(self, session, url: str, links: List[Dict]):
+#         """Create link relationships for navigation understanding"""
+#         for link in links[:20]:  # Limit for performance
+#             target_url = link["url"]
+#             link_text = link["text"]
+#             is_internal = link["internal"]
+#             # Create target page node (minimal)
+#             session.run("""
+#             MERGE (target:Page {url: $target_url})
+#             SET target.discovered_via = $source_url
+#             """, {"target_url": target_url, "source_url": url})
+#             # Create relationship
+#             relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
+#             session.run(f"""
+#             MATCH (source:Page {{url: $source_url}})
+#             MATCH (target:Page {{url: $target_url}})
+#             MERGE (source)-[r:{relationship_type}]->(target)
+#             SET r.link_text = $link_text,
+#                 r.is_internal = $is_internal
+#             """, {
+#                 "source_url": url,
+#                 "target_url": target_url,
+#                 "link_text": link_text,
+#                 "is_internal": is_internal
+#             })
+#     def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
+#         """Create DOM structure relationships for content hierarchy"""
+#         # Create semantic structure nodes
+#         semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
+#         for tag, count in semantic_elements.items():
+#             if count > 0:
+#                 session.run("""
+#                 MATCH (p:Page {url: $url})
+#                 MERGE (s:SemanticElement {tag: $tag, page_url: $url})
+#                 SET s.count = $count
+#                 MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
+#                 """, {"url": url, "tag": tag, "count": count})
+#     def get_page_relationships(self, url: str) -> Dict:
+#         """Get all relationships for a page for LLM context"""
+#         with self.driver.session() as session:
+#             result = session.run("""
+#             MATCH (p:Page {url: $url})
+#             OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
+#             OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
+#             OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
+#             RETURN p, collect(DISTINCT internal.url) as internal_links,
+#                    collect(DISTINCT external.url) as external_links,
+#                    collect(DISTINCT {text: h.text, level: h.level}) as headings
+#             """, {"url": url})
+#             record = result.single()
+#             if record:
+#                 return {
+#                     "page": dict(record["p"]),
+#                     "internal_links": record["internal_links"],
+#                     "external_links": record["external_links"],
+#                     "headings": record["headings"]
+#                 }
+#             return {}
+#     def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
+#         """Find related pages for LLM context and study suggestions"""
+#         with self.driver.session() as session:
+#             result = session.run("""
+#             MATCH (p:Page {url: $url})
+#             MATCH (p)-[:BELONGS_TO]->(d:Domain)
+#             MATCH (related:Page)-[:BELONGS_TO]->(d)
+#             WHERE related.url <> $url
+#             RETURN related.url as url, related.title as title,
+#                    related.content_type as content_type,
+#                    related.complexity_score as complexity_score
+#             ORDER BY related.complexity_score DESC
+#             LIMIT $limit
+#             """, {"url": url, "limit": limit})
+#             return [dict(record) for record in result]
+#     def _identify_content_type(self, data: Dict) -> str:
+#         """Identify content type for graph relationships"""
+#         title = data["metadata"]["title"].lower()
+#         if "tutorial" in title or "guide" in title:
+#             return "tutorial"
+#         elif "documentation" in title or "docs" in title:
+#             return "documentation"
+#         elif "blog" in title or "article" in title:
+#             return "article"
+#         return "general"
+#     def _calculate_complexity_score(self, data: Dict) -> float:
+#         """Calculate complexity score for relationship weighting"""
+#         text_length = len(data["text_summary"])
+#         content_blocks = len(data["content"])
+#         return min(text_length / 1000 + content_blocks / 10, 10.0)
+#     def close(self):
+#         """Close database connection"""
+#         self.driver.close()

requirements.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+beautifulsoup4==4.13.4
+bs4==0.0.2
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+colorama==0.4.6
+distro==1.9.0
+dnspython==2.7.0
+dotenv==0.9.9
+fastapi==0.115.12
+ffmpy==0.6.0
+filelock==3.18.0
+fsspec==2025.5.1
+gradio==5.33.0
+gradio_client==1.10.2
+greenlet==3.2.3
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.4
+idna==3.10
+Jinja2==3.1.6
+jiter==0.10.0
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+numpy==2.3.0
+openai==1.84.0
+orjson==3.10.18
+packaging==25.0
+pandas==2.3.0
+pillow==11.2.1
+playwright==1.52.0
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+pyee==13.0.0
+Pygments==2.19.1
+pymongo==4.13.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.13
+safehttpx==0.1.6
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.7
+starlette==0.46.2
+tomlkit==0.13.3
+tqdm==4.67.1
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.3
+websockets==15.0.1

settings.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+from pydantic import BaseModel
+from typing import Dict, List
+from dotenv import load_dotenv
+load_dotenv()
+class DatabaseConfig(BaseModel):
+    mongo_uri: str = os.getenv("mongo_uri")
+    mongo_db: str = os.getenv("mongo_db")
+    neo4j_uri: str = os.getenv("neo4j_uri")
+    neo4j_user: str = os.getenv("neo4j_user")
+    neo4j_password: str = os.getenv("neo4j_password")
+class ScrapingConfig(BaseModel):
+    timeout: int = 30000
+    wait_for_selector: str = "body"
+    headless: bool = True
+    user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    max_retries: int = 3
+    delay_between_requests: float = 1.0
+class ExtractionConfig(BaseModel):
+    content_selectors: List[str] = [
+        "article", "main", ".content", "#content",
+        ".post", ".article-body", "p", "h1", "h2", "h3"
+    ]
+    ignore_selectors: List[str] = [
+        "script", "style", "nav", "footer", "header",
+        ".advertisement", ".ads", ".sidebar"
+    ]
+    min_text_length: int = 50
+    extract_images: bool = True
+    extract_links: bool = True
+class Settings:
+    def __init__(self):
+        self.database = DatabaseConfig()
+        self.scraping = ScrapingConfig()
+        self.extraction = ExtractionConfig()
+    def update_from_env(self):
+        # Update from environment variables if available
+        if os.getenv("mongo_uri"):
+            self.database.mongo_uri = os.getenv("mongo_uri")
+        if os.getenv("mongo_db"):
+            self.database.mongo_db = os.getenv("mongo_db")
+        if os.getenv("neo4j_uri"):
+            self.database.neo4j_uri = os.getenv("neo4j_uri")
+        if os.getenv("neo4j_user"):
+            self.database.neo4j_user = os.getenv("neo4j_user")
+        if os.getenv("neo4j_password"):
+            self.database.neo4j_password = os.getenv("neo4j_password")
+settings = Settings()
+settings.update_from_env()