Spaces:

moksh9591
/

LLm_Webscraper

Running

App Files Files Community

Mokshith Salian commited on 22 days ago

Commit

f937fdb

1 Parent(s): 644f29b

initial commit

Browse files

Files changed (5) hide show

app.py +128 -0
llm_processor.py +34 -0
privacy_manager.py +65 -0
requirements.txt +7 -0
secure_scraper.py +88 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+import json
+import logging
+import os
+import time
+# Import our custom classes
+from secure_scraper import SecureScraper
+from llm_processor import LLMProcessor
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+def main():
+    st.set_page_config(
+        page_title="LLM Web Scraper",
+        page_icon="🕸️",
+        layout="wide",
+    )
+    st.title("🕸️ Free LLM Web Scraper")
+    st.write("Scrape web content with privacy protection and open-source LLM processing")
+    # Configuration section
+    with st.sidebar:
+        st.header("Configuration")
+        st.subheader("LLM Model Selection")
+        model_option = st.selectbox(
+            "Choose LLM Model",
+            [
+                "microsoft/phi-2 (fastest, 2.7B)",
+                "google/gemma-2b (balanced)",
+                "mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
+            ],
+            index=0
+        )
+        # Convert selection to model name
+        model_name = model_option.split(" ")[0]
+        st.subheader("Privacy Settings")
+        use_proxy = st.checkbox("Use Proxy Rotation", value=False)
+        use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
+    # Input section
+    st.header("Scraping Target")
+    url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")
+    with st.expander("Advanced Scraping Options"):
+        css_selectors_text = st.text_area(
+            "CSS Selectors (JSON format)",
+            placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
+        )
+        # Parse CSS selectors
+        css_selectors = None
+        if css_selectors_text:
+            try:
+                css_selectors = json.loads(css_selectors_text)
+            except json.JSONDecodeError:
+                st.error("Invalid JSON for CSS selectors")
+    st.header("LLM Processing")
+    llm_instruction = st.text_area(
+        "What do you want the LLM to do with the scraped data?",
+        placeholder="Extract the main product features and summarize them in bullet points"
+    )
+    # Initialize on button click
+    if st.button("Scrape and Process"):
+        if not url:
+            st.error("Please enter a URL to scrape")
+            return
+        # Show progress
+        with st.spinner("Initializing scraper..."):
+            proxies = None
+            if use_proxy:
+                st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
+                # In a real app, we'd use better proxies or load from a file
+                proxies = [
+                    "http://public-proxy1.example.com:8080",
+                    "http://public-proxy2.example.com:8080"
+                ]
+            scraper = SecureScraper(proxy_list=proxies if use_proxy else None)
+        # Perform scraping
+        with st.spinner("Scraping website with privacy protection..."):
+            result = scraper.scrape_url(url, css_selectors)
+            if result['status'] == 'error':
+                st.error(f"Scraping failed: {result['message']}")
+                return
+            st.success("Scraping completed successfully!")
+            # Display privacy measures used
+            st.subheader("Privacy Measures Used")
+            st.json(result['privacy'])
+            # Display raw scraped data
+            with st.expander("Raw Scraped Data"):
+                st.json(result['data'])
+        # Process with LLM
+        with st.spinner(f"Processing with {model_name}..."):
+            try:
+                llm = LLMProcessor(model_name=model_name)
+                # Prepare data for LLM (convert to string if it's a dict)
+                scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
+                processed_result = llm.process_data(
+                    scraped_data_str,
+                    llm_instruction if llm_instruction else "Summarize this information"
+                )
+                st.subheader("LLM Processing Result")
+                st.write(processed_result)
+            except Exception as e:
+                st.error(f"Error in LLM processing: {str(e)}")
+                st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")
+if __name__ == "__main__":
+    main()

llm_processor.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+class LLMProcessor:
+    def __init__(self, model_name="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"):
+        # Option 1: Use HuggingFace pipeline for simplicity
+        self.pipe = pipeline(
+            "text-generation",
+            model=model_name,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+    def process_data(self, scraped_data, task_instruction):
+        # Create prompt
+        prompt = f"""
+        Task: {task_instruction}
+        Data:
+        {scraped_data}
+        Please process the above data according to the task instruction.
+        """
+        # Generate response
+        response = self.pipe(
+            prompt,
+            max_length=2048,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True
+        )
+        return response[0]['generated_text']

privacy_manager.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import requests
+from fake_useragent import UserAgent
+import random
+import time
+import os
+from bs4 import BeautifulSoup
+import logging
+class PrivacyManager:
+    def __init__(self, proxy_list=None):
+        # Initialize User-Agent rotator
+        self.ua = UserAgent()
+        # Initialize proxies
+        self.proxies = []
+        if proxy_list:
+            self.proxies = proxy_list
+        else:
+            # Default to a few free proxy examples (you'd want to update these)
+            self.proxies = [
+                "http://public-proxy1.example.com:8080",
+                "http://public-proxy2.example.com:8080"
+            ]
+        logging.info(f"Initialized PrivacyManager with {len(self.proxies)} proxies")
+    def get_random_proxy(self):
+        if not self.proxies:
+            return None
+        return random.choice(self.proxies)
+    def get_random_user_agent(self):
+        return self.ua.random
+    def handle_captcha(self, response):
+        """
+        Basic CAPTCHA detection - in a real implementation, you'd need
+        more sophisticated handling or a dedicated service
+        """
+        soup = BeautifulSoup(response.text, 'html.parser')
+        captcha_indicators = ['captcha', 'CAPTCHA', 'robot', 'verify']
+        for indicator in captcha_indicators:
+            if indicator in response.text:
+                logging.warning(f"CAPTCHA detected: {indicator} found on page")
+                return True
+        return False
+    def get_request_params(self):
+        # Random delay to avoid detection
+        time.sleep(random.uniform(1, 3))
+        params = {
+            'headers': {'User-Agent': self.get_random_user_agent()}
+        }
+        proxy = self.get_random_proxy()
+        if proxy:
+            params['proxies'] = {
+                'http': proxy,
+                'https': proxy
+            }
+        return params

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit==1.30.0
+torch==2.1.1
+transformers==4.36.2
+requests==2.31.0
+fake-useragent==1.4.0
+beautifulsoup4==4.12.2
+accelerate==0.25.0

secure_scraper.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import random
+import requests
+from bs4 import BeautifulSoup
+import logging
+from privacy_manager import PrivacyManager
+import time
+import json
+class SecureScraper:
+    def __init__(self, proxy_list=None):
+        self.privacy_manager = PrivacyManager(proxy_list)
+        self.session = requests.Session()
+        logging.basicConfig(level=logging.INFO)
+    def scrape_url(self, url, css_selectors=None):
+        """
+        Scrape a URL with privacy protection measures
+        Args:
+            url: URL to scrape
+            css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'}
+        """
+        # Get privacy parameters
+        params = self.privacy_manager.get_request_params()
+        max_retries = 3
+        current_retry = 0
+        while current_retry < max_retries:
+            try:
+                # Configure request with privacy measures
+                headers = params.get('headers', {})
+                proxies = params.get('proxies', None)
+                # Log attempt details (but mask proxy details for security)
+                proxy_log = "using proxy" if proxies else "without proxy"
+                logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_log}")
+                # Make the request
+                response = self.session.get(url, headers=headers, proxies=proxies, timeout=10)
+                response.raise_for_status()
+                # Check for CAPTCHA
+                if self.privacy_manager.handle_captcha(response):
+                    logging.warning(f"CAPTCHA detected, retrying with new identity")
+                    params = self.privacy_manager.get_request_params()  # Get new privacy params
+                    current_retry += 1
+                    time.sleep(random.uniform(3, 7))  # Longer delay after CAPTCHA
+                    continue
+                # Extract content
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # If no selectors provided, return general page info
+                if not css_selectors:
+                    result = {
+                        'title': soup.title.string if soup.title else 'No title found',
+                        'text': soup.get_text(strip=True)[:10000],  # Limit text size
+                        'links': [a.get('href') for a in soup.find_all('a', href=True)][:20]  # Limit links
+                    }
+                else:
+                    # Extract requested elements
+                    result = {}
+                    for key, selector in css_selectors.items():
+                        elements = soup.select(selector)
+                        if elements:
+                            # If multiple elements match, create a list
+                            if len(elements) > 1:
+                                result[key] = [elem.get_text(strip=True) for elem in elements]
+                            else:
+                                result[key] = elements[0].get_text(strip=True)
+                        else:
+                            result[key] = f"No match for selector: {selector}"
+                return {
+                    'status': 'success',
+                    'data': result,
+                    'privacy': {
+                        'user_agent_type': headers.get('User-Agent', 'Unknown')[:30] + '...'  # Truncate for privacy
+                    }
+                }
+            except requests.exceptions.RequestException as e:
+                logging.error(f"Request failed: {str(e)}")
+                current_retry += 1
+                time.sleep(random.uniform(2, 5))  # Incremental backoff
+                params = self.privacy_manager.get_request_params()  # Get new privacy params
+                # If we've exhausted retries
+        return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}