Rename services.py to web_scraper.py
Browse files- services.py +0 -111
 - web_scraper.py +237 -0
 
    	
        services.py
    DELETED
    
    | 
         @@ -1,111 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            # /services.py
         
     | 
| 2 | 
         
            -
            """ Manages interactions with all external LLM and search APIs. """
         
     | 
| 3 | 
         
            -
             
     | 
| 4 | 
         
            -
            import os
         
     | 
| 5 | 
         
            -
            import logging
         
     | 
| 6 | 
         
            -
            from typing import Dict, Any, Generator, List
         
     | 
| 7 | 
         
            -
             
     | 
| 8 | 
         
            -
            from dotenv import load_dotenv
         
     | 
| 9 | 
         
            -
            from huggingface_hub import InferenceClient
         
     | 
| 10 | 
         
            -
            from tavily import TavilyClient
         
     | 
| 11 | 
         
            -
            from groq import Groq
         
     | 
| 12 | 
         
            -
            import fireworks.client as Fireworks
         
     | 
| 13 | 
         
            -
            import openai
         
     | 
| 14 | 
         
            -
            import google.generativeai as genai
         
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
            logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
         
     | 
| 17 | 
         
            -
            load_dotenv()
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
            # --- API Keys from .env ---
         
     | 
| 20 | 
         
            -
            HF_TOKEN = os.getenv("HF_TOKEN")
         
     | 
| 21 | 
         
            -
            TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
         
     | 
| 22 | 
         
            -
            GROQ_API_KEY = os.getenv("GROQ_API_KEY")
         
     | 
| 23 | 
         
            -
            FIREWORKS_API_KEY = os.getenv("FIREWORKS_API_KEY")
         
     | 
| 24 | 
         
            -
            OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         
     | 
| 25 | 
         
            -
            GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
         
     | 
| 26 | 
         
            -
            DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
         
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
            Messages = List[Dict[str, Any]]
         
     | 
| 29 | 
         
            -
             
     | 
| 30 | 
         
            -
            class LLMService:
         
     | 
| 31 | 
         
            -
                """A multi-provider wrapper for LLM Inference APIs."""
         
     | 
| 32 | 
         
            -
                def __init__(self):
         
     | 
| 33 | 
         
            -
                    self.hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
         
     | 
| 34 | 
         
            -
                    self.groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
         
     | 
| 35 | 
         
            -
                    self.openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None
         
     | 
| 36 | 
         
            -
                    
         
     | 
| 37 | 
         
            -
                    if DEEPSEEK_API_KEY:
         
     | 
| 38 | 
         
            -
                        self.deepseek_client = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/v1")
         
     | 
| 39 | 
         
            -
                    else:
         
     | 
| 40 | 
         
            -
                        self.deepseek_client = None
         
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
                    if FIREWORKS_API_KEY:
         
     | 
| 43 | 
         
            -
                        Fireworks.api_key = FIREWORKS_API_KEY
         
     | 
| 44 | 
         
            -
                        self.fireworks_client = Fireworks
         
     | 
| 45 | 
         
            -
                    else:
         
     | 
| 46 | 
         
            -
                        self.fireworks_client = None
         
     | 
| 47 | 
         
            -
             
     | 
| 48 | 
         
            -
                    if GEMINI_API_KEY:
         
     | 
| 49 | 
         
            -
                        genai.configure(api_key=GEMINI_API_KEY)
         
     | 
| 50 | 
         
            -
                        self.gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')
         
     | 
| 51 | 
         
            -
                    else:
         
     | 
| 52 | 
         
            -
                        self.gemini_model = None
         
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
                def _prepare_messages_for_gemini(self, messages: Messages) -> List[Dict[str, Any]]:
         
     | 
| 55 | 
         
            -
                    gemini_messages = []
         
     | 
| 56 | 
         
            -
                    for msg in messages:
         
     | 
| 57 | 
         
            -
                        if msg['role'] == 'system': continue # Gemini doesn't use a system role in this way
         
     | 
| 58 | 
         
            -
                        role = 'model' if msg['role'] == 'assistant' else 'user'
         
     | 
| 59 | 
         
            -
                        gemini_messages.append({'role': role, 'parts': [msg['content']]})
         
     | 
| 60 | 
         
            -
                    return gemini_messages
         
     | 
| 61 | 
         
            -
             
     | 
| 62 | 
         
            -
                def generate_code_stream(self, model_id: str, messages: Messages, max_tokens: int = 8192) -> Generator[str, None, None]:
         
     | 
| 63 | 
         
            -
                    provider, model_name = model_id.split('/', 1)
         
     | 
| 64 | 
         
            -
                    logging.info(f"Dispatching to provider: {provider} for model: {model_name}")
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
                    try:
         
     | 
| 67 | 
         
            -
                        if provider in ['openai', 'groq', 'deepseek', 'fireworks']:
         
     | 
| 68 | 
         
            -
                            client_map = {'openai': self.openai_client, 'groq': self.groq_client, 'deepseek': self.deepseek_client, 'fireworks': self.fireworks_client.ChatCompletion if self.fireworks_client else None}
         
     | 
| 69 | 
         
            -
                            client = client_map.get(provider)
         
     | 
| 70 | 
         
            -
                            if not client: raise ValueError(f"{provider.capitalize()} API key not configured.")
         
     | 
| 71 | 
         
            -
                            
         
     | 
| 72 | 
         
            -
                            stream = client.create(model=model_name, messages=messages, stream=True, max_tokens=max_tokens) if provider == 'fireworks' else client.chat.completions.create(model=model_name, messages=messages, stream=True, max_tokens=max_tokens)
         
     | 
| 73 | 
         
            -
                            for chunk in stream:
         
     | 
| 74 | 
         
            -
                                if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: yield chunk.choices[0].delta.content
         
     | 
| 75 | 
         
            -
                        
         
     | 
| 76 | 
         
            -
                        elif provider == 'gemini':
         
     | 
| 77 | 
         
            -
                            if not self.gemini_model: raise ValueError("Gemini API key not configured.")
         
     | 
| 78 | 
         
            -
                            system_prompt = next((msg['content'] for msg in messages if msg['role'] == 'system'), "")
         
     | 
| 79 | 
         
            -
                            gemini_messages = self._prepare_messages_for_gemini(messages)
         
     | 
| 80 | 
         
            -
                            # Prepend system prompt to first user message for Gemini
         
     | 
| 81 | 
         
            -
                            if system_prompt and gemini_messages and gemini_messages[0]['role'] == 'user':
         
     | 
| 82 | 
         
            -
                                gemini_messages[0]['parts'][0] = f"{system_prompt}\n\n{gemini_messages[0]['parts'][0]}"
         
     | 
| 83 | 
         
            -
                            stream = self.gemini_model.generate_content(gemini_messages, stream=True)
         
     | 
| 84 | 
         
            -
                            for chunk in stream: yield chunk.text
         
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
                        elif provider == 'huggingface':
         
     | 
| 87 | 
         
            -
                            if not self.hf_client: raise ValueError("Hugging Face API token not configured.")
         
     | 
| 88 | 
         
            -
                            hf_model_id = model_id.split('/', 1)[1]
         
     | 
| 89 | 
         
            -
                            stream = self.hf_client.chat_completion(model=hf_model_id, messages=messages, stream=True, max_tokens=max_tokens)
         
     | 
| 90 | 
         
            -
                            for chunk in stream:
         
     | 
| 91 | 
         
            -
                                if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: yield chunk.choices[0].delta.content
         
     | 
| 92 | 
         
            -
                        else:
         
     | 
| 93 | 
         
            -
                            raise ValueError(f"Unknown provider: {provider}")
         
     | 
| 94 | 
         
            -
                    except Exception as e:
         
     | 
| 95 | 
         
            -
                        logging.error(f"LLM API Error with provider {provider}: {e}")
         
     | 
| 96 | 
         
            -
                        yield f"Error from {provider.capitalize()}: {str(e)}"
         
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
            class SearchService:
         
     | 
| 99 | 
         
            -
                def __init__(self, api_key: str = TAVILY_API_KEY):
         
     | 
| 100 | 
         
            -
                    self.client = TavilyClient(api_key=api_key) if api_key else None
         
     | 
| 101 | 
         
            -
                    if not self.client: logging.warning("TAVILY_API_KEY not set. Web search will be disabled.")
         
     | 
| 102 | 
         
            -
                def is_available(self) -> bool: return self.client is not None
         
     | 
| 103 | 
         
            -
                def search(self, query: str, max_results: int = 5) -> str:
         
     | 
| 104 | 
         
            -
                    if not self.is_available(): return "Web search is not available."
         
     | 
| 105 | 
         
            -
                    try:
         
     | 
| 106 | 
         
            -
                        response = self.client.search(query, search_depth="advanced", max_results=min(max(1, max_results), 10))
         
     | 
| 107 | 
         
            -
                        return "Web Search Results:\n\n" + "\n---\n".join([f"Title: {res.get('title', 'N/A')}\nURL: {res.get('url', 'N/A')}\nContent: {res.get('content', 'N/A')}" for res in response.get('results', [])])
         
     | 
| 108 | 
         
            -
                    except Exception as e: return f"Search error: {str(e)}"
         
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
            llm_service = LLMService()
         
     | 
| 111 | 
         
            -
            search_service = SearchService()
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        web_scraper.py
    ADDED
    
    | 
         @@ -0,0 +1,237 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            def extract_text_from_image(image_path):
         
     | 
| 2 | 
         
            +
                """Extract text from image using OCR"""
         
     | 
| 3 | 
         
            +
                try:
         
     | 
| 4 | 
         
            +
                    try:
         
     | 
| 5 | 
         
            +
                        pytesseract.get_tesseract_version()
         
     | 
| 6 | 
         
            +
                    except Exception:
         
     | 
| 7 | 
         
            +
                        return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
         
     | 
| 8 | 
         
            +
                    image = cv2.imread(image_path)
         
     | 
| 9 | 
         
            +
                    if image is None:
         
     | 
| 10 | 
         
            +
                        return "Error: Could not read image file"
         
     | 
| 11 | 
         
            +
                    image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
         
     | 
| 12 | 
         
            +
                    gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY)
         
     | 
| 13 | 
         
            +
                    _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
         
     | 
| 14 | 
         
            +
                    text=pytesseract.image_to_string(binary,config='--psm 6')
         
     | 
| 15 | 
         
            +
                    return text.strip() if text.strip() else "No text found in image"
         
     | 
| 16 | 
         
            +
                except Exception as e:
         
     | 
| 17 | 
         
            +
                    return f"Error extracting text from image: {e}"
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            def extract_text_from_file(file_path):
         
     | 
| 20 | 
         
            +
                if not file_path:
         
     | 
| 21 | 
         
            +
                    return ""
         
     | 
| 22 | 
         
            +
                mime,_=mimetypes.guess_type(file_path)
         
     | 
| 23 | 
         
            +
                ext=os.path.splitext(file_path)[1].lower()
         
     | 
| 24 | 
         
            +
                try:
         
     | 
| 25 | 
         
            +
                    if ext==".pdf":
         
     | 
| 26 | 
         
            +
                        with open(file_path,"rb") as f:
         
     | 
| 27 | 
         
            +
                            reader=PyPDF2.PdfReader(f)
         
     | 
| 28 | 
         
            +
                            return "\n".join(page.extract_text() or "" for page in reader.pages)
         
     | 
| 29 | 
         
            +
                    elif ext in [".txt", ".md"]:
         
     | 
| 30 | 
         
            +
                        with open(file_path,"r",encoding="utf-8") as f:
         
     | 
| 31 | 
         
            +
                            return f.read()
         
     | 
| 32 | 
         
            +
                    elif ext==".csv":
         
     | 
| 33 | 
         
            +
                        with open(file_path,"r",encoding="utf-8") as f:
         
     | 
| 34 | 
         
            +
                            return f.read()
         
     | 
| 35 | 
         
            +
                    elif ext==".docx":
         
     | 
| 36 | 
         
            +
                        doc=docx.Document(file_path)
         
     | 
| 37 | 
         
            +
                        return "\n".join([para.text for para in doc.paragraphs])
         
     | 
| 38 | 
         
            +
                    elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]:
         
     | 
| 39 | 
         
            +
                        return extract_text_from_image(file_path)
         
     | 
| 40 | 
         
            +
                    else:
         
     | 
| 41 | 
         
            +
                        return ""
         
     | 
| 42 | 
         
            +
                except Exception as e:
         
     | 
| 43 | 
         
            +
                    return f"Error extracting text: {e}"
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
            def extract_website_content(url: str) -> str:
         
     | 
| 46 | 
         
            +
                """Extract HTML code and content from a website URL"""
         
     | 
| 47 | 
         
            +
                try:
         
     | 
| 48 | 
         
            +
                    parsed_url=urlparse(url)
         
     | 
| 49 | 
         
            +
                    if not parsed_url.scheme:
         
     | 
| 50 | 
         
            +
                        url="https://"+url
         
     | 
| 51 | 
         
            +
                        parsed_url=urlparse(url)
         
     | 
| 52 | 
         
            +
                    if not parsed_url.netloc:
         
     | 
| 53 | 
         
            +
                        return "Error: Invalid URL provided"
         
     | 
| 54 | 
         
            +
                    headers={
         
     | 
| 55 | 
         
            +
                        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
         
     | 
| 56 | 
         
            +
                        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         
     | 
| 57 | 
         
            +
                        'Accept-Language':'en-US,en;q=0.9',
         
     | 
| 58 | 
         
            +
                        'Accept-Encoding':'gzip, deflate, br',
         
     | 
| 59 | 
         
            +
                        'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1',
         
     | 
| 60 | 
         
            +
                        'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0'
         
     | 
| 61 | 
         
            +
                    }
         
     | 
| 62 | 
         
            +
                    session=requests.Session()
         
     | 
| 63 | 
         
            +
                    session.headers.update(headers)
         
     | 
| 64 | 
         
            +
                    max_retries=3
         
     | 
| 65 | 
         
            +
                    for attempt in range(max_retries):
         
     | 
| 66 | 
         
            +
                        try:
         
     | 
| 67 | 
         
            +
                            response=session.get(url,timeout=15,allow_redirects=True)
         
     | 
| 68 | 
         
            +
                            response.raise_for_status()
         
     | 
| 69 | 
         
            +
                            break
         
     | 
| 70 | 
         
            +
                        except requests.exceptions.HTTPError as e:
         
     | 
| 71 | 
         
            +
                            if e.response.status_code==403 and attempt<max_retries-1:
         
     | 
| 72 | 
         
            +
                                session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
         
     | 
| 73 | 
         
            +
                                continue
         
     | 
| 74 | 
         
            +
                            else:
         
     | 
| 75 | 
         
            +
                                raise
         
     | 
| 76 | 
         
            +
                    try:
         
     | 
| 77 | 
         
            +
                        response.encoding=response.apparent_encoding
         
     | 
| 78 | 
         
            +
                        raw_html=response.text
         
     | 
| 79 | 
         
            +
                    except:
         
     | 
| 80 | 
         
            +
                        raw_html=response.content.decode('utf-8',errors='ignore')
         
     | 
| 81 | 
         
            +
                    if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
         
     | 
| 82 | 
         
            +
                        try:
         
     | 
| 83 | 
         
            +
                            raw_html=response.content.decode('latin-1',errors='ignore')
         
     | 
| 84 | 
         
            +
                        except:
         
     | 
| 85 | 
         
            +
                            try:
         
     | 
| 86 | 
         
            +
                                raw_html=response.content.decode('utf-8',errors='ignore')
         
     | 
| 87 | 
         
            +
                            except:
         
     | 
| 88 | 
         
            +
                                raw_html=response.content.decode('cp1252',errors='ignore')
         
     | 
| 89 | 
         
            +
                    soup=BeautifulSoup(raw_html,'html.parser')
         
     | 
| 90 | 
         
            +
                    title=soup.find('title')
         
     | 
| 91 | 
         
            +
                    title_text=title.get_text().strip() if title else "No title found"
         
     | 
| 92 | 
         
            +
                    meta_desc=soup.find('meta',attrs={'name':'description'})
         
     | 
| 93 | 
         
            +
                    description=meta_desc.get('content','') if meta_desc else ""
         
     | 
| 94 | 
         
            +
                    content_sections=[]
         
     | 
| 95 | 
         
            +
                    main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body']
         
     | 
| 96 | 
         
            +
                    for selector in main_selectors:
         
     | 
| 97 | 
         
            +
                        elements=soup.select(selector)
         
     | 
| 98 | 
         
            +
                        for element in elements:
         
     | 
| 99 | 
         
            +
                            text=element.get_text().strip()
         
     | 
| 100 | 
         
            +
                            if len(text)>100:
         
     | 
| 101 | 
         
            +
                                content_sections.append(text)
         
     | 
| 102 | 
         
            +
                    nav_links=[]
         
     | 
| 103 | 
         
            +
                    nav_elements=soup.find_all(['nav','header'])
         
     | 
| 104 | 
         
            +
                    for nav in nav_elements:
         
     | 
| 105 | 
         
            +
                        links=nav.find_all('a')
         
     | 
| 106 | 
         
            +
                        for link in links:
         
     | 
| 107 | 
         
            +
                            link_text=link.get_text().strip()
         
     | 
| 108 | 
         
            +
                            link_href=link.get('href','')
         
     | 
| 109 | 
         
            +
                            if link_text and link_href:
         
     | 
| 110 | 
         
            +
                                nav_links.append(f"{link_text}: {link_href}")
         
     | 
| 111 | 
         
            +
                    img_elements=soup.find_all('img')
         
     | 
| 112 | 
         
            +
                    for img in img_elements:
         
     | 
| 113 | 
         
            +
                        src=img.get('src','')
         
     | 
| 114 | 
         
            +
                        if src:
         
     | 
| 115 | 
         
            +
                            if src.startswith('//'):
         
     | 
| 116 | 
         
            +
                                absolute_src='https:'+src
         
     | 
| 117 | 
         
            +
                                img['src']=absolute_src
         
     | 
| 118 | 
         
            +
                            elif src.startswith('/'):
         
     | 
| 119 | 
         
            +
                                absolute_src=urljoin(url,src)
         
     | 
| 120 | 
         
            +
                                img['src']=absolute_src
         
     | 
| 121 | 
         
            +
                            elif not src.startswith(('http://','https://')):
         
     | 
| 122 | 
         
            +
                                absolute_src=urljoin(url,src)
         
     | 
| 123 | 
         
            +
                                img['src']=absolute_src
         
     | 
| 124 | 
         
            +
                            data_src=img.get('data-src','')
         
     | 
| 125 | 
         
            +
                            if data_src and not src:
         
     | 
| 126 | 
         
            +
                                if data_src.startswith('//'):
         
     | 
| 127 | 
         
            +
                                    absolute_data_src='https:'+data_src
         
     | 
| 128 | 
         
            +
                                    img['src']=absolute_data_src
         
     | 
| 129 | 
         
            +
                                elif data_src.startswith('/'):
         
     | 
| 130 | 
         
            +
                                    absolute_data_src=urljoin(url,data_src)
         
     | 
| 131 | 
         
            +
                                    img['src']=absolute_data_src
         
     | 
| 132 | 
         
            +
                                elif not data_src.startswith(('http://','https://')):
         
     | 
| 133 | 
         
            +
                                    absolute_data_src=urljoin(url,data_src)
         
     | 
| 134 | 
         
            +
                                    img['src']=absolute_data_src
         
     | 
| 135 | 
         
            +
                                else:
         
     | 
| 136 | 
         
            +
                                    img['src']=data_src
         
     | 
| 137 | 
         
            +
                    elements_with_style=soup.find_all(attrs={'style':True})
         
     | 
| 138 | 
         
            +
                    for element in elements_with_style:
         
     | 
| 139 | 
         
            +
                        style_attr=element.get('style','')
         
     | 
| 140 | 
         
            +
                        import re
         
     | 
| 141 | 
         
            +
                        bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
         
     | 
| 142 | 
         
            +
                        matches=re.findall(bg_pattern,style_attr, re.IGNORECASE)
         
     | 
| 143 | 
         
            +
                        for match in matches:
         
     | 
| 144 | 
         
            +
                            if match.startswith('//'):
         
     | 
| 145 | 
         
            +
                                absolute_bg='https:'+match
         
     | 
| 146 | 
         
            +
                                style_attr=style_attr.replace(match,absolute_bg)
         
     | 
| 147 | 
         
            +
                            elif match.startswith('/'):
         
     | 
| 148 | 
         
            +
                                absolute_bg=urljoin(url,match)
         
     | 
| 149 | 
         
            +
                                style_attr=style_attr.replace(match,absolute_bg)
         
     | 
| 150 | 
         
            +
                            elif not match.startswith(('http://','https://')):
         
     | 
| 151 | 
         
            +
                                absolute_bg=urljoin(url,match)
         
     | 
| 152 | 
         
            +
                                style_attr=style_attr.replace(match,absolute_bg)
         
     | 
| 153 | 
         
            +
                        element['style']=style_attr
         
     | 
| 154 | 
         
            +
                    style_elements=soup.find_all('style')
         
     | 
| 155 | 
         
            +
                    for style in style_elements:
         
     | 
| 156 | 
         
            +
                        if style.string:
         
     | 
| 157 | 
         
            +
                            style_content=style.string
         
     | 
| 158 | 
         
            +
                            bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
         
     | 
| 159 | 
         
            +
                            matches=re.findall(bg_pattern,style_content, re.IGNORECASE)
         
     | 
| 160 | 
         
            +
                            for match in matches:
         
     | 
| 161 | 
         
            +
                                if match.startswith('//'):
         
     | 
| 162 | 
         
            +
                                    absolute_bg='https:'+match
         
     | 
| 163 | 
         
            +
                                    style_content=style_content.replace(match,absolute_bg)
         
     | 
| 164 | 
         
            +
                                elif match.startswith('/'):
         
     | 
| 165 | 
         
            +
                                    absolute_bg=urljoin(url,match)
         
     | 
| 166 | 
         
            +
                                    style_content=style_content.replace(match,absolute_bg)
         
     | 
| 167 | 
         
            +
                                elif not match.startswith(('http://','https://')):
         
     | 
| 168 | 
         
            +
                                    absolute_bg=urljoin(url,match)
         
     | 
| 169 | 
         
            +
                                    style_content=style_content.replace(match,absolute_bg)
         
     | 
| 170 | 
         
            +
                            style.string=style_content
         
     | 
| 171 | 
         
            +
                    images=[]
         
     | 
| 172 | 
         
            +
                    img_elements=soup.find_all('img')
         
     | 
| 173 | 
         
            +
                    for img in img_elements:
         
     | 
| 174 | 
         
            +
                        src=img.get('src','')
         
     | 
| 175 | 
         
            +
                        alt=img.get('alt','')
         
     | 
| 176 | 
         
            +
                        if src:
         
     | 
| 177 | 
         
            +
                            images.append({'src':src,'alt':alt})
         
     | 
| 178 | 
         
            +
                    def test_image_url(img_url):
         
     | 
| 179 | 
         
            +
                        try:
         
     | 
| 180 | 
         
            +
                            test_response=requests.head(img_url,timeout=5,allow_redirects=True)
         
     | 
| 181 | 
         
            +
                            return test_response.status_code==200
         
     | 
| 182 | 
         
            +
                        except:
         
     | 
| 183 | 
         
            +
                            return False
         
     | 
| 184 | 
         
            +
                    working_images=[]
         
     | 
| 185 | 
         
            +
                    for img in images[:10]:
         
     | 
| 186 | 
         
            +
                        if test_image_url(img['src']):
         
     | 
| 187 | 
         
            +
                            working_images.append(img)
         
     | 
| 188 | 
         
            +
                    modified_html=str(soup)
         
     | 
| 189 | 
         
            +
                    import re
         
     | 
| 190 | 
         
            +
                    cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL)
         
     | 
| 191 | 
         
            +
                    cleaned_html=re.sub(r'\s+',' ',cleaned_html)
         
     | 
| 192 | 
         
            +
                    cleaned_html=re.sub(r'>\s+<','><',cleaned_html)
         
     | 
| 193 | 
         
            +
                    if len(cleaned_html)>15000:
         
     | 
| 194 | 
         
            +
                        cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->"
         
     | 
| 195 | 
         
            +
                    if not title_text or title_text=="No title found":
         
     | 
| 196 | 
         
            +
                        title_text=url.split('/')[-1] or url.split('/')[-2] or "Website"
         
     | 
| 197 | 
         
            +
                    if len(cleaned_html.strip())<100:
         
     | 
| 198 | 
         
            +
                        website_content=f"""
         
     | 
| 199 | 
         
            +
            WEBSITE REDESIGN - EXTRACTION FAILED
         
     | 
| 200 | 
         
            +
            ====================================
         
     | 
| 201 | 
         
            +
            URL: {url}
         
     | 
| 202 | 
         
            +
            Title: {title_text}
         
     | 
| 203 | 
         
            +
            ERROR: Could not extract meaningful HTML content from this website. This could be due to:
         
     | 
| 204 | 
         
            +
            1. The website uses heavy JavaScript to load content dynamically
         
     | 
| 205 | 
         
            +
            2. The website has anti-bot protection
         
     | 
| 206 | 
         
            +
            3. The website requires authentication
         
     | 
| 207 | 
         
            +
            4. The website is using advanced compression or encoding
         
     | 
| 208 | 
         
            +
            FALLBACK APPROACH:
         
     | 
| 209 | 
         
            +
            Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
         
     | 
| 210 | 
         
            +
            1. Create a typical layout for this type of website
         
     | 
| 211 | 
         
            +
            2. Use placeholder content that would be appropriate
         
     | 
| 212 | 
         
            +
            3. Include modern design elements and responsive features
         
     | 
| 213 | 
         
            +
            4. Use a clean, professional design with good typography
         
     | 
| 214 | 
         
            +
            5. Make it mobile-friendly and accessible
         
     | 
| 215 | 
         
            +
            This will help me create a better design for you."""
         
     | 
| 216 | 
         
            +
                        return website_content.strip()
         
     | 
| 217 | 
         
            +
                    website_content=f"""
         
     | 
| 218 | 
         
            +
            WEBSITE REDESIGN - ORIGINAL HTML CODE
         
     | 
| 219 | 
         
            +
            ===[TRUNCATED FOR BREVITY]==="""
         
     | 
| 220 | 
         
            +
                    return website_content.strip()
         
     | 
| 221 | 
         
            +
                except requests.exceptions.HTTPError as e:
         
     | 
| 222 | 
         
            +
                    if e.response.status_code==403:
         
     | 
| 223 | 
         
            +
                        return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
         
     | 
| 224 | 
         
            +
                    elif e.response.status_code==404:
         
     | 
| 225 | 
         
            +
                        return f"Error: Website not found (404). Please check the URL and try again."
         
     | 
| 226 | 
         
            +
                    elif e.response.status_code>=500:
         
     | 
| 227 | 
         
            +
                        return f"Error: Website server error ({e.response.status_code}). Please try again later."
         
     | 
| 228 | 
         
            +
                    else:
         
     | 
| 229 | 
         
            +
                        return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
         
     | 
| 230 | 
         
            +
                except requests.exceptions.Timeout:
         
     | 
| 231 | 
         
            +
                    return "Error: Request timed out. The website may be slow or unavailable."
         
     | 
| 232 | 
         
            +
                except requests.exceptions.ConnectionError:
         
     | 
| 233 | 
         
            +
                    return "Error: Could not connect to the website. Please check your internet connection and the URL."
         
     | 
| 234 | 
         
            +
                except requests.exceptions.RequestException as e:
         
     | 
| 235 | 
         
            +
                    return f"Error accessing website: {str(e)}"
         
     | 
| 236 | 
         
            +
                except Exception as e:
         
     | 
| 237 | 
         
            +
                    return f"Error extracting website content: {str(e)}"
         
     |