|
|
|
|
|
import logging |
|
from logging.handlers import RotatingFileHandler |
|
import sys |
|
import hashlib |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from functools import lru_cache |
|
from typing import Any, Optional |
|
|
|
|
|
_logger_instance = None |
|
|
|
|
|
def setup_logging(): |
|
"""Configure logging to both file and console""" |
|
global _logger_instance |
|
if _logger_instance: |
|
return _logger_instance |
|
|
|
logger = logging.getLogger("ankigen") |
|
logger.setLevel(logging.DEBUG) |
|
|
|
|
|
if logger.hasHandlers(): |
|
logger.handlers.clear() |
|
|
|
detailed_formatter = logging.Formatter( |
|
"%(asctime)s - %(name)s - %(levelname)s - %(module)s:%(lineno)d - %(message)s" |
|
) |
|
simple_formatter = logging.Formatter("%(levelname)s: %(message)s") |
|
|
|
file_handler = RotatingFileHandler( |
|
"ankigen.log", maxBytes=1024 * 1024, backupCount=5 |
|
) |
|
file_handler.setLevel(logging.DEBUG) |
|
file_handler.setFormatter(detailed_formatter) |
|
|
|
console_handler = logging.StreamHandler(sys.stdout) |
|
console_handler.setLevel(logging.INFO) |
|
console_handler.setFormatter(simple_formatter) |
|
|
|
logger.addHandler(file_handler) |
|
logger.addHandler(console_handler) |
|
|
|
_logger_instance = logger |
|
return logger |
|
|
|
|
|
def get_logger(): |
|
"""Returns the initialized logger instance.""" |
|
if _logger_instance is None: |
|
return setup_logging() |
|
return _logger_instance |
|
|
|
|
|
|
|
logger = get_logger() |
|
|
|
|
|
|
|
class ResponseCache: |
|
"""A simple cache for API responses using LRU for get operations.""" |
|
|
|
def __init__(self, maxsize=128): |
|
|
|
self._internal_get_from_dict = self._get_from_dict_actual |
|
self._lru_cached_get = lru_cache(maxsize=maxsize)(self._internal_get_from_dict) |
|
self._dict_cache = {} |
|
|
|
def _get_from_dict_actual(self, cache_key: str): |
|
"""Actual dictionary lookup, intended to be wrapped by lru_cache.""" |
|
logger.debug(f"Cache DICT GET: key={cache_key}") |
|
return self._dict_cache.get(cache_key) |
|
|
|
def get(self, prompt: str, model: str) -> Optional[Any]: |
|
"""Retrieves an item from the cache. Uses LRU for this get path.""" |
|
cache_key = self._create_key(prompt, model) |
|
|
|
return self._lru_cached_get(cache_key) |
|
|
|
def set(self, prompt: str, model: str, response: Any): |
|
"""Sets an item in the cache.""" |
|
cache_key = self._create_key(prompt, model) |
|
logger.debug(f"Cache SET: key={cache_key}, type={type(response)}") |
|
self._dict_cache[cache_key] = response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _create_key(self, prompt: str, model: str) -> str: |
|
"""Creates a unique MD5 hash key for caching.""" |
|
return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest() |
|
|
|
|
|
|
|
def fetch_webpage_text(url: str) -> str: |
|
"""Fetches and extracts main text content from a URL.""" |
|
logger_util = get_logger() |
|
try: |
|
logger_util.info(f"Fetching content from URL: {url}") |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
|
} |
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
|
|
logger_util.debug(f"Parsing HTML content for {url}") |
|
try: |
|
soup = BeautifulSoup(response.text, "lxml") |
|
except ImportError: |
|
logger_util.warning("lxml not found, using html.parser instead.") |
|
soup = BeautifulSoup(response.text, "html.parser") |
|
except Exception as e: |
|
logger_util.error( |
|
f"BeautifulSoup initialization failed for {url}: {e}", exc_info=True |
|
) |
|
raise RuntimeError(f"Failed to parse HTML content for {url}.") |
|
|
|
for script_or_style in soup(["script", "style"]): |
|
script_or_style.extract() |
|
|
|
main_content = soup.find("main") |
|
if not main_content: |
|
main_content = soup.find("article") |
|
|
|
if main_content: |
|
text = main_content.get_text() |
|
logger_util.debug(f"Extracted text from <{main_content.name}> tag.") |
|
else: |
|
body = soup.find("body") |
|
if body: |
|
text = body.get_text() |
|
logger_util.debug("Extracted text from <body> tag (fallback).") |
|
else: |
|
text = "" |
|
logger_util.warning(f"Could not find <body> tag in {url}") |
|
|
|
|
|
lines = (line.strip() for line in text.splitlines()) |
|
cleaned_text = "\n".join(line for line in lines if line) |
|
|
|
if not cleaned_text: |
|
logger_util.warning(f"Could not extract meaningful text from {url}") |
|
return "" |
|
|
|
logger_util.info( |
|
f"Successfully extracted text from {url} (Length: {len(cleaned_text)} chars)" |
|
) |
|
return cleaned_text |
|
|
|
except requests.exceptions.RequestException as e: |
|
logger_util.error(f"Network error fetching URL {url}: {e}", exc_info=True) |
|
raise ConnectionError(f"Could not fetch URL: {e}") |
|
except Exception as e: |
|
logger_util.error(f"Error processing URL {url}: {e}", exc_info=True) |
|
if isinstance(e, (ValueError, ConnectionError, RuntimeError)): |
|
raise e |
|
else: |
|
raise RuntimeError( |
|
f"An unexpected error occurred while processing the URL: {e}" |
|
) |
|
|