import os import re from typing import Dict, Optional import google.generativeai as genai import logging from dotenv import load_dotenv from urllib.parse import urlparse from cachetools import TTLCache # Load environment variables load_dotenv() # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # In-memory cache: 1000 items, 1-hour TTL cache = TTLCache(maxsize=1000, ttl=3600) async def summarize_text(text: str, url: str = "") -> Dict[str, str]: """Summarize text into a title and description using Gemini-1.5 Flash.""" try: # Validate inputs text = text.strip() if text else "" if not url: url = "https://example.com" try: parsed_url = urlparse(url) domain = parsed_url.netloc or "example.com" except Exception: logging.warning(f"Invalid URL: {url}. Using default domain.") domain = "example.com" # Check cache cache_key = f"summarize_{hash(text + url)}" if cache_key in cache: logging.info(f"Cache hit for {cache_key}") return cache[cache_key] # Get Gemini API key api_key = os.getenv("GEMINI_API_KEY") if not api_key: logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.") raise ValueError("Gemini API key is required for summarization.") # Configure Gemini client genai.configure(api_key=api_key) model = genai.GenerativeModel('gemini-1.5-flash') # Handle short or empty text if len(text) < 20: logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.") text = f"Content from {url} about news, products, or services." # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk) chunk_size = 1000 text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] summaries = [] for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency prompt = ( f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) " f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, " f"and suitable for a news, product, or service context inferred from the URL ({url}). " f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}" ) response = await model.generate_content_async(prompt) raw_content = response.text.strip() logging.info(f"Raw Gemini response: {raw_content}") # Parse response with regex try: match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content) if match: title = match.group(1) description = match.group(2) summaries.append({"title": title, "description": description}) else: raise ValueError("Invalid JSON format in Gemini response") except Exception as e: logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.") continue # Combine summaries (prioritize first valid summary) if summaries: result = summaries[0] else: logging.warning("No valid summaries generated. Using fallback.") result = { "title": "News Summary", "description": f"Discover news and insights from {domain}."[:100] } # Ensure non-empty outputs if not result["title"].strip(): result["title"] = "News Summary" if not result["description"].strip(): result["description"] = f"Discover news and insights from {domain}." cache[cache_key] = result logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}") return result except Exception as e: logging.error(f"Error summarizing text: {e}") domain = urlparse(url).netloc or "example.com" result = { "title": "News Summary", "description": f"Discover news and insights from {domain}."[:100] } cache[cache_key] = result return result async def quick_summarize(text: str, url: str = "") -> Dict[str, str]: """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash.""" try: # Validate inputs text = text.strip() if text else "" if not url: url = "https://example.com" try: parsed_url = urlparse(url) domain = parsed_url.netloc or "example.com" except Exception: logging.warning(f"Invalid URL: {url}. Using default domain.") domain = "example.com" # Check cache cache_key = f"quick_summarize_{hash(text + url)}" if cache_key in cache: logging.info(f"Cache hit for {cache_key}") return cache[cache_key] # Get Gemini API key api_key = os.getenv("GEMINI_API_KEY") if not api_key: logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.") raise ValueError("Gemini API key is required for summarization.") # Configure Gemini client genai.configure(api_key=api_key) model = genai.GenerativeModel('gemini-1.5-pro') # Handle short or empty text if len(text) < 20: logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.") text = f"Content from {url} about news, products, or services." # Lightweight prompt with chunking chunk_size = 1000 text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] summaries = [] for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary prompt = ( f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. " f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}" ) response = await model.generate_content_async(prompt) raw_content = response.text.strip() logging.info(f"Raw Gemini response (quick): {raw_content}") # Parse response with regex try: match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content) if match: title = match.group(1) description = match.group(2) summaries.append({"title": title, "description": description}) else: raise ValueError("Invalid JSON format in Gemini response") except Exception as e: logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.") continue # Use first valid summary or fallback if summaries: result = summaries[0] else: logging.warning("No valid summaries generated. Using fallback.") result = { "title": "Quick Summary", "description": f"Check out content from {domain}." } # Ensure non-empty outputs if not result["title"].strip(): result["title"] = "Quick Summary" if not result["description"].strip(): result["description"] = f"Check out content from {domain}." cache[cache_key] = result logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}") return result except Exception as e: logging.error(f"Error in quick summarize: {e}") domain = urlparse(url).netloc or "example.com" result = { "title": "Quick Summary", "description": f"Check out content from {domain}." } cache[cache_key] = result return result