Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -53,7 +53,6 @@ except ImportError:
|
|
53 |
# --- Google Gemini ---
|
54 |
try:
|
55 |
import google.generativeai as genai
|
56 |
-
# Import specific types needed, check library for exact names if errors occur
|
57 |
from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
|
58 |
_gemini_available = True
|
59 |
except ImportError:
|
@@ -74,7 +73,6 @@ logging.getLogger('gunicorn.error').setLevel(logging.INFO)
|
|
74 |
logging.getLogger('uvicorn').setLevel(logging.INFO)
|
75 |
logging.getLogger('starlette').setLevel(logging.INFO)
|
76 |
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
|
77 |
-
# Suppress noisy crawl4ai/playwright logs if needed
|
78 |
logging.getLogger("crawl4ai").setLevel(logging.INFO) # Adjust level as needed
|
79 |
logging.getLogger("playwright").setLevel(logging.WARNING)
|
80 |
|
@@ -113,7 +111,6 @@ GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer
|
|
113 |
# Models (User can still configure via env vars)
|
114 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
115 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
116 |
-
# *** Reverted Model Name as requested ***
|
117 |
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model
|
118 |
|
119 |
# --- Configuration Checks ---
|
@@ -159,14 +156,10 @@ if _gemini_primary_enabled:
|
|
159 |
_gemini_primary_enabled = False
|
160 |
|
161 |
# --- Constants ---
|
162 |
-
MAX_SUMMARY_CHUNK_SIZE = 4000
|
163 |
-
#
|
164 |
-
# Let's use a more conservative estimate for 2.0 Flash, e.g., 128k tokens ~ 500k chars? Check Gemini docs.
|
165 |
-
# Sticking with a large number for now, but be aware this might need adjustment.
|
166 |
-
MAX_INPUT_TOKEN_APPROX = 500000
|
167 |
|
168 |
# --- Retry Decorator ---
|
169 |
-
# (Remains the same)
|
170 |
@retry(
|
171 |
stop=stop_after_attempt(4),
|
172 |
wait=wait_exponential(multiplier=1, min=2, max=15),
|
@@ -175,26 +168,20 @@ MAX_INPUT_TOKEN_APPROX = 500000
|
|
175 |
reraise=True
|
176 |
)
|
177 |
async def retry_bot_operation(func, *args, **kwargs):
|
|
|
178 |
try:
|
179 |
return await func(*args, **kwargs)
|
180 |
except BadRequest as e:
|
181 |
-
ignore_errors = [
|
182 |
-
"message is not modified", "query is too old", "message to edit not found",
|
183 |
-
"chat not found", "bot was blocked by the user",
|
184 |
-
]
|
185 |
if any(err in str(e).lower() for err in ignore_errors):
|
186 |
logger.warning(f"Ignoring non-critical BadRequest: {e}")
|
187 |
return None
|
188 |
-
# Only raise if it's not an ignored error
|
189 |
logger.error(f"Potentially critical BadRequest: {e}")
|
190 |
raise
|
191 |
except TelegramError as e:
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
else:
|
196 |
-
logger.error(f"Unhandled TelegramError: {e}") # Log other Telegram errors
|
197 |
-
raise # Reraise to allow tenacity to handle retry
|
198 |
except Exception as e:
|
199 |
logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
|
200 |
raise
|
@@ -202,9 +189,11 @@ async def retry_bot_operation(func, *args, **kwargs):
|
|
202 |
# --- Helper Functions ---
|
203 |
# (is_youtube_url, extract_youtube_id remain the same)
|
204 |
def is_youtube_url(url):
|
|
|
205 |
youtube_regex = re.compile( r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/' r'(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
|
206 |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match)
|
207 |
def extract_youtube_id(url):
|
|
|
208 |
youtube_regex = re.compile( r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/' r'(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
|
209 |
match = youtube_regex.search(url)
|
210 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
@@ -246,7 +235,6 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
246 |
return None
|
247 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
248 |
|
249 |
-
|
250 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
251 |
# ... (Keep existing implementation) ...
|
252 |
global APIFY_ACTOR_ID
|
@@ -295,7 +283,6 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
|
|
295 |
except httpx.RequestError as e: logger.error(f"[Apify SyncItems] Request error during API interaction for {video_url}: {e}"); return None
|
296 |
except Exception as e: logger.error(f"[Apify SyncItems] Unexpected error during Apify SyncItems REST call for {video_url}: {e}", exc_info=True); return None
|
297 |
|
298 |
-
|
299 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
300 |
# ... (Keep existing implementation) ...
|
301 |
global SUPADATA_API_KEY, APIFY_API_TOKEN
|
@@ -304,11 +291,9 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
304 |
transcript_text = None
|
305 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
306 |
try:
|
307 |
-
# Run the blocking call in a separate thread
|
308 |
transcript_list = await asyncio.to_thread(
|
309 |
YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch
|
310 |
)
|
311 |
-
# transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) # Old way
|
312 |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
313 |
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text.strip()
|
314 |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
|
@@ -320,14 +305,13 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
320 |
transcript_text = None
|
321 |
except Exception as e:
|
322 |
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
|
323 |
-
# if isinstance(e, (NoTranscriptFound, TranscriptsDisabled)): logger.warning(f"[Primary YT] Known issue: {type(e).__name__}") # Handled above
|
324 |
transcript_text = None
|
325 |
|
326 |
if transcript_text is None:
|
327 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
328 |
if SUPADATA_API_KEY:
|
329 |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
330 |
-
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
331 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
332 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
333 |
|
@@ -335,12 +319,11 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
335 |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
|
336 |
if APIFY_API_TOKEN:
|
337 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
338 |
-
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
|
339 |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
|
340 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
341 |
|
342 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
343 |
-
# This return was missing, ensuring the final value is returned if fallbacks succeed
|
344 |
return transcript_text
|
345 |
|
346 |
|
@@ -358,22 +341,11 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
358 |
return None
|
359 |
|
360 |
logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
|
361 |
-
# Define a writable cache directory (use /tmp in container environments)
|
362 |
-
# No longer creating dir here, relying on HOME=/tmp from Dockerfile
|
363 |
-
# cache_dir_path = "/tmp/.crawl4ai"
|
364 |
-
# try:
|
365 |
-
# os.makedirs(cache_dir_path, exist_ok=True)
|
366 |
-
# logger.info(f"[Crawl4AI Primary] Ensured cache directory exists: {cache_dir_path}")
|
367 |
-
# except OSError as e:
|
368 |
-
# logger.error(f"[Crawl4AI Primary] Failed to create cache directory {cache_dir_path}: {e}. Crawl may fail.")
|
369 |
-
# except Exception as e:
|
370 |
-
# logger.error(f"[Crawl4AI Primary] Unexpected error creating cache directory {cache_dir_path}: {e}")
|
371 |
|
372 |
try:
|
373 |
-
#
|
374 |
-
|
375 |
-
|
376 |
-
logger.info(f"[Crawl4AI Primary] Initialized crawler (HOME should be /tmp).")
|
377 |
# Use arun for a single URL crawl
|
378 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
|
379 |
|
@@ -384,14 +356,13 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
384 |
return content
|
385 |
else:
|
386 |
logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted markdown content is empty.")
|
387 |
-
# Try result.text as a fallback within crawl4ai success
|
388 |
if result.text:
|
389 |
content = result.text.strip()
|
390 |
if content:
|
391 |
logger.info(f"[Crawl4AI Primary] Using .text fallback after empty markdown. Length: {len(content)}")
|
392 |
return content
|
393 |
return None # Return None if both markdown and text are empty
|
394 |
-
elif result and result.text:
|
395 |
content = result.text.strip()
|
396 |
if content:
|
397 |
logger.info(f"[Crawl4AI Primary] Success crawling {url} (using .text, markdown missing). Length: {len(content)}")
|
@@ -405,13 +376,11 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
405 |
except asyncio.TimeoutError:
|
406 |
logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
|
407 |
return None
|
408 |
-
except PermissionError as e: #
|
409 |
-
|
410 |
-
|
411 |
-
return None # Fail gracefully for this method
|
412 |
except Exception as e:
|
413 |
logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
|
414 |
-
# Log specific crawl4ai errors if they become apparent
|
415 |
return None
|
416 |
|
417 |
|
@@ -427,7 +396,7 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
427 |
response.raise_for_status()
|
428 |
content_type = response.headers.get('content-type', '').lower()
|
429 |
if 'html' not in content_type: logger.warning(f"[Web Scrape BS4] Non-HTML content type from {url}: {content_type}"); return None
|
430 |
-
try: return response.text
|
431 |
except Exception as e: logger.error(f"[Web Scrape BS4] Error getting response text for {url}: {e}"); return None
|
432 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape BS4] HTTP error {e.response.status_code} fetching {url}: {e}")
|
433 |
except httpx.TimeoutException: logger.error(f"[Web Scrape BS4] Timeout error fetching {url}")
|
@@ -437,7 +406,6 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
437 |
return None
|
438 |
|
439 |
async def get_website_content_bs4(url: str) -> Optional[str]:
|
440 |
-
"""Fetches and parses website content using BeautifulSoup (Fallback 1)."""
|
441 |
# ... (Keep existing implementation) ...
|
442 |
if not url: logger.error("[BS4 Fallback] get_website_content_bs4: No URL"); return None
|
443 |
logger.info(f"[BS4 Fallback] Attempting basic fetch & parse for: {url}")
|
@@ -445,39 +413,24 @@ async def get_website_content_bs4(url: str) -> Optional[str]:
|
|
445 |
if not html_content:
|
446 |
logger.warning(f"[BS4 Fallback] Failed to fetch HTML for {url}")
|
447 |
return None
|
448 |
-
|
449 |
try:
|
450 |
-
# Inner function for parsing to use asyncio.to_thread
|
451 |
def parse_html(content):
|
452 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
453 |
-
|
454 |
-
|
455 |
-
element.extract()
|
456 |
-
# Try to find main content areas more broadly
|
457 |
-
selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body', '.post-content', '.entry-content'] # Added more common selectors
|
458 |
target_element = None
|
459 |
for selector in selectors:
|
460 |
try:
|
461 |
target_element = soup.select_one(selector)
|
462 |
if target_element: break
|
463 |
-
except Exception as sel_e:
|
464 |
-
|
465 |
-
continue
|
466 |
-
|
467 |
-
|
468 |
-
if not target_element: target_element = soup.body # Fallback to body
|
469 |
if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
|
470 |
-
|
471 |
-
# Extract text, clean up whitespace aggressively
|
472 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
473 |
-
text = " ".join(lines)
|
474 |
-
|
475 |
-
# Basic post-cleaning
|
476 |
-
text = re.sub(r'\s{2,}', ' ', text).strip() # Replace multiple spaces with single space
|
477 |
-
|
478 |
if not text: logger.warning(f"[BS4 Fallback] Extracted text is empty after cleaning for {url}"); return None
|
479 |
return text
|
480 |
-
|
481 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
482 |
if text_content:
|
483 |
logger.info(f"[BS4 Fallback] Success scrape/parse for {url} (final len: {len(text_content)})")
|
@@ -491,7 +444,6 @@ async def get_website_content_bs4(url: str) -> Optional[str]:
|
|
491 |
|
492 |
# Fallback 2: urltotext.com API
|
493 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
494 |
-
"""Fetches website content using urltotext.com API (Fallback 2)."""
|
495 |
# ... (Keep existing implementation) ...
|
496 |
if not url: logger.error("[API Fallback] No URL"); return None
|
497 |
if not api_key: logger.error("[API Fallback] urltotext.com API key missing."); return None
|
@@ -513,8 +465,10 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
513 |
else: logger.warning(f"[API Fallback] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
514 |
except json.JSONDecodeError: logger.error(f"[API Fallback] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
515 |
except Exception as e: logger.error(f"[API Fallback] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
516 |
-
elif response.status_code == 402:
|
517 |
-
|
|
|
|
|
518 |
elif response.status_code in [400, 401, 403, 422, 500]: logger.error(f"[API Fallback] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
519 |
else: logger.error(f"[API Fallback] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
520 |
except httpx.TimeoutException: logger.error(f"[API Fallback] Timeout connecting to urltotext.com API for {url}"); return None
|
@@ -529,188 +483,150 @@ async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Opt
|
|
529 |
logger.error("[Gemini Primary] Called but is disabled.");
|
530 |
return None, "Error: Primary AI service (Gemini) not configured/available."
|
531 |
|
532 |
-
# Truncate input text if it exceeds the approximate limit
|
533 |
if len(text) > MAX_INPUT_TOKEN_APPROX:
|
534 |
logger.warning(f"[Gemini Primary] Input text length ({len(text)}) exceeds limit ({MAX_INPUT_TOKEN_APPROX}). Truncating.")
|
535 |
text = text[:MAX_INPUT_TOKEN_APPROX]
|
536 |
|
537 |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}")
|
538 |
|
539 |
-
# Define prompts
|
540 |
if summary_type == "paragraph":
|
541 |
-
prompt = f"""Please summarise the following text into a concise paragraph
|
542 |
-
|
543 |
-
Text to summarise:
|
544 |
-
---
|
545 |
-
{text}
|
546 |
-
---
|
547 |
-
|
548 |
-
Concise Paragraph Summary:"""
|
549 |
elif summary_type == "points":
|
550 |
-
prompt = f"""Please summarise the following text into a list of key bullet points
|
551 |
-
|
552 |
-
Text to summarise:
|
553 |
-
---
|
554 |
-
{text}
|
555 |
-
---
|
556 |
-
|
557 |
-
Key Bullet Points Summary:"""
|
558 |
else:
|
559 |
logger.error(f"[Gemini Primary] Invalid summary_type: {summary_type}")
|
560 |
return None, f"Error: Invalid summary type '{summary_type}' specified."
|
561 |
|
562 |
-
#
|
563 |
-
safety_settings = {
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
|
|
|
|
|
|
|
|
569 |
|
570 |
-
# Configure generation settings (optional)
|
571 |
generation_config = genai.types.GenerationConfig(
|
572 |
-
|
573 |
-
|
574 |
-
max_output_tokens=2048, # Increased max tokens
|
575 |
-
temperature=0.7, # Adjust creativity vs factualness
|
576 |
-
# top_p=1.0, # Default
|
577 |
-
# top_k=None # Default
|
578 |
)
|
579 |
|
580 |
try:
|
581 |
model = genai.GenerativeModel(GEMINI_MODEL)
|
582 |
logger.debug(f"[Gemini Primary] Sending request to model {GEMINI_MODEL}")
|
583 |
-
response: GenerateContentResponse = await model.generate_content_async(
|
584 |
prompt,
|
585 |
generation_config=generation_config,
|
586 |
safety_settings=safety_settings,
|
587 |
-
# request_options={"timeout": 100} # Optional: Add timeout for the API call itself
|
588 |
)
|
589 |
|
590 |
-
# Check for
|
591 |
if not response.candidates:
|
592 |
block_reason = "Unknown"
|
|
|
593 |
if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
|
594 |
block_reason = response.prompt_feedback.block_reason or "Reason not specified"
|
595 |
-
|
|
|
|
|
596 |
logger.error(f"[Gemini Primary] {error_msg}")
|
597 |
return None, error_msg
|
598 |
|
599 |
-
#
|
600 |
-
|
601 |
-
finish_reason_val =
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
safety_ratings_str = "N/A"
|
613 |
-
if hasattr(response.candidates[0], 'safety_ratings') and response.candidates[0].safety_ratings:
|
614 |
-
safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.candidates[0].safety_ratings])
|
615 |
-
|
616 |
-
error_msg = f"Error: Gemini generation finished unexpectedly. Reason: {finish_reason_str}. Safety: {safety_ratings_str}"
|
617 |
-
logger.error(f"[Gemini Primary] {error_msg}")
|
618 |
|
619 |
-
# Decide how to handle specific non-STOP reasons
|
620 |
-
if finish_reason_str == "SAFETY":
|
621 |
-
return None, error_msg # Return specific error for safety blocks
|
622 |
-
# For RECITATION, OTHER, etc., it's safer to return an error than potentially broken output
|
623 |
-
return None, f"Error: Gemini generation finished unexpectedly ({finish_reason_str})."
|
624 |
|
625 |
-
# Extract text
|
626 |
summary_text = ""
|
627 |
try:
|
628 |
-
|
629 |
-
summary_text = response.text
|
630 |
except ValueError as e:
|
631 |
-
# Handle cases where .text might raise error (e.g., if blocked, but caught above ideally)
|
632 |
logger.warning(f"[Gemini Primary] Error accessing response.text: {e}. Trying parts manually.")
|
633 |
-
|
634 |
-
|
635 |
-
summary_text = "".join(part.text for part in response.candidates[0].content.parts)
|
636 |
|
637 |
if not summary_text or not summary_text.strip():
|
638 |
-
|
|
|
639 |
return None, "Error: AI generated an empty summary."
|
640 |
|
641 |
-
logger.info(f"[Gemini Primary] Summary
|
642 |
return summary_text.strip(), None
|
643 |
|
644 |
except AttributeError as e:
|
645 |
-
# Catch potential errors if response structure is different than expected
|
646 |
logger.error(f"[Gemini Primary] Attribute error accessing Gemini response: {e}. Response structure might have changed.", exc_info=True)
|
647 |
return None, f"Error: Failed to parse Gemini response. Details: {e}"
|
648 |
except Exception as e:
|
649 |
logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
|
650 |
-
# Check for specific Google API errors if needed
|
651 |
-
# from google.api_core import exceptions as google_exceptions
|
652 |
-
# if isinstance(e, google_exceptions.GoogleAPIError): ...
|
653 |
return None, f"Error: Failed to communicate with the primary AI service (Gemini). Details: {e}"
|
654 |
|
655 |
|
656 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
657 |
""" Calls the OpenRouter API to generate a summary. """
|
658 |
-
# ... (Keep existing implementation
|
659 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
660 |
if not _openrouter_fallback_enabled:
|
661 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
662 |
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
|
663 |
|
664 |
-
max_input_len_openrouter = 100000
|
665 |
if len(text) > max_input_len_openrouter:
|
666 |
logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
|
667 |
text = text[:max_input_len_openrouter]
|
668 |
|
669 |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
|
670 |
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
Text:
|
676 |
-
---
|
677 |
-
{text}
|
678 |
-
---
|
679 |
-
|
680 |
-
Concise Paragraph Summary:"""
|
681 |
-
elif summary_type == "points":
|
682 |
-
prompt_content = f"""Please summarise the following text into a list of key bullet points. Each point should capture a distinct main idea.
|
683 |
-
|
684 |
-
Text:
|
685 |
-
---
|
686 |
-
{text}
|
687 |
-
---
|
688 |
-
|
689 |
-
Key Bullet Points Summary:"""
|
690 |
-
else:
|
691 |
-
logger.error(f"[OpenRouter Fallback] Invalid summary_type: {summary_type}")
|
692 |
-
return None, f"Error: Invalid summary type '{summary_type}' specified."
|
693 |
-
|
694 |
-
headers = {
|
695 |
-
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
696 |
-
"Content-Type": "application/json",
|
697 |
-
"HTTP-Referer": "https://github.com/fmab777/telegram-summary-bot", # Optional: Identify your app
|
698 |
-
"X-Title": "Telegram Summary Bot", # Optional: Identify your app
|
699 |
-
}
|
700 |
-
payload = {
|
701 |
-
"model": OPENROUTER_MODEL,
|
702 |
-
"messages": [
|
703 |
-
{"role": "system", "content": "You are an expert summarizer. Provide summaries as requested."},
|
704 |
-
{"role": "user", "content": prompt_content}
|
705 |
-
],
|
706 |
-
"max_tokens": 2048, # Adjust as needed
|
707 |
-
"temperature": 0.7,
|
708 |
-
}
|
709 |
|
|
|
|
|
710 |
api_url = "https://openrouter.ai/api/v1/chat/completions"
|
711 |
|
712 |
try:
|
713 |
-
async with httpx.AsyncClient(timeout=120.0) as client:
|
714 |
logger.debug(f"[OpenRouter Fallback] Sending request to {api_url} for model {OPENROUTER_MODEL}")
|
715 |
response = await client.post(api_url, headers=headers, json=payload)
|
716 |
logger.debug(f"[OpenRouter Fallback] Received status code {response.status_code}")
|
@@ -721,101 +637,64 @@ Key Bullet Points Summary:"""
|
|
721 |
if data.get("choices") and len(data["choices"]) > 0:
|
722 |
choice = data["choices"][0]
|
723 |
message = choice.get("message")
|
724 |
-
finish_reason = choice.get("finish_reason", "N/A")
|
725 |
-
|
726 |
if message and message.get("content"):
|
727 |
summary_text = message["content"].strip()
|
728 |
if summary_text:
|
729 |
logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
|
730 |
-
|
731 |
-
if finish_reason == 'length':
|
732 |
-
logger.warning("[OpenRouter Fallback] Summary may be truncated due to max_tokens limit.")
|
733 |
return summary_text, None
|
734 |
-
else:
|
735 |
-
|
736 |
-
return None, "Error: Fallback AI generated an empty summary."
|
737 |
-
else:
|
738 |
-
logger.error(f"[OpenRouter Fallback] Invalid response structure (missing message/content). Data: {data}")
|
739 |
-
return None, "Error: Fallback AI returned an invalid response format."
|
740 |
else:
|
741 |
logger.error(f"[OpenRouter Fallback] Invalid response structure (missing choices). Data: {data}")
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
except json.JSONDecodeError:
|
747 |
-
logger.error(f"[OpenRouter Fallback] Failed to decode JSON response. Status: {response.status_code}, Text: {response.text[:500]}")
|
748 |
-
return None, "Error: Fallback AI sent an invalid JSON response."
|
749 |
-
except Exception as e:
|
750 |
-
logger.error(f"[OpenRouter Fallback] Error processing success response: {e}", exc_info=True)
|
751 |
-
return None, f"Error: Failed to process Fallback AI response. Details: {e}"
|
752 |
-
|
753 |
else:
|
754 |
-
# Handle API errors (rate limits, auth, etc.)
|
755 |
error_message = f"Error: Fallback AI service ({OPENROUTER_MODEL}) returned status {response.status_code}."
|
756 |
-
try:
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
return None, error_message
|
763 |
-
|
764 |
-
except httpx.TimeoutException:
|
765 |
-
logger.error(f"[OpenRouter Fallback] Timeout connecting to OpenRouter API for {OPENROUTER_MODEL}")
|
766 |
-
return None, "Error: Timed out connecting to the fallback AI service."
|
767 |
-
except httpx.RequestError as e:
|
768 |
-
logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}")
|
769 |
-
return None, f"Error: Network error connecting to the fallback AI service. Details: {e}"
|
770 |
-
except Exception as e:
|
771 |
-
logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter API call: {e}", exc_info=True)
|
772 |
-
return None, f"Error: Unexpected issue with the fallback AI service. Details: {e}"
|
773 |
|
774 |
|
775 |
async def generate_summary(text: str, summary_type: str) -> str:
|
776 |
""" Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
|
777 |
-
# ... (Keep existing implementation
|
778 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
779 |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
|
780 |
-
|
781 |
-
error_message: Optional[str] = None # Accumulates errors
|
782 |
-
|
783 |
-
# --- Attempt Primary AI (Gemini) ---
|
784 |
if _gemini_primary_enabled:
|
785 |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})")
|
786 |
primary_summary, primary_error = await _call_gemini(text, summary_type)
|
787 |
if primary_summary:
|
788 |
logger.info(f"[Summary Generation] Success with primary AI (Gemini).")
|
789 |
-
return primary_summary
|
790 |
else:
|
791 |
logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error}. Proceeding to fallback.")
|
792 |
-
error_message = f"Primary AI ({GEMINI_MODEL}) failed: {primary_error}"
|
793 |
else:
|
794 |
logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback.")
|
795 |
error_message = "Primary AI (Gemini) unavailable."
|
796 |
|
797 |
-
# --- Attempt Fallback AI (OpenRouter) ---
|
798 |
if _openrouter_fallback_enabled:
|
799 |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})")
|
800 |
fallback_summary, fallback_error = await _call_openrouter(text, summary_type)
|
801 |
if fallback_summary:
|
802 |
logger.info(f"[Summary Generation] Success with fallback AI (OpenRouter).")
|
803 |
-
return fallback_summary
|
804 |
else:
|
805 |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error}")
|
806 |
-
|
807 |
-
|
808 |
-
return f"{error_message}\nFallback AI ({OPENROUTER_MODEL}) also failed: {fallback_error}"
|
809 |
-
else: # Should not happen if logic is correct, but fallback just in case
|
810 |
-
return f"Fallback AI ({OPENROUTER_MODEL}) failed: {fallback_error}"
|
811 |
else:
|
812 |
logger.error("[Summary Generation] Fallback AI (OpenRouter) is disabled. Cannot proceed.")
|
813 |
-
if error_message:
|
814 |
-
|
815 |
-
else: # Primary disabled AND fallback disabled
|
816 |
-
return "Error: Both primary and fallback AI services are unavailable."
|
817 |
|
818 |
-
# This part should ideally not be reached if the logic above is sound
|
819 |
logger.error("[Summary Generation] Reached end of function unexpectedly. No summary generated.")
|
820 |
final_error = error_message or "Unknown summary generation error."
|
821 |
return f"Sorry, an error occurred: {final_error}"
|
@@ -831,7 +710,7 @@ async def process_summary_task(
|
|
831 |
bot_token: str
|
832 |
) -> None:
|
833 |
"""Handles fetching content, generating summary, and sending results."""
|
834 |
-
# ... (Keep existing implementation
|
835 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
836 |
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
837 |
background_request: Optional[BaseRequest] = None
|
@@ -840,43 +719,25 @@ async def process_summary_task(
|
|
840 |
user_feedback_message: Optional[str] = None
|
841 |
success = False
|
842 |
status_message_id = message_id_to_edit # Keep track of the original button message ID
|
843 |
-
message_to_delete_later_id: Optional[int] = None # For temporary "Processing..." messages
|
844 |
|
845 |
try:
|
846 |
-
# Initialize background bot
|
847 |
background_request = HTTPXRequest(connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0)
|
848 |
bot = Bot(token=bot_token, request=background_request)
|
849 |
except Exception as e:
|
850 |
-
logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True)
|
851 |
-
# Cannot send feedback without bot
|
852 |
-
return
|
853 |
|
854 |
try:
|
855 |
-
#
|
856 |
-
# We *always* edit the original button message first
|
857 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{html.escape(url)}\n\nThis might take a moment..."
|
858 |
-
|
859 |
-
if status_message_id: # If we have the ID of the message with buttons
|
860 |
try:
|
861 |
-
await retry_bot_operation(
|
862 |
-
bot.edit_message_text,
|
863 |
-
chat_id=chat_id,
|
864 |
-
message_id=status_message_id,
|
865 |
-
text=processing_message_text,
|
866 |
-
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
867 |
-
reply_markup=None, # Remove buttons
|
868 |
-
link_preview_options={'is_disabled': True} # Disable preview here too
|
869 |
-
)
|
870 |
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} to 'Processing'")
|
871 |
-
edited_original_msg = True
|
872 |
except Exception as e:
|
873 |
-
logger.warning(f"[Task {task_id}] Could not edit original button message {status_message_id}: {e}.
|
874 |
-
# Don't send a new message here, the final result/error will handle it.
|
875 |
-
# Keep status_message_id so we know the original message still exists.
|
876 |
|
877 |
-
# Indicate activity
|
878 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
879 |
-
except Exception: pass
|
880 |
|
881 |
# --- Determine URL Type and Fetch Content ---
|
882 |
is_youtube = is_youtube_url(url)
|
@@ -884,43 +745,28 @@ async def process_summary_task(
|
|
884 |
|
885 |
if is_youtube:
|
886 |
video_id = extract_youtube_id(url)
|
887 |
-
if video_id:
|
888 |
-
|
889 |
-
|
890 |
-
user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
891 |
-
|
892 |
-
if not content and not user_feedback_message:
|
893 |
-
user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
894 |
else:
|
895 |
-
# Website URL -
|
896 |
logger.info(f"[Task {task_id}] URL is website. Attempting Crawl4AI (Primary)...")
|
897 |
content = await get_website_content_via_crawl4ai(url)
|
898 |
-
|
899 |
if not content:
|
900 |
-
|
901 |
-
logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
|
902 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
903 |
except Exception: pass
|
904 |
content = await get_website_content_bs4(url)
|
905 |
-
|
906 |
if not content:
|
907 |
-
logger.warning(f"[Task {task_id}] BeautifulSoup
|
908 |
global URLTOTEXT_API_KEY, _urltotext_fallback_enabled
|
909 |
if _urltotext_fallback_enabled:
|
910 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
911 |
except Exception: pass
|
912 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
913 |
-
if not content:
|
914 |
-
|
915 |
-
|
916 |
-
else:
|
917 |
-
logger.warning(f"[Task {task_id}] API fallback is disabled. Cannot attempt Fallback 2.")
|
918 |
-
user_feedback_message = "Sorry, I couldn't fetch content from that website using Crawl4AI or BeautifulSoup, and the API fallback is not enabled." # Updated message
|
919 |
-
|
920 |
-
# Final check if all web methods failed
|
921 |
-
if not content and not user_feedback_message:
|
922 |
-
logger.error(f"[Task {task_id}] All website fetching methods seem to have failed without setting a specific user message.")
|
923 |
-
user_feedback_message = "Sorry, I couldn't fetch content from that website using any available method (blocked/inaccessible/empty?)."
|
924 |
|
925 |
|
926 |
# --- Generate Summary if Content was Fetched ---
|
@@ -928,175 +774,99 @@ async def process_summary_task(
|
|
928 |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary type: '{summary_type}'.")
|
929 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
930 |
except Exception: pass
|
931 |
-
|
932 |
-
final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter
|
933 |
|
934 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
935 |
-
user_feedback_message = final_summary
|
936 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
937 |
-
success = False
|
938 |
else:
|
939 |
-
# Success - Split and Send
|
940 |
summary_parts = []
|
941 |
-
current_part = ""
|
942 |
-
# Split respecting newlines, ensure no part exceeds MAX_SUMMARY_CHUNK_SIZE
|
943 |
-
lines = final_summary.splitlines(keepends=True)
|
944 |
for line in lines:
|
945 |
if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
|
946 |
-
if current_part.strip():
|
947 |
-
|
948 |
-
|
949 |
-
if len(current_part) > MAX_SUMMARY_CHUNK_SIZE:
|
950 |
-
logger.warning(f"[Task {task_id}] Truncating overly long line in summary.")
|
951 |
-
current_part = current_part[:MAX_SUMMARY_CHUNK_SIZE]
|
952 |
-
else:
|
953 |
-
current_part += line
|
954 |
if current_part.strip(): summary_parts.append(current_part.strip())
|
955 |
-
if not summary_parts:
|
956 |
-
summary_parts.append("Summary generated, but it appears to be empty.")
|
957 |
-
logger.warning(f"[Task {task_id}] Summary was non-empty initially but splitting resulted in zero parts.")
|
958 |
|
959 |
logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
|
960 |
|
961 |
-
#
|
962 |
edited_final_result = False
|
963 |
-
if status_message_id:
|
964 |
try:
|
965 |
-
await retry_bot_operation(
|
966 |
-
|
967 |
-
chat_id=chat_id,
|
968 |
-
message_id=status_message_id,
|
969 |
-
text=summary_parts[0],
|
970 |
-
parse_mode=None, # Send as plain text initially, safer
|
971 |
-
link_preview_options={'is_disabled': True}
|
972 |
-
)
|
973 |
-
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} with first summary part.")
|
974 |
edited_final_result = True
|
975 |
-
except Exception as edit_err:
|
976 |
-
logger.warning(f"[Task {task_id}] Failed to edit original button message {status_message_id} with summary part 1: {edit_err}. Sending as new message.")
|
977 |
-
# Original message remains as "Processing..." or its initial state if first edit failed
|
978 |
-
|
979 |
-
# If editing failed, or if there was no original message_id (shouldn't happen), send first part as new message
|
980 |
if not edited_final_result:
|
981 |
-
sent_msg = await retry_bot_operation(
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
-
|
986 |
-
link_preview_options={'is_disabled': True}
|
987 |
-
)
|
988 |
-
if not sent_msg:
|
989 |
-
logger.error(f"[Task {task_id}] Failed to send first summary part even as new message.")
|
990 |
-
user_feedback_message = "Sorry, failed to send the summary."
|
991 |
-
success = False
|
992 |
-
|
993 |
-
# Send remaining parts (if any and first part succeeded)
|
994 |
-
if success and len(summary_parts) > 1: # Check success flag before sending more parts
|
995 |
for i, part in enumerate(summary_parts[1:], start=2):
|
996 |
-
await asyncio.sleep(0.5)
|
997 |
try:
|
998 |
-
await retry_bot_operation(
|
999 |
-
bot.send_message,
|
1000 |
-
chat_id=chat_id,
|
1001 |
-
text=part,
|
1002 |
-
parse_mode=None,
|
1003 |
-
link_preview_options={'is_disabled': True}
|
1004 |
-
)
|
1005 |
logger.debug(f"[Task {task_id}] Sent summary part {i}/{len(summary_parts)}.")
|
1006 |
-
except Exception as part_err:
|
1007 |
-
|
1008 |
-
|
1009 |
-
|
1010 |
-
|
1011 |
-
|
1012 |
-
|
1013 |
-
if not user_feedback_message:
|
1014 |
-
success = True
|
1015 |
-
|
1016 |
-
# --- Handle Failure Cases (Content Fetching Failed or Summary Failed) ---
|
1017 |
-
if not success: # Check overall success flag
|
1018 |
-
if not user_feedback_message: # If success is false but no specific message, set a generic one
|
1019 |
-
user_feedback_message = "Sorry, an unknown error occurred during processing."
|
1020 |
-
logger.error(f"[Task {task_id}] Task failed but no specific user_feedback_message was set.")
|
1021 |
-
|
1022 |
-
logger.warning(f"[Task {task_id}] Sending failure/error feedback to user: {user_feedback_message}")
|
1023 |
try:
|
1024 |
-
#
|
1025 |
edited_final_error = False
|
1026 |
if status_message_id:
|
1027 |
try:
|
1028 |
-
await retry_bot_operation(
|
1029 |
-
|
1030 |
-
chat_id=chat_id,
|
1031 |
-
message_id=status_message_id,
|
1032 |
-
text=user_feedback_message,
|
1033 |
-
link_preview_options={'is_disabled': True},
|
1034 |
-
reply_markup=None # Remove buttons
|
1035 |
-
)
|
1036 |
-
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} with failure feedback.")
|
1037 |
edited_final_error = True
|
1038 |
-
except Exception as edit_err:
|
1039 |
-
logger.warning(f"[Task {task_id}] Failed to edit original button message {status_message_id} with failure feedback: {edit_err}. Sending as new message.")
|
1040 |
-
|
1041 |
-
# If editing failed or wasn't applicable, send error as a new message
|
1042 |
if not edited_final_error:
|
1043 |
-
await retry_bot_operation(
|
1044 |
-
bot.send_message,
|
1045 |
-
chat_id=chat_id,
|
1046 |
-
text=user_feedback_message,
|
1047 |
-
link_preview_options={'is_disabled': True}
|
1048 |
-
)
|
1049 |
logger.debug(f"[Task {task_id}] Sent failure feedback as new message.")
|
1050 |
-
except Exception as send_err:
|
1051 |
-
logger.error(f"[Task {task_id}] Failed even to send failure feedback message: {send_err}")
|
1052 |
|
1053 |
except Exception as e:
|
1054 |
-
# Catch-all for unexpected errors
|
1055 |
logger.error(f"[Task {task_id}] Unexpected error during processing task: {e}", exc_info=True)
|
1056 |
-
success = False
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
# Attempt to send a final error message (maybe edit original if possible?)
|
1061 |
-
edited_final_crash_error = False
|
1062 |
if status_message_id:
|
1063 |
-
try:
|
1064 |
-
|
1065 |
-
|
1066 |
-
|
1067 |
-
if not edited_final_crash_error:
|
1068 |
-
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
1069 |
-
except Exception as final_err:
|
1070 |
-
logger.error(f"[Task {task_id}] Failed to send the final unexpected error feedback: {final_err}")
|
1071 |
|
1072 |
finally:
|
1073 |
-
#
|
1074 |
-
# No automatic deletion needed now. The original message (status_message_id) is either
|
1075 |
-
# edited with the result/error, or left as is if editing failed. Temporary messages
|
1076 |
-
# were not introduced in this version of the logic.
|
1077 |
-
|
1078 |
-
# Close the background bot's HTTP client
|
1079 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
1080 |
-
try:
|
1081 |
-
|
1082 |
-
logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
1083 |
-
except Exception as close_err:
|
1084 |
-
logger.warning(f"[Task {task_id}] Error closing background bot's client: {close_err}")
|
1085 |
-
|
1086 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
1087 |
|
1088 |
|
1089 |
# --- Telegram Handlers ---
|
1090 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
|
1091 |
-
# No changes needed in these handlers
|
1092 |
-
|
1093 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1094 |
user = update.effective_user; mention = user.mention_html()
|
1095 |
if not user or not update.message: return
|
1096 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /start.")
|
1097 |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
|
1098 |
|
1099 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1100 |
user = update.effective_user
|
1101 |
if not user or not update.message: return
|
1102 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
|
@@ -1104,9 +874,9 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
1104 |
"1. Send me any YouTube video link or website URL.\n"
|
1105 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
1106 |
"3. Click the button for your choice.\n"
|
1107 |
-
"4. Wait while I process it!\n\n"
|
1108 |
"⚙️ **Behind the scenes:**\n"
|
1109 |
-
f"• **Websites:** I try `Crawl4AI` (smart crawl), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n"
|
1110 |
"• **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
|
1111 |
f"• **Summaries:** Generated using Google `{GEMINI_MODEL}` (primary) or `{OPENROUTER_MODEL}` (fallback, if configured).\n\n"
|
1112 |
"**Commands:**\n"
|
@@ -1115,82 +885,51 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
1115 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
1116 |
|
1117 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1118 |
if not update.message or not update.message.text: return
|
1119 |
url = update.message.text.strip(); user = update.effective_user
|
1120 |
if not user: return
|
1121 |
-
|
1122 |
-
url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE) # Slightly improved regex
|
1123 |
if not url_pattern.match(url):
|
1124 |
logger.debug(f"Ignoring non-URL from {user.id}: {url}")
|
1125 |
-
await update.message.reply_text("Hmm, that doesn't look like a valid web URL
|
1126 |
-
return
|
1127 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
|
1128 |
-
# Store URL and original message ID in user_data
|
1129 |
context.user_data['url_to_summarize'] = url
|
1130 |
context.user_data['original_message_id'] = update.message.message_id
|
1131 |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
|
1132 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
1133 |
-
escaped_url = html.escape(url)
|
1134 |
await update.message.reply_html( f"Okay, I see this link:\n<code>{escaped_url}</code>\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True )
|
1135 |
|
1136 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1137 |
query = update.callback_query
|
1138 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
1139 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
1140 |
-
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}
|
1141 |
-
except Exception as e: logger.warning(f"Error answering callback {query_id}: {e}")
|
1142 |
|
1143 |
url = context.user_data.get('url_to_summarize')
|
1144 |
-
message_id_to_edit = query.message.message_id
|
1145 |
-
logger.info(f"User {user.id}
|
1146 |
|
1147 |
if not url:
|
1148 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Expired?")
|
1149 |
-
try:
|
1150 |
-
await query.edit_message_text(text="Sorry, I seem to have lost the context for that link. 🤔 Please send the URL again.", reply_markup=None)
|
1151 |
-
except BadRequest as e:
|
1152 |
-
if "message is not modified" in str(e).lower(): pass # Ignore if text is the same
|
1153 |
-
else: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
1154 |
except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
1155 |
-
return
|
1156 |
|
1157 |
-
# Check necessary configurations before scheduling
|
1158 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1159 |
-
if not TELEGRAM_TOKEN:
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
return
|
1164 |
-
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
1165 |
-
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid! Cannot summarize.")
|
1166 |
-
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models are available. Cannot proceed.", reply_markup=None)
|
1167 |
-
except Exception: pass
|
1168 |
-
return
|
1169 |
-
elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback (OpenRouter).")
|
1170 |
-
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) is unavailable, relying on primary (Gemini).")
|
1171 |
|
1172 |
-
# Schedule the background task
|
1173 |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit} (URL: {url})")
|
1174 |
-
|
1175 |
-
asyncio.ensure_future(
|
1176 |
-
process_summary_task(
|
1177 |
-
user_id=user.id,
|
1178 |
-
chat_id=query.message.chat_id,
|
1179 |
-
message_id_to_edit=message_id_to_edit, # Pass the ID of the message with buttons
|
1180 |
-
url=url,
|
1181 |
-
summary_type=summary_type,
|
1182 |
-
bot_token=TELEGRAM_TOKEN # Pass token explicitly
|
1183 |
-
)
|
1184 |
-
# Consider adding .set_name(...) if using Python 3.8+ for better debugging
|
1185 |
-
# .set_name(f"SummaryTask-{user.id}-{message_id_to_edit}")
|
1186 |
-
)
|
1187 |
-
|
1188 |
-
# Clear context AFTER scheduling the task
|
1189 |
context.user_data.pop('url_to_summarize', None)
|
1190 |
context.user_data.pop('original_message_id', None)
|
1191 |
-
logger.debug(f"Cleared URL context for user {user.id} after scheduling
|
1192 |
-
|
1193 |
-
# Do not edit the message here - let the task handle its own status updates by editing this message
|
1194 |
|
1195 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1196 |
# ... (Keep existing implementation) ...
|
@@ -1211,7 +950,6 @@ async def setup_bot_config() -> Application:
|
|
1211 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
1212 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
1213 |
application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() )
|
1214 |
-
# Add Handlers
|
1215 |
application.add_handler(CommandHandler("start", start))
|
1216 |
application.add_handler(CommandHandler("help", help_command))
|
1217 |
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
@@ -1221,21 +959,22 @@ async def setup_bot_config() -> Application:
|
|
1221 |
logger.info("Telegram application handlers configured."); return application
|
1222 |
|
1223 |
# --- ASGI Lifespan & Routes ---
|
1224 |
-
# (lifespan, telegram_webhook
|
1225 |
@contextlib.asynccontextmanager
|
1226 |
async def lifespan(app: Starlette):
|
|
|
1227 |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
|
1228 |
logger.info("ASGI Lifespan: Startup initiated...");
|
1229 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
|
1230 |
try:
|
1231 |
ptb_app = await setup_bot_config()
|
1232 |
-
await ptb_app.initialize()
|
1233 |
bot_info = await ptb_app.bot.get_me()
|
1234 |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
|
1235 |
|
1236 |
-
# Webhook setup
|
1237 |
current_webhook_info = await ptb_app.bot.get_webhook_info()
|
1238 |
-
webhook_delete_success = True
|
1239 |
if current_webhook_info and current_webhook_info.url:
|
1240 |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
|
1241 |
try:
|
@@ -1243,212 +982,115 @@ async def lifespan(app: Starlette):
|
|
1243 |
else: logger.warning("Failed delete webhook (API returned False)."); webhook_delete_success = False
|
1244 |
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1); webhook_delete_success = False
|
1245 |
|
1246 |
-
# Determine Webhook URL
|
1247 |
space_host = os.environ.get("SPACE_HOST")
|
1248 |
-
webhook_path = "/webhook"
|
1249 |
full_webhook_url = None
|
1250 |
if space_host:
|
1251 |
-
|
1252 |
-
host = space_host.split('://')[-1]
|
1253 |
-
full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}"
|
1254 |
logger.info(f"Using SPACE_HOST to determine webhook URL: {full_webhook_url}")
|
1255 |
-
else:
|
1256 |
-
logger.critical("Could not construct webhook URL (SPACE_HOST env var missing or invalid).")
|
1257 |
-
raise RuntimeError("Webhook URL undetermined.")
|
1258 |
|
1259 |
-
# Set Webhook
|
1260 |
if full_webhook_url and webhook_delete_success:
|
1261 |
logger.info(f"Attempting to set webhook: {full_webhook_url}")
|
1262 |
-
set_webhook_args = {
|
1263 |
-
|
1264 |
-
|
1265 |
-
"drop_pending_updates": True
|
1266 |
-
}
|
1267 |
-
if WEBHOOK_SECRET:
|
1268 |
-
set_webhook_args["secret_token"] = WEBHOOK_SECRET
|
1269 |
-
logger.info("Webhook secret token will be used.")
|
1270 |
-
|
1271 |
-
await asyncio.sleep(1.0) # Short delay before setting
|
1272 |
-
|
1273 |
try:
|
1274 |
webhook_set = await ptb_app.bot.set_webhook(**set_webhook_args)
|
1275 |
-
if not webhook_set:
|
1276 |
-
|
1277 |
-
await asyncio.sleep(1.5) # Longer delay after setting before verification
|
1278 |
webhook_info = await ptb_app.bot.get_webhook_info()
|
1279 |
if webhook_info and webhook_info.url == full_webhook_url:
|
1280 |
logger.info(f"Webhook set and verified successfully: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}")
|
1281 |
-
if webhook_info.last_error_message:
|
1282 |
-
|
1283 |
-
|
1284 |
-
logger.error(f"Webhook verification failed! Expected '{full_webhook_url}', Got info: {webhook_info}")
|
1285 |
-
raise RuntimeError("Webhook verification failed after setting.")
|
1286 |
-
|
1287 |
-
await ptb_app.start() # Start PTB processing updates AFTER successful webhook setup
|
1288 |
logger.info("PTB Application started (webhook mode).")
|
1289 |
-
except Exception as e:
|
1290 |
-
|
1291 |
-
raise RuntimeError(f"Failed to set/verify webhook: {e}") from e
|
1292 |
-
elif not webhook_delete_success:
|
1293 |
-
logger.error("FATAL: Failed to delete previous webhook. Cannot set new one.")
|
1294 |
-
raise RuntimeError("Failed to delete previous webhook.")
|
1295 |
-
# else: # Already handled by raising error if full_webhook_url is None
|
1296 |
|
1297 |
-
logger.info("ASGI Lifespan: Startup complete.");
|
1298 |
-
yield # Application runs here
|
1299 |
|
1300 |
except Exception as startup_err:
|
1301 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
1302 |
-
# Ensure cleanup happens even if startup fails partially
|
1303 |
if ptb_app:
|
1304 |
try:
|
1305 |
if ptb_app.running: await ptb_app.stop()
|
1306 |
-
# Check if initialized before shutting down
|
1307 |
if ptb_app._initialized: await ptb_app.shutdown()
|
1308 |
-
except Exception as shutdown_err:
|
1309 |
-
|
1310 |
-
raise # Reraise the original startup error
|
1311 |
-
|
1312 |
finally:
|
1313 |
-
# Shutdown sequence
|
1314 |
logger.info("ASGI Lifespan: Shutdown initiated...")
|
1315 |
if ptb_app:
|
1316 |
try:
|
1317 |
-
if ptb_app.running:
|
1318 |
-
|
1319 |
-
|
1320 |
-
# Check if initialized before shutting down
|
1321 |
-
if ptb_app._initialized:
|
1322 |
-
logger.info("Shutting down PTB application...")
|
1323 |
-
await ptb_app.shutdown()
|
1324 |
-
logger.info("PTB Application shut down successfully.")
|
1325 |
-
else:
|
1326 |
-
logger.info("PTB Application was not fully initialized, skipping shutdown.")
|
1327 |
-
except Exception as e:
|
1328 |
-
logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
|
1329 |
-
else:
|
1330 |
-
logger.info("PTB application object not created.")
|
1331 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1332 |
|
1333 |
-
|
1334 |
async def health_check(request: Request) -> PlainTextResponse:
|
|
|
1335 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
|
1336 |
-
bot_status = "Not Initialized"
|
1337 |
-
|
1338 |
-
# *** FIXED Attribute Name ***
|
1339 |
-
if ptb_app and ptb_app.bot and ptb_app._initialized: # Check if initialized using _initialized
|
1340 |
try:
|
1341 |
-
# Quick check if webhook seems ok, more reliable than get_me() sometimes
|
1342 |
wh_info = await ptb_app.bot.get_webhook_info()
|
1343 |
-
# Check running status as well
|
1344 |
if ptb_app.running and wh_info and wh_info.url:
|
1345 |
-
bot_info = await ptb_app.bot.get_me()
|
1346 |
-
bot_username = f"@{bot_info.username}"
|
1347 |
bot_status = f"Running (Webhook OK, {bot_username})"
|
1348 |
-
elif ptb_app.running:
|
1349 |
-
bot_status = f"Running (Webhook Status: {wh_info.url if wh_info else 'N/A'}, Last Error: {wh_info.last_error_message if wh_info else 'N/A'})"
|
1350 |
else: bot_status = "Initialized/Not running"
|
1351 |
-
except Exception as e:
|
1352 |
-
|
1353 |
-
bot_status = f"Error checking status: {e}"
|
1354 |
-
elif ptb_app:
|
1355 |
-
bot_status = "Initializing..." # Or Initialized but not running yet
|
1356 |
|
1357 |
health_info = [
|
1358 |
-
f"=== Telegram Summary Bot Status ===",
|
1359 |
-
f"
|
1360 |
-
"--- Services ---",
|
1361 |
-
f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED (Lib Missing)'}",
|
1362 |
f"Fallback Web Scraper 1: BeautifulSoup",
|
1363 |
f"Fallback Web Scraper 2: {'urltotext.com API' if _urltotext_fallback_enabled else 'DISABLED (No Key)'}",
|
1364 |
f"Primary Summarizer: {'Gemini (' + GEMINI_MODEL + ')' if _gemini_primary_enabled else 'DISABLED (No Key/Lib)'}",
|
1365 |
f"Fallback Summarizer: {'OpenRouter (' + OPENROUTER_MODEL + ')' if _openrouter_fallback_enabled else 'DISABLED (No Key)'}",
|
1366 |
f"Primary YT Transcript: youtube-transcript-api",
|
1367 |
f"Fallback YT Transcript 1: {'Supadata API' if SUPADATA_API_KEY else 'DISABLED (No Key)'}",
|
1368 |
-
f"Fallback YT Transcript 2: {'Apify (' + APIFY_ACTOR_ID + ')' if _apify_token_exists else 'DISABLED (No Key)'}"
|
1369 |
-
]
|
1370 |
return PlainTextResponse("\n".join(health_info))
|
1371 |
|
1372 |
-
|
1373 |
async def telegram_webhook(request: Request) -> Response:
|
1374 |
# ... (Keep existing implementation) ...
|
1375 |
global WEBHOOK_SECRET, ptb_app
|
1376 |
-
if not ptb_app:
|
1377 |
-
|
1378 |
-
|
1379 |
-
|
1380 |
-
|
1381 |
-
return PlainTextResponse('Bot not initialized', status_code=503)
|
1382 |
-
if not ptb_app.running:
|
1383 |
-
logger.warning("Webhook received but PTB application not running.")
|
1384 |
-
return PlainTextResponse('Bot not running', status_code=503) # Service Unavailable
|
1385 |
-
|
1386 |
-
# Validate webhook secret if configured
|
1387 |
if WEBHOOK_SECRET:
|
1388 |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1389 |
-
if token_header != WEBHOOK_SECRET:
|
1390 |
-
logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'")
|
1391 |
-
return Response(content="Invalid secret token", status_code=403) # Forbidden
|
1392 |
|
1393 |
try:
|
1394 |
-
# Process update
|
1395 |
update_data = await request.json()
|
1396 |
update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1397 |
logger.debug(f"Processing update_id: {update.update_id} via webhook")
|
1398 |
-
# Use process_update which handles tasks in the background
|
1399 |
await ptb_app.process_update(update)
|
1400 |
-
return Response(status_code=200)
|
1401 |
-
|
1402 |
-
except
|
1403 |
-
logger.error("Webhook received invalid JSON.")
|
1404 |
-
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1405 |
-
except Exception as e:
|
1406 |
-
# Log error but return OK to Telegram to prevent retries for processing errors
|
1407 |
-
logger.error(f"Error processing webhook update: {e}", exc_info=True)
|
1408 |
-
return Response(status_code=200) # OK
|
1409 |
|
1410 |
|
1411 |
# --- ASGI App Definition ---
|
1412 |
-
|
1413 |
-
app = Starlette(
|
1414 |
-
debug=False, # Set to True for more verbose errors during development if needed
|
1415 |
-
lifespan=lifespan,
|
1416 |
-
routes=[
|
1417 |
-
Route("/", endpoint=health_check, methods=["GET"]),
|
1418 |
-
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]),
|
1419 |
-
]
|
1420 |
-
)
|
1421 |
logger.info("Starlette ASGI application created with native routes.")
|
1422 |
|
1423 |
|
1424 |
# --- Development Runner ---
|
1425 |
-
# (Remains the same)
|
1426 |
if __name__ == '__main__':
|
|
|
1427 |
import uvicorn
|
1428 |
logger.warning("Running in development mode using Uvicorn directly - FOR LOCAL TESTING ONLY")
|
1429 |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
|
1430 |
-
# Use the PORT env var for local running too, defaulting to 8080
|
1431 |
local_port = int(os.environ.get('PORT', 8080))
|
1432 |
-
|
1433 |
-
|
1434 |
-
|
1435 |
-
|
1436 |
-
|
1437 |
-
load_dotenv()
|
1438 |
-
logger.info("Loaded environment variables from .env file for local development.")
|
1439 |
-
except ImportError:
|
1440 |
-
logger.info(".env file not found or python-dotenv not installed, using system environment variables.")
|
1441 |
-
|
1442 |
-
|
1443 |
-
# Re-check required tokens after potential .env load
|
1444 |
-
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN not found.")
|
1445 |
-
if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY not found.")
|
1446 |
-
# Add checks for other keys as needed for local testing
|
1447 |
-
|
1448 |
-
uvicorn.run(
|
1449 |
-
"main:app",
|
1450 |
-
host='0.0.0.0',
|
1451 |
-
port=local_port,
|
1452 |
-
log_level=log_level,
|
1453 |
-
reload=True # Enable auto-reload for development
|
1454 |
-
)
|
|
|
53 |
# --- Google Gemini ---
|
54 |
try:
|
55 |
import google.generativeai as genai
|
|
|
56 |
from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
|
57 |
_gemini_available = True
|
58 |
except ImportError:
|
|
|
73 |
logging.getLogger('uvicorn').setLevel(logging.INFO)
|
74 |
logging.getLogger('starlette').setLevel(logging.INFO)
|
75 |
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
|
|
|
76 |
logging.getLogger("crawl4ai").setLevel(logging.INFO) # Adjust level as needed
|
77 |
logging.getLogger("playwright").setLevel(logging.WARNING)
|
78 |
|
|
|
111 |
# Models (User can still configure via env vars)
|
112 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
113 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
|
|
114 |
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model
|
115 |
|
116 |
# --- Configuration Checks ---
|
|
|
156 |
_gemini_primary_enabled = False
|
157 |
|
158 |
# --- Constants ---
|
159 |
+
MAX_SUMMARY_CHUNK_SIZE = 4000
|
160 |
+
MAX_INPUT_TOKEN_APPROX = 500000 # Keep conservative estimate for gemini-2.0-flash
|
|
|
|
|
|
|
161 |
|
162 |
# --- Retry Decorator ---
|
|
|
163 |
@retry(
|
164 |
stop=stop_after_attempt(4),
|
165 |
wait=wait_exponential(multiplier=1, min=2, max=15),
|
|
|
168 |
reraise=True
|
169 |
)
|
170 |
async def retry_bot_operation(func, *args, **kwargs):
|
171 |
+
# ... (Keep existing implementation) ...
|
172 |
try:
|
173 |
return await func(*args, **kwargs)
|
174 |
except BadRequest as e:
|
175 |
+
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ]
|
|
|
|
|
|
|
176 |
if any(err in str(e).lower() for err in ignore_errors):
|
177 |
logger.warning(f"Ignoring non-critical BadRequest: {e}")
|
178 |
return None
|
|
|
179 |
logger.error(f"Potentially critical BadRequest: {e}")
|
180 |
raise
|
181 |
except TelegramError as e:
|
182 |
+
if isinstance(e, (TimedOut, NetworkError, RetryAfter)): logger.warning(f"Telegram transient error (will retry): {e}")
|
183 |
+
else: logger.error(f"Unhandled TelegramError: {e}")
|
184 |
+
raise
|
|
|
|
|
|
|
185 |
except Exception as e:
|
186 |
logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
|
187 |
raise
|
|
|
189 |
# --- Helper Functions ---
|
190 |
# (is_youtube_url, extract_youtube_id remain the same)
|
191 |
def is_youtube_url(url):
|
192 |
+
# ... (Keep existing implementation) ...
|
193 |
youtube_regex = re.compile( r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/' r'(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
|
194 |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match)
|
195 |
def extract_youtube_id(url):
|
196 |
+
# ... (Keep existing implementation) ...
|
197 |
youtube_regex = re.compile( r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/' r'(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
|
198 |
match = youtube_regex.search(url)
|
199 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
|
|
235 |
return None
|
236 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
237 |
|
|
|
238 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
239 |
# ... (Keep existing implementation) ...
|
240 |
global APIFY_ACTOR_ID
|
|
|
283 |
except httpx.RequestError as e: logger.error(f"[Apify SyncItems] Request error during API interaction for {video_url}: {e}"); return None
|
284 |
except Exception as e: logger.error(f"[Apify SyncItems] Unexpected error during Apify SyncItems REST call for {video_url}: {e}", exc_info=True); return None
|
285 |
|
|
|
286 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
287 |
# ... (Keep existing implementation) ...
|
288 |
global SUPADATA_API_KEY, APIFY_API_TOKEN
|
|
|
291 |
transcript_text = None
|
292 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
293 |
try:
|
|
|
294 |
transcript_list = await asyncio.to_thread(
|
295 |
YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch
|
296 |
)
|
|
|
297 |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
298 |
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text.strip()
|
299 |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
|
|
|
305 |
transcript_text = None
|
306 |
except Exception as e:
|
307 |
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
|
|
|
308 |
transcript_text = None
|
309 |
|
310 |
if transcript_text is None:
|
311 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
312 |
if SUPADATA_API_KEY:
|
313 |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
314 |
+
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
315 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
316 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
317 |
|
|
|
319 |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
|
320 |
if APIFY_API_TOKEN:
|
321 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
322 |
+
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
|
323 |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
|
324 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
325 |
|
326 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
|
|
327 |
return transcript_text
|
328 |
|
329 |
|
|
|
341 |
return None
|
342 |
|
343 |
logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
try:
|
346 |
+
# *** MODIFIED: Instantiate with ignore_robots=True to bypass cache creation ***
|
347 |
+
async with AsyncWebCrawler(ignore_robots=True) as crawler:
|
348 |
+
logger.info(f"[Crawl4AI Primary] Initialized crawler (ignore_robots=True).")
|
|
|
349 |
# Use arun for a single URL crawl
|
350 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
|
351 |
|
|
|
356 |
return content
|
357 |
else:
|
358 |
logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted markdown content is empty.")
|
|
|
359 |
if result.text:
|
360 |
content = result.text.strip()
|
361 |
if content:
|
362 |
logger.info(f"[Crawl4AI Primary] Using .text fallback after empty markdown. Length: {len(content)}")
|
363 |
return content
|
364 |
return None # Return None if both markdown and text are empty
|
365 |
+
elif result and result.text:
|
366 |
content = result.text.strip()
|
367 |
if content:
|
368 |
logger.info(f"[Crawl4AI Primary] Success crawling {url} (using .text, markdown missing). Length: {len(content)}")
|
|
|
376 |
except asyncio.TimeoutError:
|
377 |
logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
|
378 |
return None
|
379 |
+
except PermissionError as e: # Should hopefully not happen now, but keep for logging
|
380 |
+
logger.error(f"[Crawl4AI Primary] Permission denied during crawl for {url}. Target path: '{e.filename}'. Error: {e}", exc_info=True)
|
381 |
+
return None
|
|
|
382 |
except Exception as e:
|
383 |
logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
|
|
|
384 |
return None
|
385 |
|
386 |
|
|
|
396 |
response.raise_for_status()
|
397 |
content_type = response.headers.get('content-type', '').lower()
|
398 |
if 'html' not in content_type: logger.warning(f"[Web Scrape BS4] Non-HTML content type from {url}: {content_type}"); return None
|
399 |
+
try: return response.text
|
400 |
except Exception as e: logger.error(f"[Web Scrape BS4] Error getting response text for {url}: {e}"); return None
|
401 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape BS4] HTTP error {e.response.status_code} fetching {url}: {e}")
|
402 |
except httpx.TimeoutException: logger.error(f"[Web Scrape BS4] Timeout error fetching {url}")
|
|
|
406 |
return None
|
407 |
|
408 |
async def get_website_content_bs4(url: str) -> Optional[str]:
|
|
|
409 |
# ... (Keep existing implementation) ...
|
410 |
if not url: logger.error("[BS4 Fallback] get_website_content_bs4: No URL"); return None
|
411 |
logger.info(f"[BS4 Fallback] Attempting basic fetch & parse for: {url}")
|
|
|
413 |
if not html_content:
|
414 |
logger.warning(f"[BS4 Fallback] Failed to fetch HTML for {url}")
|
415 |
return None
|
|
|
416 |
try:
|
|
|
417 |
def parse_html(content):
|
418 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
419 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]): element.extract()
|
420 |
+
selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body', '.post-content', '.entry-content']
|
|
|
|
|
|
|
421 |
target_element = None
|
422 |
for selector in selectors:
|
423 |
try:
|
424 |
target_element = soup.select_one(selector)
|
425 |
if target_element: break
|
426 |
+
except Exception as sel_e: logger.warning(f"[BS4 Fallback] Invalid selector '{selector}': {sel_e}"); continue
|
427 |
+
if not target_element: target_element = soup.body
|
|
|
|
|
|
|
|
|
428 |
if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
|
|
|
|
|
429 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
430 |
+
text = " ".join(lines)
|
431 |
+
text = re.sub(r'\s{2,}', ' ', text).strip()
|
|
|
|
|
|
|
432 |
if not text: logger.warning(f"[BS4 Fallback] Extracted text is empty after cleaning for {url}"); return None
|
433 |
return text
|
|
|
434 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
435 |
if text_content:
|
436 |
logger.info(f"[BS4 Fallback] Success scrape/parse for {url} (final len: {len(text_content)})")
|
|
|
444 |
|
445 |
# Fallback 2: urltotext.com API
|
446 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
|
447 |
# ... (Keep existing implementation) ...
|
448 |
if not url: logger.error("[API Fallback] No URL"); return None
|
449 |
if not api_key: logger.error("[API Fallback] urltotext.com API key missing."); return None
|
|
|
465 |
else: logger.warning(f"[API Fallback] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
466 |
except json.JSONDecodeError: logger.error(f"[API Fallback] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
467 |
except Exception as e: logger.error(f"[API Fallback] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
468 |
+
elif response.status_code == 402: logger.error(f"[API Fallback] Error 402 (Insufficient Credits) from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
469 |
+
# Added check for 400 specifically mentioning bad URL
|
470 |
+
elif response.status_code == 400 and "url" in response.text.lower():
|
471 |
+
logger.error(f"[API Fallback] Error 400 (Likely Bad URL) from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
472 |
elif response.status_code in [400, 401, 403, 422, 500]: logger.error(f"[API Fallback] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
473 |
else: logger.error(f"[API Fallback] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
474 |
except httpx.TimeoutException: logger.error(f"[API Fallback] Timeout connecting to urltotext.com API for {url}"); return None
|
|
|
483 |
logger.error("[Gemini Primary] Called but is disabled.");
|
484 |
return None, "Error: Primary AI service (Gemini) not configured/available."
|
485 |
|
|
|
486 |
if len(text) > MAX_INPUT_TOKEN_APPROX:
|
487 |
logger.warning(f"[Gemini Primary] Input text length ({len(text)}) exceeds limit ({MAX_INPUT_TOKEN_APPROX}). Truncating.")
|
488 |
text = text[:MAX_INPUT_TOKEN_APPROX]
|
489 |
|
490 |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}")
|
491 |
|
|
|
492 |
if summary_type == "paragraph":
|
493 |
+
prompt = f"""Please summarise the following text into a concise paragraph...""" # Keep prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
elif summary_type == "points":
|
495 |
+
prompt = f"""Please summarise the following text into a list of key bullet points...""" # Keep prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
else:
|
497 |
logger.error(f"[Gemini Primary] Invalid summary_type: {summary_type}")
|
498 |
return None, f"Error: Invalid summary type '{summary_type}' specified."
|
499 |
|
500 |
+
# *** MODIFIED: Disable safety settings as requested ***
|
501 |
+
safety_settings = { category: HarmBlockThreshold.BLOCK_NONE for category in HarmCategory }
|
502 |
+
# safety_settings = {
|
503 |
+
# HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
|
504 |
+
# HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
|
505 |
+
# HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
|
506 |
+
# HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
|
507 |
+
# # HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: HarmBlockThreshold.BLOCK_NONE # If supported by model
|
508 |
+
# }
|
509 |
+
logger.info("[Gemini Primary] Safety settings disabled (BLOCK_NONE).")
|
510 |
+
|
511 |
|
|
|
512 |
generation_config = genai.types.GenerationConfig(
|
513 |
+
max_output_tokens=2048,
|
514 |
+
temperature=0.7,
|
|
|
|
|
|
|
|
|
515 |
)
|
516 |
|
517 |
try:
|
518 |
model = genai.GenerativeModel(GEMINI_MODEL)
|
519 |
logger.debug(f"[Gemini Primary] Sending request to model {GEMINI_MODEL}")
|
520 |
+
response: GenerateContentResponse = await model.generate_content_async(
|
521 |
prompt,
|
522 |
generation_config=generation_config,
|
523 |
safety_settings=safety_settings,
|
|
|
524 |
)
|
525 |
|
526 |
+
# 1. Check for explicit blocks (even with BLOCK_NONE, core harms might be blocked)
|
527 |
if not response.candidates:
|
528 |
block_reason = "Unknown"
|
529 |
+
safety_ratings_str = "N/A"
|
530 |
if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
|
531 |
block_reason = response.prompt_feedback.block_reason or "Reason not specified"
|
532 |
+
if response.prompt_feedback.safety_ratings:
|
533 |
+
safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.prompt_feedback.safety_ratings])
|
534 |
+
error_msg = f"Error: Gemini response blocked or empty (Prompt Feedback). Reason: {block_reason}. Safety: {safety_ratings_str}"
|
535 |
logger.error(f"[Gemini Primary] {error_msg}")
|
536 |
return None, error_msg
|
537 |
|
538 |
+
# 2. Check candidate's finish reason and safety ratings
|
539 |
+
candidate = response.candidates[0]
|
540 |
+
finish_reason_val = candidate.finish_reason
|
541 |
+
finish_reason_str = str(finish_reason_val).upper() # Convert to upper string for reliable compare
|
542 |
+
logger.debug(f"[Gemini Primary] Received response. Finish reason value: {finish_reason_val} -> {finish_reason_str}")
|
543 |
+
|
544 |
+
# Check for safety issues in the *candidate* itself
|
545 |
+
candidate_safety_ratings_str = "N/A"
|
546 |
+
if hasattr(candidate, 'safety_ratings') and candidate.safety_ratings:
|
547 |
+
candidate_safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in candidate.safety_ratings])
|
548 |
+
# Check if any category was blocked, even with BLOCK_NONE (might indicate core harm)
|
549 |
+
# This logic might be too strict if BLOCK_NONE truly means nothing is blocked by threshold.
|
550 |
+
# Let's rely on finish_reason == SAFETY first.
|
551 |
+
# if any(r.probability.name in ["MEDIUM", "HIGH"] for r in candidate.safety_ratings):
|
552 |
+
# logger.warning(f"[Gemini Primary] Candidate has medium/high safety rating despite BLOCK_NONE. Ratings: {candidate_safety_ratings_str}")
|
553 |
+
|
554 |
+
|
555 |
+
# *** MODIFIED: Refined Finish Reason Check ***
|
556 |
+
# Reasons indicating success or acceptable completion:
|
557 |
+
success_reasons = ["STOP", "MAX_TOKENS"]
|
558 |
+
# Reasons indicating definite failure:
|
559 |
+
failure_reasons = ["SAFETY", "RECITATION", "OTHER"] # Treat OTHER as failure
|
560 |
+
|
561 |
+
if finish_reason_str in success_reasons:
|
562 |
+
# Proceed to extract text
|
563 |
+
logger.info(f"[Gemini Primary] Generation finished successfully or reached token limit. Reason: {finish_reason_str}")
|
564 |
+
if finish_reason_str == "MAX_TOKENS":
|
565 |
+
logger.warning("[Gemini Primary] Output may be truncated due to max_tokens limit.")
|
566 |
+
# Continue below to extract text...
|
567 |
+
|
568 |
+
elif finish_reason_str in failure_reasons:
|
569 |
+
error_msg = f"Error: Gemini generation failed. Reason: {finish_reason_str}. Safety: {candidate_safety_ratings_str}"
|
570 |
+
logger.error(f"[Gemini Primary] {error_msg}")
|
571 |
+
return None, error_msg # Return specific error
|
572 |
|
573 |
+
else:
|
574 |
+
# Unknown or unexpected finish reason
|
575 |
+
error_msg = f"Error: Gemini generation finished with unexpected reason: {finish_reason_str}. Safety: {candidate_safety_ratings_str}"
|
576 |
+
logger.error(f"[Gemini Primary] {error_msg}")
|
577 |
+
return None, error_msg # Return error
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
|
|
|
|
|
|
|
|
|
|
|
579 |
|
580 |
+
# 3. Extract text if finish reason was acceptable
|
581 |
summary_text = ""
|
582 |
try:
|
583 |
+
summary_text = response.text # Preferred way
|
|
|
584 |
except ValueError as e:
|
|
|
585 |
logger.warning(f"[Gemini Primary] Error accessing response.text: {e}. Trying parts manually.")
|
586 |
+
if candidate.content and candidate.content.parts:
|
587 |
+
summary_text = "".join(part.text for part in candidate.content.parts if hasattr(part, "text"))
|
|
|
588 |
|
589 |
if not summary_text or not summary_text.strip():
|
590 |
+
# If text is empty even after successful finish reason, log warning and return error
|
591 |
+
logger.warning(f"[Gemini Primary] Gemini returned empty summary despite finish reason '{finish_reason_str}'.")
|
592 |
return None, "Error: AI generated an empty summary."
|
593 |
|
594 |
+
logger.info(f"[Gemini Primary] Summary extracted successfully (len: {len(summary_text)}).")
|
595 |
return summary_text.strip(), None
|
596 |
|
597 |
except AttributeError as e:
|
|
|
598 |
logger.error(f"[Gemini Primary] Attribute error accessing Gemini response: {e}. Response structure might have changed.", exc_info=True)
|
599 |
return None, f"Error: Failed to parse Gemini response. Details: {e}"
|
600 |
except Exception as e:
|
601 |
logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
|
|
|
|
|
|
|
602 |
return None, f"Error: Failed to communicate with the primary AI service (Gemini). Details: {e}"
|
603 |
|
604 |
|
605 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
606 |
""" Calls the OpenRouter API to generate a summary. """
|
607 |
+
# ... (Keep existing implementation) ...
|
608 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
609 |
if not _openrouter_fallback_enabled:
|
610 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
611 |
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
|
612 |
|
613 |
+
max_input_len_openrouter = 100000
|
614 |
if len(text) > max_input_len_openrouter:
|
615 |
logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
|
616 |
text = text[:max_input_len_openrouter]
|
617 |
|
618 |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
|
619 |
|
620 |
+
if summary_type == "paragraph": prompt_content = f"Please summarise... {text} ..." # Keep prompts
|
621 |
+
elif summary_type == "points": prompt_content = f"Please summarise... {text} ..."
|
622 |
+
else: logger.error(f"[OpenRouter Fallback] Invalid summary_type: {summary_type}"); return None, f"Error: Invalid summary type '{summary_type}' specified."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
623 |
|
624 |
+
headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json", "HTTP-Referer": "...", "X-Title": "..." }
|
625 |
+
payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "system", "content": "..."}, {"role": "user", "content": prompt_content}], "max_tokens": 2048, "temperature": 0.7 }
|
626 |
api_url = "https://openrouter.ai/api/v1/chat/completions"
|
627 |
|
628 |
try:
|
629 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
630 |
logger.debug(f"[OpenRouter Fallback] Sending request to {api_url} for model {OPENROUTER_MODEL}")
|
631 |
response = await client.post(api_url, headers=headers, json=payload)
|
632 |
logger.debug(f"[OpenRouter Fallback] Received status code {response.status_code}")
|
|
|
637 |
if data.get("choices") and len(data["choices"]) > 0:
|
638 |
choice = data["choices"][0]
|
639 |
message = choice.get("message")
|
640 |
+
finish_reason = choice.get("finish_reason", "N/A")
|
|
|
641 |
if message and message.get("content"):
|
642 |
summary_text = message["content"].strip()
|
643 |
if summary_text:
|
644 |
logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
|
645 |
+
if finish_reason == 'length': logger.warning("[OpenRouter Fallback] Summary may be truncated due to max_tokens limit.")
|
|
|
|
|
646 |
return summary_text, None
|
647 |
+
else: logger.warning("[OpenRouter Fallback] OpenRouter returned an empty summary content."); return None, "Error: Fallback AI generated an empty summary."
|
648 |
+
else: logger.error(f"[OpenRouter Fallback] Invalid response structure (missing message/content). Data: {data}"); return None, "Error: Fallback AI returned an invalid response format."
|
|
|
|
|
|
|
|
|
649 |
else:
|
650 |
logger.error(f"[OpenRouter Fallback] Invalid response structure (missing choices). Data: {data}")
|
651 |
+
api_error = data.get("error", {}).get("message", "Unknown API error"); return None, f"Error: Fallback AI response missing summary. API msg: {api_error}"
|
652 |
+
except json.JSONDecodeError: logger.error(f"[OpenRouter Fallback] Failed to decode JSON response. Status: {response.status_code}, Text: {response.text[:500]}"); return None, "Error: Fallback AI sent an invalid JSON response."
|
653 |
+
except Exception as e: logger.error(f"[OpenRouter Fallback] Error processing success response: {e}", exc_info=True); return None, f"Error: Failed to process Fallback AI response. Details: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
else:
|
|
|
655 |
error_message = f"Error: Fallback AI service ({OPENROUTER_MODEL}) returned status {response.status_code}."
|
656 |
+
try: error_details = response.json().get("error", {}).get("message", response.text[:200]); error_message += f" Details: {error_details}"
|
657 |
+
except Exception: error_message += f" Response: {response.text[:200]}"
|
658 |
+
logger.error(f"[OpenRouter Fallback] {error_message}"); return None, error_message
|
659 |
+
except httpx.TimeoutException: logger.error(f"[OpenRouter Fallback] Timeout connecting to OpenRouter API for {OPENROUTER_MODEL}"); return None, "Error: Timed out connecting to the fallback AI service."
|
660 |
+
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, f"Error: Network error connecting to the fallback AI service. Details: {e}"
|
661 |
+
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter API call: {e}", exc_info=True); return None, f"Error: Unexpected issue with the fallback AI service. Details: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
662 |
|
663 |
|
664 |
async def generate_summary(text: str, summary_type: str) -> str:
|
665 |
""" Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
|
666 |
+
# ... (Keep existing implementation) ...
|
667 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
668 |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
|
669 |
+
error_message: Optional[str] = None
|
|
|
|
|
|
|
670 |
if _gemini_primary_enabled:
|
671 |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})")
|
672 |
primary_summary, primary_error = await _call_gemini(text, summary_type)
|
673 |
if primary_summary:
|
674 |
logger.info(f"[Summary Generation] Success with primary AI (Gemini).")
|
675 |
+
return primary_summary
|
676 |
else:
|
677 |
logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error}. Proceeding to fallback.")
|
678 |
+
error_message = f"Primary AI ({GEMINI_MODEL}) failed: {primary_error}"
|
679 |
else:
|
680 |
logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback.")
|
681 |
error_message = "Primary AI (Gemini) unavailable."
|
682 |
|
|
|
683 |
if _openrouter_fallback_enabled:
|
684 |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})")
|
685 |
fallback_summary, fallback_error = await _call_openrouter(text, summary_type)
|
686 |
if fallback_summary:
|
687 |
logger.info(f"[Summary Generation] Success with fallback AI (OpenRouter).")
|
688 |
+
return fallback_summary
|
689 |
else:
|
690 |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error}")
|
691 |
+
if error_message: return f"{error_message}\nFallback AI ({OPENROUTER_MODEL}) also failed: {fallback_error}"
|
692 |
+
else: return f"Fallback AI ({OPENROUTER_MODEL}) failed: {fallback_error}"
|
|
|
|
|
|
|
693 |
else:
|
694 |
logger.error("[Summary Generation] Fallback AI (OpenRouter) is disabled. Cannot proceed.")
|
695 |
+
if error_message: return f"{error_message}\nFallback AI is also unavailable."
|
696 |
+
else: return "Error: Both primary and fallback AI services are unavailable."
|
|
|
|
|
697 |
|
|
|
698 |
logger.error("[Summary Generation] Reached end of function unexpectedly. No summary generated.")
|
699 |
final_error = error_message or "Unknown summary generation error."
|
700 |
return f"Sorry, an error occurred: {final_error}"
|
|
|
710 |
bot_token: str
|
711 |
) -> None:
|
712 |
"""Handles fetching content, generating summary, and sending results."""
|
713 |
+
# ... (Keep existing implementation) ...
|
714 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
715 |
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
716 |
background_request: Optional[BaseRequest] = None
|
|
|
719 |
user_feedback_message: Optional[str] = None
|
720 |
success = False
|
721 |
status_message_id = message_id_to_edit # Keep track of the original button message ID
|
|
|
722 |
|
723 |
try:
|
|
|
724 |
background_request = HTTPXRequest(connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0)
|
725 |
bot = Bot(token=bot_token, request=background_request)
|
726 |
except Exception as e:
|
727 |
+
logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return
|
|
|
|
|
728 |
|
729 |
try:
|
730 |
+
# Edit original button message to "Processing..."
|
|
|
731 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{html.escape(url)}\n\nThis might take a moment..."
|
732 |
+
if status_message_id:
|
|
|
733 |
try:
|
734 |
+
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.HTML, reply_markup=None, link_preview_options={'is_disabled': True} )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
735 |
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} to 'Processing'")
|
|
|
736 |
except Exception as e:
|
737 |
+
logger.warning(f"[Task {task_id}] Could not edit original button message {status_message_id}: {e}.")
|
|
|
|
|
738 |
|
|
|
739 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
740 |
+
except Exception: pass
|
741 |
|
742 |
# --- Determine URL Type and Fetch Content ---
|
743 |
is_youtube = is_youtube_url(url)
|
|
|
745 |
|
746 |
if is_youtube:
|
747 |
video_id = extract_youtube_id(url)
|
748 |
+
if video_id: content = await get_youtube_transcript(video_id, url)
|
749 |
+
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
750 |
+
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video..."
|
|
|
|
|
|
|
|
|
751 |
else:
|
752 |
+
# Website URL - Try methods in order
|
753 |
logger.info(f"[Task {task_id}] URL is website. Attempting Crawl4AI (Primary)...")
|
754 |
content = await get_website_content_via_crawl4ai(url)
|
|
|
755 |
if not content:
|
756 |
+
logger.warning(f"[Task {task_id}] Crawl4AI failed. Attempting BeautifulSoup (Fallback 1)...")
|
|
|
757 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
758 |
except Exception: pass
|
759 |
content = await get_website_content_bs4(url)
|
|
|
760 |
if not content:
|
761 |
+
logger.warning(f"[Task {task_id}] BeautifulSoup failed. Attempting API (Fallback 2)...")
|
762 |
global URLTOTEXT_API_KEY, _urltotext_fallback_enabled
|
763 |
if _urltotext_fallback_enabled:
|
764 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
765 |
except Exception: pass
|
766 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
767 |
+
if not content: logger.error(f"[Task {task_id}] API fallback (urltotext) also failed."); user_feedback_message = "Sorry, couldn't fetch content (Crawl4AI/BS4 failed, API failed/credits?)."
|
768 |
+
else: logger.warning(f"[Task {task_id}] API fallback disabled."); user_feedback_message = "Sorry, couldn't fetch content (Crawl4AI/BS4 failed, API disabled)."
|
769 |
+
if not content and not user_feedback_message: user_feedback_message = "Sorry, couldn't fetch content using any method."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
|
771 |
|
772 |
# --- Generate Summary if Content was Fetched ---
|
|
|
774 |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary type: '{summary_type}'.")
|
775 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
776 |
except Exception: pass
|
777 |
+
final_summary = await generate_summary(content, summary_type)
|
|
|
778 |
|
779 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
780 |
+
user_feedback_message = final_summary
|
781 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
782 |
+
success = False
|
783 |
else:
|
784 |
+
# Success - Split and Send
|
785 |
summary_parts = []
|
786 |
+
current_part = ""; lines = final_summary.splitlines(keepends=True)
|
|
|
|
|
787 |
for line in lines:
|
788 |
if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
|
789 |
+
if current_part.strip(): summary_parts.append(current_part.strip())
|
790 |
+
current_part = line[:MAX_SUMMARY_CHUNK_SIZE] if len(line) > MAX_SUMMARY_CHUNK_SIZE else line
|
791 |
+
else: current_part += line
|
|
|
|
|
|
|
|
|
|
|
792 |
if current_part.strip(): summary_parts.append(current_part.strip())
|
793 |
+
if not summary_parts: summary_parts.append("Summary generated, but empty."); logger.warning(...)
|
|
|
|
|
794 |
|
795 |
logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
|
796 |
|
797 |
+
# Edit original message with first part
|
798 |
edited_final_result = False
|
799 |
+
if status_message_id:
|
800 |
try:
|
801 |
+
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
|
802 |
+
logger.debug(f"[Task {task_id}] Edited original message {status_message_id} with first summary part.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
edited_final_result = True
|
804 |
+
except Exception as edit_err: logger.warning(f"[Task {task_id}] Failed edit original message {status_message_id} with summary part 1: {edit_err}. Sending new.")
|
|
|
|
|
|
|
|
|
805 |
if not edited_final_result:
|
806 |
+
sent_msg = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
|
807 |
+
if not sent_msg: user_feedback_message = "Sorry, failed to send summary."; success = False; logger.error(...)
|
808 |
+
|
809 |
+
# Send remaining parts if needed and success still true
|
810 |
+
if success and len(summary_parts) > 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
for i, part in enumerate(summary_parts[1:], start=2):
|
812 |
+
await asyncio.sleep(0.5)
|
813 |
try:
|
814 |
+
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
|
|
|
|
|
|
|
|
|
|
|
|
|
815 |
logger.debug(f"[Task {task_id}] Sent summary part {i}/{len(summary_parts)}.")
|
816 |
+
except Exception as part_err: user_feedback_message = f"Sorry, failed to send part {i}."; success = False; logger.error(...); break
|
817 |
+
if not user_feedback_message: success = True # Confirm success if no errors occurred
|
818 |
+
|
819 |
+
# --- Handle Failure Cases ---
|
820 |
+
if not success:
|
821 |
+
if not user_feedback_message: user_feedback_message = "Sorry, unknown error."; logger.error(...)
|
822 |
+
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
823 |
try:
|
824 |
+
# Edit original message with error
|
825 |
edited_final_error = False
|
826 |
if status_message_id:
|
827 |
try:
|
828 |
+
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=user_feedback_message, link_preview_options={'is_disabled': True}, reply_markup=None )
|
829 |
+
logger.debug(f"[Task {task_id}] Edited original message {status_message_id} with failure feedback.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
830 |
edited_final_error = True
|
831 |
+
except Exception as edit_err: logger.warning(f"[Task {task_id}] Failed edit original message {status_message_id} with error: {edit_err}. Sending new.")
|
|
|
|
|
|
|
832 |
if not edited_final_error:
|
833 |
+
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
|
|
|
|
|
|
|
|
|
|
|
834 |
logger.debug(f"[Task {task_id}] Sent failure feedback as new message.")
|
835 |
+
except Exception as send_err: logger.error(f"[Task {task_id}] Failed even to send failure feedback: {send_err}")
|
|
|
836 |
|
837 |
except Exception as e:
|
838 |
+
# Catch-all for unexpected errors in task
|
839 |
logger.error(f"[Task {task_id}] Unexpected error during processing task: {e}", exc_info=True)
|
840 |
+
success = False; user_feedback_message = "Oops! Something went wrong..."
|
841 |
+
if bot:
|
842 |
+
try: # Try sending final crash error
|
843 |
+
edited_crash = False
|
|
|
|
|
844 |
if status_message_id:
|
845 |
+
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=user_feedback_message, reply_markup=None, link_preview_options={'is_disabled': True} ); edited_crash = True
|
846 |
+
except Exception: pass
|
847 |
+
if not edited_crash: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
848 |
+
except Exception as final_err: logger.error(f"[Task {task_id}] Failed to send final crash error feedback: {final_err}")
|
|
|
|
|
|
|
|
|
849 |
|
850 |
finally:
|
851 |
+
# Close background bot client
|
|
|
|
|
|
|
|
|
|
|
852 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
853 |
+
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot client closed.")
|
854 |
+
except Exception as close_err: logger.warning(f"[Task {task_id}] Error closing background bot client: {close_err}")
|
|
|
|
|
|
|
|
|
855 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
856 |
|
857 |
|
858 |
# --- Telegram Handlers ---
|
859 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
|
860 |
+
# No changes needed in these handlers.
|
|
|
861 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
862 |
+
# ... (Keep existing implementation) ...
|
863 |
user = update.effective_user; mention = user.mention_html()
|
864 |
if not user or not update.message: return
|
865 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /start.")
|
866 |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
|
867 |
|
868 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
869 |
+
# ... (Keep existing implementation) ...
|
870 |
user = update.effective_user
|
871 |
if not user or not update.message: return
|
872 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
|
|
|
874 |
"1. Send me any YouTube video link or website URL.\n"
|
875 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
876 |
"3. Click the button for your choice.\n"
|
877 |
+
"4. Wait while I process it!\n\n"
|
878 |
"⚙️ **Behind the scenes:**\n"
|
879 |
+
f"• **Websites:** I try `Crawl4AI` (smart crawl, ignores robots.txt), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n" # Mention ignore_robots
|
880 |
"• **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
|
881 |
f"• **Summaries:** Generated using Google `{GEMINI_MODEL}` (primary) or `{OPENROUTER_MODEL}` (fallback, if configured).\n\n"
|
882 |
"**Commands:**\n"
|
|
|
885 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
886 |
|
887 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
888 |
+
# ... (Keep existing implementation) ...
|
889 |
if not update.message or not update.message.text: return
|
890 |
url = update.message.text.strip(); user = update.effective_user
|
891 |
if not user: return
|
892 |
+
url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE)
|
|
|
893 |
if not url_pattern.match(url):
|
894 |
logger.debug(f"Ignoring non-URL from {user.id}: {url}")
|
895 |
+
await update.message.reply_text("Hmm, that doesn't look like a valid web URL...", parse_mode=ParseMode.MARKDOWN); return
|
|
|
896 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
|
|
|
897 |
context.user_data['url_to_summarize'] = url
|
898 |
context.user_data['original_message_id'] = update.message.message_id
|
899 |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
|
900 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
901 |
+
escaped_url = html.escape(url)
|
902 |
await update.message.reply_html( f"Okay, I see this link:\n<code>{escaped_url}</code>\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True )
|
903 |
|
904 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
905 |
+
# ... (Keep existing implementation) ...
|
906 |
query = update.callback_query
|
907 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
908 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
909 |
+
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}")
|
910 |
+
except Exception as e: logger.warning(f"Error answering callback {query_id}: {e}")
|
911 |
|
912 |
url = context.user_data.get('url_to_summarize')
|
913 |
+
message_id_to_edit = query.message.message_id
|
914 |
+
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}")
|
915 |
|
916 |
if not url:
|
917 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Expired?")
|
918 |
+
try: await query.edit_message_text(text="Sorry, context lost. Please send URL again.", reply_markup=None)
|
|
|
|
|
|
|
|
|
919 |
except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
920 |
+
return
|
921 |
|
|
|
922 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
923 |
+
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing!"); try: await query.edit_message_text("❌ Bot config error.", reply_markup=None); except Exception: pass; return
|
924 |
+
if not _gemini_primary_enabled and not _openrouter_fallback_enabled: logger.critical("No AI models available!"); try: await query.edit_message_text("❌ AI config error.", reply_markup=None); except Exception: pass; return
|
925 |
+
elif not _gemini_primary_enabled: logger.warning("Primary AI unavailable, using fallback.")
|
926 |
+
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI unavailable, using primary.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
927 |
|
|
|
928 |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit} (URL: {url})")
|
929 |
+
asyncio.ensure_future( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=message_id_to_edit, url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ) )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
930 |
context.user_data.pop('url_to_summarize', None)
|
931 |
context.user_data.pop('original_message_id', None)
|
932 |
+
logger.debug(f"Cleared URL context for user {user.id} after scheduling.")
|
|
|
|
|
933 |
|
934 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
935 |
# ... (Keep existing implementation) ...
|
|
|
950 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
951 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
952 |
application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() )
|
|
|
953 |
application.add_handler(CommandHandler("start", start))
|
954 |
application.add_handler(CommandHandler("help", help_command))
|
955 |
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
|
|
959 |
logger.info("Telegram application handlers configured."); return application
|
960 |
|
961 |
# --- ASGI Lifespan & Routes ---
|
962 |
+
# (lifespan, health_check, telegram_webhook) - No changes needed here
|
963 |
@contextlib.asynccontextmanager
|
964 |
async def lifespan(app: Starlette):
|
965 |
+
# ... (Keep existing implementation) ...
|
966 |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
|
967 |
logger.info("ASGI Lifespan: Startup initiated...");
|
968 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
|
969 |
try:
|
970 |
ptb_app = await setup_bot_config()
|
971 |
+
await ptb_app.initialize()
|
972 |
bot_info = await ptb_app.bot.get_me()
|
973 |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
|
974 |
|
975 |
+
# Webhook setup
|
976 |
current_webhook_info = await ptb_app.bot.get_webhook_info()
|
977 |
+
webhook_delete_success = True
|
978 |
if current_webhook_info and current_webhook_info.url:
|
979 |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
|
980 |
try:
|
|
|
982 |
else: logger.warning("Failed delete webhook (API returned False)."); webhook_delete_success = False
|
983 |
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1); webhook_delete_success = False
|
984 |
|
|
|
985 |
space_host = os.environ.get("SPACE_HOST")
|
986 |
+
webhook_path = "/webhook"
|
987 |
full_webhook_url = None
|
988 |
if space_host:
|
989 |
+
full_webhook_url = f"https://{space_host.split('://')[-1].rstrip('/')}{webhook_path}"
|
|
|
|
|
990 |
logger.info(f"Using SPACE_HOST to determine webhook URL: {full_webhook_url}")
|
991 |
+
else: raise RuntimeError("Webhook URL undetermined (SPACE_HOST missing).")
|
|
|
|
|
992 |
|
|
|
993 |
if full_webhook_url and webhook_delete_success:
|
994 |
logger.info(f"Attempting to set webhook: {full_webhook_url}")
|
995 |
+
set_webhook_args = { "url": full_webhook_url, "allowed_updates": Update.ALL_TYPES, "drop_pending_updates": True }
|
996 |
+
if WEBHOOK_SECRET: set_webhook_args["secret_token"] = WEBHOOK_SECRET; logger.info("Webhook secret token will be used.")
|
997 |
+
await asyncio.sleep(1.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
998 |
try:
|
999 |
webhook_set = await ptb_app.bot.set_webhook(**set_webhook_args)
|
1000 |
+
if not webhook_set: raise RuntimeError("set_webhook API call returned False.")
|
1001 |
+
await asyncio.sleep(1.5)
|
|
|
1002 |
webhook_info = await ptb_app.bot.get_webhook_info()
|
1003 |
if webhook_info and webhook_info.url == full_webhook_url:
|
1004 |
logger.info(f"Webhook set and verified successfully: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}")
|
1005 |
+
if webhook_info.last_error_message: logger.warning(f"Webhook status has last error: {webhook_info.last_error_message}")
|
1006 |
+
else: raise RuntimeError(f"Webhook verification failed! Expected '{full_webhook_url}', Got info: {webhook_info}")
|
1007 |
+
await ptb_app.start()
|
|
|
|
|
|
|
|
|
1008 |
logger.info("PTB Application started (webhook mode).")
|
1009 |
+
except Exception as e: logger.error(f"FATAL: Failed to set or verify webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set/verify webhook: {e}") from e
|
1010 |
+
elif not webhook_delete_success: raise RuntimeError("Failed to delete previous webhook.")
|
|
|
|
|
|
|
|
|
|
|
1011 |
|
1012 |
+
logger.info("ASGI Lifespan: Startup complete."); yield
|
|
|
1013 |
|
1014 |
except Exception as startup_err:
|
1015 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
|
|
1016 |
if ptb_app:
|
1017 |
try:
|
1018 |
if ptb_app.running: await ptb_app.stop()
|
|
|
1019 |
if ptb_app._initialized: await ptb_app.shutdown()
|
1020 |
+
except Exception as shutdown_err: logger.error(f"Error during shutdown after startup failure: {shutdown_err}")
|
1021 |
+
raise
|
|
|
|
|
1022 |
finally:
|
|
|
1023 |
logger.info("ASGI Lifespan: Shutdown initiated...")
|
1024 |
if ptb_app:
|
1025 |
try:
|
1026 |
+
if ptb_app.running: await ptb_app.stop(); logger.info("PTB stopped.")
|
1027 |
+
if ptb_app._initialized: await ptb_app.shutdown(); logger.info("PTB shutdown.")
|
1028 |
+
except Exception as e: logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1029 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1030 |
|
|
|
1031 |
async def health_check(request: Request) -> PlainTextResponse:
|
1032 |
+
# ... (Keep existing implementation) ...
|
1033 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
|
1034 |
+
bot_status = "Not Initialized"; bot_username = "N/A"
|
1035 |
+
if ptb_app and ptb_app.bot and ptb_app._initialized:
|
|
|
|
|
1036 |
try:
|
|
|
1037 |
wh_info = await ptb_app.bot.get_webhook_info()
|
|
|
1038 |
if ptb_app.running and wh_info and wh_info.url:
|
1039 |
+
bot_info = await ptb_app.bot.get_me(); bot_username = f"@{bot_info.username}"
|
|
|
1040 |
bot_status = f"Running (Webhook OK, {bot_username})"
|
1041 |
+
elif ptb_app.running: bot_status = f"Running (Webhook Status: {wh_info.url if wh_info else 'N/A'}, Last Error: {wh_info.last_error_message if wh_info else 'N/A'})"
|
|
|
1042 |
else: bot_status = "Initialized/Not running"
|
1043 |
+
except Exception as e: logger.error(f"Error checking bot status: {e}", exc_info=True); bot_status = f"Error checking status: {e}"
|
1044 |
+
elif ptb_app: bot_status = "Initializing..."
|
|
|
|
|
|
|
1045 |
|
1046 |
health_info = [
|
1047 |
+
f"=== Telegram Summary Bot Status ===", f"Bot Application: {bot_status}", "--- Services ---",
|
1048 |
+
f"Primary Web Scraper: {'Crawl4AI (ignore_robots=True)' if _crawl4ai_primary_web_enabled else 'DISABLED (Lib Missing)'}", # Note ignore_robots
|
|
|
|
|
1049 |
f"Fallback Web Scraper 1: BeautifulSoup",
|
1050 |
f"Fallback Web Scraper 2: {'urltotext.com API' if _urltotext_fallback_enabled else 'DISABLED (No Key)'}",
|
1051 |
f"Primary Summarizer: {'Gemini (' + GEMINI_MODEL + ')' if _gemini_primary_enabled else 'DISABLED (No Key/Lib)'}",
|
1052 |
f"Fallback Summarizer: {'OpenRouter (' + OPENROUTER_MODEL + ')' if _openrouter_fallback_enabled else 'DISABLED (No Key)'}",
|
1053 |
f"Primary YT Transcript: youtube-transcript-api",
|
1054 |
f"Fallback YT Transcript 1: {'Supadata API' if SUPADATA_API_KEY else 'DISABLED (No Key)'}",
|
1055 |
+
f"Fallback YT Transcript 2: {'Apify (' + APIFY_ACTOR_ID + ')' if _apify_token_exists else 'DISABLED (No Key)'}" ]
|
|
|
1056 |
return PlainTextResponse("\n".join(health_info))
|
1057 |
|
|
|
1058 |
async def telegram_webhook(request: Request) -> Response:
|
1059 |
# ... (Keep existing implementation) ...
|
1060 |
global WEBHOOK_SECRET, ptb_app
|
1061 |
+
if not ptb_app or not ptb_app._initialized or not ptb_app.running:
|
1062 |
+
status = "Not Initialized" if not ptb_app else ("Not Running (Initializing)" if not ptb_app.running else "Not Running")
|
1063 |
+
logger.error(f"Webhook received but PTB application {status}.")
|
1064 |
+
return PlainTextResponse(f'Bot {status}', status_code=503)
|
1065 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
1066 |
if WEBHOOK_SECRET:
|
1067 |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1068 |
+
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook invalid secret token."); return Response(content="Invalid secret token", status_code=403)
|
|
|
|
|
1069 |
|
1070 |
try:
|
|
|
1071 |
update_data = await request.json()
|
1072 |
update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1073 |
logger.debug(f"Processing update_id: {update.update_id} via webhook")
|
|
|
1074 |
await ptb_app.process_update(update)
|
1075 |
+
return Response(status_code=200)
|
1076 |
+
except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1077 |
+
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to TG
|
|
|
|
|
|
|
|
|
|
|
|
|
1078 |
|
1079 |
|
1080 |
# --- ASGI App Definition ---
|
1081 |
+
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1082 |
logger.info("Starlette ASGI application created with native routes.")
|
1083 |
|
1084 |
|
1085 |
# --- Development Runner ---
|
|
|
1086 |
if __name__ == '__main__':
|
1087 |
+
# ... (Keep existing implementation) ...
|
1088 |
import uvicorn
|
1089 |
logger.warning("Running in development mode using Uvicorn directly - FOR LOCAL TESTING ONLY")
|
1090 |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
|
|
|
1091 |
local_port = int(os.environ.get('PORT', 8080))
|
1092 |
+
try: from dotenv import load_dotenv; load_dotenv(); logger.info("Loaded .env file.")
|
1093 |
+
except ImportError: logger.info(".env not loaded.")
|
1094 |
+
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN missing.")
|
1095 |
+
if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY missing.")
|
1096 |
+
uvicorn.run( "main:app", host='0.0.0.0', port=local_port, log_level=log_level, reload=True )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|