Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -53,6 +53,7 @@ except ImportError:
|
|
53 |
# --- Google Gemini ---
|
54 |
try:
|
55 |
import google.generativeai as genai
|
|
|
56 |
from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
|
57 |
_gemini_available = True
|
58 |
except ImportError:
|
@@ -112,8 +113,8 @@ GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer
|
|
112 |
# Models (User can still configure via env vars)
|
113 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
114 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
115 |
-
|
116 |
-
|
117 |
|
118 |
# --- Configuration Checks ---
|
119 |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
@@ -159,7 +160,10 @@ if _gemini_primary_enabled:
|
|
159 |
|
160 |
# --- Constants ---
|
161 |
MAX_SUMMARY_CHUNK_SIZE = 4000 # Max characters per Telegram message (allow buffer)
|
162 |
-
|
|
|
|
|
|
|
163 |
|
164 |
# --- Retry Decorator ---
|
165 |
# (Remains the same)
|
@@ -181,11 +185,16 @@ async def retry_bot_operation(func, *args, **kwargs):
|
|
181 |
if any(err in str(e).lower() for err in ignore_errors):
|
182 |
logger.warning(f"Ignoring non-critical BadRequest: {e}")
|
183 |
return None
|
|
|
184 |
logger.error(f"Potentially critical BadRequest: {e}")
|
185 |
raise
|
186 |
except TelegramError as e:
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
189 |
except Exception as e:
|
190 |
logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
|
191 |
raise
|
@@ -205,7 +214,7 @@ def extract_youtube_id(url):
|
|
205 |
# --- Content Fetching Functions ---
|
206 |
|
207 |
# - YouTube Transcript Fetching (get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript) -
|
208 |
-
# (
|
209 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
210 |
# ... (Keep existing implementation) ...
|
211 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
@@ -295,20 +304,30 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
295 |
transcript_text = None
|
296 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
297 |
try:
|
298 |
-
|
|
|
|
|
|
|
|
|
299 |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
300 |
-
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text
|
301 |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
except Exception as e:
|
303 |
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
|
304 |
-
if isinstance(e, (NoTranscriptFound, TranscriptsDisabled)): logger.warning(f"[Primary YT] Known issue: {type(e).__name__}")
|
305 |
transcript_text = None
|
306 |
|
307 |
if transcript_text is None:
|
308 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
309 |
if SUPADATA_API_KEY:
|
310 |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
311 |
-
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
312 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
313 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
314 |
|
@@ -316,7 +335,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
316 |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
|
317 |
if APIFY_API_TOKEN:
|
318 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
319 |
-
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
|
320 |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
|
321 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
322 |
|
@@ -340,35 +359,21 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
340 |
|
341 |
logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
|
342 |
# Define a writable cache directory (use /tmp in container environments)
|
343 |
-
#
|
344 |
-
cache_dir_path = "/tmp/.crawl4ai"
|
345 |
-
try:
|
346 |
-
|
347 |
-
|
348 |
-
except OSError as e:
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
logger.error(f"[Crawl4AI Primary] Unexpected error creating cache directory {cache_dir_path}: {e}")
|
353 |
-
|
354 |
|
355 |
try:
|
356 |
-
# Use AsyncWebCrawler context manager
|
357 |
-
#
|
358 |
-
|
359 |
-
|
360 |
-
# The PermissionError happens in RobotsParser -> get_home_folder -> os.makedirs.
|
361 |
-
# WORKAROUND: We might need to adjust the environment or hope setting HOME=/app in Dockerfile is enough
|
362 |
-
# *if* the library correctly uses HOME. Let's test *without* explicit cache_dir first,
|
363 |
-
# relying on HOME=/app and the prior os.makedirs call. If it still fails, we need a different approach.
|
364 |
-
|
365 |
-
# UPDATE: The traceback shows it uses utils.get_home_folder(). Let's stick with HOME=/app for now
|
366 |
-
# and see if the permission error was transient or specific to the '.models' subdir.
|
367 |
-
# If it persists, we might need to fork/modify crawl4ai or find another way to configure its paths.
|
368 |
-
|
369 |
-
# Let's *try* passing cache_dir anyway, maybe it's an undocumented/newer feature
|
370 |
-
async with AsyncWebCrawler(cache_dir=cache_dir_path) as crawler: # TRY passing cache_dir
|
371 |
-
logger.info(f"[Crawl4AI Primary] Initialized with explicit cache_dir: {cache_dir_path}")
|
372 |
# Use arun for a single URL crawl
|
373 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
|
374 |
|
@@ -379,14 +384,20 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
379 |
return content
|
380 |
else:
|
381 |
logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted markdown content is empty.")
|
382 |
-
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
content = result.text.strip()
|
385 |
if content:
|
386 |
-
logger.info(f"[Crawl4AI Primary] Success crawling {url} (using .text). Length: {len(content)}")
|
387 |
return content
|
388 |
else:
|
389 |
-
logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted text content is empty.")
|
390 |
return None
|
391 |
else:
|
392 |
logger.warning(f"[Crawl4AI Primary] Crawl failed or returned no result/content for {url}.")
|
@@ -395,14 +406,12 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
395 |
logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
|
396 |
return None
|
397 |
except PermissionError as e: # Catch the specific error
|
398 |
-
|
|
|
399 |
return None # Fail gracefully for this method
|
400 |
except Exception as e:
|
401 |
-
|
402 |
-
|
403 |
-
logger.error(f"[Crawl4AI Primary] AsyncWebCrawler does not accept 'cache_dir'. Remove this argument. Error: {e}")
|
404 |
-
else:
|
405 |
-
logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
|
406 |
return None
|
407 |
|
408 |
|
@@ -445,11 +454,16 @@ async def get_website_content_bs4(url: str) -> Optional[str]:
|
|
445 |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
|
446 |
element.extract()
|
447 |
# Try to find main content areas more broadly
|
448 |
-
selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body']
|
449 |
target_element = None
|
450 |
for selector in selectors:
|
451 |
-
|
452 |
-
|
|
|
|
|
|
|
|
|
|
|
453 |
|
454 |
if not target_element: target_element = soup.body # Fallback to body
|
455 |
if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
|
@@ -557,7 +571,7 @@ Key Bullet Points Summary:"""
|
|
557 |
generation_config = genai.types.GenerationConfig(
|
558 |
# candidate_count=1, # Default is 1
|
559 |
# stop_sequences=["\n"],
|
560 |
-
max_output_tokens=2048, # Increased max tokens
|
561 |
temperature=0.7, # Adjust creativity vs factualness
|
562 |
# top_p=1.0, # Default
|
563 |
# top_k=None # Default
|
@@ -569,44 +583,68 @@ Key Bullet Points Summary:"""
|
|
569 |
response: GenerateContentResponse = await model.generate_content_async( # Use async version
|
570 |
prompt,
|
571 |
generation_config=generation_config,
|
572 |
-
safety_settings=safety_settings
|
|
|
573 |
)
|
574 |
-
logger.debug(f"[Gemini Primary] Received response. Finish reason: {response.candidates[0].finish_reason if response.candidates else 'N/A'}")
|
575 |
|
576 |
# Check for safety blocks or other issues in response
|
577 |
if not response.candidates:
|
578 |
-
block_reason =
|
|
|
|
|
579 |
error_msg = f"Error: Gemini response blocked or empty. Reason: {block_reason}"
|
580 |
logger.error(f"[Gemini Primary] {error_msg}")
|
581 |
return None, error_msg
|
582 |
|
583 |
-
#
|
584 |
-
finish_reason
|
585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
586 |
# Log safety ratings if available
|
587 |
safety_ratings_str = "N/A"
|
588 |
-
if hasattr(response.candidates[0], 'safety_ratings'):
|
589 |
safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.candidates[0].safety_ratings])
|
590 |
-
|
|
|
591 |
logger.error(f"[Gemini Primary] {error_msg}")
|
592 |
-
# Return partial text if available and finish reason is MAX_TOKENS? Maybe not, could be truncated badly.
|
593 |
-
# If SAFETY, definitely return error.
|
594 |
-
if finish_reason == genai.types.FinishReason.SAFETY:
|
595 |
-
return None, error_msg # Return specific error for safety blocks
|
596 |
-
# For other reasons, maybe return partial, but safer to return error for now
|
597 |
-
# return response.text if hasattr(response, 'text') else None, error_msg # Optional: return partial text for RECITATION/OTHER
|
598 |
-
return None, f"Error: Gemini generation finished unexpectedly ({finish_reason.name})."
|
599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
600 |
|
601 |
-
# Extract text
|
602 |
-
summary_text = response.text
|
603 |
if not summary_text or not summary_text.strip():
|
604 |
logger.warning("[Gemini Primary] Gemini returned an empty summary.")
|
605 |
return None, "Error: AI generated an empty summary."
|
606 |
|
607 |
-
logger.info(f"[Gemini Primary] Summary generated successfully (len: {len(summary_text)}).")
|
608 |
return summary_text.strip(), None
|
609 |
|
|
|
|
|
|
|
|
|
610 |
except Exception as e:
|
611 |
logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
|
612 |
# Check for specific Google API errors if needed
|
@@ -617,14 +655,13 @@ Key Bullet Points Summary:"""
|
|
617 |
|
618 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
619 |
""" Calls the OpenRouter API to generate a summary. """
|
|
|
620 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
621 |
if not _openrouter_fallback_enabled:
|
622 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
623 |
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
|
624 |
|
625 |
-
|
626 |
-
# Example: 32k tokens ~ 120k chars. Deepseek is large though. Check model specifics if issues arise.
|
627 |
-
max_input_len_openrouter = 100000 # Adjust based on OPENROUTER_MODEL limits if known
|
628 |
if len(text) > max_input_len_openrouter:
|
629 |
logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
|
630 |
text = text[:max_input_len_openrouter]
|
@@ -682,11 +719,13 @@ Key Bullet Points Summary:"""
|
|
682 |
try:
|
683 |
data = response.json()
|
684 |
if data.get("choices") and len(data["choices"]) > 0:
|
685 |
-
|
|
|
|
|
|
|
686 |
if message and message.get("content"):
|
687 |
summary_text = message["content"].strip()
|
688 |
if summary_text:
|
689 |
-
finish_reason = data["choices"][0].get("finish_reason", "N/A")
|
690 |
logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
|
691 |
# Check for length finish reason?
|
692 |
if finish_reason == 'length':
|
@@ -735,6 +774,7 @@ Key Bullet Points Summary:"""
|
|
735 |
|
736 |
async def generate_summary(text: str, summary_type: str) -> str:
|
737 |
""" Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
|
|
|
738 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
739 |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
|
740 |
final_summary: Optional[str] = None
|
@@ -791,6 +831,7 @@ async def process_summary_task(
|
|
791 |
bot_token: str
|
792 |
) -> None:
|
793 |
"""Handles fetching content, generating summary, and sending results."""
|
|
|
794 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
795 |
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
796 |
background_request: Optional[BaseRequest] = None
|
@@ -798,8 +839,8 @@ async def process_summary_task(
|
|
798 |
content: Optional[str] = None
|
799 |
user_feedback_message: Optional[str] = None
|
800 |
success = False
|
801 |
-
status_message_id = message_id_to_edit
|
802 |
-
message_to_delete_later_id: Optional[int] = None
|
803 |
|
804 |
try:
|
805 |
# Initialize background bot
|
@@ -807,12 +848,15 @@ async def process_summary_task(
|
|
807 |
bot = Bot(token=bot_token, request=background_request)
|
808 |
except Exception as e:
|
809 |
logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True)
|
|
|
810 |
return
|
811 |
|
812 |
try:
|
813 |
# Send/Edit "Processing..." message
|
|
|
814 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{html.escape(url)}\n\nThis might take a moment..."
|
815 |
-
|
|
|
816 |
try:
|
817 |
await retry_bot_operation(
|
818 |
bot.edit_message_text,
|
@@ -820,32 +864,15 @@ async def process_summary_task(
|
|
820 |
message_id=status_message_id,
|
821 |
text=processing_message_text,
|
822 |
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
823 |
-
reply_markup=None,
|
824 |
link_preview_options={'is_disabled': True} # Disable preview here too
|
825 |
)
|
826 |
-
logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
827 |
-
|
828 |
-
logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new.")
|
829 |
-
status_message_id = None # Flag to send a new message
|
830 |
-
|
831 |
-
if not status_message_id: # If editing failed or wasn't applicable
|
832 |
-
try:
|
833 |
-
status_message = await retry_bot_operation(
|
834 |
-
bot.send_message,
|
835 |
-
chat_id=chat_id,
|
836 |
-
text=processing_message_text,
|
837 |
-
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
838 |
-
link_preview_options={'is_disabled': True}
|
839 |
-
)
|
840 |
-
if status_message:
|
841 |
-
message_to_delete_later_id = status_message.message_id
|
842 |
-
logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
843 |
-
else:
|
844 |
-
raise RuntimeError("Failed to send status message after retries.")
|
845 |
except Exception as e:
|
846 |
-
logger.
|
847 |
-
|
848 |
-
#
|
849 |
|
850 |
# Indicate activity
|
851 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
@@ -870,6 +897,7 @@ async def process_summary_task(
|
|
870 |
content = await get_website_content_via_crawl4ai(url)
|
871 |
|
872 |
if not content:
|
|
|
873 |
logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
|
874 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
875 |
except Exception: pass
|
@@ -883,8 +911,6 @@ async def process_summary_task(
|
|
883 |
except Exception: pass
|
884 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
885 |
if not content:
|
886 |
-
# Check if the specific error was insufficient credits
|
887 |
-
# Note: get_website_content_via_api already logs the specific error
|
888 |
logger.error(f"[Task {task_id}] API fallback (urltotext) also failed for {url}.")
|
889 |
user_feedback_message = "Sorry, I couldn't fetch content from that website using any method (Crawl4AI/BS4 failed, API failed or ran out of credits)." # Updated message
|
890 |
else:
|
@@ -908,68 +934,50 @@ async def process_summary_task(
|
|
908 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
909 |
user_feedback_message = final_summary # Pass AI error message to user
|
910 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
|
|
911 |
else:
|
912 |
-
# Success - Send the summary
|
913 |
summary_parts = []
|
914 |
current_part = ""
|
915 |
# Split respecting newlines, ensure no part exceeds MAX_SUMMARY_CHUNK_SIZE
|
916 |
lines = final_summary.splitlines(keepends=True)
|
917 |
for line in lines:
|
918 |
-
# If adding the next line exceeds the limit, finalize the current part
|
919 |
if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
|
920 |
if current_part.strip(): # Don't add empty parts
|
921 |
summary_parts.append(current_part.strip())
|
922 |
-
current_part = line
|
923 |
-
# If a single line itself is too long, truncate it (edge case)
|
924 |
if len(current_part) > MAX_SUMMARY_CHUNK_SIZE:
|
925 |
logger.warning(f"[Task {task_id}] Truncating overly long line in summary.")
|
926 |
current_part = current_part[:MAX_SUMMARY_CHUNK_SIZE]
|
927 |
else:
|
928 |
current_part += line
|
929 |
-
|
930 |
-
# Add the last part if it has content
|
931 |
-
if current_part.strip():
|
932 |
-
summary_parts.append(current_part.strip())
|
933 |
-
|
934 |
-
# If somehow splitting resulted in nothing (e.g., empty summary initially?)
|
935 |
if not summary_parts:
|
936 |
summary_parts.append("Summary generated, but it appears to be empty.")
|
937 |
logger.warning(f"[Task {task_id}] Summary was non-empty initially but splitting resulted in zero parts.")
|
938 |
|
939 |
-
|
940 |
logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
|
941 |
|
942 |
-
#
|
943 |
-
|
944 |
-
|
945 |
-
message_sent = False
|
946 |
-
|
947 |
-
if edit_target_id:
|
948 |
try:
|
949 |
-
# Try editing the status message first
|
950 |
await retry_bot_operation(
|
951 |
bot.edit_message_text,
|
952 |
chat_id=chat_id,
|
953 |
-
message_id=
|
954 |
text=summary_parts[0],
|
955 |
parse_mode=None, # Send as plain text initially, safer
|
956 |
link_preview_options={'is_disabled': True}
|
957 |
)
|
958 |
-
logger.debug(f"[Task {task_id}] Edited message {
|
959 |
-
|
960 |
-
if message_to_delete_later_id == edit_target_id: message_to_delete_later_id = None
|
961 |
-
# If it was the *original* button message that we are editing, keep status_message_id
|
962 |
-
# so we know *not* to delete it in finally block if it's the only message left.
|
963 |
-
# However, it's clearer to just prevent deletion if edited.
|
964 |
-
if status_message_id == edit_target_id: status_message_id = None # Mark as handled
|
965 |
-
|
966 |
-
message_sent = True
|
967 |
except Exception as edit_err:
|
968 |
-
|
969 |
-
|
970 |
|
971 |
-
if
|
972 |
-
|
973 |
sent_msg = await retry_bot_operation(
|
974 |
bot.send_message,
|
975 |
chat_id=chat_id,
|
@@ -977,15 +985,13 @@ async def process_summary_task(
|
|
977 |
parse_mode=None,
|
978 |
link_preview_options={'is_disabled': True}
|
979 |
)
|
980 |
-
if sent_msg:
|
981 |
-
logger.debug(f"[Task {task_id}] Sent first summary part as new message {sent_msg.message_id}.")
|
982 |
-
else: # Should be caught by retry, but log defensively
|
983 |
logger.error(f"[Task {task_id}] Failed to send first summary part even as new message.")
|
984 |
-
user_feedback_message = "Sorry, failed to send the summary."
|
985 |
-
|
986 |
|
987 |
# Send remaining parts (if any and first part succeeded)
|
988 |
-
if
|
989 |
for i, part in enumerate(summary_parts[1:], start=2):
|
990 |
await asyncio.sleep(0.5) # Small delay between parts
|
991 |
try:
|
@@ -1000,40 +1006,40 @@ async def process_summary_task(
|
|
1000 |
except Exception as part_err:
|
1001 |
logger.error(f"[Task {task_id}] Failed to send summary part {i}: {part_err}")
|
1002 |
user_feedback_message = f"Sorry, failed to send part {i} of the summary."
|
1003 |
-
|
1004 |
break # Stop sending remaining parts
|
1005 |
|
1006 |
-
# Determine overall success based on whether feedback message is set
|
1007 |
if not user_feedback_message:
|
1008 |
success = True
|
1009 |
-
# user_feedback_message = None # Clear feedback message ONLY on full success
|
1010 |
|
1011 |
-
# --- Handle Cases
|
1012 |
-
if
|
|
|
|
|
|
|
|
|
1013 |
logger.warning(f"[Task {task_id}] Sending failure/error feedback to user: {user_feedback_message}")
|
1014 |
try:
|
1015 |
-
# Try editing the
|
1016 |
-
|
1017 |
-
|
1018 |
-
if feedback_target_id:
|
1019 |
try:
|
1020 |
await retry_bot_operation(
|
1021 |
bot.edit_message_text,
|
1022 |
chat_id=chat_id,
|
1023 |
-
message_id=
|
1024 |
text=user_feedback_message,
|
1025 |
link_preview_options={'is_disabled': True},
|
1026 |
reply_markup=None # Remove buttons
|
1027 |
)
|
1028 |
-
logger.debug(f"[Task {task_id}] Edited message {
|
1029 |
-
|
1030 |
-
if message_to_delete_later_id == feedback_target_id: message_to_delete_later_id = None
|
1031 |
-
if status_message_id == feedback_target_id: status_message_id = None
|
1032 |
-
message_sent = True
|
1033 |
except Exception as edit_err:
|
1034 |
-
logger.warning(f"[Task {task_id}] Failed to edit message {
|
1035 |
|
1036 |
-
|
|
|
1037 |
await retry_bot_operation(
|
1038 |
bot.send_message,
|
1039 |
chat_id=chat_id,
|
@@ -1046,36 +1052,28 @@ async def process_summary_task(
|
|
1046 |
|
1047 |
except Exception as e:
|
1048 |
# Catch-all for unexpected errors during the main processing logic
|
1049 |
-
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True)
|
|
|
1050 |
user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
|
1051 |
if bot: # Ensure bot exists before trying to send
|
1052 |
try:
|
1053 |
-
# Attempt to send a final error message
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
|
|
|
|
|
|
1059 |
except Exception as final_err:
|
1060 |
logger.error(f"[Task {task_id}] Failed to send the final unexpected error feedback: {final_err}")
|
1061 |
|
1062 |
finally:
|
1063 |
# --- Cleanup ---
|
1064 |
-
#
|
1065 |
-
if
|
1066 |
-
|
1067 |
-
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=message_to_delete_later_id)
|
1068 |
-
logger.debug(f"[Task {task_id}] Deleted temporary status message {message_to_delete_later_id}")
|
1069 |
-
except Exception as del_e:
|
1070 |
-
logger.warning(f"[Task {task_id}] Failed to delete temporary status message {message_to_delete_later_id}: {del_e}")
|
1071 |
-
|
1072 |
-
# Explicitly DO NOT delete the original message with buttons (status_message_id)
|
1073 |
-
# if it was successfully edited with the final result or error message.
|
1074 |
-
# The logic above sets status_message_id = None if it was edited.
|
1075 |
-
# If status_message_id still holds the ID here, it means editing failed and we sent a *new* message.
|
1076 |
-
# In that failure case, maybe we *should* delete the original button message? Or leave it?
|
1077 |
-
# Let's leave it for now to avoid deleting user context if things went very wrong.
|
1078 |
-
# Deleting message_to_delete_later_id covers the main cleanup case.
|
1079 |
|
1080 |
# Close the background bot's HTTP client
|
1081 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
@@ -1090,26 +1088,23 @@ async def process_summary_task(
|
|
1090 |
|
1091 |
# --- Telegram Handlers ---
|
1092 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
|
1093 |
-
#
|
1094 |
|
1095 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1096 |
-
# ... (Keep existing implementation) ...
|
1097 |
user = update.effective_user; mention = user.mention_html()
|
1098 |
if not user or not update.message: return
|
1099 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /start.")
|
1100 |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
|
1101 |
|
1102 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1103 |
-
# ... (Keep existing implementation) ...
|
1104 |
user = update.effective_user
|
1105 |
if not user or not update.message: return
|
1106 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
|
1107 |
-
# Updated help text slightly
|
1108 |
help_text = ( "🔍 **How to use:**\n\n"
|
1109 |
"1. Send me any YouTube video link or website URL.\n"
|
1110 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
1111 |
"3. Click the button for your choice.\n"
|
1112 |
-
"4. Wait
|
1113 |
"⚙️ **Behind the scenes:**\n"
|
1114 |
f"• **Websites:** I try `Crawl4AI` (smart crawl), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n"
|
1115 |
"• **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
|
@@ -1120,15 +1115,14 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
1120 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
1121 |
|
1122 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1123 |
-
# ... (Keep existing implementation) ...
|
1124 |
if not update.message or not update.message.text: return
|
1125 |
url = update.message.text.strip(); user = update.effective_user
|
1126 |
if not user: return
|
1127 |
# Basic URL validation
|
1128 |
-
|
|
|
1129 |
logger.debug(f"Ignoring non-URL from {user.id}: {url}")
|
1130 |
-
|
1131 |
-
await update.message.reply_text("Hmm, that doesn't look like a valid web URL. Please make sure it starts with `http://` or `https://`.", parse_mode=ParseMode.MARKDOWN)
|
1132 |
return
|
1133 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
|
1134 |
# Store URL and original message ID in user_data
|
@@ -1140,12 +1134,11 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
1140 |
await update.message.reply_html( f"Okay, I see this link:\n<code>{escaped_url}</code>\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True )
|
1141 |
|
1142 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1143 |
-
# ... (Keep existing implementation, check AI config) ...
|
1144 |
query = update.callback_query
|
1145 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
1146 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
1147 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'N/A'})")
|
1148 |
-
except Exception as e: logger.
|
1149 |
|
1150 |
url = context.user_data.get('url_to_summarize')
|
1151 |
message_id_to_edit = query.message.message_id # The message with the buttons
|
@@ -1161,10 +1154,6 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1161 |
except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
1162 |
return # Do not proceed further
|
1163 |
|
1164 |
-
# Clear context *only after* successfully scheduling the task below
|
1165 |
-
# context.user_data.pop('url_to_summarize', None) # Moved clearing
|
1166 |
-
# context.user_data.pop('original_message_id', None) # Moved clearing
|
1167 |
-
|
1168 |
# Check necessary configurations before scheduling
|
1169 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1170 |
if not TELEGRAM_TOKEN:
|
@@ -1177,13 +1166,13 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1177 |
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models are available. Cannot proceed.", reply_markup=None)
|
1178 |
except Exception: pass
|
1179 |
return
|
1180 |
-
# Log warnings if one model is missing, but proceed if at least one is available
|
1181 |
elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback (OpenRouter).")
|
1182 |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) is unavailable, relying on primary (Gemini).")
|
1183 |
|
1184 |
# Schedule the background task
|
1185 |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit} (URL: {url})")
|
1186 |
-
asyncio.create_task
|
|
|
1187 |
process_summary_task(
|
1188 |
user_id=user.id,
|
1189 |
chat_id=query.message.chat_id,
|
@@ -1191,28 +1180,21 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1191 |
url=url,
|
1192 |
summary_type=summary_type,
|
1193 |
bot_token=TELEGRAM_TOKEN # Pass token explicitly
|
1194 |
-
)
|
1195 |
-
|
|
|
1196 |
)
|
1197 |
|
1198 |
-
# Clear context AFTER scheduling the task
|
1199 |
context.user_data.pop('url_to_summarize', None)
|
1200 |
context.user_data.pop('original_message_id', None)
|
1201 |
logger.debug(f"Cleared URL context for user {user.id} after scheduling task.")
|
1202 |
|
1203 |
-
#
|
1204 |
-
# This prevents the user clicking again while the task starts up.
|
1205 |
-
# try:
|
1206 |
-
# await query.edit_message_text(text=f"Okay, starting '{summary_type}' summary...", reply_markup=None)
|
1207 |
-
# except Exception as e:
|
1208 |
-
# logger.warning(f"Could not edit button message immediately after scheduling: {e}")
|
1209 |
-
# This initial edit will be quickly overwritten by the task's "Processing..." message.
|
1210 |
-
|
1211 |
|
1212 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1213 |
# ... (Keep existing implementation) ...
|
1214 |
-
|
1215 |
-
ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter) # Add common transient errors
|
1216 |
if isinstance(context.error, ignore_errors):
|
1217 |
ignore_messages = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
|
1218 |
err_str = str(context.error).lower()
|
@@ -1220,10 +1202,6 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
|
|
1220 |
logger.warning(f"Ignoring known/handled/transient error in error_handler: {context.error}")
|
1221 |
return
|
1222 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
1223 |
-
# Consider notifying the user about unexpected errors if appropriate and possible
|
1224 |
-
# if isinstance(update, Update) and update.effective_chat:
|
1225 |
-
# try: await context.bot.send_message(chat_id=update.effective_chat.id, text="An unexpected error occurred.")
|
1226 |
-
# except Exception: pass
|
1227 |
|
1228 |
|
1229 |
# --- Application Setup ---
|
@@ -1231,22 +1209,19 @@ async def setup_bot_config() -> Application:
|
|
1231 |
# ... (Keep existing implementation) ...
|
1232 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
1233 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
1234 |
-
# Configure HTTPX client with slightly longer timeouts for PTB internal requests if needed
|
1235 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
1236 |
application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() )
|
1237 |
# Add Handlers
|
1238 |
application.add_handler(CommandHandler("start", start))
|
1239 |
application.add_handler(CommandHandler("help", help_command))
|
1240 |
-
# Use a slightly broader filter to catch URLs even without explicit entity type from Telegram
|
1241 |
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
1242 |
application.add_handler(MessageHandler(url_filter, handle_potential_url))
|
1243 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
1244 |
-
# Error Handler
|
1245 |
application.add_error_handler(error_handler)
|
1246 |
logger.info("Telegram application handlers configured."); return application
|
1247 |
|
1248 |
# --- ASGI Lifespan & Routes ---
|
1249 |
-
# (lifespan,
|
1250 |
@contextlib.asynccontextmanager
|
1251 |
async def lifespan(app: Starlette):
|
1252 |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
|
@@ -1254,33 +1229,39 @@ async def lifespan(app: Starlette):
|
|
1254 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
|
1255 |
try:
|
1256 |
ptb_app = await setup_bot_config()
|
1257 |
-
await ptb_app.initialize()
|
1258 |
bot_info = await ptb_app.bot.get_me()
|
1259 |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
|
1260 |
|
1261 |
# Webhook setup logic (remains the same)
|
1262 |
current_webhook_info = await ptb_app.bot.get_webhook_info()
|
|
|
1263 |
if current_webhook_info and current_webhook_info.url:
|
1264 |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
|
1265 |
try:
|
1266 |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
|
1267 |
-
else: logger.warning("Failed delete webhook (API returned False).")
|
1268 |
-
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1)
|
1269 |
|
|
|
1270 |
space_host = os.environ.get("SPACE_HOST")
|
1271 |
webhook_path = "/webhook" # Standard path
|
1272 |
full_webhook_url = None
|
1273 |
if space_host:
|
1274 |
-
|
1275 |
-
|
1276 |
-
|
1277 |
-
|
|
|
|
|
|
|
1278 |
|
1279 |
-
|
|
|
1280 |
logger.info(f"Attempting to set webhook: {full_webhook_url}")
|
1281 |
set_webhook_args = {
|
1282 |
"url": full_webhook_url,
|
1283 |
-
"allowed_updates": Update.ALL_TYPES,
|
1284 |
"drop_pending_updates": True
|
1285 |
}
|
1286 |
if WEBHOOK_SECRET:
|
@@ -1290,30 +1271,28 @@ async def lifespan(app: Starlette):
|
|
1290 |
await asyncio.sleep(1.0) # Short delay before setting
|
1291 |
|
1292 |
try:
|
1293 |
-
await ptb_app.bot.set_webhook(**set_webhook_args)
|
1294 |
-
|
|
|
|
|
1295 |
webhook_info = await ptb_app.bot.get_webhook_info()
|
1296 |
if webhook_info and webhook_info.url == full_webhook_url:
|
1297 |
-
logger.info(f"Webhook set successfully: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}")
|
1298 |
-
# Log other info if useful: e.g., webhook_info.last_error_message
|
1299 |
if webhook_info.last_error_message:
|
1300 |
logger.warning(f"Webhook status has last error: {webhook_info.last_error_message}")
|
1301 |
else:
|
1302 |
-
logger.error(f"Webhook
|
1303 |
-
raise RuntimeError("Webhook
|
1304 |
|
1305 |
-
await ptb_app.start() # Start PTB processing updates
|
1306 |
logger.info("PTB Application started (webhook mode).")
|
1307 |
except Exception as e:
|
1308 |
-
logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True)
|
1309 |
-
raise RuntimeError(f"Failed to set webhook: {e}") from e
|
1310 |
-
|
1311 |
-
|
1312 |
-
|
1313 |
-
|
1314 |
-
# You might decide to raise an error or try to run in polling mode if webhook fails
|
1315 |
-
raise RuntimeError("Webhook URL undetermined (SPACE_HOST missing).")
|
1316 |
-
|
1317 |
|
1318 |
logger.info("ASGI Lifespan: Startup complete.");
|
1319 |
yield # Application runs here
|
@@ -1324,7 +1303,8 @@ async def lifespan(app: Starlette):
|
|
1324 |
if ptb_app:
|
1325 |
try:
|
1326 |
if ptb_app.running: await ptb_app.stop()
|
1327 |
-
|
|
|
1328 |
except Exception as shutdown_err:
|
1329 |
logger.error(f"Error during shutdown after startup failure: {shutdown_err}")
|
1330 |
raise # Reraise the original startup error
|
@@ -1337,36 +1317,42 @@ async def lifespan(app: Starlette):
|
|
1337 |
if ptb_app.running:
|
1338 |
logger.info("Stopping PTB application...")
|
1339 |
await ptb_app.stop()
|
1340 |
-
|
1341 |
-
|
1342 |
-
|
|
|
|
|
|
|
|
|
1343 |
except Exception as e:
|
1344 |
logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
|
1345 |
else:
|
1346 |
-
logger.info("PTB application
|
1347 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1348 |
|
1349 |
|
1350 |
async def health_check(request: Request) -> PlainTextResponse:
|
1351 |
-
# ... (Keep existing implementation, updated with model names) ...
|
1352 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
|
1353 |
bot_status = "Not Initialized"
|
1354 |
bot_username = "N/A"
|
1355 |
-
|
|
|
1356 |
try:
|
1357 |
# Quick check if webhook seems ok, more reliable than get_me() sometimes
|
1358 |
wh_info = await ptb_app.bot.get_webhook_info()
|
|
|
1359 |
if ptb_app.running and wh_info and wh_info.url:
|
1360 |
bot_info = await ptb_app.bot.get_me()
|
1361 |
bot_username = f"@{bot_info.username}"
|
1362 |
bot_status = f"Running (Webhook OK, {bot_username})"
|
1363 |
elif ptb_app.running:
|
1364 |
-
bot_status = "Running (Webhook
|
1365 |
else: bot_status = "Initialized/Not running"
|
1366 |
-
except Exception as e:
|
|
|
|
|
1367 |
elif ptb_app:
|
1368 |
-
bot_status = "Initializing..."
|
1369 |
-
|
1370 |
|
1371 |
health_info = [
|
1372 |
f"=== Telegram Summary Bot Status ===",
|
@@ -1390,6 +1376,9 @@ async def telegram_webhook(request: Request) -> Response:
|
|
1390 |
if not ptb_app:
|
1391 |
logger.error("Webhook received but PTB application not initialized.")
|
1392 |
return PlainTextResponse('Bot not initialized', status_code=503) # Service Unavailable
|
|
|
|
|
|
|
1393 |
if not ptb_app.running:
|
1394 |
logger.warning("Webhook received but PTB application not running.")
|
1395 |
return PlainTextResponse('Bot not running', status_code=503) # Service Unavailable
|
@@ -1443,9 +1432,13 @@ if __name__ == '__main__':
|
|
1443 |
|
1444 |
# Make sure necessary env vars are loaded for local dev if not set system-wide
|
1445 |
# Example using python-dotenv if you add it to requirements-dev.txt
|
1446 |
-
|
1447 |
-
|
1448 |
-
|
|
|
|
|
|
|
|
|
1449 |
|
1450 |
# Re-check required tokens after potential .env load
|
1451 |
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN not found.")
|
|
|
53 |
# --- Google Gemini ---
|
54 |
try:
|
55 |
import google.generativeai as genai
|
56 |
+
# Import specific types needed, check library for exact names if errors occur
|
57 |
from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
|
58 |
_gemini_available = True
|
59 |
except ImportError:
|
|
|
113 |
# Models (User can still configure via env vars)
|
114 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
115 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
116 |
+
# *** Reverted Model Name as requested ***
|
117 |
+
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model
|
118 |
|
119 |
# --- Configuration Checks ---
|
120 |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
|
|
160 |
|
161 |
# --- Constants ---
|
162 |
MAX_SUMMARY_CHUNK_SIZE = 4000 # Max characters per Telegram message (allow buffer)
|
163 |
+
# Adjust based on gemini-2.0-flash context window if known, 1M was for 1.5 Flash
|
164 |
+
# Let's use a more conservative estimate for 2.0 Flash, e.g., 128k tokens ~ 500k chars? Check Gemini docs.
|
165 |
+
# Sticking with a large number for now, but be aware this might need adjustment.
|
166 |
+
MAX_INPUT_TOKEN_APPROX = 500000
|
167 |
|
168 |
# --- Retry Decorator ---
|
169 |
# (Remains the same)
|
|
|
185 |
if any(err in str(e).lower() for err in ignore_errors):
|
186 |
logger.warning(f"Ignoring non-critical BadRequest: {e}")
|
187 |
return None
|
188 |
+
# Only raise if it's not an ignored error
|
189 |
logger.error(f"Potentially critical BadRequest: {e}")
|
190 |
raise
|
191 |
except TelegramError as e:
|
192 |
+
# Log specific transient errors that might trigger retry
|
193 |
+
if isinstance(e, (TimedOut, NetworkError, RetryAfter)):
|
194 |
+
logger.warning(f"Telegram transient error (will retry): {e}")
|
195 |
+
else:
|
196 |
+
logger.error(f"Unhandled TelegramError: {e}") # Log other Telegram errors
|
197 |
+
raise # Reraise to allow tenacity to handle retry
|
198 |
except Exception as e:
|
199 |
logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
|
200 |
raise
|
|
|
214 |
# --- Content Fetching Functions ---
|
215 |
|
216 |
# - YouTube Transcript Fetching (get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript) -
|
217 |
+
# (No changes needed in these functions)
|
218 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
219 |
# ... (Keep existing implementation) ...
|
220 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
|
|
304 |
transcript_text = None
|
305 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
306 |
try:
|
307 |
+
# Run the blocking call in a separate thread
|
308 |
+
transcript_list = await asyncio.to_thread(
|
309 |
+
YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch
|
310 |
+
)
|
311 |
+
# transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) # Old way
|
312 |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
313 |
+
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text.strip()
|
314 |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
|
315 |
+
except TranscriptsDisabled:
|
316 |
+
logger.warning(f"[Primary YT] Transcripts are disabled for video {video_id}")
|
317 |
+
transcript_text = None
|
318 |
+
except NoTranscriptFound:
|
319 |
+
logger.warning(f"[Primary YT] No English transcript found for video {video_id}")
|
320 |
+
transcript_text = None
|
321 |
except Exception as e:
|
322 |
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
|
323 |
+
# if isinstance(e, (NoTranscriptFound, TranscriptsDisabled)): logger.warning(f"[Primary YT] Known issue: {type(e).__name__}") # Handled above
|
324 |
transcript_text = None
|
325 |
|
326 |
if transcript_text is None:
|
327 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
328 |
if SUPADATA_API_KEY:
|
329 |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
330 |
+
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text # Already stripped
|
331 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
332 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
333 |
|
|
|
335 |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
|
336 |
if APIFY_API_TOKEN:
|
337 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
338 |
+
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text # Already stripped
|
339 |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
|
340 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
341 |
|
|
|
359 |
|
360 |
logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
|
361 |
# Define a writable cache directory (use /tmp in container environments)
|
362 |
+
# No longer creating dir here, relying on HOME=/tmp from Dockerfile
|
363 |
+
# cache_dir_path = "/tmp/.crawl4ai"
|
364 |
+
# try:
|
365 |
+
# os.makedirs(cache_dir_path, exist_ok=True)
|
366 |
+
# logger.info(f"[Crawl4AI Primary] Ensured cache directory exists: {cache_dir_path}")
|
367 |
+
# except OSError as e:
|
368 |
+
# logger.error(f"[Crawl4AI Primary] Failed to create cache directory {cache_dir_path}: {e}. Crawl may fail.")
|
369 |
+
# except Exception as e:
|
370 |
+
# logger.error(f"[Crawl4AI Primary] Unexpected error creating cache directory {cache_dir_path}: {e}")
|
|
|
|
|
371 |
|
372 |
try:
|
373 |
+
# Use AsyncWebCrawler context manager.
|
374 |
+
# Removed explicit cache_dir, relying on HOME=/tmp from Dockerfile
|
375 |
+
async with AsyncWebCrawler() as crawler:
|
376 |
+
logger.info(f"[Crawl4AI Primary] Initialized crawler (HOME should be /tmp).")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
# Use arun for a single URL crawl
|
378 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
|
379 |
|
|
|
384 |
return content
|
385 |
else:
|
386 |
logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted markdown content is empty.")
|
387 |
+
# Try result.text as a fallback within crawl4ai success
|
388 |
+
if result.text:
|
389 |
+
content = result.text.strip()
|
390 |
+
if content:
|
391 |
+
logger.info(f"[Crawl4AI Primary] Using .text fallback after empty markdown. Length: {len(content)}")
|
392 |
+
return content
|
393 |
+
return None # Return None if both markdown and text are empty
|
394 |
+
elif result and result.text: # Fallback to raw text if markdown is somehow missing entirely
|
395 |
content = result.text.strip()
|
396 |
if content:
|
397 |
+
logger.info(f"[Crawl4AI Primary] Success crawling {url} (using .text, markdown missing). Length: {len(content)}")
|
398 |
return content
|
399 |
else:
|
400 |
+
logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted text content is empty (markdown missing).")
|
401 |
return None
|
402 |
else:
|
403 |
logger.warning(f"[Crawl4AI Primary] Crawl failed or returned no result/content for {url}.")
|
|
|
406 |
logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
|
407 |
return None
|
408 |
except PermissionError as e: # Catch the specific error
|
409 |
+
# Log more detail if possible
|
410 |
+
logger.error(f"[Crawl4AI Primary] Permission denied during crawl for {url}. Target path: '{e.filename}'. Likely filesystem issue or HOME=/tmp not effective. Error: {e}", exc_info=True)
|
411 |
return None # Fail gracefully for this method
|
412 |
except Exception as e:
|
413 |
+
logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
|
414 |
+
# Log specific crawl4ai errors if they become apparent
|
|
|
|
|
|
|
415 |
return None
|
416 |
|
417 |
|
|
|
454 |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
|
455 |
element.extract()
|
456 |
# Try to find main content areas more broadly
|
457 |
+
selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body', '.post-content', '.entry-content'] # Added more common selectors
|
458 |
target_element = None
|
459 |
for selector in selectors:
|
460 |
+
try:
|
461 |
+
target_element = soup.select_one(selector)
|
462 |
+
if target_element: break
|
463 |
+
except Exception as sel_e: # Catch potential invalid CSS selectors from list
|
464 |
+
logger.warning(f"[BS4 Fallback] Invalid selector '{selector}': {sel_e}")
|
465 |
+
continue
|
466 |
+
|
467 |
|
468 |
if not target_element: target_element = soup.body # Fallback to body
|
469 |
if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
|
|
|
571 |
generation_config = genai.types.GenerationConfig(
|
572 |
# candidate_count=1, # Default is 1
|
573 |
# stop_sequences=["\n"],
|
574 |
+
max_output_tokens=2048, # Increased max tokens
|
575 |
temperature=0.7, # Adjust creativity vs factualness
|
576 |
# top_p=1.0, # Default
|
577 |
# top_k=None # Default
|
|
|
583 |
response: GenerateContentResponse = await model.generate_content_async( # Use async version
|
584 |
prompt,
|
585 |
generation_config=generation_config,
|
586 |
+
safety_settings=safety_settings,
|
587 |
+
# request_options={"timeout": 100} # Optional: Add timeout for the API call itself
|
588 |
)
|
|
|
589 |
|
590 |
# Check for safety blocks or other issues in response
|
591 |
if not response.candidates:
|
592 |
+
block_reason = "Unknown"
|
593 |
+
if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
|
594 |
+
block_reason = response.prompt_feedback.block_reason or "Reason not specified"
|
595 |
error_msg = f"Error: Gemini response blocked or empty. Reason: {block_reason}"
|
596 |
logger.error(f"[Gemini Primary] {error_msg}")
|
597 |
return None, error_msg
|
598 |
|
599 |
+
# *** FIXED FinishReason Check ***
|
600 |
+
# Access finish_reason from the first candidate
|
601 |
+
finish_reason_val = response.candidates[0].finish_reason
|
602 |
+
logger.debug(f"[Gemini Primary] Received response. Finish reason value: {finish_reason_val}")
|
603 |
+
|
604 |
+
# Convert the finish reason (might be enum/int/string) to uppercase string for reliable comparison
|
605 |
+
finish_reason_str = str(finish_reason_val).upper()
|
606 |
+
|
607 |
+
# Check if it's NOT a successful or expected non-error finish
|
608 |
+
# Expected "good" finishes: STOP, MAX_TOKENS
|
609 |
+
# Problematic finishes: SAFETY, RECITATION, OTHER, etc.
|
610 |
+
if finish_reason_str not in ["STOP", "MAX_TOKENS"]:
|
611 |
# Log safety ratings if available
|
612 |
safety_ratings_str = "N/A"
|
613 |
+
if hasattr(response.candidates[0], 'safety_ratings') and response.candidates[0].safety_ratings:
|
614 |
safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.candidates[0].safety_ratings])
|
615 |
+
|
616 |
+
error_msg = f"Error: Gemini generation finished unexpectedly. Reason: {finish_reason_str}. Safety: {safety_ratings_str}"
|
617 |
logger.error(f"[Gemini Primary] {error_msg}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
618 |
|
619 |
+
# Decide how to handle specific non-STOP reasons
|
620 |
+
if finish_reason_str == "SAFETY":
|
621 |
+
return None, error_msg # Return specific error for safety blocks
|
622 |
+
# For RECITATION, OTHER, etc., it's safer to return an error than potentially broken output
|
623 |
+
return None, f"Error: Gemini generation finished unexpectedly ({finish_reason_str})."
|
624 |
+
|
625 |
+
# Extract text (use response.text shortcut if available and reliable)
|
626 |
+
summary_text = ""
|
627 |
+
try:
|
628 |
+
# response.text might combine parts automatically, safer if available
|
629 |
+
summary_text = response.text
|
630 |
+
except ValueError as e:
|
631 |
+
# Handle cases where .text might raise error (e.g., if blocked, but caught above ideally)
|
632 |
+
logger.warning(f"[Gemini Primary] Error accessing response.text: {e}. Trying parts manually.")
|
633 |
+
# Fallback to joining parts if .text fails (less common now)
|
634 |
+
if response.candidates and response.candidates[0].content and response.candidates[0].content.parts:
|
635 |
+
summary_text = "".join(part.text for part in response.candidates[0].content.parts)
|
636 |
|
|
|
|
|
637 |
if not summary_text or not summary_text.strip():
|
638 |
logger.warning("[Gemini Primary] Gemini returned an empty summary.")
|
639 |
return None, "Error: AI generated an empty summary."
|
640 |
|
641 |
+
logger.info(f"[Gemini Primary] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason_str}")
|
642 |
return summary_text.strip(), None
|
643 |
|
644 |
+
except AttributeError as e:
|
645 |
+
# Catch potential errors if response structure is different than expected
|
646 |
+
logger.error(f"[Gemini Primary] Attribute error accessing Gemini response: {e}. Response structure might have changed.", exc_info=True)
|
647 |
+
return None, f"Error: Failed to parse Gemini response. Details: {e}"
|
648 |
except Exception as e:
|
649 |
logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
|
650 |
# Check for specific Google API errors if needed
|
|
|
655 |
|
656 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
657 |
""" Calls the OpenRouter API to generate a summary. """
|
658 |
+
# ... (Keep existing implementation - no changes needed here based on logs) ...
|
659 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
660 |
if not _openrouter_fallback_enabled:
|
661 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
662 |
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
|
663 |
|
664 |
+
max_input_len_openrouter = 100000 # Adjust based on OPENROUTER_MODEL limits if known (Deepseek is large)
|
|
|
|
|
665 |
if len(text) > max_input_len_openrouter:
|
666 |
logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
|
667 |
text = text[:max_input_len_openrouter]
|
|
|
719 |
try:
|
720 |
data = response.json()
|
721 |
if data.get("choices") and len(data["choices"]) > 0:
|
722 |
+
choice = data["choices"][0]
|
723 |
+
message = choice.get("message")
|
724 |
+
finish_reason = choice.get("finish_reason", "N/A") # Get finish reason
|
725 |
+
|
726 |
if message and message.get("content"):
|
727 |
summary_text = message["content"].strip()
|
728 |
if summary_text:
|
|
|
729 |
logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
|
730 |
# Check for length finish reason?
|
731 |
if finish_reason == 'length':
|
|
|
774 |
|
775 |
async def generate_summary(text: str, summary_type: str) -> str:
|
776 |
""" Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
|
777 |
+
# ... (Keep existing implementation - no changes needed here based on logs) ...
|
778 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
779 |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
|
780 |
final_summary: Optional[str] = None
|
|
|
831 |
bot_token: str
|
832 |
) -> None:
|
833 |
"""Handles fetching content, generating summary, and sending results."""
|
834 |
+
# ... (Keep existing implementation - no changes needed here based on logs) ...
|
835 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
836 |
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
837 |
background_request: Optional[BaseRequest] = None
|
|
|
839 |
content: Optional[str] = None
|
840 |
user_feedback_message: Optional[str] = None
|
841 |
success = False
|
842 |
+
status_message_id = message_id_to_edit # Keep track of the original button message ID
|
843 |
+
message_to_delete_later_id: Optional[int] = None # For temporary "Processing..." messages
|
844 |
|
845 |
try:
|
846 |
# Initialize background bot
|
|
|
848 |
bot = Bot(token=bot_token, request=background_request)
|
849 |
except Exception as e:
|
850 |
logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True)
|
851 |
+
# Cannot send feedback without bot
|
852 |
return
|
853 |
|
854 |
try:
|
855 |
# Send/Edit "Processing..." message
|
856 |
+
# We *always* edit the original button message first
|
857 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{html.escape(url)}\n\nThis might take a moment..."
|
858 |
+
edited_original_msg = False
|
859 |
+
if status_message_id: # If we have the ID of the message with buttons
|
860 |
try:
|
861 |
await retry_bot_operation(
|
862 |
bot.edit_message_text,
|
|
|
864 |
message_id=status_message_id,
|
865 |
text=processing_message_text,
|
866 |
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
867 |
+
reply_markup=None, # Remove buttons
|
868 |
link_preview_options={'is_disabled': True} # Disable preview here too
|
869 |
)
|
870 |
+
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} to 'Processing'")
|
871 |
+
edited_original_msg = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
872 |
except Exception as e:
|
873 |
+
logger.warning(f"[Task {task_id}] Could not edit original button message {status_message_id}: {e}. Will proceed without initial status update on that message.")
|
874 |
+
# Don't send a new message here, the final result/error will handle it.
|
875 |
+
# Keep status_message_id so we know the original message still exists.
|
876 |
|
877 |
# Indicate activity
|
878 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
|
|
897 |
content = await get_website_content_via_crawl4ai(url)
|
898 |
|
899 |
if not content:
|
900 |
+
# Log the failure reason from Crawl4AI if available (already logged in the function)
|
901 |
logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
|
902 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
903 |
except Exception: pass
|
|
|
911 |
except Exception: pass
|
912 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
913 |
if not content:
|
|
|
|
|
914 |
logger.error(f"[Task {task_id}] API fallback (urltotext) also failed for {url}.")
|
915 |
user_feedback_message = "Sorry, I couldn't fetch content from that website using any method (Crawl4AI/BS4 failed, API failed or ran out of credits)." # Updated message
|
916 |
else:
|
|
|
934 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
935 |
user_feedback_message = final_summary # Pass AI error message to user
|
936 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
937 |
+
success = False # Explicitly mark as failure
|
938 |
else:
|
939 |
+
# Success - Split and Send the summary
|
940 |
summary_parts = []
|
941 |
current_part = ""
|
942 |
# Split respecting newlines, ensure no part exceeds MAX_SUMMARY_CHUNK_SIZE
|
943 |
lines = final_summary.splitlines(keepends=True)
|
944 |
for line in lines:
|
|
|
945 |
if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
|
946 |
if current_part.strip(): # Don't add empty parts
|
947 |
summary_parts.append(current_part.strip())
|
948 |
+
current_part = line
|
|
|
949 |
if len(current_part) > MAX_SUMMARY_CHUNK_SIZE:
|
950 |
logger.warning(f"[Task {task_id}] Truncating overly long line in summary.")
|
951 |
current_part = current_part[:MAX_SUMMARY_CHUNK_SIZE]
|
952 |
else:
|
953 |
current_part += line
|
954 |
+
if current_part.strip(): summary_parts.append(current_part.strip())
|
|
|
|
|
|
|
|
|
|
|
955 |
if not summary_parts:
|
956 |
summary_parts.append("Summary generated, but it appears to be empty.")
|
957 |
logger.warning(f"[Task {task_id}] Summary was non-empty initially but splitting resulted in zero parts.")
|
958 |
|
|
|
959 |
logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
|
960 |
|
961 |
+
# Try to edit the original button message with the *first* part of the summary
|
962 |
+
edited_final_result = False
|
963 |
+
if status_message_id: # If we have the original button message ID
|
|
|
|
|
|
|
964 |
try:
|
|
|
965 |
await retry_bot_operation(
|
966 |
bot.edit_message_text,
|
967 |
chat_id=chat_id,
|
968 |
+
message_id=status_message_id,
|
969 |
text=summary_parts[0],
|
970 |
parse_mode=None, # Send as plain text initially, safer
|
971 |
link_preview_options={'is_disabled': True}
|
972 |
)
|
973 |
+
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} with first summary part.")
|
974 |
+
edited_final_result = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
975 |
except Exception as edit_err:
|
976 |
+
logger.warning(f"[Task {task_id}] Failed to edit original button message {status_message_id} with summary part 1: {edit_err}. Sending as new message.")
|
977 |
+
# Original message remains as "Processing..." or its initial state if first edit failed
|
978 |
|
979 |
+
# If editing failed, or if there was no original message_id (shouldn't happen), send first part as new message
|
980 |
+
if not edited_final_result:
|
981 |
sent_msg = await retry_bot_operation(
|
982 |
bot.send_message,
|
983 |
chat_id=chat_id,
|
|
|
985 |
parse_mode=None,
|
986 |
link_preview_options={'is_disabled': True}
|
987 |
)
|
988 |
+
if not sent_msg:
|
|
|
|
|
989 |
logger.error(f"[Task {task_id}] Failed to send first summary part even as new message.")
|
990 |
+
user_feedback_message = "Sorry, failed to send the summary."
|
991 |
+
success = False
|
992 |
|
993 |
# Send remaining parts (if any and first part succeeded)
|
994 |
+
if success and len(summary_parts) > 1: # Check success flag before sending more parts
|
995 |
for i, part in enumerate(summary_parts[1:], start=2):
|
996 |
await asyncio.sleep(0.5) # Small delay between parts
|
997 |
try:
|
|
|
1006 |
except Exception as part_err:
|
1007 |
logger.error(f"[Task {task_id}] Failed to send summary part {i}: {part_err}")
|
1008 |
user_feedback_message = f"Sorry, failed to send part {i} of the summary."
|
1009 |
+
success = False # Mark as failure
|
1010 |
break # Stop sending remaining parts
|
1011 |
|
1012 |
+
# Determine overall success based on whether feedback message is set *during this block*
|
1013 |
if not user_feedback_message:
|
1014 |
success = True
|
|
|
1015 |
|
1016 |
+
# --- Handle Failure Cases (Content Fetching Failed or Summary Failed) ---
|
1017 |
+
if not success: # Check overall success flag
|
1018 |
+
if not user_feedback_message: # If success is false but no specific message, set a generic one
|
1019 |
+
user_feedback_message = "Sorry, an unknown error occurred during processing."
|
1020 |
+
logger.error(f"[Task {task_id}] Task failed but no specific user_feedback_message was set.")
|
1021 |
+
|
1022 |
logger.warning(f"[Task {task_id}] Sending failure/error feedback to user: {user_feedback_message}")
|
1023 |
try:
|
1024 |
+
# Try editing the original button message with the error
|
1025 |
+
edited_final_error = False
|
1026 |
+
if status_message_id:
|
|
|
1027 |
try:
|
1028 |
await retry_bot_operation(
|
1029 |
bot.edit_message_text,
|
1030 |
chat_id=chat_id,
|
1031 |
+
message_id=status_message_id,
|
1032 |
text=user_feedback_message,
|
1033 |
link_preview_options={'is_disabled': True},
|
1034 |
reply_markup=None # Remove buttons
|
1035 |
)
|
1036 |
+
logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} with failure feedback.")
|
1037 |
+
edited_final_error = True
|
|
|
|
|
|
|
1038 |
except Exception as edit_err:
|
1039 |
+
logger.warning(f"[Task {task_id}] Failed to edit original button message {status_message_id} with failure feedback: {edit_err}. Sending as new message.")
|
1040 |
|
1041 |
+
# If editing failed or wasn't applicable, send error as a new message
|
1042 |
+
if not edited_final_error:
|
1043 |
await retry_bot_operation(
|
1044 |
bot.send_message,
|
1045 |
chat_id=chat_id,
|
|
|
1052 |
|
1053 |
except Exception as e:
|
1054 |
# Catch-all for unexpected errors during the main processing logic
|
1055 |
+
logger.error(f"[Task {task_id}] Unexpected error during processing task: {e}", exc_info=True)
|
1056 |
+
success = False
|
1057 |
user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
|
1058 |
if bot: # Ensure bot exists before trying to send
|
1059 |
try:
|
1060 |
+
# Attempt to send a final error message (maybe edit original if possible?)
|
1061 |
+
edited_final_crash_error = False
|
1062 |
+
if status_message_id:
|
1063 |
+
try:
|
1064 |
+
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=user_feedback_message, reply_markup=None, link_preview_options={'is_disabled': True} )
|
1065 |
+
edited_final_crash_error = True
|
1066 |
+
except Exception: pass # Ignore error editing here, just try sending new
|
1067 |
+
if not edited_final_crash_error:
|
1068 |
+
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
1069 |
except Exception as final_err:
|
1070 |
logger.error(f"[Task {task_id}] Failed to send the final unexpected error feedback: {final_err}")
|
1071 |
|
1072 |
finally:
|
1073 |
# --- Cleanup ---
|
1074 |
+
# No automatic deletion needed now. The original message (status_message_id) is either
|
1075 |
+
# edited with the result/error, or left as is if editing failed. Temporary messages
|
1076 |
+
# were not introduced in this version of the logic.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1077 |
|
1078 |
# Close the background bot's HTTP client
|
1079 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
|
|
1088 |
|
1089 |
# --- Telegram Handlers ---
|
1090 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
|
1091 |
+
# No changes needed in these handlers based on logs.
|
1092 |
|
1093 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1094 |
user = update.effective_user; mention = user.mention_html()
|
1095 |
if not user or not update.message: return
|
1096 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /start.")
|
1097 |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
|
1098 |
|
1099 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1100 |
user = update.effective_user
|
1101 |
if not user or not update.message: return
|
1102 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
|
|
|
1103 |
help_text = ( "🔍 **How to use:**\n\n"
|
1104 |
"1. Send me any YouTube video link or website URL.\n"
|
1105 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
1106 |
"3. Click the button for your choice.\n"
|
1107 |
+
"4. Wait while I process it!\n\n" # Slightly rephrased
|
1108 |
"⚙️ **Behind the scenes:**\n"
|
1109 |
f"• **Websites:** I try `Crawl4AI` (smart crawl), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n"
|
1110 |
"• **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
|
|
|
1115 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
1116 |
|
1117 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1118 |
if not update.message or not update.message.text: return
|
1119 |
url = update.message.text.strip(); user = update.effective_user
|
1120 |
if not user: return
|
1121 |
# Basic URL validation
|
1122 |
+
url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE) # Slightly improved regex
|
1123 |
+
if not url_pattern.match(url):
|
1124 |
logger.debug(f"Ignoring non-URL from {user.id}: {url}")
|
1125 |
+
await update.message.reply_text("Hmm, that doesn't look like a valid web URL. Please make sure it starts with `http://` or `https://` and includes a domain.", parse_mode=ParseMode.MARKDOWN)
|
|
|
1126 |
return
|
1127 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
|
1128 |
# Store URL and original message ID in user_data
|
|
|
1134 |
await update.message.reply_html( f"Okay, I see this link:\n<code>{escaped_url}</code>\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True )
|
1135 |
|
1136 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1137 |
query = update.callback_query
|
1138 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
1139 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
1140 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'N/A'})")
|
1141 |
+
except Exception as e: logger.warning(f"Error answering callback {query_id}: {e}") # Log warning, don't stop
|
1142 |
|
1143 |
url = context.user_data.get('url_to_summarize')
|
1144 |
message_id_to_edit = query.message.message_id # The message with the buttons
|
|
|
1154 |
except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
1155 |
return # Do not proceed further
|
1156 |
|
|
|
|
|
|
|
|
|
1157 |
# Check necessary configurations before scheduling
|
1158 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1159 |
if not TELEGRAM_TOKEN:
|
|
|
1166 |
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models are available. Cannot proceed.", reply_markup=None)
|
1167 |
except Exception: pass
|
1168 |
return
|
|
|
1169 |
elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback (OpenRouter).")
|
1170 |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) is unavailable, relying on primary (Gemini).")
|
1171 |
|
1172 |
# Schedule the background task
|
1173 |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit} (URL: {url})")
|
1174 |
+
# Use asyncio.ensure_future for slightly better practice than create_task directly here
|
1175 |
+
asyncio.ensure_future(
|
1176 |
process_summary_task(
|
1177 |
user_id=user.id,
|
1178 |
chat_id=query.message.chat_id,
|
|
|
1180 |
url=url,
|
1181 |
summary_type=summary_type,
|
1182 |
bot_token=TELEGRAM_TOKEN # Pass token explicitly
|
1183 |
+
)
|
1184 |
+
# Consider adding .set_name(...) if using Python 3.8+ for better debugging
|
1185 |
+
# .set_name(f"SummaryTask-{user.id}-{message_id_to_edit}")
|
1186 |
)
|
1187 |
|
1188 |
+
# Clear context AFTER scheduling the task
|
1189 |
context.user_data.pop('url_to_summarize', None)
|
1190 |
context.user_data.pop('original_message_id', None)
|
1191 |
logger.debug(f"Cleared URL context for user {user.id} after scheduling task.")
|
1192 |
|
1193 |
+
# Do not edit the message here - let the task handle its own status updates by editing this message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1194 |
|
1195 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1196 |
# ... (Keep existing implementation) ...
|
1197 |
+
ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
|
|
|
1198 |
if isinstance(context.error, ignore_errors):
|
1199 |
ignore_messages = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
|
1200 |
err_str = str(context.error).lower()
|
|
|
1202 |
logger.warning(f"Ignoring known/handled/transient error in error_handler: {context.error}")
|
1203 |
return
|
1204 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
|
|
|
|
|
|
|
|
1205 |
|
1206 |
|
1207 |
# --- Application Setup ---
|
|
|
1209 |
# ... (Keep existing implementation) ...
|
1210 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
1211 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
|
|
1212 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
1213 |
application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() )
|
1214 |
# Add Handlers
|
1215 |
application.add_handler(CommandHandler("start", start))
|
1216 |
application.add_handler(CommandHandler("help", help_command))
|
|
|
1217 |
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
1218 |
application.add_handler(MessageHandler(url_filter, handle_potential_url))
|
1219 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
|
|
1220 |
application.add_error_handler(error_handler)
|
1221 |
logger.info("Telegram application handlers configured."); return application
|
1222 |
|
1223 |
# --- ASGI Lifespan & Routes ---
|
1224 |
+
# (lifespan, telegram_webhook remain the same)
|
1225 |
@contextlib.asynccontextmanager
|
1226 |
async def lifespan(app: Starlette):
|
1227 |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
|
|
|
1229 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
|
1230 |
try:
|
1231 |
ptb_app = await setup_bot_config()
|
1232 |
+
await ptb_app.initialize() # This sets the _initialized flag
|
1233 |
bot_info = await ptb_app.bot.get_me()
|
1234 |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
|
1235 |
|
1236 |
# Webhook setup logic (remains the same)
|
1237 |
current_webhook_info = await ptb_app.bot.get_webhook_info()
|
1238 |
+
webhook_delete_success = True # Assume success unless proven otherwise
|
1239 |
if current_webhook_info and current_webhook_info.url:
|
1240 |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
|
1241 |
try:
|
1242 |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
|
1243 |
+
else: logger.warning("Failed delete webhook (API returned False)."); webhook_delete_success = False
|
1244 |
+
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1); webhook_delete_success = False
|
1245 |
|
1246 |
+
# Determine Webhook URL
|
1247 |
space_host = os.environ.get("SPACE_HOST")
|
1248 |
webhook_path = "/webhook" # Standard path
|
1249 |
full_webhook_url = None
|
1250 |
if space_host:
|
1251 |
+
protocol = "https"
|
1252 |
+
host = space_host.split('://')[-1]
|
1253 |
+
full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}"
|
1254 |
+
logger.info(f"Using SPACE_HOST to determine webhook URL: {full_webhook_url}")
|
1255 |
+
else:
|
1256 |
+
logger.critical("Could not construct webhook URL (SPACE_HOST env var missing or invalid).")
|
1257 |
+
raise RuntimeError("Webhook URL undetermined.")
|
1258 |
|
1259 |
+
# Set Webhook
|
1260 |
+
if full_webhook_url and webhook_delete_success:
|
1261 |
logger.info(f"Attempting to set webhook: {full_webhook_url}")
|
1262 |
set_webhook_args = {
|
1263 |
"url": full_webhook_url,
|
1264 |
+
"allowed_updates": Update.ALL_TYPES,
|
1265 |
"drop_pending_updates": True
|
1266 |
}
|
1267 |
if WEBHOOK_SECRET:
|
|
|
1271 |
await asyncio.sleep(1.0) # Short delay before setting
|
1272 |
|
1273 |
try:
|
1274 |
+
webhook_set = await ptb_app.bot.set_webhook(**set_webhook_args)
|
1275 |
+
if not webhook_set:
|
1276 |
+
raise RuntimeError("set_webhook API call returned False.")
|
1277 |
+
await asyncio.sleep(1.5) # Longer delay after setting before verification
|
1278 |
webhook_info = await ptb_app.bot.get_webhook_info()
|
1279 |
if webhook_info and webhook_info.url == full_webhook_url:
|
1280 |
+
logger.info(f"Webhook set and verified successfully: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}")
|
|
|
1281 |
if webhook_info.last_error_message:
|
1282 |
logger.warning(f"Webhook status has last error: {webhook_info.last_error_message}")
|
1283 |
else:
|
1284 |
+
logger.error(f"Webhook verification failed! Expected '{full_webhook_url}', Got info: {webhook_info}")
|
1285 |
+
raise RuntimeError("Webhook verification failed after setting.")
|
1286 |
|
1287 |
+
await ptb_app.start() # Start PTB processing updates AFTER successful webhook setup
|
1288 |
logger.info("PTB Application started (webhook mode).")
|
1289 |
except Exception as e:
|
1290 |
+
logger.error(f"FATAL: Failed to set or verify webhook: {e}", exc_info=True)
|
1291 |
+
raise RuntimeError(f"Failed to set/verify webhook: {e}") from e
|
1292 |
+
elif not webhook_delete_success:
|
1293 |
+
logger.error("FATAL: Failed to delete previous webhook. Cannot set new one.")
|
1294 |
+
raise RuntimeError("Failed to delete previous webhook.")
|
1295 |
+
# else: # Already handled by raising error if full_webhook_url is None
|
|
|
|
|
|
|
1296 |
|
1297 |
logger.info("ASGI Lifespan: Startup complete.");
|
1298 |
yield # Application runs here
|
|
|
1303 |
if ptb_app:
|
1304 |
try:
|
1305 |
if ptb_app.running: await ptb_app.stop()
|
1306 |
+
# Check if initialized before shutting down
|
1307 |
+
if ptb_app._initialized: await ptb_app.shutdown()
|
1308 |
except Exception as shutdown_err:
|
1309 |
logger.error(f"Error during shutdown after startup failure: {shutdown_err}")
|
1310 |
raise # Reraise the original startup error
|
|
|
1317 |
if ptb_app.running:
|
1318 |
logger.info("Stopping PTB application...")
|
1319 |
await ptb_app.stop()
|
1320 |
+
# Check if initialized before shutting down
|
1321 |
+
if ptb_app._initialized:
|
1322 |
+
logger.info("Shutting down PTB application...")
|
1323 |
+
await ptb_app.shutdown()
|
1324 |
+
logger.info("PTB Application shut down successfully.")
|
1325 |
+
else:
|
1326 |
+
logger.info("PTB Application was not fully initialized, skipping shutdown.")
|
1327 |
except Exception as e:
|
1328 |
logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
|
1329 |
else:
|
1330 |
+
logger.info("PTB application object not created.")
|
1331 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1332 |
|
1333 |
|
1334 |
async def health_check(request: Request) -> PlainTextResponse:
|
|
|
1335 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
|
1336 |
bot_status = "Not Initialized"
|
1337 |
bot_username = "N/A"
|
1338 |
+
# *** FIXED Attribute Name ***
|
1339 |
+
if ptb_app and ptb_app.bot and ptb_app._initialized: # Check if initialized using _initialized
|
1340 |
try:
|
1341 |
# Quick check if webhook seems ok, more reliable than get_me() sometimes
|
1342 |
wh_info = await ptb_app.bot.get_webhook_info()
|
1343 |
+
# Check running status as well
|
1344 |
if ptb_app.running and wh_info and wh_info.url:
|
1345 |
bot_info = await ptb_app.bot.get_me()
|
1346 |
bot_username = f"@{bot_info.username}"
|
1347 |
bot_status = f"Running (Webhook OK, {bot_username})"
|
1348 |
elif ptb_app.running:
|
1349 |
+
bot_status = f"Running (Webhook Status: {wh_info.url if wh_info else 'N/A'}, Last Error: {wh_info.last_error_message if wh_info else 'N/A'})"
|
1350 |
else: bot_status = "Initialized/Not running"
|
1351 |
+
except Exception as e:
|
1352 |
+
logger.error(f"Error checking bot status in health check: {e}", exc_info=True)
|
1353 |
+
bot_status = f"Error checking status: {e}"
|
1354 |
elif ptb_app:
|
1355 |
+
bot_status = "Initializing..." # Or Initialized but not running yet
|
|
|
1356 |
|
1357 |
health_info = [
|
1358 |
f"=== Telegram Summary Bot Status ===",
|
|
|
1376 |
if not ptb_app:
|
1377 |
logger.error("Webhook received but PTB application not initialized.")
|
1378 |
return PlainTextResponse('Bot not initialized', status_code=503) # Service Unavailable
|
1379 |
+
if not ptb_app._initialized: # Check _initialized flag
|
1380 |
+
logger.error("Webhook received but PTB application not running (not initialized).")
|
1381 |
+
return PlainTextResponse('Bot not initialized', status_code=503)
|
1382 |
if not ptb_app.running:
|
1383 |
logger.warning("Webhook received but PTB application not running.")
|
1384 |
return PlainTextResponse('Bot not running', status_code=503) # Service Unavailable
|
|
|
1432 |
|
1433 |
# Make sure necessary env vars are loaded for local dev if not set system-wide
|
1434 |
# Example using python-dotenv if you add it to requirements-dev.txt
|
1435 |
+
try:
|
1436 |
+
from dotenv import load_dotenv
|
1437 |
+
load_dotenv()
|
1438 |
+
logger.info("Loaded environment variables from .env file for local development.")
|
1439 |
+
except ImportError:
|
1440 |
+
logger.info(".env file not found or python-dotenv not installed, using system environment variables.")
|
1441 |
+
|
1442 |
|
1443 |
# Re-check required tokens after potential .env load
|
1444 |
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN not found.")
|