fmab777 commited on
Commit
3e7da99
·
verified ·
1 Parent(s): d6e3c43

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +250 -257
main.py CHANGED
@@ -53,6 +53,7 @@ except ImportError:
53
  # --- Google Gemini ---
54
  try:
55
  import google.generativeai as genai
 
56
  from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
57
  _gemini_available = True
58
  except ImportError:
@@ -112,8 +113,8 @@ GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer
112
  # Models (User can still configure via env vars)
113
  OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
114
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
115
- GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-1.5-flash-latest") # Use the 1.5 flash model directly
116
- # Using gemini-1.5-flash-latest is generally recommended over gemini-2.0-flash-001
117
 
118
  # --- Configuration Checks ---
119
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
@@ -159,7 +160,10 @@ if _gemini_primary_enabled:
159
 
160
  # --- Constants ---
161
  MAX_SUMMARY_CHUNK_SIZE = 4000 # Max characters per Telegram message (allow buffer)
162
- MAX_INPUT_TOKEN_APPROX = 1000000 # Gemini 1.5 Flash context window (approx chars) - adjust if needed
 
 
 
163
 
164
  # --- Retry Decorator ---
165
  # (Remains the same)
@@ -181,11 +185,16 @@ async def retry_bot_operation(func, *args, **kwargs):
181
  if any(err in str(e).lower() for err in ignore_errors):
182
  logger.warning(f"Ignoring non-critical BadRequest: {e}")
183
  return None
 
184
  logger.error(f"Potentially critical BadRequest: {e}")
185
  raise
186
  except TelegramError as e:
187
- logger.warning(f"TelegramError (will retry if applicable): {e}")
188
- raise
 
 
 
 
189
  except Exception as e:
190
  logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
191
  raise
@@ -205,7 +214,7 @@ def extract_youtube_id(url):
205
  # --- Content Fetching Functions ---
206
 
207
  # - YouTube Transcript Fetching (get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript) -
208
- # (These functions remain exactly the same as in your provided code)
209
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
210
  # ... (Keep existing implementation) ...
211
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
@@ -295,20 +304,30 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
295
  transcript_text = None
296
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
297
  try:
298
- transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
 
 
 
 
299
  if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
300
- if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text
301
  else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
 
 
 
 
 
 
302
  except Exception as e:
303
  logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
304
- if isinstance(e, (NoTranscriptFound, TranscriptsDisabled)): logger.warning(f"[Primary YT] Known issue: {type(e).__name__}")
305
  transcript_text = None
306
 
307
  if transcript_text is None:
308
  logger.info("[Fallback YT 1] Trying Supadata API...")
309
  if SUPADATA_API_KEY:
310
  transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
311
- if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
312
  else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
313
  else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
314
 
@@ -316,7 +335,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
316
  logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
317
  if APIFY_API_TOKEN:
318
  transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
319
- if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
320
  else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
321
  else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
322
 
@@ -340,35 +359,21 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
340
 
341
  logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
342
  # Define a writable cache directory (use /tmp in container environments)
343
- # Create the directory path beforehand to avoid potential race conditions or permission issues within the library
344
- cache_dir_path = "/tmp/.crawl4ai" # CHANGED: Use /tmp
345
- try:
346
- os.makedirs(cache_dir_path, exist_ok=True)
347
- logger.info(f"[Crawl4AI Primary] Ensured cache directory exists: {cache_dir_path}")
348
- except OSError as e:
349
- logger.error(f"[Crawl4AI Primary] Failed to create cache directory {cache_dir_path}: {e}. Crawl may fail.")
350
- # Don't return here, let the crawler try anyway, it might handle it internally or use default
351
- except Exception as e:
352
- logger.error(f"[Crawl4AI Primary] Unexpected error creating cache directory {cache_dir_path}: {e}")
353
-
354
 
355
  try:
356
- # Use AsyncWebCrawler context manager with explicit cache_dir
357
- # NOTE: Pass cache_dir here if the library supports it via __init__ or a config object
358
- # Checking crawl4ai docs/source, AsyncWebCrawler doesn't directly take cache_dir in __init__.
359
- # It seems to rely on environment variables or default home resolution.
360
- # The PermissionError happens in RobotsParser -> get_home_folder -> os.makedirs.
361
- # WORKAROUND: We might need to adjust the environment or hope setting HOME=/app in Dockerfile is enough
362
- # *if* the library correctly uses HOME. Let's test *without* explicit cache_dir first,
363
- # relying on HOME=/app and the prior os.makedirs call. If it still fails, we need a different approach.
364
-
365
- # UPDATE: The traceback shows it uses utils.get_home_folder(). Let's stick with HOME=/app for now
366
- # and see if the permission error was transient or specific to the '.models' subdir.
367
- # If it persists, we might need to fork/modify crawl4ai or find another way to configure its paths.
368
-
369
- # Let's *try* passing cache_dir anyway, maybe it's an undocumented/newer feature
370
- async with AsyncWebCrawler(cache_dir=cache_dir_path) as crawler: # TRY passing cache_dir
371
- logger.info(f"[Crawl4AI Primary] Initialized with explicit cache_dir: {cache_dir_path}")
372
  # Use arun for a single URL crawl
373
  result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
374
 
@@ -379,14 +384,20 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
379
  return content
380
  else:
381
  logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted markdown content is empty.")
382
- return None
383
- elif result and result.text: # Fallback to raw text if markdown is somehow empty
 
 
 
 
 
 
384
  content = result.text.strip()
385
  if content:
386
- logger.info(f"[Crawl4AI Primary] Success crawling {url} (using .text). Length: {len(content)}")
387
  return content
388
  else:
389
- logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted text content is empty.")
390
  return None
391
  else:
392
  logger.warning(f"[Crawl4AI Primary] Crawl failed or returned no result/content for {url}.")
@@ -395,14 +406,12 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
395
  logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
396
  return None
397
  except PermissionError as e: # Catch the specific error
398
- logger.error(f"[Crawl4AI Primary] Permission denied during crawl for {url}. Likely filesystem issue in container. Error: {e}", exc_info=True)
 
399
  return None # Fail gracefully for this method
400
  except Exception as e:
401
- # Log type error if cache_dir isn't accepted
402
- if "unexpected keyword argument 'cache_dir'" in str(e):
403
- logger.error(f"[Crawl4AI Primary] AsyncWebCrawler does not accept 'cache_dir'. Remove this argument. Error: {e}")
404
- else:
405
- logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
406
  return None
407
 
408
 
@@ -445,11 +454,16 @@ async def get_website_content_bs4(url: str) -> Optional[str]:
445
  for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
446
  element.extract()
447
  # Try to find main content areas more broadly
448
- selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body']
449
  target_element = None
450
  for selector in selectors:
451
- target_element = soup.select_one(selector)
452
- if target_element: break
 
 
 
 
 
453
 
454
  if not target_element: target_element = soup.body # Fallback to body
455
  if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
@@ -557,7 +571,7 @@ Key Bullet Points Summary:"""
557
  generation_config = genai.types.GenerationConfig(
558
  # candidate_count=1, # Default is 1
559
  # stop_sequences=["\n"],
560
- max_output_tokens=2048, # Increased max tokens for potentially longer summaries from large inputs
561
  temperature=0.7, # Adjust creativity vs factualness
562
  # top_p=1.0, # Default
563
  # top_k=None # Default
@@ -569,44 +583,68 @@ Key Bullet Points Summary:"""
569
  response: GenerateContentResponse = await model.generate_content_async( # Use async version
570
  prompt,
571
  generation_config=generation_config,
572
- safety_settings=safety_settings
 
573
  )
574
- logger.debug(f"[Gemini Primary] Received response. Finish reason: {response.candidates[0].finish_reason if response.candidates else 'N/A'}")
575
 
576
  # Check for safety blocks or other issues in response
577
  if not response.candidates:
578
- block_reason = response.prompt_feedback.block_reason if hasattr(response, 'prompt_feedback') else 'Unknown'
 
 
579
  error_msg = f"Error: Gemini response blocked or empty. Reason: {block_reason}"
580
  logger.error(f"[Gemini Primary] {error_msg}")
581
  return None, error_msg
582
 
583
- # Check finish reason (e.g., MAX_TOKENS, SAFETY)
584
- finish_reason = response.candidates[0].finish_reason
585
- if finish_reason != genai.types.FinishReason.STOP and finish_reason != genai.types.FinishReason.MAX_TOKENS:
 
 
 
 
 
 
 
 
 
586
  # Log safety ratings if available
587
  safety_ratings_str = "N/A"
588
- if hasattr(response.candidates[0], 'safety_ratings'):
589
  safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.candidates[0].safety_ratings])
590
- error_msg = f"Error: Gemini generation finished unexpectedly. Reason: {finish_reason.name}. Safety: {safety_ratings_str}"
 
591
  logger.error(f"[Gemini Primary] {error_msg}")
592
- # Return partial text if available and finish reason is MAX_TOKENS? Maybe not, could be truncated badly.
593
- # If SAFETY, definitely return error.
594
- if finish_reason == genai.types.FinishReason.SAFETY:
595
- return None, error_msg # Return specific error for safety blocks
596
- # For other reasons, maybe return partial, but safer to return error for now
597
- # return response.text if hasattr(response, 'text') else None, error_msg # Optional: return partial text for RECITATION/OTHER
598
- return None, f"Error: Gemini generation finished unexpectedly ({finish_reason.name})."
599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
 
601
- # Extract text
602
- summary_text = response.text
603
  if not summary_text or not summary_text.strip():
604
  logger.warning("[Gemini Primary] Gemini returned an empty summary.")
605
  return None, "Error: AI generated an empty summary."
606
 
607
- logger.info(f"[Gemini Primary] Summary generated successfully (len: {len(summary_text)}).")
608
  return summary_text.strip(), None
609
 
 
 
 
 
610
  except Exception as e:
611
  logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
612
  # Check for specific Google API errors if needed
@@ -617,14 +655,13 @@ Key Bullet Points Summary:"""
617
 
618
  async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
619
  """ Calls the OpenRouter API to generate a summary. """
 
620
  global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
621
  if not _openrouter_fallback_enabled:
622
  logger.error("[OpenRouter Fallback] Called but is disabled.");
623
  return None, "Error: Fallback AI service (OpenRouter) not configured/available."
624
 
625
- # OpenRouter models might have smaller context windows, truncate more aggressively if needed
626
- # Example: 32k tokens ~ 120k chars. Deepseek is large though. Check model specifics if issues arise.
627
- max_input_len_openrouter = 100000 # Adjust based on OPENROUTER_MODEL limits if known
628
  if len(text) > max_input_len_openrouter:
629
  logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
630
  text = text[:max_input_len_openrouter]
@@ -682,11 +719,13 @@ Key Bullet Points Summary:"""
682
  try:
683
  data = response.json()
684
  if data.get("choices") and len(data["choices"]) > 0:
685
- message = data["choices"][0].get("message")
 
 
 
686
  if message and message.get("content"):
687
  summary_text = message["content"].strip()
688
  if summary_text:
689
- finish_reason = data["choices"][0].get("finish_reason", "N/A")
690
  logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
691
  # Check for length finish reason?
692
  if finish_reason == 'length':
@@ -735,6 +774,7 @@ Key Bullet Points Summary:"""
735
 
736
  async def generate_summary(text: str, summary_type: str) -> str:
737
  """ Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
 
738
  global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
739
  logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
740
  final_summary: Optional[str] = None
@@ -791,6 +831,7 @@ async def process_summary_task(
791
  bot_token: str
792
  ) -> None:
793
  """Handles fetching content, generating summary, and sending results."""
 
794
  task_id = f"{user_id}-{message_id_to_edit or 'new'}"
795
  logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
796
  background_request: Optional[BaseRequest] = None
@@ -798,8 +839,8 @@ async def process_summary_task(
798
  content: Optional[str] = None
799
  user_feedback_message: Optional[str] = None
800
  success = False
801
- status_message_id = message_id_to_edit
802
- message_to_delete_later_id: Optional[int] = None
803
 
804
  try:
805
  # Initialize background bot
@@ -807,12 +848,15 @@ async def process_summary_task(
807
  bot = Bot(token=bot_token, request=background_request)
808
  except Exception as e:
809
  logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True)
 
810
  return
811
 
812
  try:
813
  # Send/Edit "Processing..." message
 
814
  processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{html.escape(url)}\n\nThis might take a moment..."
815
- if status_message_id:
 
816
  try:
817
  await retry_bot_operation(
818
  bot.edit_message_text,
@@ -820,32 +864,15 @@ async def process_summary_task(
820
  message_id=status_message_id,
821
  text=processing_message_text,
822
  parse_mode=ParseMode.HTML, # Use HTML for escaped URL
823
- reply_markup=None,
824
  link_preview_options={'is_disabled': True} # Disable preview here too
825
  )
826
- logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
827
- except Exception as e:
828
- logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new.")
829
- status_message_id = None # Flag to send a new message
830
-
831
- if not status_message_id: # If editing failed or wasn't applicable
832
- try:
833
- status_message = await retry_bot_operation(
834
- bot.send_message,
835
- chat_id=chat_id,
836
- text=processing_message_text,
837
- parse_mode=ParseMode.HTML, # Use HTML for escaped URL
838
- link_preview_options={'is_disabled': True}
839
- )
840
- if status_message:
841
- message_to_delete_later_id = status_message.message_id
842
- logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
843
- else:
844
- raise RuntimeError("Failed to send status message after retries.")
845
  except Exception as e:
846
- logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}")
847
- user_feedback_message = "Sorry, there was an issue starting the process."
848
- # Attempt to send final feedback later if possible
849
 
850
  # Indicate activity
851
  try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
@@ -870,6 +897,7 @@ async def process_summary_task(
870
  content = await get_website_content_via_crawl4ai(url)
871
 
872
  if not content:
 
873
  logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
874
  try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
875
  except Exception: pass
@@ -883,8 +911,6 @@ async def process_summary_task(
883
  except Exception: pass
884
  content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
885
  if not content:
886
- # Check if the specific error was insufficient credits
887
- # Note: get_website_content_via_api already logs the specific error
888
  logger.error(f"[Task {task_id}] API fallback (urltotext) also failed for {url}.")
889
  user_feedback_message = "Sorry, I couldn't fetch content from that website using any method (Crawl4AI/BS4 failed, API failed or ran out of credits)." # Updated message
890
  else:
@@ -908,68 +934,50 @@ async def process_summary_task(
908
  if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
909
  user_feedback_message = final_summary # Pass AI error message to user
910
  logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
 
911
  else:
912
- # Success - Send the summary
913
  summary_parts = []
914
  current_part = ""
915
  # Split respecting newlines, ensure no part exceeds MAX_SUMMARY_CHUNK_SIZE
916
  lines = final_summary.splitlines(keepends=True)
917
  for line in lines:
918
- # If adding the next line exceeds the limit, finalize the current part
919
  if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
920
  if current_part.strip(): # Don't add empty parts
921
  summary_parts.append(current_part.strip())
922
- current_part = line # Start new part with the current line
923
- # If a single line itself is too long, truncate it (edge case)
924
  if len(current_part) > MAX_SUMMARY_CHUNK_SIZE:
925
  logger.warning(f"[Task {task_id}] Truncating overly long line in summary.")
926
  current_part = current_part[:MAX_SUMMARY_CHUNK_SIZE]
927
  else:
928
  current_part += line
929
-
930
- # Add the last part if it has content
931
- if current_part.strip():
932
- summary_parts.append(current_part.strip())
933
-
934
- # If somehow splitting resulted in nothing (e.g., empty summary initially?)
935
  if not summary_parts:
936
  summary_parts.append("Summary generated, but it appears to be empty.")
937
  logger.warning(f"[Task {task_id}] Summary was non-empty initially but splitting resulted in zero parts.")
938
 
939
-
940
  logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
941
 
942
- # Determine the target message ID for the *first* part
943
- # Prefer editing the "Processing..." message if we sent a new one
944
- edit_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
945
- message_sent = False
946
-
947
- if edit_target_id:
948
  try:
949
- # Try editing the status message first
950
  await retry_bot_operation(
951
  bot.edit_message_text,
952
  chat_id=chat_id,
953
- message_id=edit_target_id,
954
  text=summary_parts[0],
955
  parse_mode=None, # Send as plain text initially, safer
956
  link_preview_options={'is_disabled': True}
957
  )
958
- logger.debug(f"[Task {task_id}] Edited message {edit_target_id} with first summary part.")
959
- # Prevent this message from being deleted later if it was the 'Processing...' one
960
- if message_to_delete_later_id == edit_target_id: message_to_delete_later_id = None
961
- # If it was the *original* button message that we are editing, keep status_message_id
962
- # so we know *not* to delete it in finally block if it's the only message left.
963
- # However, it's clearer to just prevent deletion if edited.
964
- if status_message_id == edit_target_id: status_message_id = None # Mark as handled
965
-
966
- message_sent = True
967
  except Exception as edit_err:
968
- logger.warning(f"[Task {task_id}] Failed to edit message {edit_target_id} with summary: {edit_err}. Sending new message instead.")
969
- # If edit fails, fall through to send a new message
970
 
971
- if not message_sent:
972
- # Send the first part as a new message
973
  sent_msg = await retry_bot_operation(
974
  bot.send_message,
975
  chat_id=chat_id,
@@ -977,15 +985,13 @@ async def process_summary_task(
977
  parse_mode=None,
978
  link_preview_options={'is_disabled': True}
979
  )
980
- if sent_msg:
981
- logger.debug(f"[Task {task_id}] Sent first summary part as new message {sent_msg.message_id}.")
982
- else: # Should be caught by retry, but log defensively
983
  logger.error(f"[Task {task_id}] Failed to send first summary part even as new message.")
984
- user_feedback_message = "Sorry, failed to send the summary." # Set error
985
-
986
 
987
  # Send remaining parts (if any and first part succeeded)
988
- if not user_feedback_message and len(summary_parts) > 1:
989
  for i, part in enumerate(summary_parts[1:], start=2):
990
  await asyncio.sleep(0.5) # Small delay between parts
991
  try:
@@ -1000,40 +1006,40 @@ async def process_summary_task(
1000
  except Exception as part_err:
1001
  logger.error(f"[Task {task_id}] Failed to send summary part {i}: {part_err}")
1002
  user_feedback_message = f"Sorry, failed to send part {i} of the summary."
1003
- # Should we stop sending further parts? Yes.
1004
  break # Stop sending remaining parts
1005
 
1006
- # Determine overall success based on whether feedback message is set
1007
  if not user_feedback_message:
1008
  success = True
1009
- # user_feedback_message = None # Clear feedback message ONLY on full success
1010
 
1011
- # --- Handle Cases Where No Content Was Fetched or Summary Failed ---
1012
- if user_feedback_message: # Check if any error occurred
 
 
 
 
1013
  logger.warning(f"[Task {task_id}] Sending failure/error feedback to user: {user_feedback_message}")
1014
  try:
1015
- # Try editing the status message first
1016
- feedback_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
1017
- message_sent = False
1018
- if feedback_target_id:
1019
  try:
1020
  await retry_bot_operation(
1021
  bot.edit_message_text,
1022
  chat_id=chat_id,
1023
- message_id=feedback_target_id,
1024
  text=user_feedback_message,
1025
  link_preview_options={'is_disabled': True},
1026
  reply_markup=None # Remove buttons
1027
  )
1028
- logger.debug(f"[Task {task_id}] Edited message {feedback_target_id} with failure feedback.")
1029
- # Prevent deletion if edited
1030
- if message_to_delete_later_id == feedback_target_id: message_to_delete_later_id = None
1031
- if status_message_id == feedback_target_id: status_message_id = None
1032
- message_sent = True
1033
  except Exception as edit_err:
1034
- logger.warning(f"[Task {task_id}] Failed to edit message {feedback_target_id} with failure feedback: {edit_err}. Sending new message instead.")
1035
 
1036
- if not message_sent:
 
1037
  await retry_bot_operation(
1038
  bot.send_message,
1039
  chat_id=chat_id,
@@ -1046,36 +1052,28 @@ async def process_summary_task(
1046
 
1047
  except Exception as e:
1048
  # Catch-all for unexpected errors during the main processing logic
1049
- logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True)
 
1050
  user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
1051
  if bot: # Ensure bot exists before trying to send
1052
  try:
1053
- # Attempt to send a final error message
1054
- await retry_bot_operation(
1055
- bot.send_message,
1056
- chat_id=chat_id,
1057
- text=user_feedback_message
1058
- )
 
 
 
1059
  except Exception as final_err:
1060
  logger.error(f"[Task {task_id}] Failed to send the final unexpected error feedback: {final_err}")
1061
 
1062
  finally:
1063
  # --- Cleanup ---
1064
- # Delete the temporary "Processing..." message if it exists and wasn't edited/handled
1065
- if message_to_delete_later_id and bot:
1066
- try:
1067
- await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=message_to_delete_later_id)
1068
- logger.debug(f"[Task {task_id}] Deleted temporary status message {message_to_delete_later_id}")
1069
- except Exception as del_e:
1070
- logger.warning(f"[Task {task_id}] Failed to delete temporary status message {message_to_delete_later_id}: {del_e}")
1071
-
1072
- # Explicitly DO NOT delete the original message with buttons (status_message_id)
1073
- # if it was successfully edited with the final result or error message.
1074
- # The logic above sets status_message_id = None if it was edited.
1075
- # If status_message_id still holds the ID here, it means editing failed and we sent a *new* message.
1076
- # In that failure case, maybe we *should* delete the original button message? Or leave it?
1077
- # Let's leave it for now to avoid deleting user context if things went very wrong.
1078
- # Deleting message_to_delete_later_id covers the main cleanup case.
1079
 
1080
  # Close the background bot's HTTP client
1081
  if background_request and hasattr(background_request, '_client') and background_request._client:
@@ -1090,26 +1088,23 @@ async def process_summary_task(
1090
 
1091
  # --- Telegram Handlers ---
1092
  # (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
1093
- # These remain largely the same.
1094
 
1095
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
1096
- # ... (Keep existing implementation) ...
1097
  user = update.effective_user; mention = user.mention_html()
1098
  if not user or not update.message: return
1099
  logger.info(f"User {user.id} ({user.username or 'N/A'}) used /start.")
1100
  await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
1101
 
1102
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
1103
- # ... (Keep existing implementation) ...
1104
  user = update.effective_user
1105
  if not user or not update.message: return
1106
  logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
1107
- # Updated help text slightly
1108
  help_text = ( "🔍 **How to use:**\n\n"
1109
  "1. Send me any YouTube video link or website URL.\n"
1110
  "2. I'll ask how you want it summarised (paragraph or points).\n"
1111
  "3. Click the button for your choice.\n"
1112
- "4. Wait for the summary!\n\n"
1113
  "⚙️ **Behind the scenes:**\n"
1114
  f"• **Websites:** I try `Crawl4AI` (smart crawl), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n"
1115
  "• **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
@@ -1120,15 +1115,14 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
1120
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
1121
 
1122
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
1123
- # ... (Keep existing implementation) ...
1124
  if not update.message or not update.message.text: return
1125
  url = update.message.text.strip(); user = update.effective_user
1126
  if not user: return
1127
  # Basic URL validation
1128
- if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
 
1129
  logger.debug(f"Ignoring non-URL from {user.id}: {url}")
1130
- # Optionally reply to the user that it doesn't look like a valid URL
1131
- await update.message.reply_text("Hmm, that doesn't look like a valid web URL. Please make sure it starts with `http://` or `https://`.", parse_mode=ParseMode.MARKDOWN)
1132
  return
1133
  logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
1134
  # Store URL and original message ID in user_data
@@ -1140,12 +1134,11 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
1140
  await update.message.reply_html( f"Okay, I see this link:\n<code>{escaped_url}</code>\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True )
1141
 
1142
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
1143
- # ... (Keep existing implementation, check AI config) ...
1144
  query = update.callback_query
1145
  if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
1146
  user = query.from_user; summary_type = query.data; query_id = query.id
1147
  try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'N/A'})")
1148
- except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) # Log but continue
1149
 
1150
  url = context.user_data.get('url_to_summarize')
1151
  message_id_to_edit = query.message.message_id # The message with the buttons
@@ -1161,10 +1154,6 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
1161
  except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
1162
  return # Do not proceed further
1163
 
1164
- # Clear context *only after* successfully scheduling the task below
1165
- # context.user_data.pop('url_to_summarize', None) # Moved clearing
1166
- # context.user_data.pop('original_message_id', None) # Moved clearing
1167
-
1168
  # Check necessary configurations before scheduling
1169
  global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
1170
  if not TELEGRAM_TOKEN:
@@ -1177,13 +1166,13 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
1177
  try: await query.edit_message_text(text="❌ AI configuration error: No summarization models are available. Cannot proceed.", reply_markup=None)
1178
  except Exception: pass
1179
  return
1180
- # Log warnings if one model is missing, but proceed if at least one is available
1181
  elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback (OpenRouter).")
1182
  elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) is unavailable, relying on primary (Gemini).")
1183
 
1184
  # Schedule the background task
1185
  logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit} (URL: {url})")
1186
- asyncio.create_task(
 
1187
  process_summary_task(
1188
  user_id=user.id,
1189
  chat_id=query.message.chat_id,
@@ -1191,28 +1180,21 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
1191
  url=url,
1192
  summary_type=summary_type,
1193
  bot_token=TELEGRAM_TOKEN # Pass token explicitly
1194
- ),
1195
- name=f"SummaryTask-{user.id}-{message_id_to_edit}"
 
1196
  )
1197
 
1198
- # Clear context AFTER scheduling the task to prevent race conditions if user clicks fast
1199
  context.user_data.pop('url_to_summarize', None)
1200
  context.user_data.pop('original_message_id', None)
1201
  logger.debug(f"Cleared URL context for user {user.id} after scheduling task.")
1202
 
1203
- # Optionally edit the button message *immediately* to give feedback before the task edits it again
1204
- # This prevents the user clicking again while the task starts up.
1205
- # try:
1206
- # await query.edit_message_text(text=f"Okay, starting '{summary_type}' summary...", reply_markup=None)
1207
- # except Exception as e:
1208
- # logger.warning(f"Could not edit button message immediately after scheduling: {e}")
1209
- # This initial edit will be quickly overwritten by the task's "Processing..." message.
1210
-
1211
 
1212
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
1213
  # ... (Keep existing implementation) ...
1214
- # Consider adding specific TelegramError types if needed
1215
- ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter) # Add common transient errors
1216
  if isinstance(context.error, ignore_errors):
1217
  ignore_messages = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
1218
  err_str = str(context.error).lower()
@@ -1220,10 +1202,6 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
1220
  logger.warning(f"Ignoring known/handled/transient error in error_handler: {context.error}")
1221
  return
1222
  logger.error("Exception while handling an update:", exc_info=context.error)
1223
- # Consider notifying the user about unexpected errors if appropriate and possible
1224
- # if isinstance(update, Update) and update.effective_chat:
1225
- # try: await context.bot.send_message(chat_id=update.effective_chat.id, text="An unexpected error occurred.")
1226
- # except Exception: pass
1227
 
1228
 
1229
  # --- Application Setup ---
@@ -1231,22 +1209,19 @@ async def setup_bot_config() -> Application:
1231
  # ... (Keep existing implementation) ...
1232
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
1233
  if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
1234
- # Configure HTTPX client with slightly longer timeouts for PTB internal requests if needed
1235
  custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
1236
  application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() )
1237
  # Add Handlers
1238
  application.add_handler(CommandHandler("start", start))
1239
  application.add_handler(CommandHandler("help", help_command))
1240
- # Use a slightly broader filter to catch URLs even without explicit entity type from Telegram
1241
  url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
1242
  application.add_handler(MessageHandler(url_filter, handle_potential_url))
1243
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
1244
- # Error Handler
1245
  application.add_error_handler(error_handler)
1246
  logger.info("Telegram application handlers configured."); return application
1247
 
1248
  # --- ASGI Lifespan & Routes ---
1249
- # (lifespan, health_check, telegram_webhook remain the same)
1250
  @contextlib.asynccontextmanager
1251
  async def lifespan(app: Starlette):
1252
  global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
@@ -1254,33 +1229,39 @@ async def lifespan(app: Starlette):
1254
  if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
1255
  try:
1256
  ptb_app = await setup_bot_config()
1257
- await ptb_app.initialize()
1258
  bot_info = await ptb_app.bot.get_me()
1259
  logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
1260
 
1261
  # Webhook setup logic (remains the same)
1262
  current_webhook_info = await ptb_app.bot.get_webhook_info()
 
1263
  if current_webhook_info and current_webhook_info.url:
1264
  logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
1265
  try:
1266
  if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
1267
- else: logger.warning("Failed delete webhook (API returned False).")
1268
- except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1)
1269
 
 
1270
  space_host = os.environ.get("SPACE_HOST")
1271
  webhook_path = "/webhook" # Standard path
1272
  full_webhook_url = None
1273
  if space_host:
1274
- # Ensure correct protocol and clean up host string
1275
- protocol = "https"
1276
- host = space_host.split('://')[-1] # Get host part if protocol included
1277
- full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}"
 
 
 
1278
 
1279
- if full_webhook_url:
 
1280
  logger.info(f"Attempting to set webhook: {full_webhook_url}")
1281
  set_webhook_args = {
1282
  "url": full_webhook_url,
1283
- "allowed_updates": Update.ALL_TYPES, # Or specify needed types
1284
  "drop_pending_updates": True
1285
  }
1286
  if WEBHOOK_SECRET:
@@ -1290,30 +1271,28 @@ async def lifespan(app: Starlette):
1290
  await asyncio.sleep(1.0) # Short delay before setting
1291
 
1292
  try:
1293
- await ptb_app.bot.set_webhook(**set_webhook_args)
1294
- await asyncio.sleep(1.0) # Short delay after setting
 
 
1295
  webhook_info = await ptb_app.bot.get_webhook_info()
1296
  if webhook_info and webhook_info.url == full_webhook_url:
1297
- logger.info(f"Webhook set successfully: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}")
1298
- # Log other info if useful: e.g., webhook_info.last_error_message
1299
  if webhook_info.last_error_message:
1300
  logger.warning(f"Webhook status has last error: {webhook_info.last_error_message}")
1301
  else:
1302
- logger.error(f"Webhook URL mismatch or failed to set! Expected '{full_webhook_url}', Got '{webhook_info.url if webhook_info else 'N/A'}'")
1303
- raise RuntimeError("Webhook URL mismatch or failed confirmation.")
1304
 
1305
- await ptb_app.start() # Start PTB processing updates
1306
  logger.info("PTB Application started (webhook mode).")
1307
  except Exception as e:
1308
- logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True)
1309
- raise RuntimeError(f"Failed to set webhook: {e}") from e
1310
- else:
1311
- # Attempt to get URL from request headers if available (might work in some environments)
1312
- # This is less reliable than SPACE_HOST
1313
- logger.warning("SPACE_HOST environment variable not found. Webhook URL cannot be determined reliably for setup.")
1314
- # You might decide to raise an error or try to run in polling mode if webhook fails
1315
- raise RuntimeError("Webhook URL undetermined (SPACE_HOST missing).")
1316
-
1317
 
1318
  logger.info("ASGI Lifespan: Startup complete.");
1319
  yield # Application runs here
@@ -1324,7 +1303,8 @@ async def lifespan(app: Starlette):
1324
  if ptb_app:
1325
  try:
1326
  if ptb_app.running: await ptb_app.stop()
1327
- await ptb_app.shutdown()
 
1328
  except Exception as shutdown_err:
1329
  logger.error(f"Error during shutdown after startup failure: {shutdown_err}")
1330
  raise # Reraise the original startup error
@@ -1337,36 +1317,42 @@ async def lifespan(app: Starlette):
1337
  if ptb_app.running:
1338
  logger.info("Stopping PTB application...")
1339
  await ptb_app.stop()
1340
- logger.info("Shutting down PTB application...")
1341
- await ptb_app.shutdown()
1342
- logger.info("PTB Application shut down successfully.")
 
 
 
 
1343
  except Exception as e:
1344
  logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
1345
  else:
1346
- logger.info("PTB application was not initialized or startup failed.")
1347
  logger.info("ASGI Lifespan: Shutdown complete.")
1348
 
1349
 
1350
  async def health_check(request: Request) -> PlainTextResponse:
1351
- # ... (Keep existing implementation, updated with model names) ...
1352
  global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
1353
  bot_status = "Not Initialized"
1354
  bot_username = "N/A"
1355
- if ptb_app and ptb_app.bot and ptb_app.initialized: # Check if initialized
 
1356
  try:
1357
  # Quick check if webhook seems ok, more reliable than get_me() sometimes
1358
  wh_info = await ptb_app.bot.get_webhook_info()
 
1359
  if ptb_app.running and wh_info and wh_info.url:
1360
  bot_info = await ptb_app.bot.get_me()
1361
  bot_username = f"@{bot_info.username}"
1362
  bot_status = f"Running (Webhook OK, {bot_username})"
1363
  elif ptb_app.running:
1364
- bot_status = "Running (Webhook check failed or not set)"
1365
  else: bot_status = "Initialized/Not running"
1366
- except Exception as e: bot_status = f"Error checking status: {e}"
 
 
1367
  elif ptb_app:
1368
- bot_status = "Initializing..."
1369
-
1370
 
1371
  health_info = [
1372
  f"=== Telegram Summary Bot Status ===",
@@ -1390,6 +1376,9 @@ async def telegram_webhook(request: Request) -> Response:
1390
  if not ptb_app:
1391
  logger.error("Webhook received but PTB application not initialized.")
1392
  return PlainTextResponse('Bot not initialized', status_code=503) # Service Unavailable
 
 
 
1393
  if not ptb_app.running:
1394
  logger.warning("Webhook received but PTB application not running.")
1395
  return PlainTextResponse('Bot not running', status_code=503) # Service Unavailable
@@ -1443,9 +1432,13 @@ if __name__ == '__main__':
1443
 
1444
  # Make sure necessary env vars are loaded for local dev if not set system-wide
1445
  # Example using python-dotenv if you add it to requirements-dev.txt
1446
- # from dotenv import load_dotenv
1447
- # load_dotenv()
1448
- # logger.info("Loaded environment variables from .env file for local development.")
 
 
 
 
1449
 
1450
  # Re-check required tokens after potential .env load
1451
  if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN not found.")
 
53
  # --- Google Gemini ---
54
  try:
55
  import google.generativeai as genai
56
+ # Import specific types needed, check library for exact names if errors occur
57
  from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
58
  _gemini_available = True
59
  except ImportError:
 
113
  # Models (User can still configure via env vars)
114
  OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
115
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
116
+ # *** Reverted Model Name as requested ***
117
+ GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model
118
 
119
  # --- Configuration Checks ---
120
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
 
160
 
161
  # --- Constants ---
162
  MAX_SUMMARY_CHUNK_SIZE = 4000 # Max characters per Telegram message (allow buffer)
163
+ # Adjust based on gemini-2.0-flash context window if known, 1M was for 1.5 Flash
164
+ # Let's use a more conservative estimate for 2.0 Flash, e.g., 128k tokens ~ 500k chars? Check Gemini docs.
165
+ # Sticking with a large number for now, but be aware this might need adjustment.
166
+ MAX_INPUT_TOKEN_APPROX = 500000
167
 
168
  # --- Retry Decorator ---
169
  # (Remains the same)
 
185
  if any(err in str(e).lower() for err in ignore_errors):
186
  logger.warning(f"Ignoring non-critical BadRequest: {e}")
187
  return None
188
+ # Only raise if it's not an ignored error
189
  logger.error(f"Potentially critical BadRequest: {e}")
190
  raise
191
  except TelegramError as e:
192
+ # Log specific transient errors that might trigger retry
193
+ if isinstance(e, (TimedOut, NetworkError, RetryAfter)):
194
+ logger.warning(f"Telegram transient error (will retry): {e}")
195
+ else:
196
+ logger.error(f"Unhandled TelegramError: {e}") # Log other Telegram errors
197
+ raise # Reraise to allow tenacity to handle retry
198
  except Exception as e:
199
  logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
200
  raise
 
214
  # --- Content Fetching Functions ---
215
 
216
  # - YouTube Transcript Fetching (get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript) -
217
+ # (No changes needed in these functions)
218
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
219
  # ... (Keep existing implementation) ...
220
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
 
304
  transcript_text = None
305
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
306
  try:
307
+ # Run the blocking call in a separate thread
308
+ transcript_list = await asyncio.to_thread(
309
+ YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch
310
+ )
311
+ # transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) # Old way
312
  if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
313
+ if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text.strip()
314
  else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
315
+ except TranscriptsDisabled:
316
+ logger.warning(f"[Primary YT] Transcripts are disabled for video {video_id}")
317
+ transcript_text = None
318
+ except NoTranscriptFound:
319
+ logger.warning(f"[Primary YT] No English transcript found for video {video_id}")
320
+ transcript_text = None
321
  except Exception as e:
322
  logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
323
+ # if isinstance(e, (NoTranscriptFound, TranscriptsDisabled)): logger.warning(f"[Primary YT] Known issue: {type(e).__name__}") # Handled above
324
  transcript_text = None
325
 
326
  if transcript_text is None:
327
  logger.info("[Fallback YT 1] Trying Supadata API...")
328
  if SUPADATA_API_KEY:
329
  transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
330
+ if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text # Already stripped
331
  else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
332
  else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
333
 
 
335
  logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
336
  if APIFY_API_TOKEN:
337
  transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
338
+ if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text # Already stripped
339
  else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
340
  else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
341
 
 
359
 
360
  logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
361
  # Define a writable cache directory (use /tmp in container environments)
362
+ # No longer creating dir here, relying on HOME=/tmp from Dockerfile
363
+ # cache_dir_path = "/tmp/.crawl4ai"
364
+ # try:
365
+ # os.makedirs(cache_dir_path, exist_ok=True)
366
+ # logger.info(f"[Crawl4AI Primary] Ensured cache directory exists: {cache_dir_path}")
367
+ # except OSError as e:
368
+ # logger.error(f"[Crawl4AI Primary] Failed to create cache directory {cache_dir_path}: {e}. Crawl may fail.")
369
+ # except Exception as e:
370
+ # logger.error(f"[Crawl4AI Primary] Unexpected error creating cache directory {cache_dir_path}: {e}")
 
 
371
 
372
  try:
373
+ # Use AsyncWebCrawler context manager.
374
+ # Removed explicit cache_dir, relying on HOME=/tmp from Dockerfile
375
+ async with AsyncWebCrawler() as crawler:
376
+ logger.info(f"[Crawl4AI Primary] Initialized crawler (HOME should be /tmp).")
 
 
 
 
 
 
 
 
 
 
 
 
377
  # Use arun for a single URL crawl
378
  result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
379
 
 
384
  return content
385
  else:
386
  logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted markdown content is empty.")
387
+ # Try result.text as a fallback within crawl4ai success
388
+ if result.text:
389
+ content = result.text.strip()
390
+ if content:
391
+ logger.info(f"[Crawl4AI Primary] Using .text fallback after empty markdown. Length: {len(content)}")
392
+ return content
393
+ return None # Return None if both markdown and text are empty
394
+ elif result and result.text: # Fallback to raw text if markdown is somehow missing entirely
395
  content = result.text.strip()
396
  if content:
397
+ logger.info(f"[Crawl4AI Primary] Success crawling {url} (using .text, markdown missing). Length: {len(content)}")
398
  return content
399
  else:
400
+ logger.warning(f"[Crawl4AI Primary] Crawl successful for {url}, but extracted text content is empty (markdown missing).")
401
  return None
402
  else:
403
  logger.warning(f"[Crawl4AI Primary] Crawl failed or returned no result/content for {url}.")
 
406
  logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
407
  return None
408
  except PermissionError as e: # Catch the specific error
409
+ # Log more detail if possible
410
+ logger.error(f"[Crawl4AI Primary] Permission denied during crawl for {url}. Target path: '{e.filename}'. Likely filesystem issue or HOME=/tmp not effective. Error: {e}", exc_info=True)
411
  return None # Fail gracefully for this method
412
  except Exception as e:
413
+ logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
414
+ # Log specific crawl4ai errors if they become apparent
 
 
 
415
  return None
416
 
417
 
 
454
  for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
455
  element.extract()
456
  # Try to find main content areas more broadly
457
+ selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body', '.post-content', '.entry-content'] # Added more common selectors
458
  target_element = None
459
  for selector in selectors:
460
+ try:
461
+ target_element = soup.select_one(selector)
462
+ if target_element: break
463
+ except Exception as sel_e: # Catch potential invalid CSS selectors from list
464
+ logger.warning(f"[BS4 Fallback] Invalid selector '{selector}': {sel_e}")
465
+ continue
466
+
467
 
468
  if not target_element: target_element = soup.body # Fallback to body
469
  if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
 
571
  generation_config = genai.types.GenerationConfig(
572
  # candidate_count=1, # Default is 1
573
  # stop_sequences=["\n"],
574
+ max_output_tokens=2048, # Increased max tokens
575
  temperature=0.7, # Adjust creativity vs factualness
576
  # top_p=1.0, # Default
577
  # top_k=None # Default
 
583
  response: GenerateContentResponse = await model.generate_content_async( # Use async version
584
  prompt,
585
  generation_config=generation_config,
586
+ safety_settings=safety_settings,
587
+ # request_options={"timeout": 100} # Optional: Add timeout for the API call itself
588
  )
 
589
 
590
  # Check for safety blocks or other issues in response
591
  if not response.candidates:
592
+ block_reason = "Unknown"
593
+ if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
594
+ block_reason = response.prompt_feedback.block_reason or "Reason not specified"
595
  error_msg = f"Error: Gemini response blocked or empty. Reason: {block_reason}"
596
  logger.error(f"[Gemini Primary] {error_msg}")
597
  return None, error_msg
598
 
599
+ # *** FIXED FinishReason Check ***
600
+ # Access finish_reason from the first candidate
601
+ finish_reason_val = response.candidates[0].finish_reason
602
+ logger.debug(f"[Gemini Primary] Received response. Finish reason value: {finish_reason_val}")
603
+
604
+ # Convert the finish reason (might be enum/int/string) to uppercase string for reliable comparison
605
+ finish_reason_str = str(finish_reason_val).upper()
606
+
607
+ # Check if it's NOT a successful or expected non-error finish
608
+ # Expected "good" finishes: STOP, MAX_TOKENS
609
+ # Problematic finishes: SAFETY, RECITATION, OTHER, etc.
610
+ if finish_reason_str not in ["STOP", "MAX_TOKENS"]:
611
  # Log safety ratings if available
612
  safety_ratings_str = "N/A"
613
+ if hasattr(response.candidates[0], 'safety_ratings') and response.candidates[0].safety_ratings:
614
  safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.candidates[0].safety_ratings])
615
+
616
+ error_msg = f"Error: Gemini generation finished unexpectedly. Reason: {finish_reason_str}. Safety: {safety_ratings_str}"
617
  logger.error(f"[Gemini Primary] {error_msg}")
 
 
 
 
 
 
 
618
 
619
+ # Decide how to handle specific non-STOP reasons
620
+ if finish_reason_str == "SAFETY":
621
+ return None, error_msg # Return specific error for safety blocks
622
+ # For RECITATION, OTHER, etc., it's safer to return an error than potentially broken output
623
+ return None, f"Error: Gemini generation finished unexpectedly ({finish_reason_str})."
624
+
625
+ # Extract text (use response.text shortcut if available and reliable)
626
+ summary_text = ""
627
+ try:
628
+ # response.text might combine parts automatically, safer if available
629
+ summary_text = response.text
630
+ except ValueError as e:
631
+ # Handle cases where .text might raise error (e.g., if blocked, but caught above ideally)
632
+ logger.warning(f"[Gemini Primary] Error accessing response.text: {e}. Trying parts manually.")
633
+ # Fallback to joining parts if .text fails (less common now)
634
+ if response.candidates and response.candidates[0].content and response.candidates[0].content.parts:
635
+ summary_text = "".join(part.text for part in response.candidates[0].content.parts)
636
 
 
 
637
  if not summary_text or not summary_text.strip():
638
  logger.warning("[Gemini Primary] Gemini returned an empty summary.")
639
  return None, "Error: AI generated an empty summary."
640
 
641
+ logger.info(f"[Gemini Primary] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason_str}")
642
  return summary_text.strip(), None
643
 
644
+ except AttributeError as e:
645
+ # Catch potential errors if response structure is different than expected
646
+ logger.error(f"[Gemini Primary] Attribute error accessing Gemini response: {e}. Response structure might have changed.", exc_info=True)
647
+ return None, f"Error: Failed to parse Gemini response. Details: {e}"
648
  except Exception as e:
649
  logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
650
  # Check for specific Google API errors if needed
 
655
 
656
  async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
657
  """ Calls the OpenRouter API to generate a summary. """
658
+ # ... (Keep existing implementation - no changes needed here based on logs) ...
659
  global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
660
  if not _openrouter_fallback_enabled:
661
  logger.error("[OpenRouter Fallback] Called but is disabled.");
662
  return None, "Error: Fallback AI service (OpenRouter) not configured/available."
663
 
664
+ max_input_len_openrouter = 100000 # Adjust based on OPENROUTER_MODEL limits if known (Deepseek is large)
 
 
665
  if len(text) > max_input_len_openrouter:
666
  logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
667
  text = text[:max_input_len_openrouter]
 
719
  try:
720
  data = response.json()
721
  if data.get("choices") and len(data["choices"]) > 0:
722
+ choice = data["choices"][0]
723
+ message = choice.get("message")
724
+ finish_reason = choice.get("finish_reason", "N/A") # Get finish reason
725
+
726
  if message and message.get("content"):
727
  summary_text = message["content"].strip()
728
  if summary_text:
 
729
  logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
730
  # Check for length finish reason?
731
  if finish_reason == 'length':
 
774
 
775
  async def generate_summary(text: str, summary_type: str) -> str:
776
  """ Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
777
+ # ... (Keep existing implementation - no changes needed here based on logs) ...
778
  global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
779
  logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
780
  final_summary: Optional[str] = None
 
831
  bot_token: str
832
  ) -> None:
833
  """Handles fetching content, generating summary, and sending results."""
834
+ # ... (Keep existing implementation - no changes needed here based on logs) ...
835
  task_id = f"{user_id}-{message_id_to_edit or 'new'}"
836
  logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
837
  background_request: Optional[BaseRequest] = None
 
839
  content: Optional[str] = None
840
  user_feedback_message: Optional[str] = None
841
  success = False
842
+ status_message_id = message_id_to_edit # Keep track of the original button message ID
843
+ message_to_delete_later_id: Optional[int] = None # For temporary "Processing..." messages
844
 
845
  try:
846
  # Initialize background bot
 
848
  bot = Bot(token=bot_token, request=background_request)
849
  except Exception as e:
850
  logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True)
851
+ # Cannot send feedback without bot
852
  return
853
 
854
  try:
855
  # Send/Edit "Processing..." message
856
+ # We *always* edit the original button message first
857
  processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{html.escape(url)}\n\nThis might take a moment..."
858
+ edited_original_msg = False
859
+ if status_message_id: # If we have the ID of the message with buttons
860
  try:
861
  await retry_bot_operation(
862
  bot.edit_message_text,
 
864
  message_id=status_message_id,
865
  text=processing_message_text,
866
  parse_mode=ParseMode.HTML, # Use HTML for escaped URL
867
+ reply_markup=None, # Remove buttons
868
  link_preview_options={'is_disabled': True} # Disable preview here too
869
  )
870
+ logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} to 'Processing'")
871
+ edited_original_msg = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872
  except Exception as e:
873
+ logger.warning(f"[Task {task_id}] Could not edit original button message {status_message_id}: {e}. Will proceed without initial status update on that message.")
874
+ # Don't send a new message here, the final result/error will handle it.
875
+ # Keep status_message_id so we know the original message still exists.
876
 
877
  # Indicate activity
878
  try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
 
897
  content = await get_website_content_via_crawl4ai(url)
898
 
899
  if not content:
900
+ # Log the failure reason from Crawl4AI if available (already logged in the function)
901
  logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
902
  try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
903
  except Exception: pass
 
911
  except Exception: pass
912
  content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
913
  if not content:
 
 
914
  logger.error(f"[Task {task_id}] API fallback (urltotext) also failed for {url}.")
915
  user_feedback_message = "Sorry, I couldn't fetch content from that website using any method (Crawl4AI/BS4 failed, API failed or ran out of credits)." # Updated message
916
  else:
 
934
  if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
935
  user_feedback_message = final_summary # Pass AI error message to user
936
  logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
937
+ success = False # Explicitly mark as failure
938
  else:
939
+ # Success - Split and Send the summary
940
  summary_parts = []
941
  current_part = ""
942
  # Split respecting newlines, ensure no part exceeds MAX_SUMMARY_CHUNK_SIZE
943
  lines = final_summary.splitlines(keepends=True)
944
  for line in lines:
 
945
  if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
946
  if current_part.strip(): # Don't add empty parts
947
  summary_parts.append(current_part.strip())
948
+ current_part = line
 
949
  if len(current_part) > MAX_SUMMARY_CHUNK_SIZE:
950
  logger.warning(f"[Task {task_id}] Truncating overly long line in summary.")
951
  current_part = current_part[:MAX_SUMMARY_CHUNK_SIZE]
952
  else:
953
  current_part += line
954
+ if current_part.strip(): summary_parts.append(current_part.strip())
 
 
 
 
 
955
  if not summary_parts:
956
  summary_parts.append("Summary generated, but it appears to be empty.")
957
  logger.warning(f"[Task {task_id}] Summary was non-empty initially but splitting resulted in zero parts.")
958
 
 
959
  logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
960
 
961
+ # Try to edit the original button message with the *first* part of the summary
962
+ edited_final_result = False
963
+ if status_message_id: # If we have the original button message ID
 
 
 
964
  try:
 
965
  await retry_bot_operation(
966
  bot.edit_message_text,
967
  chat_id=chat_id,
968
+ message_id=status_message_id,
969
  text=summary_parts[0],
970
  parse_mode=None, # Send as plain text initially, safer
971
  link_preview_options={'is_disabled': True}
972
  )
973
+ logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} with first summary part.")
974
+ edited_final_result = True
 
 
 
 
 
 
 
975
  except Exception as edit_err:
976
+ logger.warning(f"[Task {task_id}] Failed to edit original button message {status_message_id} with summary part 1: {edit_err}. Sending as new message.")
977
+ # Original message remains as "Processing..." or its initial state if first edit failed
978
 
979
+ # If editing failed, or if there was no original message_id (shouldn't happen), send first part as new message
980
+ if not edited_final_result:
981
  sent_msg = await retry_bot_operation(
982
  bot.send_message,
983
  chat_id=chat_id,
 
985
  parse_mode=None,
986
  link_preview_options={'is_disabled': True}
987
  )
988
+ if not sent_msg:
 
 
989
  logger.error(f"[Task {task_id}] Failed to send first summary part even as new message.")
990
+ user_feedback_message = "Sorry, failed to send the summary."
991
+ success = False
992
 
993
  # Send remaining parts (if any and first part succeeded)
994
+ if success and len(summary_parts) > 1: # Check success flag before sending more parts
995
  for i, part in enumerate(summary_parts[1:], start=2):
996
  await asyncio.sleep(0.5) # Small delay between parts
997
  try:
 
1006
  except Exception as part_err:
1007
  logger.error(f"[Task {task_id}] Failed to send summary part {i}: {part_err}")
1008
  user_feedback_message = f"Sorry, failed to send part {i} of the summary."
1009
+ success = False # Mark as failure
1010
  break # Stop sending remaining parts
1011
 
1012
+ # Determine overall success based on whether feedback message is set *during this block*
1013
  if not user_feedback_message:
1014
  success = True
 
1015
 
1016
+ # --- Handle Failure Cases (Content Fetching Failed or Summary Failed) ---
1017
+ if not success: # Check overall success flag
1018
+ if not user_feedback_message: # If success is false but no specific message, set a generic one
1019
+ user_feedback_message = "Sorry, an unknown error occurred during processing."
1020
+ logger.error(f"[Task {task_id}] Task failed but no specific user_feedback_message was set.")
1021
+
1022
  logger.warning(f"[Task {task_id}] Sending failure/error feedback to user: {user_feedback_message}")
1023
  try:
1024
+ # Try editing the original button message with the error
1025
+ edited_final_error = False
1026
+ if status_message_id:
 
1027
  try:
1028
  await retry_bot_operation(
1029
  bot.edit_message_text,
1030
  chat_id=chat_id,
1031
+ message_id=status_message_id,
1032
  text=user_feedback_message,
1033
  link_preview_options={'is_disabled': True},
1034
  reply_markup=None # Remove buttons
1035
  )
1036
+ logger.debug(f"[Task {task_id}] Edited original button message {status_message_id} with failure feedback.")
1037
+ edited_final_error = True
 
 
 
1038
  except Exception as edit_err:
1039
+ logger.warning(f"[Task {task_id}] Failed to edit original button message {status_message_id} with failure feedback: {edit_err}. Sending as new message.")
1040
 
1041
+ # If editing failed or wasn't applicable, send error as a new message
1042
+ if not edited_final_error:
1043
  await retry_bot_operation(
1044
  bot.send_message,
1045
  chat_id=chat_id,
 
1052
 
1053
  except Exception as e:
1054
  # Catch-all for unexpected errors during the main processing logic
1055
+ logger.error(f"[Task {task_id}] Unexpected error during processing task: {e}", exc_info=True)
1056
+ success = False
1057
  user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
1058
  if bot: # Ensure bot exists before trying to send
1059
  try:
1060
+ # Attempt to send a final error message (maybe edit original if possible?)
1061
+ edited_final_crash_error = False
1062
+ if status_message_id:
1063
+ try:
1064
+ await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=user_feedback_message, reply_markup=None, link_preview_options={'is_disabled': True} )
1065
+ edited_final_crash_error = True
1066
+ except Exception: pass # Ignore error editing here, just try sending new
1067
+ if not edited_final_crash_error:
1068
+ await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
1069
  except Exception as final_err:
1070
  logger.error(f"[Task {task_id}] Failed to send the final unexpected error feedback: {final_err}")
1071
 
1072
  finally:
1073
  # --- Cleanup ---
1074
+ # No automatic deletion needed now. The original message (status_message_id) is either
1075
+ # edited with the result/error, or left as is if editing failed. Temporary messages
1076
+ # were not introduced in this version of the logic.
 
 
 
 
 
 
 
 
 
 
 
 
1077
 
1078
  # Close the background bot's HTTP client
1079
  if background_request and hasattr(background_request, '_client') and background_request._client:
 
1088
 
1089
  # --- Telegram Handlers ---
1090
  # (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
1091
+ # No changes needed in these handlers based on logs.
1092
 
1093
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
1094
  user = update.effective_user; mention = user.mention_html()
1095
  if not user or not update.message: return
1096
  logger.info(f"User {user.id} ({user.username or 'N/A'}) used /start.")
1097
  await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
1098
 
1099
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
1100
  user = update.effective_user
1101
  if not user or not update.message: return
1102
  logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
 
1103
  help_text = ( "🔍 **How to use:**\n\n"
1104
  "1. Send me any YouTube video link or website URL.\n"
1105
  "2. I'll ask how you want it summarised (paragraph or points).\n"
1106
  "3. Click the button for your choice.\n"
1107
+ "4. Wait while I process it!\n\n" # Slightly rephrased
1108
  "⚙️ **Behind the scenes:**\n"
1109
  f"• **Websites:** I try `Crawl4AI` (smart crawl), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n"
1110
  "• **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
 
1115
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
1116
 
1117
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
1118
  if not update.message or not update.message.text: return
1119
  url = update.message.text.strip(); user = update.effective_user
1120
  if not user: return
1121
  # Basic URL validation
1122
+ url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE) # Slightly improved regex
1123
+ if not url_pattern.match(url):
1124
  logger.debug(f"Ignoring non-URL from {user.id}: {url}")
1125
+ await update.message.reply_text("Hmm, that doesn't look like a valid web URL. Please make sure it starts with `http://` or `https://` and includes a domain.", parse_mode=ParseMode.MARKDOWN)
 
1126
  return
1127
  logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
1128
  # Store URL and original message ID in user_data
 
1134
  await update.message.reply_html( f"Okay, I see this link:\n<code>{escaped_url}</code>\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True )
1135
 
1136
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
1137
  query = update.callback_query
1138
  if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
1139
  user = query.from_user; summary_type = query.data; query_id = query.id
1140
  try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'N/A'})")
1141
+ except Exception as e: logger.warning(f"Error answering callback {query_id}: {e}") # Log warning, don't stop
1142
 
1143
  url = context.user_data.get('url_to_summarize')
1144
  message_id_to_edit = query.message.message_id # The message with the buttons
 
1154
  except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
1155
  return # Do not proceed further
1156
 
 
 
 
 
1157
  # Check necessary configurations before scheduling
1158
  global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
1159
  if not TELEGRAM_TOKEN:
 
1166
  try: await query.edit_message_text(text="❌ AI configuration error: No summarization models are available. Cannot proceed.", reply_markup=None)
1167
  except Exception: pass
1168
  return
 
1169
  elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback (OpenRouter).")
1170
  elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) is unavailable, relying on primary (Gemini).")
1171
 
1172
  # Schedule the background task
1173
  logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit} (URL: {url})")
1174
+ # Use asyncio.ensure_future for slightly better practice than create_task directly here
1175
+ asyncio.ensure_future(
1176
  process_summary_task(
1177
  user_id=user.id,
1178
  chat_id=query.message.chat_id,
 
1180
  url=url,
1181
  summary_type=summary_type,
1182
  bot_token=TELEGRAM_TOKEN # Pass token explicitly
1183
+ )
1184
+ # Consider adding .set_name(...) if using Python 3.8+ for better debugging
1185
+ # .set_name(f"SummaryTask-{user.id}-{message_id_to_edit}")
1186
  )
1187
 
1188
+ # Clear context AFTER scheduling the task
1189
  context.user_data.pop('url_to_summarize', None)
1190
  context.user_data.pop('original_message_id', None)
1191
  logger.debug(f"Cleared URL context for user {user.id} after scheduling task.")
1192
 
1193
+ # Do not edit the message here - let the task handle its own status updates by editing this message
 
 
 
 
 
 
 
1194
 
1195
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
1196
  # ... (Keep existing implementation) ...
1197
+ ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
 
1198
  if isinstance(context.error, ignore_errors):
1199
  ignore_messages = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
1200
  err_str = str(context.error).lower()
 
1202
  logger.warning(f"Ignoring known/handled/transient error in error_handler: {context.error}")
1203
  return
1204
  logger.error("Exception while handling an update:", exc_info=context.error)
 
 
 
 
1205
 
1206
 
1207
  # --- Application Setup ---
 
1209
  # ... (Keep existing implementation) ...
1210
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
1211
  if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
 
1212
  custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
1213
  application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() )
1214
  # Add Handlers
1215
  application.add_handler(CommandHandler("start", start))
1216
  application.add_handler(CommandHandler("help", help_command))
 
1217
  url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
1218
  application.add_handler(MessageHandler(url_filter, handle_potential_url))
1219
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
 
1220
  application.add_error_handler(error_handler)
1221
  logger.info("Telegram application handlers configured."); return application
1222
 
1223
  # --- ASGI Lifespan & Routes ---
1224
+ # (lifespan, telegram_webhook remain the same)
1225
  @contextlib.asynccontextmanager
1226
  async def lifespan(app: Starlette):
1227
  global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
 
1229
  if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
1230
  try:
1231
  ptb_app = await setup_bot_config()
1232
+ await ptb_app.initialize() # This sets the _initialized flag
1233
  bot_info = await ptb_app.bot.get_me()
1234
  logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
1235
 
1236
  # Webhook setup logic (remains the same)
1237
  current_webhook_info = await ptb_app.bot.get_webhook_info()
1238
+ webhook_delete_success = True # Assume success unless proven otherwise
1239
  if current_webhook_info and current_webhook_info.url:
1240
  logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
1241
  try:
1242
  if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
1243
+ else: logger.warning("Failed delete webhook (API returned False)."); webhook_delete_success = False
1244
+ except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1); webhook_delete_success = False
1245
 
1246
+ # Determine Webhook URL
1247
  space_host = os.environ.get("SPACE_HOST")
1248
  webhook_path = "/webhook" # Standard path
1249
  full_webhook_url = None
1250
  if space_host:
1251
+ protocol = "https"
1252
+ host = space_host.split('://')[-1]
1253
+ full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}"
1254
+ logger.info(f"Using SPACE_HOST to determine webhook URL: {full_webhook_url}")
1255
+ else:
1256
+ logger.critical("Could not construct webhook URL (SPACE_HOST env var missing or invalid).")
1257
+ raise RuntimeError("Webhook URL undetermined.")
1258
 
1259
+ # Set Webhook
1260
+ if full_webhook_url and webhook_delete_success:
1261
  logger.info(f"Attempting to set webhook: {full_webhook_url}")
1262
  set_webhook_args = {
1263
  "url": full_webhook_url,
1264
+ "allowed_updates": Update.ALL_TYPES,
1265
  "drop_pending_updates": True
1266
  }
1267
  if WEBHOOK_SECRET:
 
1271
  await asyncio.sleep(1.0) # Short delay before setting
1272
 
1273
  try:
1274
+ webhook_set = await ptb_app.bot.set_webhook(**set_webhook_args)
1275
+ if not webhook_set:
1276
+ raise RuntimeError("set_webhook API call returned False.")
1277
+ await asyncio.sleep(1.5) # Longer delay after setting before verification
1278
  webhook_info = await ptb_app.bot.get_webhook_info()
1279
  if webhook_info and webhook_info.url == full_webhook_url:
1280
+ logger.info(f"Webhook set and verified successfully: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}")
 
1281
  if webhook_info.last_error_message:
1282
  logger.warning(f"Webhook status has last error: {webhook_info.last_error_message}")
1283
  else:
1284
+ logger.error(f"Webhook verification failed! Expected '{full_webhook_url}', Got info: {webhook_info}")
1285
+ raise RuntimeError("Webhook verification failed after setting.")
1286
 
1287
+ await ptb_app.start() # Start PTB processing updates AFTER successful webhook setup
1288
  logger.info("PTB Application started (webhook mode).")
1289
  except Exception as e:
1290
+ logger.error(f"FATAL: Failed to set or verify webhook: {e}", exc_info=True)
1291
+ raise RuntimeError(f"Failed to set/verify webhook: {e}") from e
1292
+ elif not webhook_delete_success:
1293
+ logger.error("FATAL: Failed to delete previous webhook. Cannot set new one.")
1294
+ raise RuntimeError("Failed to delete previous webhook.")
1295
+ # else: # Already handled by raising error if full_webhook_url is None
 
 
 
1296
 
1297
  logger.info("ASGI Lifespan: Startup complete.");
1298
  yield # Application runs here
 
1303
  if ptb_app:
1304
  try:
1305
  if ptb_app.running: await ptb_app.stop()
1306
+ # Check if initialized before shutting down
1307
+ if ptb_app._initialized: await ptb_app.shutdown()
1308
  except Exception as shutdown_err:
1309
  logger.error(f"Error during shutdown after startup failure: {shutdown_err}")
1310
  raise # Reraise the original startup error
 
1317
  if ptb_app.running:
1318
  logger.info("Stopping PTB application...")
1319
  await ptb_app.stop()
1320
+ # Check if initialized before shutting down
1321
+ if ptb_app._initialized:
1322
+ logger.info("Shutting down PTB application...")
1323
+ await ptb_app.shutdown()
1324
+ logger.info("PTB Application shut down successfully.")
1325
+ else:
1326
+ logger.info("PTB Application was not fully initialized, skipping shutdown.")
1327
  except Exception as e:
1328
  logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
1329
  else:
1330
+ logger.info("PTB application object not created.")
1331
  logger.info("ASGI Lifespan: Shutdown complete.")
1332
 
1333
 
1334
  async def health_check(request: Request) -> PlainTextResponse:
 
1335
  global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
1336
  bot_status = "Not Initialized"
1337
  bot_username = "N/A"
1338
+ # *** FIXED Attribute Name ***
1339
+ if ptb_app and ptb_app.bot and ptb_app._initialized: # Check if initialized using _initialized
1340
  try:
1341
  # Quick check if webhook seems ok, more reliable than get_me() sometimes
1342
  wh_info = await ptb_app.bot.get_webhook_info()
1343
+ # Check running status as well
1344
  if ptb_app.running and wh_info and wh_info.url:
1345
  bot_info = await ptb_app.bot.get_me()
1346
  bot_username = f"@{bot_info.username}"
1347
  bot_status = f"Running (Webhook OK, {bot_username})"
1348
  elif ptb_app.running:
1349
+ bot_status = f"Running (Webhook Status: {wh_info.url if wh_info else 'N/A'}, Last Error: {wh_info.last_error_message if wh_info else 'N/A'})"
1350
  else: bot_status = "Initialized/Not running"
1351
+ except Exception as e:
1352
+ logger.error(f"Error checking bot status in health check: {e}", exc_info=True)
1353
+ bot_status = f"Error checking status: {e}"
1354
  elif ptb_app:
1355
+ bot_status = "Initializing..." # Or Initialized but not running yet
 
1356
 
1357
  health_info = [
1358
  f"=== Telegram Summary Bot Status ===",
 
1376
  if not ptb_app:
1377
  logger.error("Webhook received but PTB application not initialized.")
1378
  return PlainTextResponse('Bot not initialized', status_code=503) # Service Unavailable
1379
+ if not ptb_app._initialized: # Check _initialized flag
1380
+ logger.error("Webhook received but PTB application not running (not initialized).")
1381
+ return PlainTextResponse('Bot not initialized', status_code=503)
1382
  if not ptb_app.running:
1383
  logger.warning("Webhook received but PTB application not running.")
1384
  return PlainTextResponse('Bot not running', status_code=503) # Service Unavailable
 
1432
 
1433
  # Make sure necessary env vars are loaded for local dev if not set system-wide
1434
  # Example using python-dotenv if you add it to requirements-dev.txt
1435
+ try:
1436
+ from dotenv import load_dotenv
1437
+ load_dotenv()
1438
+ logger.info("Loaded environment variables from .env file for local development.")
1439
+ except ImportError:
1440
+ logger.info(".env file not found or python-dotenv not installed, using system environment variables.")
1441
+
1442
 
1443
  # Re-check required tokens after potential .env load
1444
  if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN not found.")