diff --git "a/TransateKRtoEN.py" "b/TransateKRtoEN.py" new file mode 100644--- /dev/null +++ "b/TransateKRtoEN.py" @@ -0,0 +1,11749 @@ +# TransateKRtoEN.py +# -*- coding: utf-8 -*- +import json +import logging +import shutil +import threading +import queue +import uuid +import inspect +import os, sys, io, zipfile, time, re, mimetypes, subprocess, tiktoken +import builtins +import ebooklib +from ebooklib import epub +from bs4 import BeautifulSoup +try: + from bs4 import XMLParsedAsHTMLWarning + import warnings + # Suppress the warning since we handle both HTML and XHTML content + warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) +except ImportError: + # Older versions of BeautifulSoup might not have this warning + pass +from collections import Counter +from unified_api_client import UnifiedClient, UnifiedClientError +import hashlib +import tempfile +import unicodedata +from difflib import SequenceMatcher +import unicodedata +import re +import time +from history_manager import HistoryManager +from chapter_splitter import ChapterSplitter +from image_translator import ImageTranslator +from typing import Dict, List, Tuple +from txt_processor import TextFileProcessor +from ai_hunter_enhanced import ImprovedAIHunterDetection +import csv +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed + +# Module-level functions for ProcessPoolExecutor compatibility +def _check_sentence_batch_for_terms(args): + """Check a batch of sentences for term matches - used by ProcessPoolExecutor""" + batch_sentences, terms = args + filtered = [] + + # Use pre-compiled term list for fast checking + for sentence in batch_sentences: + # Quick check using any() - stops at first match + if any(term in sentence for term in terms): + filtered.append(sentence) + + return filtered + +def _process_sentence_batch_for_extraction(args): + """Process sentences to extract terms - used by ProcessPoolExecutor""" + batch_sentences, batch_idx, combined_pattern, exclude_check_data = args + from collections import Counter + import re + + local_word_freq = Counter() + local_important = [] + local_seen = set() + + # Rebuild the exclusion check function from data + honorifics_to_exclude, title_patterns_str, common_words, chinese_nums = exclude_check_data + title_patterns = [re.compile(p) for p in title_patterns_str] + + def should_exclude_term(term): + term_lower = term.lower() + + # Check if it's a common word + if term in common_words or term_lower in common_words: + return True + + # Check if it contains honorifics + for honorific in honorifics_to_exclude: + if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])): + return True + + # Check if it matches title patterns + for pattern in title_patterns: + if pattern.search(term): + return True + + # Check if it's a number + if term in chinese_nums or term.isdigit(): + return True + + return False + + for sentence in batch_sentences: + sentence = sentence.strip() + if len(sentence) < 10 or len(sentence) > 500: + continue + + # Find all potential terms in this sentence + matches = re.findall(combined_pattern, sentence) + + if matches: + # Filter out excluded terms + filtered_matches = [] + for match in matches: + if not should_exclude_term(match): + local_word_freq[match] += 1 + filtered_matches.append(match) + + # Keep sentences with valid potential terms + if filtered_matches: + sentence_key = ' '.join(sorted(filtered_matches)) + if sentence_key not in local_seen: + local_important.append(sentence) + local_seen.add(sentence_key) + + return local_word_freq, local_important, local_seen, batch_idx +from tqdm import tqdm + +def is_traditional_translation_api(model: str) -> bool: + """Check if the model is a traditional translation API""" + return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/') + +def get_chapter_terminology(is_text_file, chapter_data=None): + """Get appropriate terminology (Chapter/Section) based on source type""" + if is_text_file: + return "Section" + if chapter_data: + if chapter_data.get('filename', '').endswith('.txt') or chapter_data.get('is_chunk', False): + return "Section" + return "Chapter" +# ===================================================== +# CONFIGURATION AND ENVIRONMENT MANAGEMENT +# ===================================================== +class TranslationConfig: + """Centralized configuration management""" + def __init__(self): + self.MODEL = os.getenv("MODEL", "gemini-1.5-flash") + self.input_path = os.getenv("input_path", "default.epub") + self.PROFILE_NAME = os.getenv("PROFILE_NAME", "korean").lower() + self.CONTEXTUAL = os.getenv("CONTEXTUAL", "1") == "1" + self.DELAY = float(os.getenv("SEND_INTERVAL_SECONDS", "1")) + self.SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip() + self.REMOVE_AI_ARTIFACTS = os.getenv("REMOVE_AI_ARTIFACTS", "0") == "1" + self.TEMP = float(os.getenv("TRANSLATION_TEMPERATURE", "0.3")) + self.HIST_LIMIT = int(os.getenv("TRANSLATION_HISTORY_LIMIT", "20")) + self.MAX_OUTPUT_TOKENS = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) + self.EMERGENCY_RESTORE = os.getenv("EMERGENCY_PARAGRAPH_RESTORE", "1") == "1" + self.BATCH_TRANSLATION = os.getenv("BATCH_TRANSLATION", "0") == "1" + self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) + self.ENABLE_IMAGE_TRANSLATION = os.getenv("ENABLE_IMAGE_TRANSLATION", "1") == "1" + self.TRANSLATE_BOOK_TITLE = os.getenv("TRANSLATE_BOOK_TITLE", "1") == "1" + self.DISABLE_ZERO_DETECTION = os.getenv("DISABLE_ZERO_DETECTION", "0") == "1" + self.ENABLE_AUTO_GLOSSARY = os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1" + self.COMPREHENSIVE_EXTRACTION = os.getenv("COMPREHENSIVE_EXTRACTION", "0") == "1" + self.MANUAL_GLOSSARY = os.getenv("MANUAL_GLOSSARY") + self.RETRY_TRUNCATED = os.getenv("RETRY_TRUNCATED", "0") == "1" + self.RETRY_DUPLICATE_BODIES = os.getenv("RETRY_DUPLICATE_BODIES", "1") == "1" + self.RETRY_TIMEOUT = os.getenv("RETRY_TIMEOUT", "0") == "1" + self.CHUNK_TIMEOUT = int(os.getenv("CHUNK_TIMEOUT", "900")) + self.MAX_RETRY_TOKENS = int(os.getenv("MAX_RETRY_TOKENS", "16384")) + self.DUPLICATE_LOOKBACK_CHAPTERS = int(os.getenv("DUPLICATE_LOOKBACK_CHAPTERS", "3")) + self.USE_ROLLING_SUMMARY = os.getenv("USE_ROLLING_SUMMARY", "0") == "1" + self.ROLLING_SUMMARY_EXCHANGES = int(os.getenv("ROLLING_SUMMARY_EXCHANGES", "5")) + self.ROLLING_SUMMARY_MODE = os.getenv("ROLLING_SUMMARY_MODE", "replace") + # New: maximum number of rolling summary entries to retain when in append mode (0 = unlimited) + self.ROLLING_SUMMARY_MAX_ENTRIES = int(os.getenv("ROLLING_SUMMARY_MAX_ENTRIES", "10")) + self.DUPLICATE_DETECTION_MODE = os.getenv("DUPLICATE_DETECTION_MODE", "basic") + self.AI_HUNTER_THRESHOLD = int(os.getenv("AI_HUNTER_THRESHOLD", "75")) + self.TRANSLATION_HISTORY_ROLLING = os.getenv("TRANSLATION_HISTORY_ROLLING", "0") == "1" + self.API_KEY = (os.getenv("API_KEY") or + os.getenv("OPENAI_API_KEY") or + os.getenv("OPENAI_OR_Gemini_API_KEY") or + os.getenv("GEMINI_API_KEY")) + # NEW: Simple chapter number offset + self.CHAPTER_NUMBER_OFFSET = int(os.getenv("CHAPTER_NUMBER_OFFSET", "0")) + self.ENABLE_WATERMARK_REMOVAL = os.getenv("ENABLE_WATERMARK_REMOVAL", "1") == "1" + self.SAVE_CLEANED_IMAGES = os.getenv("SAVE_CLEANED_IMAGES", "1") == "1" + self.WATERMARK_PATTERN_THRESHOLD = int(os.getenv("WATERMARK_PATTERN_THRESHOLD", "10")) + self.WATERMARK_CLAHE_LIMIT = float(os.getenv("WATERMARK_CLAHE_LIMIT", "3.0")) + self.COMPRESSION_FACTOR = float(os.getenv("COMPRESSION_FACTOR", "1.0")) + + # Multi API key support + self.use_multi_api_keys = os.environ.get('USE_MULTI_API_KEYS', '0') == '1' + self.multi_api_keys = [] + + if self.use_multi_api_keys: + multi_keys_json = os.environ.get('MULTI_API_KEYS', '[]') + try: + self.multi_api_keys = json.loads(multi_keys_json) + print(f"Loaded {len(self.multi_api_keys)} API keys for multi-key mode") + except Exception as e: + print(f"Failed to load multi API keys: {e}") + self.use_multi_api_keys = False + + +# ===================================================== +# UNIFIED PATTERNS AND CONSTANTS +# ===================================================== +class PatternManager: + """Centralized pattern management""" + + CHAPTER_PATTERNS = [ + # English patterns + (r'chapter[\s_-]*(\d+)', re.IGNORECASE, 'english_chapter'), + (r'\bch\.?\s*(\d+)\b', re.IGNORECASE, 'english_ch'), + (r'part[\s_-]*(\d+)', re.IGNORECASE, 'english_part'), + (r'episode[\s_-]*(\d+)', re.IGNORECASE, 'english_episode'), + # Chinese patterns + (r'第\s*(\d+)\s*[章节話话回]', 0, 'chinese_chapter'), + (r'第\s*([一二三四五六七八九十百千万]+)\s*[章节話话回]', 0, 'chinese_chapter_cn'), + (r'(\d+)[章节話话回]', 0, 'chinese_short'), + # Japanese patterns + (r'第\s*(\d+)\s*話', 0, 'japanese_wa'), + (r'第\s*(\d+)\s*章', 0, 'japanese_chapter'), + (r'その\s*(\d+)', 0, 'japanese_sono'), + (r'(\d+)話目', 0, 'japanese_wame'), + # Korean patterns + (r'제\s*(\d+)\s*[장화권부편]', 0, 'korean_chapter'), + (r'(\d+)\s*[장화권부편]', 0, 'korean_short'), + (r'에피소드\s*(\d+)', 0, 'korean_episode'), + # Generic numeric patterns + (r'^\s*(\d+)\s*[-–—.\:]', re.MULTILINE, 'generic_numbered'), + (r'_(\d+)\.x?html?$', re.IGNORECASE, 'filename_number'), + (r'/(\d+)\.x?html?$', re.IGNORECASE, 'path_number'), + (r'(\d+)', 0, 'any_number'), + ] + + FILENAME_EXTRACT_PATTERNS = [ + # IMPORTANT: More specific patterns MUST come first + r'^\d{3}(\d)_(\d{2})_\.x?html?$', # Captures both parts for decimal: group1.group2 + r'^\d{4}_(\d+)\.x?html?$', # "0000_1.xhtml" - extracts 1, not 0000 + r'^\d+_(\d+)[_\.]', # Any digits followed by underscore then capture next digits + r'^(\d+)[_\.]', # Standard: "0249_" or "0249." + r'response_(\d+)_', # Standard pattern: response_001_ + r'response_(\d+)\.', # Pattern: response_001. + r'(\d{3,5})[_\.]', # 3-5 digit pattern with padding + r'[Cc]hapter[_\s]*(\d+)', # Chapter word pattern + r'[Cc]h[_\s]*(\d+)', # Ch abbreviation + r'No(\d+)Chapter', # No prefix with Chapter - matches "No00013Chapter.xhtml" + r'No(\d+)Section', # No prefix with Section - matches "No00013Section.xhtml" + r'No(\d+)(?=\.|_|$)', # No prefix followed by end, dot, or underscore (not followed by text) + r'第(\d+)[章话回]', # Chinese chapter markers + r'_(\d+)(?:_|\.|$)', # Number between underscores or at end + r'^(\d+)(?:_|\.|$)', # Starting with number + r'(\d+)', # Any number (fallback) + ] + + CJK_HONORIFICS = { + 'korean': [ + # Modern honorifics + '님', '씨', '선배', '후배', '동기', '형', '누나', '언니', '오빠', '동생', + '선생님', '교수님', '박사님', '사장님', '회장님', '부장님', '과장님', '대리님', + '팀장님', '실장님', '이사님', '전무님', '상무님', '부사장님', '고문님', + + # Classical/formal honorifics + '공', '옹', '군', '양', '낭', '랑', '생', '자', '부', '모', '시', '제', '족하', + + # Royal/noble address forms + '마마', '마노라', '대감', '영감', '나리', '도령', '낭자', '아씨', '규수', + '각하', '전하', '폐하', '저하', '합하', '대비', '대왕', '왕자', '공주', + + # Buddhist/religious + '스님', '사부님', '조사님', '큰스님', '화상', '대덕', '대사', '법사', + '선사', '율사', '보살님', '거사님', '신부님', '목사님', '장로님', '집사님', + + # Confucian/scholarly + '부자', '선생', '대인', '어른', '어르신', '존자', '현자', '군자', '대부', + '학사', '진사', '문하생', '제자', + + # Kinship honorifics + '어르신', '할아버님', '할머님', '아버님', '어머님', '형님', '누님', + '아주버님', '아주머님', '삼촌', '이모님', '고모님', '외삼촌', '장인어른', + '장모님', '시아버님', '시어머님', '처남', '처형', '매형', '손님', + + # Verb-based honorific endings and speech levels + '습니다', 'ㅂ니다', '습니까', 'ㅂ니까', '시다', '세요', '셔요', '십시오', '시오', + '이에요', '예요', '이예요', '에요', '어요', '아요', '여요', '해요', '이세요', '으세요', + '으시', '시', '으십니다', '십니다', '으십니까', '십니까', '으셨', '셨', + '드립니다', '드려요', '드릴게요', '드리겠습니다', '올립니다', '올려요', + '사옵니다', '사뢰', '여쭙니다', '여쭤요', '아뢰', '뵙니다', '뵈요', '모십니다', + '시지요', '시죠', '시네요', '시는군요', '시는구나', '으실', '실', + '드시다', '잡수시다', '주무시다', '계시다', '가시다', '오시다', + + # Common verb endings with 있다/없다/하다 + '있어요', '있습니다', '있으세요', '있으십니까', '없어요', '없습니다', '없으세요', + '해요', '합니다', '하세요', '하십시오', '하시죠', '하시네요', '했어요', '했습니다', + '되세요', '되셨어요', '되십니다', '됩니다', '되요', '돼요', + '이야', '이네', '이구나', '이군', '이네요', '인가요', '인가', '일까요', '일까', + '거예요', '거에요', '겁니다', '건가요', '게요', '을게요', '을까요', '었어요', '었습니다', + '겠습니다', '겠어요', '겠네요', '을겁니다', '을거예요', '을거에요', + + # Common endings + '요', '죠', '네요', '는데요', '거든요', '니까', '으니까', '는걸요', '군요', '구나', + '는구나', '는군요', '더라고요', '더군요', '던데요', '나요', '가요', '까요', + '라고요', '다고요', '냐고요', '자고요', '란다', '단다', '냔다', '잔다', + + # Formal archaic endings + '나이다', '사옵나이다', '옵니다', '오', '소서', '으오', '으옵소서', '사이다', + '으시옵니다', '시옵니다', '으시옵니까', '시옵니까', '나이까', '리이까', '리이다', + '옵소서', '으소서', '소이다', '로소이다', '이옵니다', '이올시다', '하옵니다' + ], + 'japanese': [ + # Modern honorifics + 'さん', 'ちゃん', '君', 'くん', '様', 'さま', '先生', 'せんせい', '殿', 'どの', '先輩', 'せんぱい', + # Classical/historical + '氏', 'し', '朝臣', 'あそん', '宿禰', 'すくね', '連', 'むらじ', '臣', 'おみ', '君', 'きみ', + '真人', 'まひと', '道師', 'みちのし', '稲置', 'いなぎ', '直', 'あたい', '造', 'みやつこ', + # Court titles + '卿', 'きょう', '大夫', 'たいふ', '郎', 'ろう', '史', 'し', '主典', 'さかん', + # Buddhist titles + '和尚', 'おしょう', '禅師', 'ぜんじ', '上人', 'しょうにん', '聖人', 'しょうにん', + '法師', 'ほうし', '阿闍梨', 'あじゃり', '大和尚', 'だいおしょう', + # Shinto titles + '大宮司', 'だいぐうじ', '宮司', 'ぐうじ', '禰宜', 'ねぎ', '祝', 'はふり', + # Samurai era + '守', 'かみ', '介', 'すけ', '掾', 'じょう', '目', 'さかん', '丞', 'じょう', + # Keigo (honorific language) verb forms + 'です', 'ます', 'ございます', 'いらっしゃる', 'いらっしゃいます', 'おっしゃる', 'おっしゃいます', + 'なさる', 'なさいます', 'くださる', 'くださいます', 'いただく', 'いただきます', + 'おります', 'でございます', 'ございません', 'いたします', 'いたしました', + '申す', '申します', '申し上げる', '申し上げます', '存じる', '存じます', '存じ上げる', + '伺う', '伺います', '参る', '参ります', 'お目にかかる', 'お目にかかります', + '拝見', '拝見します', '拝聴', '拝聴します', '承る', '承ります', + # Respectful prefixes/suffixes + 'お', 'ご', '御', 'み', '美', '貴', '尊' + ], + 'chinese': [ + # Modern forms + '先生', '小姐', '夫人', '公子', '大人', '老师', '师父', '师傅', '同志', '同学', + # Ancient/classical forms + '子', '丈', '翁', '公', '侯', '伯', '叔', '仲', '季', '父', '甫', '卿', '君', '生', + # Imperial court + '陛下', '殿下', '千岁', '万岁', '圣上', '皇上', '天子', '至尊', '御前', '爷', + # Nobility/officials + '阁下', '大人', '老爷', '相公', '官人', '郎君', '娘子', '夫子', '足下', + # Religious titles + '上人', '法师', '禅师', '大师', '高僧', '圣僧', '神僧', '活佛', '仁波切', + '真人', '天师', '道长', '道友', '仙长', '上仙', '祖师', '掌教', + # Scholarly/Confucian + '夫子', '圣人', '贤人', '君子', '大儒', '鸿儒', '宗师', '泰斗', '巨擘', + # Martial arts + '侠士', '大侠', '少侠', '女侠', '英雄', '豪杰', '壮士', '义士', + # Family/kinship + '令尊', '令堂', '令郎', '令爱', '贤弟', '贤侄', '愚兄', '小弟', '家父', '家母', + # Humble forms + '在下', '小人', '鄙人', '不才', '愚', '某', '仆', '妾', '奴', '婢', + # Polite verbal markers + '请', '请问', '敢问', '恭请', '敬请', '烦请', '有请', '请教', '赐教', + '惠顾', '惠赐', '惠存', '笑纳', '雅正', '指正', '斧正', '垂询', + '拜', '拜见', '拜访', '拜读', '拜托', '拜谢', '敬上', '谨上', '顿首' + ], + 'english': [ + # Modern Korean romanizations (Revised Romanization of Korean - 2000) + '-nim', '-ssi', '-seonbae', '-hubae', '-donggi', '-hyeong', '-nuna', + '-eonni', '-oppa', '-dongsaeng', '-seonsaengnim', '-gyosunim', + '-baksanim', '-sajangnim', '-hoejangnim', '-bujangnim', '-gwajangnim', + '-daerim', '-timjangnim', '-siljangnim', '-isanim', '-jeonmunim', + '-sangmunim', '-busajangnim', '-gomunnim', + + # Classical/formal Korean romanizations + '-gong', '-ong', '-gun', '-yang', '-nang', '-rang', '-saeng', '-ja', + '-bu', '-mo', '-si', '-je', '-jokha', + + # Royal/noble Korean romanizations + '-mama', '-manora', '-daegam', '-yeonggam', '-nari', '-doryeong', + '-nangja', '-assi', '-gyusu', '-gakha', '-jeonha', '-pyeha', '-jeoha', + '-hapka', '-daebi', '-daewang', '-wangja', '-gongju', + + # Buddhist/religious Korean romanizations + '-seunim', '-sabunim', '-josanim', '-keunseunim', '-hwasang', + '-daedeok', '-daesa', '-beopsa', '-seonsa', '-yulsa', '-bosalnim', + '-geosanim', '-sinbunim', '-moksanim', '-jangnonim', '-jipsanim', + + # Confucian/scholarly Korean romanizations + '-buja', '-seonsaeng', '-daein', '-eoreun', '-eoreusin', '-jonja', + '-hyeonja', '-gunja', '-daebu', '-haksa', '-jinsa', '-munhasaeng', '-jeja', + + # Kinship Korean romanizations + '-harabeonim', '-halmeonim', '-abeonim', '-eomeonim', '-hyeongnim', + '-nunim', '-ajubeonim', '-ajumeonim', '-samchon', '-imonim', '-gomonim', + '-oesamchon', '-jangineoreun', '-jangmonim', '-siabeonim', '-sieomeonim', + '-cheonam', '-cheohyeong', '-maehyeong', '-sonnim', + + # Korean verb endings romanized (Revised Romanization) + '-seumnida', '-mnida', '-seumnikka', '-mnikka', '-sida', '-seyo', + '-syeoyo', '-sipsio', '-sio', '-ieyo', '-yeyo', '-iyeyo', '-eyo', + '-eoyo', '-ayo', '-yeoyo', '-haeyo', '-iseyo', '-euseyo', + '-eusi', '-si', '-eusimnida', '-simnida', '-eusimnikka', '-simnikka', + '-eusyeot', '-syeot', '-deurimnida', '-deuryeoyo', '-deurilgeyo', + '-deurigesseumnida', '-ollimnida', '-ollyeoyo', '-saomnida', '-saroe', + '-yeojjumnida', '-yeojjwoyo', '-aroe', '-boemnida', '-boeyo', '-mosimnida', + '-sijiyo', '-sijyo', '-sineyo', '-sineungunyo', '-sineunguna', '-eusil', '-sil', + '-deusida', '-japsusida', '-jumusida', '-gyesida', '-gasida', '-osida', + + # Common Korean verb endings romanized + '-isseoyo', '-isseumnida', '-isseuseyo', '-isseusimnikka', + '-eopseoyo', '-eopseumnida', '-eopseuseyo', '-hamnida', '-haseyo', + '-hasipsio', '-hasijyo', '-hasineyo', '-haesseoyo', '-haesseumnida', + '-doeseyo', '-doesyeosseoyo', '-doesimnida', '-doemnida', '-doeyo', '-dwaeyo', + '-iya', '-ine', '-iguna', '-igun', '-ineyo', '-ingayo', '-inga', + '-ilkkayo', '-ilkka', '-geoyeyo', '-geoeyo', '-geomnida', '-geongayo', + '-geyo', '-eulgeyo', '-eulkkayo', '-eosseoyo', '-eosseumnida', + '-gesseumnida', '-gesseoyo', '-genneyo', '-eulgeommida', '-eulgeoyeyo', '-eulgeoeyo', + + # Common Korean endings romanized + '-yo', '-jyo', '-neyo', '-neundeyo', '-geodeunyo', '-nikka', + '-eunikka', '-neungeolyo', '-gunyo', '-guna', '-neunguna', '-neungunyo', + '-deoragoyo', '-deogunyo', '-deondeyo', '-nayo', '-gayo', '-kkayo', + '-ragoyo', '-dagoyo', '-nyagoyo', '-jagoyo', '-randa', '-danda', + '-nyanda', '-janda', + + # Formal archaic Korean romanized + '-naida', '-saomnaida', '-omnida', '-o', '-soseo', '-euo', + '-euopsoseo', '-saida', '-eusiomnida', '-siomnida', '-eusiomnikka', + '-siomnikka', '-naikka', '-riikka', '-riida', '-opsoseo', '-eusoseo', + '-soida', '-rosoida', '-iomnida', '-iolsida', '-haomnida', + + # Japanese keigo romanized (keeping existing) + '-san', '-chan', '-kun', '-sama', '-sensei', '-senpai', '-dono', + '-shi', '-tan', '-chin', '-desu', '-masu', '-gozaimasu', + '-irassharu', '-irasshaimasu', '-ossharu', '-osshaimasu', + '-nasaru', '-nasaimasu', '-kudasaru', '-kudasaimasu', '-itadaku', + '-itadakimasu', '-orimasu', '-degozaimasu', '-gozaimasen', + '-itashimasu', '-itashimashita', '-mousu', '-moushimasu', + '-moushiageru', '-moushiagemasu', '-zonjiru', '-zonjimasu', + '-ukagau', '-ukagaimasu', '-mairu', '-mairimasu', '-haiken', + '-haikenshimasu', + + # Chinese romanizations (keeping existing) + '-xiong', '-di', '-ge', '-gege', '-didi', '-jie', '-jiejie', + '-meimei', '-shixiong', '-shidi', '-shijie', '-shimei', '-gongzi', + '-guniang', '-xiaojie', '-daren', '-qianbei', '-daoyou', '-zhanglao', + '-shibo', '-shishu', '-shifu', '-laoshi', '-xiansheng', '-daxia', + '-shaoxia', '-nvxia', '-jushi', '-shanren', '-dazhang', '-zhenren', + + # Ancient Chinese romanizations + '-zi', '-gong', '-hou', '-bo', '-jun', '-qing', '-weng', '-fu', + '-sheng', '-lang', '-langjun', '-niangzi', '-furen', '-gege', + '-jiejie', '-yeye', '-nainai', + + # Chinese politeness markers romanized + '-qing', '-jing', '-gong', '-hui', '-ci', '-bai', '-gan', '-chui', + 'qingwen', 'ganwen', 'gongjing', 'jingjing', 'baijian', 'baifang', + 'baituo' + ] + } + + TITLE_PATTERNS = { + 'korean': [ + # Modern titles + r'\b(왕|여왕|왕자|공주|황제|황후|대왕|대공|공작|백작|자작|남작|기사|장군|대장|원수|제독|함장|대신|재상|총리|대통령|시장|지사|검사|판사|변호사|의사|박사|교수|신부|목사|스님|도사)\b', + r'\b(폐하|전하|각하|예하|님|대감|영감|나리|도련님|아가씨|부인|선생)\b', + # Historical/classical titles + r'\b(대왕|태왕|왕비|왕후|세자|세자빈|대군|군|옹주|공주|부마|원자|원손)\b', + r'\b(영의정|좌의정|우의정|판서|참판|참의|정승|판사|사또|현령|군수|목사|부사)\b', + r'\b(대제학|제학|대사간|사간|대사헌|사헌|도승지|승지|한림|사관|내시|환관)\b', + r'\b(병조판서|이조판서|호조판서|예조판서|형조판서|공조판서)\b', + r'\b(도원수|부원수|병마절도사|수군절도사|첨절제사|만호|천호|백호)\b', + r'\b(정일품|종일품|정이품|종이품|정삼품|종삼품|정사품|종사품|정오품|종오품)\b', + # Korean honorific verb endings patterns + r'(습니다|ㅂ니다|습니까|ㅂ니까|세요|셔요|십시오|시오)$', + r'(이에요|예요|이예요|에요|어요|아요|여요|해요)$', + r'(으시|시)(었|겠|ㄹ|을|는|던)*(습니다|ㅂ니다|어요|아요|세요)', + r'(드립니다|드려요|드릴게요|드리겠습니다|올립니다|올려요)$', + r'(사옵니다|여쭙니다|여쭤요|뵙니다|뵈요|모십니다)$', + r'(나이다|사옵나이다|옵니다|으오|으옵소서|사이다)$' + ], + 'japanese': [ + # Modern titles + r'\b(王|女王|王子|姫|皇帝|皇后|天皇|皇太子|大王|大公|公爵|伯爵|子爵|男爵|騎士|将軍|大将|元帥|提督|艦長|大臣|宰相|総理|大統領|市長|知事|検事|裁判官|弁護士|医者|博士|教授|神父|牧師|僧侶|道士)\b', + r'\b(陛下|殿下|閣下|猊下|様|大人|殿|卿|君|氏)\b', + # Historical titles + r'\b(天皇|皇后|皇太子|親王|内親王|王|女王|太政大臣|左大臣|右大臣|内大臣|大納言|中納言|参議)\b', + r'\b(関白|摂政|征夷大将軍|管領|執権|守護|地頭|代官|奉行|与力|同心)\b', + r'\b(太政官|神祇官|式部省|治部省|民部省|兵部省|刑部省|大蔵省|宮内省)\b', + r'\b(大僧正|僧正|大僧都|僧都|律師|大法師|法師|大禅師|禅師)\b', + r'\b(正一位|従一位|正二位|従二位|正三位|従三位|正四位|従四位|正五位|従五位)\b', + r'\b(大和守|山城守|摂津守|河内守|和泉守|伊賀守|伊勢守|尾張守|三河守|遠江守)\b', + # Japanese keigo (honorific language) patterns + r'(です|ます|ございます)$', + r'(いらっしゃ|おっしゃ|なさ|くださ)(います|いました|る|った)$', + r'(いただ|お|ご|御)(き|きます|きました|く|ける|けます)', + r'(申し上げ|申し|存じ上げ|存じ|伺い|参り)(ます|ました|る)$', + r'(拝見|拝聴|承り|承)(します|しました|いたします|いたしました)$', + r'お[^あ-ん]+[になる|になります|くださる|くださいます]' + ], + 'chinese': [ + # Modern titles + r'\b(王|女王|王子|公主|皇帝|皇后|大王|大公|公爵|伯爵|子爵|男爵|骑士|将军|大将|元帅|提督|舰长|大臣|宰相|总理|大总统|市长|知事|检察官|法官|律师|医生|博士|教授|神父|牧师|和尚|道士)\b', + r'\b(陛下|殿下|阁下|大人|老爷|夫人|小姐|公子|少爷|姑娘|先生)\b', + # Imperial titles + r'\b(天子|圣上|皇上|万岁|万岁爷|太上皇|皇太后|太后|皇后|贵妃|妃|嫔|贵人|常在|答应)\b', + r'\b(太子|皇子|皇孙|亲王|郡王|贝勒|贝子|公主|格格|郡主|县主|郡君|县君)\b', + # Ancient official titles + r'\b(丞相|相国|太师|太傅|太保|太尉|司徒|司空|大司马|大司农|大司寇)\b', + r'\b(尚书|侍郎|郎中|员外郎|主事|知府|知州|知县|同知|通判|推官|巡抚|总督)\b', + r'\b(御史大夫|御史中丞|监察御史|给事中|都察院|翰林院|国子监|钦天监)\b', + r'\b(大学士|学士|侍读|侍讲|编修|检讨|庶吉士|举人|进士|状元|榜眼|探花)\b', + # Military ranks + r'\b(大元帅|元帅|大将军|将军|都督|都指挥使|指挥使|千户|百户|总兵|副将|参将|游击|都司|守备)\b', + r'\b(提督|总兵官|副总兵|参将|游击将军|都司|守备|千总|把总|外委)\b', + # Religious titles + r'\b(国师|帝师|法王|活佛|堪布|仁波切|大和尚|方丈|住持|首座|维那|知客)\b', + r'\b(天师|真人|道长|掌教|监院|高功|都讲|总理|提点|知观)\b', + # Nobility ranks + r'\b(公|侯|伯|子|男|开国公|郡公|国公|郡侯|县侯|郡伯|县伯|县子|县男)\b', + r'\b(一品|二品|三品|四品|五品|六品|七品|八品|九品|正一品|从一品|正二品|从二品)\b', + # Chinese politeness markers + r'(请|敢|恭|敬|烦|有)(问|请|赐|教|告|示)', + r'(拜|惠|赐|垂|雅|笑)(见|访|读|托|谢|顾|赐|存|纳|正|询)', + r'(敬|谨|顿)(上|呈|启|白|首)' + ], + 'english': [ + # Western titles + r'\b(King|Queen|Prince|Princess|Emperor|Empress|Duke|Duchess|Marquis|Marquess|Earl|Count|Countess|Viscount|Viscountess|Baron|Baroness|Knight|Lord|Lady|Sir|Dame|General|Admiral|Captain|Major|Colonel|Commander|Lieutenant|Sergeant|Minister|Chancellor|President|Mayor|Governor|Judge|Doctor|Professor|Father|Reverend|Master|Mistress)\b', + r'\b(His|Her|Your|Their)\s+(Majesty|Highness|Grace|Excellency|Honor|Worship|Lordship|Ladyship)\b', + # Romanized historical titles + r'\b(Tianzi|Huangdi|Huanghou|Taizi|Qinwang|Junwang|Beile|Beizi|Gongzhu|Gege)\b', + r'\b(Chengxiang|Zaixiang|Taishi|Taifu|Taibao|Taiwei|Situ|Sikong|Dasima)\b', + r'\b(Shogun|Daimyo|Samurai|Ronin|Ninja|Tenno|Mikado|Kampaku|Sessho)\b', + r'\b(Taewang|Wangbi|Wanghu|Seja|Daegun|Gun|Ongju|Gongju|Buma)\b' + ] + } + + # Expanded Chinese numbers including classical forms + CHINESE_NUMS = { + # Basic numbers + '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, + '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, + '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, + '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, + '二十一': 21, '二十二': 22, '二十三': 23, '二十四': 24, '二十五': 25, + '三十': 30, '四十': 40, '五十': 50, '六十': 60, + '七十': 70, '八十': 80, '九十': 90, '百': 100, + # Classical/formal numbers + '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, + '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10, + '佰': 100, '仟': 1000, '萬': 10000, '万': 10000, + # Ordinal indicators + '第一': 1, '第二': 2, '第三': 3, '第四': 4, '第五': 5, + '首': 1, '次': 2, '初': 1, '末': -1, + } + + # Common words - keeping the same for filtering + COMMON_WORDS = { + '이', '그', '저', '우리', '너희', '자기', '당신', '여기', '거기', '저기', + '오늘', '내일', '어제', '지금', '아까', '나중', '먼저', '다음', '마지막', + '모든', '어떤', '무슨', '이런', '그런', '저런', '같은', '다른', '새로운', + '하다', '있다', '없다', '되다', '하는', '있는', '없는', '되는', + '것', '수', '때', '년', '월', '일', '시', '분', '초', + '은', '는', '이', '가', '을', '를', '에', '의', '와', '과', '도', '만', + '에서', '으로', '로', '까지', '부터', '에게', '한테', '께', '께서', + 'この', 'その', 'あの', 'どの', 'これ', 'それ', 'あれ', 'どれ', + 'わたし', 'あなた', 'かれ', 'かのじょ', 'わたしたち', 'あなたたち', + 'きょう', 'あした', 'きのう', 'いま', 'あとで', 'まえ', 'つぎ', + 'の', 'は', 'が', 'を', 'に', 'で', 'と', 'も', 'や', 'から', 'まで', + '这', '那', '哪', '这个', '那个', '哪个', '这里', '那里', '哪里', + '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', + '今天', '明天', '昨天', '现在', '刚才', '以后', '以前', '后来', + '的', '了', '在', '是', '有', '和', '与', '或', '但', '因为', '所以', + '一', '二', '三', '四', '五', '六', '七', '八', '九', '十', + '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', + } +# ===================================================== +# CHUNK CONTEXT MANAGER (unchanged - already optimal) +# ===================================================== +class ChunkContextManager: + """Manage context within a chapter separate from history""" + def __init__(self): + self.current_chunks = [] + self.chapter_num = None + self.chapter_title = None + + def start_chapter(self, chapter_num, chapter_title): + """Start a new chapter context""" + self.current_chunks = [] + self.chapter_num = chapter_num + self.chapter_title = chapter_title + + def add_chunk(self, user_content, assistant_content, chunk_idx, total_chunks): + """Add a chunk to the current chapter context""" + self.current_chunks.append({ + "user": user_content, + "assistant": assistant_content, + "chunk_idx": chunk_idx, + "total_chunks": total_chunks + }) + + def get_context_messages(self, limit=3): + """Get last N chunks as messages for API context""" + context = [] + for chunk in self.current_chunks[-limit:]: + context.extend([ + {"role": "user", "content": chunk["user"]}, + {"role": "assistant", "content": chunk["assistant"]} + ]) + return context + + def get_summary_for_history(self): + """Create a summary representation for the history""" + if not self.current_chunks: + return None, None + + total_chunks = len(self.current_chunks) + + user_summary = f"[Chapter {self.chapter_num}: {self.chapter_title}]\n" + user_summary += f"[{total_chunks} chunks processed]\n" + if self.current_chunks: + first_chunk = self.current_chunks[0]['user'] + if len(first_chunk) > 500: + user_summary += first_chunk[:500] + "..." + else: + user_summary += first_chunk + + assistant_summary = f"[Chapter {self.chapter_num} Translation Complete]\n" + assistant_summary += f"[Translated in {total_chunks} chunks]\n" + if self.current_chunks: + samples = [] + first_trans = self.current_chunks[0]['assistant'] + samples.append(f"Beginning: {first_trans[:200]}..." if len(first_trans) > 200 else f"Beginning: {first_trans}") + + if total_chunks > 2: + mid_idx = total_chunks // 2 + mid_trans = self.current_chunks[mid_idx]['assistant'] + samples.append(f"Middle: {mid_trans[:200]}..." if len(mid_trans) > 200 else f"Middle: {mid_trans}") + + if total_chunks > 1: + last_trans = self.current_chunks[-1]['assistant'] + samples.append(f"End: {last_trans[:200]}..." if len(last_trans) > 200 else f"End: {last_trans}") + + assistant_summary += "\n".join(samples) + + return user_summary, assistant_summary + + def clear(self): + """Clear the current chapter context""" + self.current_chunks = [] + self.chapter_num = None + self.chapter_title = None + +# ===================================================== +# UNIFIED UTILITIES +# ===================================================== +class FileUtilities: + """Utilities for file and path operations""" + + @staticmethod + def extract_actual_chapter_number(chapter, patterns=None, config=None): + """Extract actual chapter number from filename using improved logic""" + + # IMPORTANT: Check if this is a pre-split TEXT FILE chunk first + if (chapter.get('is_chunk', False) and + 'num' in chapter and + isinstance(chapter['num'], float) and + chapter.get('filename', '').endswith('.txt')): + # For text file chunks only, preserve the decimal number + return chapter['num'] # This will be 1.1, 1.2, etc. + + # Get filename for extraction + filename = chapter.get('original_basename') or chapter.get('filename', '') + + # Use our improved extraction function + # Note: We don't have opf_spine_position here, so pass None + actual_num, method = extract_chapter_number_from_filename(filename, opf_spine_position=None) + + # If extraction succeeded, return the result + if actual_num is not None: + #print(f"[DEBUG] Extracted {actual_num} from '{filename}' using method: {method}") + return actual_num + + # Fallback to original complex logic for edge cases + actual_num = None + + if patterns is None: + patterns = PatternManager.FILENAME_EXTRACT_PATTERNS + + # Try to extract from original basename first + if chapter.get('original_basename'): + basename = chapter['original_basename'] + + # Check if decimal chapters are enabled for EPUBs + enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1' + + # For EPUBs, only check decimal patterns if the toggle is enabled + if enable_decimal: + # Check for standard decimal chapter numbers (e.g., Chapter_1.1, 1.2.html) + decimal_match = re.search(r'(\d+)\.(\d+)', basename) + if decimal_match: + actual_num = float(f"{decimal_match.group(1)}.{decimal_match.group(2)}") + return actual_num + + # Check for the XXXX_YY pattern where it represents X.YY decimal chapters + decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename) + if decimal_prefix_match: + first_part = decimal_prefix_match.group(1) + second_part = decimal_prefix_match.group(2) + + if len(second_part) == 2 and int(second_part) > 9: + chapter_num = int(first_part[-1]) + decimal_part = second_part + actual_num = float(f"{chapter_num}.{decimal_part}") + return actual_num + + # Standard XXXX_Y format handling (existing logic) + prefix_suffix_match = re.match(r'^(\d+)_(\d+)', basename) + if prefix_suffix_match: + second_part = prefix_suffix_match.group(2) + + if not enable_decimal: + actual_num = int(second_part) + return actual_num + else: + if len(second_part) == 1 or (len(second_part) == 2 and int(second_part) <= 9): + actual_num = int(second_part) + return actual_num + + # Check other patterns if no match yet + for pattern in patterns: + if pattern in [r'^(\d+)[_\.]', r'(\d{3,5})[_\.]', r'^(\d+)_']: + continue + match = re.search(pattern, basename, re.IGNORECASE) + if match: + actual_num = int(match.group(1)) + break + + # Final fallback to chapter num + if actual_num is None: + actual_num = chapter.get("num", 0) + print(f"[DEBUG] No pattern matched, using chapter num: {actual_num}") + + return actual_num + + @staticmethod + def create_chapter_filename(chapter, actual_num=None): + """Create consistent chapter filename""" + # Check if we should use header as output name + use_header_output = os.getenv("USE_HEADER_AS_OUTPUT", "0") == "1" + + # Check if this is for a text file + is_text_file = chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False) + + # Respect toggle: retain source extension and remove 'response_' prefix + retain = should_retain_source_extension() + + # Helper to compute full original extension chain (e.g., '.html.xhtml') + def _full_ext_from_original(ch): + fn = ch.get('original_filename') + if not fn: + return '.html' + bn = os.path.basename(fn) + root, ext = os.path.splitext(bn) + if not ext: + return '.html' + full_ext = '' + while ext: + full_ext = ext + full_ext + root, ext = os.path.splitext(root) + return full_ext or '.html' + + if use_header_output and chapter.get('title'): + safe_title = make_safe_filename(chapter['title'], actual_num or chapter.get('num', 0)) + if safe_title and safe_title != f"chapter_{actual_num or chapter.get('num', 0):03d}": + if is_text_file: + return f"{safe_title}.txt" if retain else f"response_{safe_title}.txt" + else: + # If retaining, use full original ext chain; else default .html + if retain: + return f"{safe_title}{_full_ext_from_original(chapter)}" + return f"response_{safe_title}.html" + + # Check if decimal chapters are enabled + enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1' + + # For EPUBs with decimal detection enabled + if enable_decimal and 'original_basename' in chapter and chapter['original_basename']: + basename = chapter['original_basename'] + + # Check for standard decimal pattern (e.g., Chapter_1.1) + decimal_match = re.search(r'(\d+)\.(\d+)', basename) + if decimal_match: + # Create a modified basename that preserves the decimal + base = os.path.splitext(basename)[0] + # Replace dots with underscores for filesystem compatibility + base = base.replace('.', '_') + # Use .txt extension for text files + if is_text_file: + return f"{base}.txt" if retain else f"response_{base}.txt" + else: + if retain: + return f"{base}{_full_ext_from_original(chapter)}" + return f"response_{base}.html" + + # Check for the special XXXX_YY decimal pattern + decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename) + if decimal_prefix_match: + first_part = decimal_prefix_match.group(1) + second_part = decimal_prefix_match.group(2) + + # If this matches our decimal pattern (e.g., 0002_33 -> 2.33) + if len(second_part) == 2 and int(second_part) > 9: + chapter_num = int(first_part[-1]) + decimal_part = second_part + # Create filename reflecting the decimal interpretation + if is_text_file: + return f"{chapter_num:04d}_{decimal_part}.txt" if retain else f"response_{chapter_num:04d}_{decimal_part}.txt" + else: + return f"{chapter_num:04d}_{decimal_part}{_full_ext_from_original(chapter)}" if retain else f"response_{chapter_num:04d}_{decimal_part}.html" + + # Standard EPUB handling - use original basename + if 'original_basename' in chapter and chapter['original_basename']: + base = os.path.splitext(chapter['original_basename'])[0] + # Use .txt extension for text files + if is_text_file: + return f"{base}.txt" if retain else f"response_{base}.txt" + else: + if retain: + # Preserve the full original extension chain + return f"{base}{_full_ext_from_original(chapter)}" + return f"response_{base}.html" + else: + # Text file handling (no original basename) + if actual_num is None: + actual_num = chapter.get('actual_chapter_num', chapter.get('num', 0)) + + # Handle decimal chapter numbers from text file splitting + if isinstance(actual_num, float): + major = int(actual_num) + minor = int(round((actual_num - major) * 10)) + if is_text_file: + return f"{major:04d}_{minor}.txt" if retain else f"response_{major:04d}_{minor}.txt" + else: + return f"{major:04d}_{minor}.html" if retain else f"response_{major:04d}_{minor}.html" + else: + if is_text_file: + return f"{actual_num:04d}.txt" if retain else f"response_{actual_num:04d}.txt" + else: + return f"{actual_num:04d}.html" if retain else f"response_{actual_num:04d}.html" + +# ===================================================== +# UNIFIED PROGRESS MANAGER +# ===================================================== +class ProgressManager: + """Unified progress management""" + + def __init__(self, payloads_dir): + self.payloads_dir = payloads_dir + self.PROGRESS_FILE = os.path.join(payloads_dir, "translation_progress.json") + self.prog = self._init_or_load() + + def _init_or_load(self): + """Initialize or load progress tracking with improved structure""" + if os.path.exists(self.PROGRESS_FILE): + try: + with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf: + prog = json.load(pf) + except json.JSONDecodeError as e: + print(f"⚠️ Warning: Progress file is corrupted: {e}") + print("🔧 Attempting to fix JSON syntax...") + + try: + with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf: + content = pf.read() + + content = re.sub(r',\s*\]', ']', content) + content = re.sub(r',\s*\}', '}', content) + + prog = json.loads(content) + + with open(self.PROGRESS_FILE, "w", encoding="utf-8") as pf: + json.dump(prog, pf, ensure_ascii=False, indent=2) + print("✅ Successfully fixed and saved progress file") + + except Exception as fix_error: + print(f"❌ Could not fix progress file: {fix_error}") + print("🔄 Creating backup and starting fresh...") + + backup_name = f"translation_progress_backup_{int(time.time())}.json" + backup_path = os.path.join(self.payloads_dir, backup_name) + try: + shutil.copy(self.PROGRESS_FILE, backup_path) + print(f"📁 Backup saved to: {backup_name}") + except: + pass + + prog = { + "chapters": {}, + "chapter_chunks": {}, + "version": "2.0" + } + + if "chapters" not in prog: + prog["chapters"] = {} + + for idx in prog.get("completed", []): + prog["chapters"][str(idx)] = { + "status": "completed", + "timestamp": None + } + + if "chapter_chunks" not in prog: + prog["chapter_chunks"] = {} + + else: + prog = { + "chapters": {}, + "chapter_chunks": {}, + "image_chunks": {}, + "version": "2.1" + } + + return prog + + def save(self): + """Save progress to file""" + try: + self.prog["completed_list"] = [] + for chapter_key, chapter_info in self.prog.get("chapters", {}).items(): + if chapter_info.get("status") == "completed" and chapter_info.get("output_file"): + self.prog["completed_list"].append({ + "num": chapter_info.get("chapter_num", 0), + "idx": chapter_info.get("chapter_idx", 0), + "title": f"Chapter {chapter_info.get('chapter_num', 0)}", + "file": chapter_info.get("output_file", ""), + "key": chapter_key + }) + + if self.prog.get("completed_list"): + self.prog["completed_list"].sort(key=lambda x: x["num"]) + + temp_file = self.PROGRESS_FILE + '.tmp' + with open(temp_file, "w", encoding="utf-8") as pf: + json.dump(self.prog, pf, ensure_ascii=False, indent=2) + + if os.path.exists(self.PROGRESS_FILE): + os.remove(self.PROGRESS_FILE) + os.rename(temp_file, self.PROGRESS_FILE) + except Exception as e: + print(f"⚠️ Warning: Failed to save progress: {e}") + temp_file = self.PROGRESS_FILE + '.tmp' + if os.path.exists(temp_file): + try: + os.remove(temp_file) + except: + pass + + def update(self, idx, actual_num, content_hash, output_file, status="in_progress", ai_features=None, raw_num=None): + """Update progress for a chapter""" + # CHANGE THIS LINE - Use actual_num instead of idx + chapter_key = str(actual_num) # WAS: chapter_key = str(idx) + + chapter_info = { + "actual_num": actual_num, + "content_hash": content_hash, + "output_file": output_file, + "status": status, + "last_updated": time.time() + } + + # Add raw number tracking + if raw_num is not None: + chapter_info["raw_chapter_num"] = raw_num + + # Check if zero detection was disabled + if hasattr(builtins, '_DISABLE_ZERO_DETECTION') and builtins._DISABLE_ZERO_DETECTION: + chapter_info["zero_adjusted"] = False + else: + chapter_info["zero_adjusted"] = (raw_num != actual_num) if raw_num is not None else False + + # FIXED: Store AI features if provided + if ai_features is not None: + chapter_info["ai_features"] = ai_features + + # Preserve existing AI features if not overwriting + elif chapter_key in self.prog["chapters"] and "ai_features" in self.prog["chapters"][chapter_key]: + chapter_info["ai_features"] = self.prog["chapters"][chapter_key]["ai_features"] + + self.prog["chapters"][chapter_key] = chapter_info + + def check_chapter_status(self, chapter_idx, actual_num, content_hash, output_dir, chapter_obj=None): + """Check if a chapter needs translation""" + + chapter_key = str(actual_num) + + # Check if we have tracking for this chapter + if chapter_key in self.prog["chapters"]: + chapter_info = self.prog["chapters"][chapter_key] + status = chapter_info.get("status") + + # Failed statuses ALWAYS trigger retranslation + if status in ["qa_failed", "failed", "error", "file_missing"]: + return True, None, None + + # Completed - check file exists + if status in ["completed", "completed_empty", "completed_image_only"]: + output_file = chapter_info.get("output_file") + if output_file: + output_path = os.path.join(output_dir, output_file) + if os.path.exists(output_path): + return False, f"Chapter {actual_num} already translated: {output_file}", output_file + + # File missing - retranslate + del self.prog["chapters"][chapter_key] + if chapter_key in self.prog.get("chapter_chunks", {}): + del self.prog["chapter_chunks"][chapter_key] + self.save() + return True, None, None + + # Any other status - retranslate + return True, None, None + + # BEFORE auto-discovery, check if ANY entry exists for this chapter's file + if chapter_obj: + from TransateKRtoEN import FileUtilities + output_filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num) + + # Check if ANY entry has this output file + for key, info in self.prog["chapters"].items(): + if info.get("output_file") == output_filename: + # Entry exists somewhere else - don't auto-discover + return True, None, None + + # NOW check if file exists for auto-discovery + output_path = os.path.join(output_dir, output_filename) + if os.path.exists(output_path): + print(f"📁 Found existing file for chapter {actual_num}: {output_filename}") + + self.prog["chapters"][chapter_key] = { + "actual_num": actual_num, + "content_hash": content_hash, + "output_file": output_filename, + "status": "completed", + "last_updated": os.path.getmtime(output_path), + "auto_discovered": True + } + + self.save() + return False, f"Chapter {actual_num} already exists: {output_filename}", output_filename + + # No entry and no file - needs translation + return True, None, None + + def cleanup_missing_files(self, output_dir): + """Remove missing files and duplicates - NO RESTORATION BULLSHIT""" + cleaned_count = 0 + + # Remove entries for missing files + for chapter_key, chapter_info in list(self.prog["chapters"].items()): + output_file = chapter_info.get("output_file") + + if output_file: + output_path = os.path.join(output_dir, output_file) + if not os.path.exists(output_path): + print(f"🗑️ Removing entry for missing file: {output_file}") + + # Delete the entry + del self.prog["chapters"][chapter_key] + + # Remove chunk data + if chapter_key in self.prog.get("chapter_chunks", {}): + del self.prog["chapter_chunks"][chapter_key] + + cleaned_count += 1 + + if cleaned_count > 0: + print(f"🔄 Removed {cleaned_count} entries - will retranslate") + + def migrate_to_content_hash(self, chapters): + """Change keys to match actual_num values for proper mapping and sort by chapter number""" + + new_chapters = {} + migrated_count = 0 + + for old_key, chapter_info in self.prog["chapters"].items(): + actual_num = chapter_info.get("actual_num") + + if actual_num is not None: + new_key = str(actual_num) + + # If key needs to change + if old_key != new_key: + print(f" Migrating: key '{old_key}' → '{new_key}' (actual_num: {actual_num})") + migrated_count += 1 + + # Check for collision + if new_key in new_chapters: + print(f" ⚠️ Warning: Key '{new_key}' already exists, keeping newer entry") + if chapter_info.get("last_updated", 0) > new_chapters[new_key].get("last_updated", 0): + new_chapters[new_key] = chapter_info + else: + new_chapters[new_key] = chapter_info + else: + # Key already matches actual_num + new_chapters[old_key] = chapter_info + else: + # No actual_num, keep as-is + print(f" ⚠️ Warning: No actual_num for key '{old_key}', keeping as-is") + new_chapters[old_key] = chapter_info + + # Sort chapters by actual_num field, then by key as fallback + def sort_key(item): + key, chapter_info = item + actual_num = chapter_info.get("actual_num") + if actual_num is not None: + return actual_num + else: + # Fallback to key if no actual_num + try: + return int(key) + except ValueError: + # For non-numeric keys, sort them at the end + return float('inf') + + sorted_chapters = dict(sorted(new_chapters.items(), key=sort_key)) + + if migrated_count > 0: + # Also migrate and sort chapter_chunks if they exist + if "chapter_chunks" in self.prog: + new_chunks = {} + for old_key, chunk_data in self.prog["chapter_chunks"].items(): + if old_key in self.prog["chapters"] and "actual_num" in self.prog["chapters"][old_key]: + new_key = str(self.prog["chapters"][old_key]["actual_num"]) + new_chunks[new_key] = chunk_data + else: + new_chunks[old_key] = chunk_data + + # Sort chapter_chunks using the same sorting logic + sorted_chunks = dict(sorted(new_chunks.items(), key=sort_key)) + self.prog["chapter_chunks"] = sorted_chunks + + self.prog["chapters"] = sorted_chapters + self.save() + print(f"✅ Migrated {migrated_count} entries to use actual_num as key and sorted by chapter number") + else: + # Even if no migration occurred, still apply sorting + self.prog["chapters"] = sorted_chapters + if "chapter_chunks" in self.prog: + sorted_chunks = dict(sorted(self.prog["chapter_chunks"].items(), key=sort_key)) + self.prog["chapter_chunks"] = sorted_chunks + self.save() + print("✅ Sorted chapters by chapter number") + + def get_stats(self, output_dir): + """Get statistics about translation progress""" + stats = { + "total_tracked": len(self.prog["chapters"]), + "completed": 0, + "missing_files": 0, + "in_progress": 0 + } + + for chapter_info in self.prog["chapters"].values(): + status = chapter_info.get("status") + output_file = chapter_info.get("output_file") + + if status == "completed" and output_file: + output_path = os.path.join(output_dir, output_file) + if os.path.exists(output_path): + stats["completed"] += 1 + else: + stats["missing_files"] += 1 + elif status == "in_progress": + stats["in_progress"] += 1 + elif status == "file_missing": + stats["missing_files"] += 1 + + return stats + +# ===================================================== +# UNIFIED CONTENT PROCESSOR +# ===================================================== +class ContentProcessor: + """Unified content processing""" + + @staticmethod + def clean_ai_artifacts(text, remove_artifacts=True): + """Remove AI response artifacts from text - but ONLY when enabled""" + if not remove_artifacts: + return text + + # First, remove thinking tags if they exist + text = ContentProcessor._remove_thinking_tags(text) + + # After removing thinking tags, re-analyze the text structure + # to catch AI artifacts that may now be at the beginning + lines = text.split('\n') + + # Clean up empty lines at the beginning + while lines and not lines[0].strip(): + lines.pop(0) + + if not lines: + return text + + # Check the first non-empty line for AI artifacts + first_line = lines[0].strip() + + ai_patterns = [ + r'^(?:Sure|Okay|Understood|Of course|Got it|Alright|Certainly|Here\'s|Here is)', + r'^(?:I\'ll|I will|Let me) (?:translate|help|assist)', + r'^(?:System|Assistant|AI|User|Human|Model)\s*:', + r'^\[PART\s+\d+/\d+\]', + r'^(?:Translation note|Note|Here\'s the translation|I\'ve translated)', + r'^```(?:html|xml|text)?\s*$', # Enhanced code block detection + r'^', remaining_text, re.IGNORECASE) or + len(remaining_text.strip()) > 50): # Reduced from 100 to 50 + + print(f"✂️ Removed AI artifact: {first_line[:50]}...") + return remaining_text.lstrip() + + if first_line.lower() in ['html', 'text', 'content', 'translation', 'output']: + remaining_lines = lines[1:] + remaining_text = '\n'.join(remaining_lines) + if remaining_text.strip(): + print(f"✂️ Removed single word artifact: {first_line}") + return remaining_text.lstrip() + + return '\n'.join(lines) + + @staticmethod + def _remove_thinking_tags(text): + """Remove thinking tags that some AI models produce""" + if not text: + return text + + # Common thinking tag patterns used by various AI models + thinking_patterns = [ + # XML-style thinking tags + (r'.*?', 'thinking'), + (r'.*?', 'think'), + (r'.*?', 'thoughts'), + (r'.*?', 'reasoning'), + (r'.*?', 'analysis'), + (r'.*?', 'reflection'), + # OpenAI o1-style reasoning blocks - fix the regex escaping + (r'<\|thinking\|>.*?', 'o1-thinking'), + # Claude-style thinking blocks + (r'\[thinking\].*?\[/thinking\]', 'claude-thinking'), + # Generic bracketed thinking patterns + (r'\[THINKING\].*?\[/THINKING\]', 'bracketed-thinking'), + (r'\[ANALYSIS\].*?\[/ANALYSIS\]', 'bracketed-analysis'), + ] + + original_text = text + removed_count = 0 + + for pattern, tag_type in thinking_patterns: + # Use DOTALL flag to match across newlines + matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE) + if matches: + text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE) + removed_count += len(matches) + + # Also remove standalone code block markers that might be artifacts + # But preserve all actual content - only remove the ``` markers themselves + code_block_removed = 0 + code_block_patterns = [ + (r'^```\w*\s*\n', '\n'), # Opening code blocks - replace with newline + (r'\n```\s*$', ''), # Closing code blocks at end - remove entirely + (r'^```\w*\s*$', ''), # Standalone ``` on its own line - remove entirely + ] + + for pattern, replacement in code_block_patterns: + matches = re.findall(pattern, text, re.MULTILINE) + if matches: + text = re.sub(pattern, replacement, text, flags=re.MULTILINE) + code_block_removed += len(matches) + + # Clean up any extra whitespace or empty lines left after removing thinking tags + total_removed = removed_count + code_block_removed + if total_removed > 0: + # Remove multiple consecutive newlines + text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) + # Remove leading/trailing whitespace + text = text.strip() + if removed_count > 0 and code_block_removed > 0: + print(f"🧠 Removed {removed_count} thinking tag(s) and {code_block_removed} code block marker(s)") + elif removed_count > 0: + print(f"🧠 Removed {removed_count} thinking tag(s)") + elif code_block_removed > 0: + print(f"📝 Removed {code_block_removed} code block marker(s)") + + return text + + @staticmethod + def clean_memory_artifacts(text): + """Remove any memory/summary artifacts that leaked into the translation""" + text = re.sub(r'\[MEMORY\].*?\[END MEMORY\]', '', text, flags=re.DOTALL) + + lines = text.split('\n') + cleaned_lines = [] + skip_next = False + + for line in lines: + if any(marker in line for marker in ['[MEMORY]', '[END MEMORY]', 'Previous context summary:', + 'memory summary', 'context summary', '[Context]']): + skip_next = True + continue + + if skip_next and line.strip() == '': + skip_next = False + continue + + skip_next = False + cleaned_lines.append(line) + + return '\n'.join(cleaned_lines) + + @staticmethod + def emergency_restore_paragraphs(text, original_html=None, verbose=True): + """Emergency restoration when AI returns wall of text without proper paragraph tags""" + def log(message): + if verbose: + print(message) + + if text.count('

') >= 3: + return text + + if original_html: + original_para_count = original_html.count('

') + current_para_count = text.count('

') + + if current_para_count < original_para_count / 2: + log(f"⚠️ Paragraph mismatch! Original: {original_para_count}, Current: {current_para_count}") + log("🔧 Attempting emergency paragraph restoration...") + + if '

' not in text and len(text) > 300: + log("❌ No paragraph tags found - applying emergency restoration") + + if '\n\n' in text: + parts = text.split('\n\n') + paragraphs = ['

' + part.strip() + '

' for part in parts if part.strip()] + return '\n'.join(paragraphs) + + dialogue_pattern = r'(?<=[.!?])\s+(?=[""\u201c\u201d])' + if re.search(dialogue_pattern, text): + parts = re.split(dialogue_pattern, text) + paragraphs = [] + for part in parts: + part = part.strip() + if part: + if not part.startswith('

'): + part = '

' + part + if not part.endswith('

'): + part = part + '

' + paragraphs.append(part) + return '\n'.join(paragraphs) + + sentence_boundary = r'(?<=[.!?])\s+(?=[A-Z\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af])' + sentences = re.split(sentence_boundary, text) + + if len(sentences) > 1: + paragraphs = [] + current_para = [] + + for sentence in sentences: + sentence = sentence.strip() + if not sentence: + continue + + current_para.append(sentence) + + should_break = ( + len(current_para) >= 3 or + sentence.rstrip().endswith(('"', '"', '"')) or + '* * *' in sentence or + '***' in sentence or + '---' in sentence + ) + + if should_break: + para_text = ' '.join(current_para) + if not para_text.startswith('

'): + para_text = '

' + para_text + if not para_text.endswith('

'): + para_text = para_text + '

' + paragraphs.append(para_text) + current_para = [] + + if current_para: + para_text = ' '.join(current_para) + if not para_text.startswith('

'): + para_text = '

' + para_text + if not para_text.endswith('

'): + para_text = para_text + '

' + paragraphs.append(para_text) + + result = '\n'.join(paragraphs) + log(f"✅ Restored {len(paragraphs)} paragraphs from wall of text") + return result + + words = text.split() + if len(words) > 100: + paragraphs = [] + words_per_para = max(100, len(words) // 10) + + for i in range(0, len(words), words_per_para): + chunk = ' '.join(words[i:i + words_per_para]) + if chunk.strip(): + paragraphs.append('

' + chunk.strip() + '

') + + return '\n'.join(paragraphs) + + elif '

' in text and text.count('

') < 3 and len(text) > 1000: + log("⚠️ Very few paragraphs for long text - checking if more breaks needed") + + soup = BeautifulSoup(text, 'html.parser') + existing_paras = soup.find_all('p') + + new_paragraphs = [] + for para in existing_paras: + para_text = para.get_text() + if len(para_text) > 500: + sentences = re.split(r'(?<=[.!?])\s+', para_text) + if len(sentences) > 5: + chunks = [] + current = [] + for sent in sentences: + current.append(sent) + if len(current) >= 3: + chunks.append('

' + ' '.join(current) + '

') + current = [] + if current: + chunks.append('

' + ' '.join(current) + '

') + new_paragraphs.extend(chunks) + else: + new_paragraphs.append(str(para)) + else: + new_paragraphs.append(str(para)) + + return '\n'.join(new_paragraphs) + + return text + + @staticmethod + def get_content_hash(html_content): + """Create a stable hash of content""" + try: + soup = BeautifulSoup(html_content, 'html.parser') + + for tag in soup(['script', 'style', 'meta', 'link']): + tag.decompose() + + text_content = soup.get_text(separator=' ', strip=True) + text_content = ' '.join(text_content.split()) + + return hashlib.md5(text_content.encode('utf-8')).hexdigest() + + except Exception as e: + print(f"[WARNING] Failed to create hash: {e}") + return hashlib.md5(html_content.encode('utf-8')).hexdigest() + + @staticmethod + def is_meaningful_text_content(html_content): + """Check if chapter has meaningful text beyond just structure""" + try: + # Check if this is plain text from enhanced extraction (html2text output) + # html2text output characteristics: + # - Often starts with # for headers + # - Contains markdown-style formatting + # - Doesn't have HTML tags + content_stripped = html_content.strip() + + # Quick check for plain text/markdown content + is_plain_text = False + if content_stripped and ( + not content_stripped.startswith('<') or # Doesn't start with HTML tag + content_stripped.startswith('#') or # Markdown header + '\n\n' in content_stripped[:500] or # Markdown paragraphs + not '

' in content_stripped[:500] and not '

' in content_stripped[:500] # No common HTML tags + ): + # This looks like plain text or markdown from html2text + is_plain_text = True + + if is_plain_text: + # For plain text, just check the length + text_length = len(content_stripped) + # Be more lenient with plain text since it's already extracted + return text_length > 50 # Much lower threshold for plain text + + # Original HTML parsing logic + soup = BeautifulSoup(html_content, 'html.parser') + + soup_copy = BeautifulSoup(str(soup), 'html.parser') + + for img in soup_copy.find_all('img'): + img.decompose() + + text_elements = soup_copy.find_all(['p', 'div', 'span']) + text_content = ' '.join(elem.get_text(strip=True) for elem in text_elements) + + headers = soup_copy.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) + header_text = ' '.join(h.get_text(strip=True) for h in headers) + + if headers and len(text_content.strip()) > 1: + return True + + if len(text_content.strip()) > 200: + return True + + if len(header_text.strip()) > 100: + return True + + return False + + except Exception as e: + print(f"Warning: Error checking text content: {e}") + return True + +# ===================================================== +# UNIFIED CHAPTER EXTRACTOR +# ===================================================== +class ChapterExtractor: + """Unified chapter extraction with three modes: Smart, Comprehensive, and Full""" + + def __init__(self, progress_callback=None): + self.pattern_manager = PatternManager() + self.progress_callback = progress_callback # Add progress callback + self.parser = self._get_best_parser() # Determine best parser on init + + def _get_best_parser(self): + """Determine the best parser available, preferring lxml for CJK text""" + try: + import lxml + return 'lxml' + except ImportError: + return 'html.parser' + + def _sort_by_opf_spine(self, chapters, opf_path): + """Sort chapters according to OPF spine order""" + try: + import xml.etree.ElementTree as ET + + # Read OPF file + with open(opf_path, 'r', encoding='utf-8') as f: + opf_content = f.read() + + # Parse OPF + root = ET.fromstring(opf_content) + + # Find namespaces + ns = {'opf': 'http://www.idpf.org/2007/opf'} + if root.tag.startswith('{'): + default_ns = root.tag[1:root.tag.index('}')] + ns = {'opf': default_ns} + + # Build manifest map (id -> href) + manifest = {} + for item in root.findall('.//opf:manifest/opf:item', ns): + item_id = item.get('id') + href = item.get('href') + if item_id and href: + manifest[item_id] = href + + # Get spine order + spine_order = [] + spine = root.find('.//opf:spine', ns) + if spine is not None: + for itemref in spine.findall('opf:itemref', ns): + idref = itemref.get('idref') + if idref and idref in manifest: + href = manifest[idref] + spine_order.append(href) + + if not spine_order: + print("⚠️ No spine order found in OPF, keeping original order") + return chapters + + # Create a mapping of filenames to spine position + spine_map = {} + for idx, href in enumerate(spine_order): + # Try different matching strategies + basename = os.path.basename(href) + spine_map[basename] = idx + spine_map[href] = idx + # Also store without extension for flexible matching + name_no_ext = os.path.splitext(basename)[0] + spine_map[name_no_ext] = idx + + print(f"📋 OPF spine contains {len(spine_order)} items") + + # Sort chapters based on spine order + def get_spine_position(chapter): + # Try to match chapter to spine + filename = chapter.get('filename', '') + basename = chapter.get('original_basename', '') + + # Try exact filename match + if filename in spine_map: + return spine_map[filename] + + # Try basename match + if basename in spine_map: + return spine_map[basename] + + # Try basename of filename + if filename: + fname_base = os.path.basename(filename) + if fname_base in spine_map: + return spine_map[fname_base] + + # Try without extension + if basename: + if basename + '.html' in spine_map: + return spine_map[basename + '.html'] + if basename + '.xhtml' in spine_map: + return spine_map[basename + '.xhtml'] + + # Fallback to chapter number * 1000 (to sort after spine items) + return 1000000 + chapter.get('num', 0) + + # Sort chapters + sorted_chapters = sorted(chapters, key=get_spine_position) + + # Renumber chapters based on new order + for idx, chapter in enumerate(sorted_chapters, 1): + chapter['spine_order'] = idx + # Optionally update chapter numbers to match spine order + # chapter['num'] = idx # Uncomment if you want to renumber + + # Log reordering info + reordered_count = 0 + for idx, chapter in enumerate(sorted_chapters): + original_idx = chapters.index(chapter) + if original_idx != idx: + reordered_count += 1 + + if reordered_count > 0: + print(f"🔄 Reordered {reordered_count} chapters to match OPF spine") + else: + print(f"✅ Chapter order already matches OPF spine") + + return sorted_chapters + + except Exception as e: + print(f"⚠️ Could not sort by OPF spine: {e}") + import traceback + traceback.print_exc() + return chapters + + + def protect_angle_brackets_with_korean(self, text: str) -> str: + """Protect CJK text in angle brackets from HTML parsing""" + if text is None: + return "" + + import re + # Extended pattern to include Korean, Chinese, and Japanese characters + cjk_pattern = r'[가-힣ㄱ-ㅎㅏ-ㅣ一-龿ぁ-ゟァ-ヿ]' + bracket_pattern = rf'<([^<>]*{cjk_pattern}[^<>]*)>' + + def replace_brackets(match): + content = match.group(1) + return f'<{content}>' + + return re.sub(bracket_pattern, replace_brackets, text) + + def ensure_all_opf_chapters_extracted(zf, chapters, out): + """Ensure ALL chapters from OPF spine are extracted, not just what ChapterExtractor found""" + + # Parse OPF to get ALL chapters in spine + opf_chapters = [] + + try: + # Find content.opf + opf_content = None + for name in zf.namelist(): + if name.endswith('content.opf'): + opf_content = zf.read(name) + break + + if not opf_content: + return chapters # No OPF, return original + + import xml.etree.ElementTree as ET + root = ET.fromstring(opf_content) + + # Handle namespaces + ns = {'opf': 'http://www.idpf.org/2007/opf'} + if root.tag.startswith('{'): + default_ns = root.tag[1:root.tag.index('}')] + ns = {'opf': default_ns} + + # Get manifest + manifest = {} + for item in root.findall('.//opf:manifest/opf:item', ns): + item_id = item.get('id') + href = item.get('href') + media_type = item.get('media-type', '') + + if item_id and href and ('html' in media_type.lower() or href.endswith(('.html', '.xhtml', '.htm'))): + manifest[item_id] = href + + # Get spine order + spine = root.find('.//opf:spine', ns) + if spine: + for itemref in spine.findall('opf:itemref', ns): + idref = itemref.get('idref') + if idref and idref in manifest: + href = manifest[idref] + filename = os.path.basename(href) + + # Skip nav, toc, cover + if any(skip in filename.lower() for skip in ['nav', 'toc', 'cover']): + continue + + opf_chapters.append(href) + + print(f"📚 OPF spine contains {len(opf_chapters)} chapters") + + # Check which OPF chapters are missing from extraction + extracted_files = set() + for c in chapters: + if 'filename' in c: + extracted_files.add(c['filename']) + if 'original_basename' in c: + extracted_files.add(c['original_basename']) + + missing_chapters = [] + for opf_chapter in opf_chapters: + basename = os.path.basename(opf_chapter) + if basename not in extracted_files and opf_chapter not in extracted_files: + missing_chapters.append(opf_chapter) + + if missing_chapters: + print(f"⚠️ {len(missing_chapters)} chapters in OPF but not extracted!") + print(f" Missing: {missing_chapters[:5]}{'...' if len(missing_chapters) > 5 else ''}") + + # Extract the missing chapters + for href in missing_chapters: + try: + # Read the chapter content + content = zf.read(href).decode('utf-8') + + # Extract chapter number + import re + basename = os.path.basename(href) + matches = re.findall(r'(\d+)', basename) + if matches: + chapter_num = int(matches[-1]) + else: + chapter_num = len(chapters) + 1 + + # Create chapter entry + from bs4 import BeautifulSoup + parser = 'lxml' if 'lxml' in sys.modules else 'html.parser' + soup = BeautifulSoup(content, parser) + + # Get title + title = "Chapter " + str(chapter_num) + title_tag = soup.find('title') + if title_tag: + title = title_tag.get_text().strip() or title + else: + for tag in ['h1', 'h2', 'h3']: + header = soup.find(tag) + if header: + title = header.get_text().strip() or title + break + + # Save the chapter file + output_filename = f"chapter_{chapter_num:04d}_{basename}" + output_path = os.path.join(out, output_filename) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(content) + + # Add to chapters list + new_chapter = { + 'num': chapter_num, + 'title': title, + 'body': content, + 'filename': href, + 'original_basename': basename, + 'file_size': len(content), + 'has_images': bool(soup.find_all('img')), + 'detection_method': 'opf_recovery', + 'content_hash': None # Will be calculated later + } + + chapters.append(new_chapter) + print(f" ✅ Recovered chapter {chapter_num}: {basename}") + + except Exception as e: + print(f" ❌ Failed to extract {href}: {e}") + + # Re-sort chapters by number + chapters.sort(key=lambda x: x['num']) + print(f"✅ Total chapters after OPF recovery: {len(chapters)}") + + except Exception as e: + print(f"⚠️ Error checking OPF chapters: {e}") + import traceback + traceback.print_exc() + + return chapters + + def extract_chapters(self, zf, output_dir): + """Extract chapters and all resources from EPUB using ThreadPoolExecutor""" + import time + + # Check stop at the very beginning + if is_stop_requested(): + print("❌ Extraction stopped by user") + return [] + + print("🚀 Starting EPUB extraction with ThreadPoolExecutor...") + print(f"📄 Using parser: {self.parser} {'(optimized for CJK)' if self.parser == 'lxml' else '(standard)'}") + + # Initial progress + if self.progress_callback: + self.progress_callback("Starting EPUB extraction...") + + # First, extract and save content.opf for reference + for name in zf.namelist(): + if name.endswith('.opf'): + try: + opf_content = zf.read(name).decode('utf-8', errors='ignore') + opf_output_path = os.path.join(output_dir, 'content.opf') + with open(opf_output_path, 'w', encoding='utf-8') as f: + f.write(opf_content) + print(f"📋 Saved OPF file: {name} → content.opf") + break + except Exception as e: + print(f"⚠️ Could not save OPF file: {e}") + + # Get extraction mode from environment + extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower() + print(f"✅ Using {extraction_mode.capitalize()} extraction mode") + + # Get number of workers from environment or use default + max_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) + print(f"🔧 Using {max_workers} workers for parallel processing") + + extracted_resources = self._extract_all_resources(zf, output_dir) + + # Check stop after resource extraction + if is_stop_requested(): + print("❌ Extraction stopped by user") + return [] + + metadata_path = os.path.join(output_dir, 'metadata.json') + if os.path.exists(metadata_path): + print("📋 Loading existing metadata...") + with open(metadata_path, 'r', encoding='utf-8') as f: + metadata = json.load(f) + else: + print("📋 Extracting fresh metadata...") + metadata = self._extract_epub_metadata(zf) + print(f"📋 Extracted metadata: {list(metadata.keys())}") + + chapters, detected_language = self._extract_chapters_universal(zf, extraction_mode) + + # Sort chapters according to OPF spine order if available + opf_path = os.path.join(output_dir, 'content.opf') + if os.path.exists(opf_path) and chapters: + print("📋 Sorting chapters according to OPF spine order...") + chapters = self._sort_by_opf_spine(chapters, opf_path) + print(f"✅ Chapters sorted according to OPF reading order") + + # Check stop after chapter extraction + if is_stop_requested(): + print("❌ Extraction stopped by user") + return [] + + if not chapters: + print("❌ No chapters could be extracted!") + return [] + + chapters_info_path = os.path.join(output_dir, 'chapters_info.json') + chapters_info = [] + chapters_info_lock = threading.Lock() + + def process_chapter(chapter): + """Process a single chapter""" + # Check stop in worker + if is_stop_requested(): + return None + + info = { + 'num': chapter['num'], + 'title': chapter['title'], + 'original_filename': chapter.get('filename', ''), + 'has_images': chapter.get('has_images', False), + 'image_count': chapter.get('image_count', 0), + 'text_length': chapter.get('file_size', len(chapter.get('body', ''))), + 'detection_method': chapter.get('detection_method', 'unknown'), + 'content_hash': chapter.get('content_hash', '') + } + + if chapter.get('has_images'): + try: + soup = BeautifulSoup(chapter.get('body', ''), self.parser) + images = soup.find_all('img') + info['images'] = [img.get('src', '') for img in images] + except: + info['images'] = [] + + return info + + # Process chapters in parallel + print(f"🔄 Processing {len(chapters)} chapters in parallel...") + + if self.progress_callback: + self.progress_callback(f"Processing {len(chapters)} chapters...") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_chapter = { + executor.submit(process_chapter, chapter): chapter + for chapter in chapters + } + + # Process completed tasks + completed = 0 + for future in as_completed(future_to_chapter): + if is_stop_requested(): + print("❌ Extraction stopped by user") + # Cancel remaining futures + for f in future_to_chapter: + f.cancel() + return [] + + try: + result = future.result() + if result: + with chapters_info_lock: + chapters_info.append(result) + completed += 1 + + # Yield to GUI periodically (can be disabled for max speed) + if completed % 5 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": + time.sleep(0.001) + + # Progress updates + if completed % 10 == 0 or completed == len(chapters): + progress_msg = f"Processed {completed}/{len(chapters)} chapters" + print(f" 📊 {progress_msg}") + if self.progress_callback: + self.progress_callback(progress_msg) + except Exception as e: + chapter = future_to_chapter[future] + print(f" ❌ Error processing chapter {chapter['num']}: {e}") + + # Sort chapters_info by chapter number to maintain order + chapters_info.sort(key=lambda x: x['num']) + + print(f"✅ Successfully processed {len(chapters_info)} chapters") + + with open(chapters_info_path, 'w', encoding='utf-8') as f: + json.dump(chapters_info, f, ensure_ascii=False, indent=2) + + print(f"💾 Saved detailed chapter info to: chapters_info.json") + + metadata.update({ + 'chapter_count': len(chapters), + 'detected_language': detected_language, + 'extracted_resources': extracted_resources, + 'extraction_mode': extraction_mode, + 'extraction_summary': { + 'total_chapters': len(chapters), + 'chapter_range': f"{chapters[0]['num']}-{chapters[-1]['num']}", + 'resources_extracted': sum(len(files) for files in extracted_resources.values()) + } + }) + + metadata['chapter_titles'] = { + str(c['num']): c['title'] for c in chapters + } + + with open(metadata_path, 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=2) + + print(f"💾 Saved comprehensive metadata to: {metadata_path}") + + self._create_extraction_report(output_dir, metadata, chapters, extracted_resources) + self._log_extraction_summary(chapters, extracted_resources, detected_language) + + print(f"🔍 VERIFICATION: {extraction_mode.capitalize()} chapter extraction completed successfully") + print(f"⚡ Used {max_workers} workers for parallel processing") + + return chapters + + def _extract_all_resources(self, zf, output_dir): + """Extract all resources with parallel processing""" + import time + + extracted_resources = { + 'css': [], + 'fonts': [], + 'images': [], + 'epub_structure': [], + 'other': [] + } + + # Check if already extracted + extraction_marker = os.path.join(output_dir, '.resources_extracted') + if os.path.exists(extraction_marker): + print("📦 Resources already extracted, skipping...") + return self._count_existing_resources(output_dir, extracted_resources) + + self._cleanup_old_resources(output_dir) + + # Create directories + for resource_type in ['css', 'fonts', 'images']: + os.makedirs(os.path.join(output_dir, resource_type), exist_ok=True) + + print(f"📦 Extracting resources in parallel...") + + # Get list of files to process + file_list = [f for f in zf.namelist() if not f.endswith('/') and os.path.basename(f)] + + # Thread-safe lock for extracted_resources + resource_lock = threading.Lock() + + def extract_single_resource(file_path): + if is_stop_requested(): + return None + + try: + file_data = zf.read(file_path) + resource_info = self._categorize_resource(file_path, os.path.basename(file_path)) + + if resource_info: + resource_type, target_dir, safe_filename = resource_info + target_path = os.path.join(output_dir, target_dir, safe_filename) if target_dir else os.path.join(output_dir, safe_filename) + + with open(target_path, 'wb') as f: + f.write(file_data) + + # Thread-safe update + with resource_lock: + extracted_resources[resource_type].append(safe_filename) + + return (resource_type, safe_filename) + except Exception as e: + print(f"[WARNING] Failed to extract {file_path}: {e}") + return None + + # Process files in parallel + total_resources = len(file_list) + extracted_count = 0 + + # Use same worker count as chapter processing + resource_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) + + with ThreadPoolExecutor(max_workers=resource_workers) as executor: + futures = {executor.submit(extract_single_resource, file_path): file_path + for file_path in file_list} + + for future in as_completed(futures): + if is_stop_requested(): + executor.shutdown(wait=False) + break + + extracted_count += 1 + + # Progress update every 20 files + if extracted_count % 20 == 0 and self.progress_callback: + self.progress_callback(f"Extracting resources: {extracted_count}/{total_resources}") + + # Yield to GUI periodically (can be disabled for max speed) + if extracted_count % 10 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": + time.sleep(0.001) + + result = future.result() + if result: + resource_type, filename = result + # Only print for important resources + if extracted_count < 10 or resource_type in ['css', 'fonts']: + print(f" 📄 Extracted {resource_type}: {filename}") + + # Mark as complete + with open(extraction_marker, 'w') as f: + f.write(f"Resources extracted at {time.time()}") + + self._validate_critical_files(output_dir, extracted_resources) + return extracted_resources + + def _extract_chapters_universal(self, zf, extraction_mode="smart"): + """Universal chapter extraction with four modes: smart, comprehensive, full, enhanced + + All modes now properly merge Section/Chapter pairs + Enhanced mode uses html2text for superior text processing + Now with parallel processing for improved performance + """ + # Check stop at the beginning + if is_stop_requested(): + print("❌ Chapter extraction stopped by user") + return [], 'unknown' + + # Import time for yielding + import time + + # Initialize enhanced extractor if using enhanced mode + enhanced_extractor = None + enhanced_filtering = extraction_mode # Default fallback + preserve_structure = True + + # Independent control: translate cover.html when requested + translate_cover_html = os.getenv("TRANSLATE_COVER_HTML", "0") == "1" + + if extraction_mode == "enhanced": + print("🚀 Initializing Enhanced extraction mode with html2text...") + + # Get enhanced mode configuration from environment + enhanced_filtering = os.getenv("ENHANCED_FILTERING", "smart") + # Avoid 'full' with html2text to prevent XML declaration artifacts; use 'comprehensive' instead + if str(enhanced_filtering).lower() == 'full': + enhanced_filtering = 'comprehensive' + preserve_structure = os.getenv("ENHANCED_PRESERVE_STRUCTURE", "1") == "1" + + print(f" • Enhanced filtering level: {enhanced_filtering}") + print(f" • Preserve structure: {preserve_structure}") + + # Try to initialize enhanced extractor + try: + # Import our enhanced extractor (assume it's in the same directory or importable) + from enhanced_text_extractor import EnhancedTextExtractor + enhanced_extractor = EnhancedTextExtractor( + filtering_mode=enhanced_filtering, + preserve_structure=preserve_structure + ) + print("✅ Enhanced text extractor initialized successfully") + + except ImportError as e: + print(f"❌ Enhanced text extractor module not found: {e}") + print(f"❌ Cannot use enhanced extraction mode. Please install enhanced_text_extractor or select a different extraction mode.") + raise e + except Exception as e: + print(f"❌ Enhanced extractor initialization failed: {e}") + print(f"❌ Cannot use enhanced extraction mode. Please select a different extraction mode.") + raise e + + chapters = [] + sample_texts = [] + + # First phase: Collect HTML files + html_files = [] + file_list = zf.namelist() + total_files = len(file_list) + + # Update progress for file collection + if self.progress_callback and total_files > 100: + self.progress_callback(f"Scanning {total_files} files in EPUB...") + + for idx, name in enumerate(file_list): + # Check stop while collecting files + if is_stop_requested(): + print("❌ Chapter extraction stopped by user") + return [], 'unknown' + + # Yield to GUI every 50 files (can be disabled for max speed) + if idx % 50 == 0 and idx > 0: + if os.getenv("ENABLE_GUI_YIELD", "1") == "1": + time.sleep(0.001) # Brief yield to GUI + if self.progress_callback and total_files > 100: + self.progress_callback(f"Scanning files: {idx}/{total_files}") + + if name.lower().endswith(('.xhtml', '.html', '.htm')): + # Skip cover files by default unless override is enabled + basename = os.path.basename(name).lower() + if basename in ['cover.html', 'cover.xhtml', 'cover.htm'] and not translate_cover_html: + print(f"[SKIP] Cover file excluded from all modes: {name}") + continue + + # Apply filtering based on the actual extraction mode (or enhanced_filtering for enhanced mode) + current_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode + + if current_filtering == "smart": + # Smart mode: aggressive filtering + lower_name = name.lower() + if any(skip in lower_name for skip in [ + 'nav', 'toc', 'contents', 'title', 'index', + 'copyright', 'acknowledgment', 'dedication' + ]): + continue + elif current_filtering == "comprehensive": + # Comprehensive mode: moderate filtering + skip_keywords = ['nav.', 'toc.', 'contents.', 'copyright.'] + basename = os.path.basename(name.lower()) + should_skip = False + for skip in skip_keywords: + if basename == skip + 'xhtml' or basename == skip + 'html' or basename == skip + 'htm': + should_skip = True + break + if should_skip: + print(f"[SKIP] Navigation/TOC file: {name}") + continue + # else: full mode - no filtering at all (except cover which is filtered above) + + html_files.append(name) + + # Update mode description to include enhanced mode + mode_description = { + "smart": "potential content files", + "comprehensive": "HTML files", + "full": "ALL HTML/XHTML files (no filtering)", + "enhanced": f"files (enhanced with {enhanced_filtering} filtering)" + } + print(f"📚 Found {len(html_files)} {mode_description.get(extraction_mode, 'files')} in EPUB") + + # Sort files to ensure proper order + html_files.sort() + + # Check if merging is disabled via environment variable + disable_merging = os.getenv("DISABLE_CHAPTER_MERGING", "0") == "1" + + processed_files = set() + merge_candidates = {} # Store potential merges without reading files yet + + if disable_merging: + print("📌 Chapter merging is DISABLED - processing all files independently") + else: + print("📌 Chapter merging is ENABLED") + + # Only do merging logic if not disabled + file_groups = {} + + # Group files by their base number to detect Section/Chapter pairs + for file_path in html_files: + filename = os.path.basename(file_path) + + # Try different patterns to extract base number + base_num = None + + # Pattern 1: "No00014" from "No00014Section.xhtml" + match = re.match(r'(No\d+)', filename) + if match: + base_num = match.group(1) + else: + # Pattern 2: "0014" from "0014_section.html" or "0014_chapter.html" + match = re.match(r'^(\d+)[_\-]', filename) + if match: + base_num = match.group(1) + else: + # Pattern 3: Just numbers at the start + match = re.match(r'^(\d+)', filename) + if match: + base_num = match.group(1) + + if base_num: + if base_num not in file_groups: + file_groups[base_num] = [] + file_groups[base_num].append(file_path) + + # Identify merge candidates WITHOUT reading files yet + for base_num, group_files in sorted(file_groups.items()): + if len(group_files) == 2: + # Check if we have a Section/Chapter pair based on filenames only + section_file = None + chapter_file = None + + for file_path in group_files: + basename = os.path.basename(file_path) + # More strict detection - must have 'section' or 'chapter' in the filename + if 'section' in basename.lower() and 'chapter' not in basename.lower(): + section_file = file_path + elif 'chapter' in basename.lower() and 'section' not in basename.lower(): + chapter_file = file_path + + if section_file and chapter_file: + # Store as potential merge candidate + merge_candidates[chapter_file] = section_file + processed_files.add(section_file) + print(f"[DEBUG] Potential merge candidate: {base_num}") + print(f" Section: {os.path.basename(section_file)}") + print(f" Chapter: {os.path.basename(chapter_file)}") + + # Filter out section files that were marked for merging + files_to_process = [] + for file_path in html_files: + if not disable_merging and file_path in processed_files: + print(f"[DEBUG] Skipping section file: {file_path}") + continue + files_to_process.append(file_path) + + print(f"📚 Processing {len(files_to_process)} files after merge analysis") + + # Thread-safe collections + sample_texts_lock = threading.Lock() + file_size_groups_lock = threading.Lock() + h1_count_lock = threading.Lock() + h2_count_lock = threading.Lock() + + # Initialize counters + file_size_groups = {} + h1_count = 0 + h2_count = 0 + processed_count = 0 + processed_count_lock = threading.Lock() + + # Progress tracking + total_files = len(files_to_process) + + # Function to process a single HTML file + def process_single_html_file(file_path, file_index): + nonlocal h1_count, h2_count, processed_count + + # Check stop + if is_stop_requested(): + return None + + # Update progress + with processed_count_lock: + processed_count += 1 + current_count = processed_count + if self.progress_callback and current_count % 5 == 0: + progress_msg = f"Processing chapters: {current_count}/{total_files} ({current_count*100//total_files}%)" + self.progress_callback(progress_msg) + + try: + # Read file data + file_data = zf.read(file_path) + + # Decode the file data + html_content = None + detected_encoding = None + for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']: + try: + html_content = file_data.decode(encoding) + detected_encoding = encoding + break + except UnicodeDecodeError: + continue + + if not html_content: + print(f"[WARNING] Could not decode {file_path}") + return None + + # Check if this file needs merging + if not disable_merging and file_path in merge_candidates: + section_file = merge_candidates[file_path] + print(f"[DEBUG] Processing merge for: {file_path}") + + try: + # Read section file + section_data = zf.read(section_file) + section_html = None + for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']: + try: + section_html = section_data.decode(encoding) + break + except UnicodeDecodeError: + continue + + if section_html: + # Quick check if section is small enough to merge + section_soup = BeautifulSoup(section_html, self.parser) + section_text = section_soup.get_text(strip=True) + + if len(section_text) < 200: # Merge if section is small + # Extract body content + chapter_soup = BeautifulSoup(html_content, self.parser) + + if section_soup.body: + section_body_content = ''.join(str(child) for child in section_soup.body.children) + else: + section_body_content = section_html + + if chapter_soup.body: + chapter_body_content = ''.join(str(child) for child in chapter_soup.body.children) + else: + chapter_body_content = html_content + + # Merge content + html_content = section_body_content + "\n
\n" + chapter_body_content + print(f" → MERGED: Section ({len(section_text)} chars) + Chapter") + else: + print(f" → NOT MERGED: Section too large ({len(section_text)} chars)") + # Remove from processed files so it gets processed separately + processed_files.discard(section_file) + + except Exception as e: + print(f"[WARNING] Failed to merge {file_path}: {e}") + + # === ENHANCED EXTRACTION POINT === + # Initialize variables that will be set by extraction + content_html = None + content_text = None + chapter_title = None + enhanced_extraction_used = False + + # Determine whether to use enhanced extractor based on toggle and provider + use_enhanced = enhanced_extractor and extraction_mode == "enhanced" + force_bs_traditional = False + try: + force_bs = os.getenv('FORCE_BS_FOR_TRADITIONAL', '0') == '1' + model_env = os.getenv('MODEL', '') + if force_bs and is_traditional_translation_api(model_env): + use_enhanced = False + force_bs_traditional = True + except Exception: + pass + + # Use enhanced extractor if available and allowed + if use_enhanced: + print(f"🚀 Using enhanced extraction for: {os.path.basename(file_path)}") + # Get clean text from html2text + clean_content, _, chapter_title = enhanced_extractor.extract_chapter_content( + html_content, enhanced_filtering + ) + enhanced_extraction_used = True + print(f"✅ Enhanced extraction complete: {len(clean_content)} chars") + + # For enhanced mode, store the markdown/plain text + # This will be sent to the translation API as-is + content_html = clean_content # This is MARKDOWN/PLAIN TEXT from html2text + content_text = clean_content # Same clean text for analysis + + # BeautifulSoup method (only for non-enhanced modes) + if not enhanced_extraction_used: + if extraction_mode == "enhanced" and not force_bs_traditional: + # Enhanced mode failed - skip this file + print(f"❌ Skipping {file_path} - enhanced extraction required but not available") + return None + # Parse the (possibly merged) content + protected_html = self.protect_angle_brackets_with_korean(html_content) + + # Use lxml parser which handles both HTML and XHTML well + soup = BeautifulSoup(protected_html, self.parser) + + # Get effective mode for filtering + effective_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode + + # In full mode, keep the entire HTML structure + if effective_filtering == "full": + content_html = html_content # Keep EVERYTHING + content_text = soup.get_text(strip=True) + else: + # Smart and comprehensive modes extract body content + if soup.body: + content_html = str(soup.body) + content_text = soup.body.get_text(strip=True) + else: + content_html = html_content + content_text = soup.get_text(strip=True) + + # Extract title (with ignore settings support) + chapter_title = None + + # Check ignore settings for batch translation + batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' + ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active + ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active + + # Extract from title tag if not ignored + if not ignore_title_tag and soup.title and soup.title.string: + chapter_title = soup.title.string.strip() + + # Extract from header tags if not ignored and no title found + if not chapter_title and not ignore_header_tags: + for header_tag in ['h1', 'h2', 'h3']: + header = soup.find(header_tag) + if header: + chapter_title = header.get_text(strip=True) + break + + # Fallback to filename if nothing found + if not chapter_title: + chapter_title = os.path.splitext(os.path.basename(file_path))[0] + + # Get the effective extraction mode for processing logic + effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode + + # Skip truly empty files in smart mode + # BUT: Never skip anything when merging is disabled (to ensure section files are processed) + if effective_mode == "smart" and not disable_merging and len(content_text.strip()) < 10: + print(f"[SKIP] Nearly empty file: {file_path} ({len(content_text)} chars)") + return None + + # Get actual chapter number based on original position + actual_chapter_num = files_to_process.index(file_path) + 1 + + # Mode-specific logic + if effective_mode == "comprehensive" or effective_mode == "full": + # For comprehensive/full mode, use sequential numbering + chapter_num = actual_chapter_num + + if not chapter_title: + chapter_title = os.path.splitext(os.path.basename(file_path))[0] + + detection_method = f"{extraction_mode}_sequential" if extraction_mode == "enhanced" else f"{effective_mode}_sequential" + + elif effective_mode == "smart": + # For smart mode, when merging is disabled, use sequential numbering + if disable_merging: + chapter_num = actual_chapter_num + + if not chapter_title: + chapter_title = os.path.splitext(os.path.basename(file_path))[0] + + detection_method = f"{extraction_mode}_sequential_no_merge" if extraction_mode == "enhanced" else "sequential_no_merge" + else: + # When merging is enabled, try to extract chapter info + protected_html = self.protect_angle_brackets_with_korean(html_content) + soup = BeautifulSoup(protected_html, self.parser) + + # Count headers (thread-safe) + h1_tags = soup.find_all('h1') + h2_tags = soup.find_all('h2') + if h1_tags: + with h1_count_lock: + h1_count += 1 + if h2_tags: + with h2_count_lock: + h2_count += 1 + + # Try to extract chapter number and title + chapter_num, extracted_title, detection_method = self._extract_chapter_info( + soup, file_path, content_text, html_content + ) + + # Use extracted title if we don't have one + if extracted_title and not chapter_title: + chapter_title = extracted_title + + # For hash-based filenames, chapter_num might be None + if chapter_num is None: + chapter_num = actual_chapter_num # Use actual chapter count + detection_method = f"{extraction_mode}_sequential_fallback" if extraction_mode == "enhanced" else "sequential_fallback" + print(f"[DEBUG] No chapter number found in {file_path}, assigning: {chapter_num}") + + # Filter content_html for ignore settings (before processing) + batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' + ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active + ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active + + if (ignore_title_tag or ignore_header_tags) and content_html and not enhanced_extraction_used: + # Parse the content HTML to remove ignored tags + content_soup = BeautifulSoup(content_html, self.parser) + + # Remove title tags if ignored + if ignore_title_tag: + for title_tag in content_soup.find_all('title'): + title_tag.decompose() + + # Remove header tags if ignored + if ignore_header_tags: + for header_tag in content_soup.find_all(['h1', 'h2', 'h3']): + header_tag.decompose() + + # Update content_html with filtered version + content_html = str(content_soup) + + # Process images and metadata (same for all modes) + protected_html = self.protect_angle_brackets_with_korean(html_content) + soup = BeautifulSoup(protected_html, self.parser) + images = soup.find_all('img') + has_images = len(images) > 0 + is_image_only_chapter = has_images and len(content_text.strip()) < 500 + + if is_image_only_chapter: + print(f"[DEBUG] Image-only chapter detected: {file_path} ({len(images)} images, {len(content_text)} chars)") + + content_hash = ContentProcessor.get_content_hash(content_html) + + # Collect file size groups for smart mode (thread-safe) + if effective_mode == "smart": + file_size = len(content_text) + with file_size_groups_lock: + if file_size not in file_size_groups: + file_size_groups[file_size] = [] + file_size_groups[file_size].append(file_path) + + # Collect sample texts (thread-safe) + with sample_texts_lock: + if len(sample_texts) < 5: + sample_texts.append(content_text[:1000]) + + # Ensure chapter_num is always an integer + if isinstance(chapter_num, float): + chapter_num = int(chapter_num) + + # Create chapter info + chapter_info = { + "num": chapter_num, # Now guaranteed to have a value + "title": chapter_title or f"Chapter {chapter_num}", + "body": content_html, + "filename": file_path, + "original_filename": os.path.basename(file_path), + "original_basename": os.path.splitext(os.path.basename(file_path))[0], + "content_hash": content_hash, + "detection_method": detection_method if detection_method else "pending", + "file_size": len(content_text), + "has_images": has_images, + "image_count": len(images), + "is_empty": len(content_text.strip()) == 0, + "is_image_only": is_image_only_chapter, + "extraction_mode": extraction_mode, + "file_index": file_index # Store original file index for sorting + } + + # Add enhanced extraction info if used + if enhanced_extraction_used: + chapter_info["enhanced_extraction"] = True + chapter_info["enhanced_filtering"] = enhanced_filtering + chapter_info["preserve_structure"] = preserve_structure + + # Add merge info if applicable + if not disable_merging and file_path in merge_candidates: + chapter_info["was_merged"] = True + chapter_info["merged_with"] = merge_candidates[file_path] + + if effective_mode == "smart": + chapter_info["language_sample"] = content_text[:500] + # Debug for section files + if 'section' in chapter_info['original_basename'].lower(): + print(f"[DEBUG] Added section file to candidates: {chapter_info['original_basename']} (size: {chapter_info['file_size']})") + + return chapter_info + + except Exception as e: + print(f"[ERROR] Failed to process {file_path}: {e}") + import traceback + traceback.print_exc() + return None + + # Process files in parallel or sequentially based on file count + print(f"🚀 Processing {len(files_to_process)} HTML files...") + + # Initial progress + if self.progress_callback: + self.progress_callback(f"Processing {len(files_to_process)} chapters...") + + candidate_chapters = [] # For smart mode + chapters_direct = [] # For other modes + + # Decide whether to use parallel processing + use_parallel = len(files_to_process) > 10 + + if use_parallel: + # Get worker count from environment variable + max_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) + print(f"📦 Using parallel processing with {max_workers} workers...") + + # Process files in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all files for processing + future_to_file = { + executor.submit(process_single_html_file, file_path, idx): (file_path, idx) + for idx, file_path in enumerate(files_to_process) + } + + # Collect results as they complete + for future in as_completed(future_to_file): + if is_stop_requested(): + print("❌ Chapter processing stopped by user") + executor.shutdown(wait=False) + return [], 'unknown' + + try: + chapter_info = future.result() + if chapter_info: + effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode + + # For smart mode when merging is enabled, collect candidates + # Otherwise, add directly to chapters + if effective_mode == "smart" and not disable_merging: + candidate_chapters.append(chapter_info) + else: + chapters_direct.append(chapter_info) + except Exception as e: + file_path, idx = future_to_file[future] + print(f"[ERROR] Thread error processing {file_path}: {e}") + else: + print("📦 Using sequential processing (small file count)...") + + # Process files sequentially for small EPUBs + for idx, file_path in enumerate(files_to_process): + if is_stop_requested(): + print("❌ Chapter processing stopped by user") + return [], 'unknown' + + chapter_info = process_single_html_file(file_path, idx) + if chapter_info: + effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode + + # For smart mode when merging is enabled, collect candidates + # Otherwise, add directly to chapters + if effective_mode == "smart" and not disable_merging: + candidate_chapters.append(chapter_info) + else: + chapters_direct.append(chapter_info) + + # Final progress update + if self.progress_callback: + self.progress_callback(f"Chapter processing complete: {len(candidate_chapters) + len(chapters_direct)} chapters") + + # Sort direct chapters by file index to maintain order + chapters_direct.sort(key=lambda x: x["file_index"]) + + # Post-process smart mode candidates (only when merging is enabled) + effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode + if effective_mode == "smart" and candidate_chapters and not disable_merging: + # Check stop before post-processing + if is_stop_requested(): + print("❌ Chapter post-processing stopped by user") + return chapters, 'unknown' + + print(f"\n[SMART MODE] Processing {len(candidate_chapters)} candidate files...") + + # Sort candidates by file index to maintain order + candidate_chapters.sort(key=lambda x: x["file_index"]) + + # Debug: Show what files we have + section_files = [c for c in candidate_chapters if 'section' in c['original_basename'].lower()] + chapter_files = [c for c in candidate_chapters if 'chapter' in c['original_basename'].lower() and 'section' not in c['original_basename'].lower()] + other_files = [c for c in candidate_chapters if c not in section_files and c not in chapter_files] + + print(f" 📊 File breakdown:") + print(f" • Section files: {len(section_files)}") + print(f" • Chapter files: {len(chapter_files)}") + print(f" • Other files: {len(other_files)}") + + # Original smart mode logic when merging is enabled + # First, separate files with detected chapter numbers from those without + numbered_chapters = [] + unnumbered_chapters = [] + + for idx, chapter in enumerate(candidate_chapters): + # Yield periodically during categorization (can be disabled for max speed) + if idx % 10 == 0 and idx > 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": + time.sleep(0.001) + + if chapter["num"] is not None: + numbered_chapters.append(chapter) + else: + unnumbered_chapters.append(chapter) + + print(f" • Files with chapter numbers: {len(numbered_chapters)}") + print(f" • Files without chapter numbers: {len(unnumbered_chapters)}") + + # Check if we have hash-based filenames (no numbered chapters found) + if not numbered_chapters and unnumbered_chapters: + print(" ⚠️ No chapter numbers found - likely hash-based filenames") + print(" → Using file order as chapter sequence") + + # Sort by file index to maintain order + unnumbered_chapters.sort(key=lambda x: x["file_index"]) + + # Assign sequential numbers + for i, chapter in enumerate(unnumbered_chapters, 1): + chapter["num"] = i + chapter["detection_method"] = f"{extraction_mode}_hash_filename_sequential" if extraction_mode == "enhanced" else "hash_filename_sequential" + if not chapter["title"] or chapter["title"] == chapter["original_basename"]: + chapter["title"] = f"Chapter {i}" + + chapters = unnumbered_chapters + else: + # We have some numbered chapters + chapters = numbered_chapters + + # For unnumbered files, check if they might be duplicates or appendices + if unnumbered_chapters: + print(f" → Analyzing {len(unnumbered_chapters)} unnumbered files...") + + # Get the max chapter number + max_num = max(c["num"] for c in numbered_chapters) + + # Check each unnumbered file + for chapter in unnumbered_chapters: + # Check stop in post-processing loop + if is_stop_requested(): + print("❌ Chapter post-processing stopped by user") + return chapters, 'unknown' + + # Check if it's very small (might be a separator or note) + if chapter["file_size"] < 200: + print(f" [SKIP] Very small file: {chapter['filename']} ({chapter['file_size']} chars)") + continue + + # Check if it has similar size to existing chapters (might be duplicate) + size = chapter["file_size"] + similar_chapters = [c for c in numbered_chapters + if abs(c["file_size"] - size) < 50] + + if similar_chapters: + # Might be a duplicate, skip it + print(f" [SKIP] Possible duplicate: {chapter['filename']} (similar size to {len(similar_chapters)} chapters)") + continue + + # Otherwise, add as appendix + max_num += 1 + chapter["num"] = max_num + chapter["detection_method"] = f"{extraction_mode}_appendix_sequential" if extraction_mode == "enhanced" else "appendix_sequential" + if not chapter["title"] or chapter["title"] == chapter["original_basename"]: + chapter["title"] = f"Appendix {max_num}" + chapters.append(chapter) + print(f" [ADD] Added as chapter {max_num}: {chapter['filename']}") + else: + # For other modes or smart mode with merging disabled + chapters = chapters_direct + + # Sort chapters by number + chapters.sort(key=lambda x: x["num"]) + + # Ensure chapter numbers are integers + # When merging is disabled, all chapters should have integer numbers anyway + for chapter in chapters: + if isinstance(chapter["num"], float): + chapter["num"] = int(chapter["num"]) + + # Final validation + if chapters: + print(f"\n✅ Final chapter count: {len(chapters)}") + print(f" • Chapter range: {chapters[0]['num']} - {chapters[-1]['num']}") + + # Enhanced mode summary + if extraction_mode == "enhanced": + enhanced_count = sum(1 for c in chapters if c.get('enhanced_extraction', False)) + print(f" 🚀 Enhanced extraction used: {enhanced_count}/{len(chapters)} chapters") + + # Check for gaps + chapter_nums = [c["num"] for c in chapters] + expected_nums = list(range(min(chapter_nums), max(chapter_nums) + 1)) + missing = set(expected_nums) - set(chapter_nums) + if missing: + print(f" ⚠️ Missing chapter numbers: {sorted(missing)}") + + # Language detection + combined_sample = ' '.join(sample_texts) if effective_mode == "smart" else '' + detected_language = self._detect_content_language(combined_sample) if combined_sample else 'unknown' + + if chapters: + self._print_extraction_summary(chapters, detected_language, extraction_mode, + h1_count if effective_mode == "smart" else 0, + h2_count if effective_mode == "smart" else 0, + file_size_groups if effective_mode == "smart" else {}) + + return chapters, detected_language + + def _extract_chapter_info(self, soup, file_path, content_text, html_content): + """Extract chapter number and title from various sources with parallel pattern matching""" + chapter_num = None + chapter_title = None + detection_method = None + + # SPECIAL HANDLING: When we have Section/Chapter pairs, differentiate them + filename = os.path.basename(file_path) + + # Handle different naming patterns for Section/Chapter files + if ('section' in filename.lower() or '_section' in filename.lower()) and 'chapter' not in filename.lower(): + # For Section files, add 0.1 to the base number + # Try different patterns + match = re.search(r'No(\d+)', filename) + if not match: + match = re.search(r'^(\d+)[_\-]', filename) + if not match: + match = re.search(r'^(\d+)', filename) + + if match: + base_num = int(match.group(1)) + chapter_num = base_num + 0.1 # Section gets .1 + detection_method = "filename_section_special" + + elif ('chapter' in filename.lower() or '_chapter' in filename.lower()) and 'section' not in filename.lower(): + # For Chapter files, use the base number + # Try different patterns + match = re.search(r'No(\d+)', filename) + if not match: + match = re.search(r'^(\d+)[_\-]', filename) + if not match: + match = re.search(r'^(\d+)', filename) + + if match: + chapter_num = int(match.group(1)) + detection_method = "filename_chapter_special" + + # If not handled by special logic, continue with normal extraction + if not chapter_num: + # Try filename first - use parallel pattern matching for better performance + chapter_patterns = [(pattern, flags, method) for pattern, flags, method in self.pattern_manager.CHAPTER_PATTERNS + if method.endswith('_number')] + + if len(chapter_patterns) > 3: # Only parallelize if we have enough patterns + # Parallel pattern matching for filename + with ThreadPoolExecutor(max_workers=min(4, len(chapter_patterns))) as executor: + def try_pattern(pattern_info): + pattern, flags, method = pattern_info + match = re.search(pattern, file_path, flags) + if match: + try: + num_str = match.group(1) + if num_str.isdigit(): + return int(num_str), f"filename_{method}" + elif method == 'chinese_chapter_cn': + converted = self._convert_chinese_number(num_str) + if converted: + return converted, f"filename_{method}" + except (ValueError, IndexError): + pass + return None, None + + # Submit all patterns + futures = [executor.submit(try_pattern, pattern_info) for pattern_info in chapter_patterns] + + # Check results as they complete + for future in as_completed(futures): + try: + num, method = future.result() + if num: + chapter_num = num + detection_method = method + # Cancel remaining futures + for f in futures: + f.cancel() + break + except Exception: + continue + else: + # Sequential processing for small pattern sets + for pattern, flags, method in chapter_patterns: + match = re.search(pattern, file_path, flags) + if match: + try: + num_str = match.group(1) + if num_str.isdigit(): + chapter_num = int(num_str) + detection_method = f"filename_{method}" + break + elif method == 'chinese_chapter_cn': + converted = self._convert_chinese_number(num_str) + if converted: + chapter_num = converted + detection_method = f"filename_{method}" + break + except (ValueError, IndexError): + continue + + # Try content if not found in filename + if not chapter_num: + # Check ignore settings for batch translation + batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' + ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active + ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active + + # Prepare all text sources to check in parallel + text_sources = [] + + # Add title tag if not ignored + if not ignore_title_tag and soup.title and soup.title.string: + title_text = soup.title.string.strip() + text_sources.append(("title", title_text, True)) # True means this can be chapter_title + + # Add headers if not ignored + if not ignore_header_tags: + for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + headers = soup.find_all(header_tag) + for header in headers[:3]: # Limit to first 3 of each type + header_text = header.get_text(strip=True) + if header_text: + text_sources.append((f"header_{header_tag}", header_text, True)) + + # Add first paragraphs + first_elements = soup.find_all(['p', 'div'])[:5] + for elem in first_elements: + elem_text = elem.get_text(strip=True) + if elem_text: + text_sources.append(("content", elem_text, False)) # False means don't use as chapter_title + + # Process text sources in parallel if we have many + if len(text_sources) > 5: + with ThreadPoolExecutor(max_workers=min(6, len(text_sources))) as executor: + def extract_from_source(source_info): + source_type, text, can_be_title = source_info + num, method = self._extract_from_text(text, source_type) + return num, method, text if (num and can_be_title) else None + + # Submit all text sources + future_to_source = {executor.submit(extract_from_source, source): source + for source in text_sources} + + # Process results as they complete + for future in as_completed(future_to_source): + try: + num, method, title = future.result() + if num: + chapter_num = num + detection_method = method + if title and not chapter_title: + chapter_title = title + # Cancel remaining futures + for f in future_to_source: + f.cancel() + break + except Exception: + continue + else: + # Sequential processing for small text sets + for source_type, text, can_be_title in text_sources: + num, method = self._extract_from_text(text, source_type) + if num: + chapter_num = num + detection_method = method + if can_be_title and not chapter_title: + chapter_title = text + break + + # Final fallback to filename patterns + if not chapter_num: + filename_base = os.path.basename(file_path) + # Parallel pattern matching for filename extraction + if len(self.pattern_manager.FILENAME_EXTRACT_PATTERNS) > 3: + with ThreadPoolExecutor(max_workers=min(4, len(self.pattern_manager.FILENAME_EXTRACT_PATTERNS))) as executor: + def try_filename_pattern(pattern): + match = re.search(pattern, filename_base, re.IGNORECASE) + if match: + try: + return int(match.group(1)) + except (ValueError, IndexError): + pass + return None + + futures = [executor.submit(try_filename_pattern, pattern) + for pattern in self.pattern_manager.FILENAME_EXTRACT_PATTERNS] + + for future in as_completed(futures): + try: + num = future.result() + if num: + chapter_num = num + detection_method = "filename_number" + for f in futures: + f.cancel() + break + except Exception: + continue + else: + # Sequential for small pattern sets + for pattern in self.pattern_manager.FILENAME_EXTRACT_PATTERNS: + match = re.search(pattern, filename_base, re.IGNORECASE) + if match: + chapter_num = int(match.group(1)) + detection_method = "filename_number" + break + + # Extract title if not already found (with ignore settings support) + if not chapter_title: + # Check ignore settings for batch translation + batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' + ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active + ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active + + # Try title tag if not ignored + if not ignore_title_tag and soup.title and soup.title.string: + chapter_title = soup.title.string.strip() + + # Try header tags if not ignored and no title found + if not chapter_title and not ignore_header_tags: + for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + header = soup.find(header_tag) + if header: + chapter_title = header.get_text(strip=True) + break + + # Final fallback + if not chapter_title: + chapter_title = f"Chapter {chapter_num}" if chapter_num else None + + chapter_title = re.sub(r'\s+', ' ', chapter_title).strip() if chapter_title else None + + return chapter_num, chapter_title, detection_method + + + def _extract_from_text(self, text, source_type): + """Extract chapter number from text using patterns with parallel matching for large pattern sets""" + # Get patterns that don't end with '_number' + text_patterns = [(pattern, flags, method) for pattern, flags, method in self.pattern_manager.CHAPTER_PATTERNS + if not method.endswith('_number')] + + # Only use parallel processing if we have many patterns + if len(text_patterns) > 5: + with ThreadPoolExecutor(max_workers=min(4, len(text_patterns))) as executor: + def try_text_pattern(pattern_info): + pattern, flags, method = pattern_info + match = re.search(pattern, text, flags) + if match: + try: + num_str = match.group(1) + if num_str.isdigit(): + return int(num_str), f"{source_type}_{method}" + elif method == 'chinese_chapter_cn': + converted = self._convert_chinese_number(num_str) + if converted: + return converted, f"{source_type}_{method}" + except (ValueError, IndexError): + pass + return None, None + + # Submit all patterns + futures = [executor.submit(try_text_pattern, pattern_info) for pattern_info in text_patterns] + + # Check results as they complete + for future in as_completed(futures): + try: + num, method = future.result() + if num: + # Cancel remaining futures + for f in futures: + f.cancel() + return num, method + except Exception: + continue + else: + # Sequential processing for small pattern sets + for pattern, flags, method in text_patterns: + match = re.search(pattern, text, flags) + if match: + try: + num_str = match.group(1) + if num_str.isdigit(): + return int(num_str), f"{source_type}_{method}" + elif method == 'chinese_chapter_cn': + converted = self._convert_chinese_number(num_str) + if converted: + return converted, f"{source_type}_{method}" + except (ValueError, IndexError): + continue + + return None, None + + def _convert_chinese_number(self, cn_num): + """Convert Chinese number to integer""" + if cn_num in self.pattern_manager.CHINESE_NUMS: + return self.pattern_manager.CHINESE_NUMS[cn_num] + + if '十' in cn_num: + parts = cn_num.split('十') + if len(parts) == 2: + tens = self.pattern_manager.CHINESE_NUMS.get(parts[0], 1) if parts[0] else 1 + ones = self.pattern_manager.CHINESE_NUMS.get(parts[1], 0) if parts[1] else 0 + return tens * 10 + ones + + return None + + def _detect_content_language(self, text_sample): + """Detect the primary language of content with parallel processing for large texts""" + + # For very short texts, use sequential processing + if len(text_sample) < 1000: + scripts = { + 'korean': 0, + 'japanese_hiragana': 0, + 'japanese_katakana': 0, + 'chinese': 0, + 'latin': 0 + } + + for char in text_sample: + code = ord(char) + if 0xAC00 <= code <= 0xD7AF: + scripts['korean'] += 1 + elif 0x3040 <= code <= 0x309F: + scripts['japanese_hiragana'] += 1 + elif 0x30A0 <= code <= 0x30FF: + scripts['japanese_katakana'] += 1 + elif 0x4E00 <= code <= 0x9FFF: + scripts['chinese'] += 1 + elif 0x0020 <= code <= 0x007F: + scripts['latin'] += 1 + else: + # For longer texts, use parallel processing + # Split text into chunks for parallel processing + chunk_size = max(500, len(text_sample) // (os.cpu_count() or 4)) + chunks = [text_sample[i:i + chunk_size] for i in range(0, len(text_sample), chunk_size)] + + # Thread-safe accumulator + scripts_lock = threading.Lock() + scripts = { + 'korean': 0, + 'japanese_hiragana': 0, + 'japanese_katakana': 0, + 'chinese': 0, + 'latin': 0 + } + + def process_chunk(text_chunk): + """Process a chunk of text and return script counts""" + local_scripts = { + 'korean': 0, + 'japanese_hiragana': 0, + 'japanese_katakana': 0, + 'chinese': 0, + 'latin': 0 + } + + for char in text_chunk: + code = ord(char) + if 0xAC00 <= code <= 0xD7AF: + local_scripts['korean'] += 1 + elif 0x3040 <= code <= 0x309F: + local_scripts['japanese_hiragana'] += 1 + elif 0x30A0 <= code <= 0x30FF: + local_scripts['japanese_katakana'] += 1 + elif 0x4E00 <= code <= 0x9FFF: + local_scripts['chinese'] += 1 + elif 0x0020 <= code <= 0x007F: + local_scripts['latin'] += 1 + + return local_scripts + + # Process chunks in parallel + with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 4, len(chunks))) as executor: + # Submit all chunks + futures = [executor.submit(process_chunk, chunk) for chunk in chunks] + + # Collect results + for future in as_completed(futures): + try: + chunk_scripts = future.result() + # Thread-safe accumulation + with scripts_lock: + for script, count in chunk_scripts.items(): + scripts[script] += count + except Exception as e: + print(f"[WARNING] Error processing chunk in language detection: {e}") + + # Language determination logic (same as original) + total_cjk = scripts['korean'] + scripts['japanese_hiragana'] + scripts['japanese_katakana'] + scripts['chinese'] + + if scripts['korean'] > total_cjk * 0.3: + return 'korean' + elif scripts['japanese_hiragana'] + scripts['japanese_katakana'] > total_cjk * 0.2: + return 'japanese' + elif scripts['chinese'] > total_cjk * 0.3: + return 'chinese' + elif scripts['latin'] > len(text_sample) * 0.7: + return 'english' + else: + return 'unknown' + + def _print_extraction_summary(self, chapters, detected_language, extraction_mode, h1_count, h2_count, file_size_groups): + """Print extraction summary""" + print(f"\n📊 Chapter Extraction Summary ({extraction_mode.capitalize()} Mode):") + print(f" • Total chapters extracted: {len(chapters)}") + + # Format chapter range handling both int and float + first_num = chapters[0]['num'] + last_num = chapters[-1]['num'] + + print(f" • Chapter range: {first_num} to {last_num}") + print(f" • Detected language: {detected_language}") + + if extraction_mode == "smart": + print(f" • Primary header type: {'

' if h2_count > h1_count else '

'}") + + image_only_count = sum(1 for c in chapters if c.get('is_image_only', False)) + text_only_count = sum(1 for c in chapters if not c.get('has_images', False) and c.get('file_size', 0) >= 500) + mixed_count = sum(1 for c in chapters if c.get('has_images', False) and c.get('file_size', 0) >= 500) + empty_count = sum(1 for c in chapters if c.get('file_size', 0) < 50) + + print(f" • Text-only chapters: {text_only_count}") + print(f" • Image-only chapters: {image_only_count}") + print(f" • Mixed content chapters: {mixed_count}") + print(f" • Empty/minimal content: {empty_count}") + + # Check for merged chapters + merged_count = sum(1 for c in chapters if c.get('was_merged', False)) + if merged_count > 0: + print(f" • Merged chapters: {merged_count}") + + # Check for missing chapters (only for integer sequences) + expected_chapters = set(range(chapters[0]['num'], chapters[-1]['num'] + 1)) + actual_chapters = set(c['num'] for c in chapters) + missing = expected_chapters - actual_chapters + if missing: + print(f" ⚠️ Missing chapter numbers: {sorted(missing)}") + + if extraction_mode == "smart": + method_stats = Counter(c['detection_method'] for c in chapters) + print(f" 📈 Detection methods used:") + for method, count in method_stats.most_common(): + print(f" • {method}: {count} chapters") + + large_groups = [size for size, files in file_size_groups.items() if len(files) > 1] + if large_groups: + print(f" ⚠️ Found {len(large_groups)} file size groups with potential duplicates") + else: + print(f" • Empty/placeholder: {empty_count}") + + if extraction_mode == "full": + print(f" 🔍 Full extraction preserved all HTML structure and tags") + + def _extract_epub_metadata(self, zf): + """Extract comprehensive metadata from EPUB file including all custom fields""" + meta = {} + # Use lxml for XML if available + xml_parser = 'lxml-xml' if self.parser == 'lxml' else 'xml' + try: + for name in zf.namelist(): + if name.lower().endswith('.opf'): + opf_content = zf.read(name) + soup = BeautifulSoup(opf_content, xml_parser) + + # Extract ALL Dublin Core elements (expanded list) + dc_elements = ['title', 'creator', 'subject', 'description', + 'publisher', 'contributor', 'date', 'type', + 'format', 'identifier', 'source', 'language', + 'relation', 'coverage', 'rights'] + + for element in dc_elements: + tag = soup.find(element) + if tag and tag.get_text(strip=True): + meta[element] = tag.get_text(strip=True) + + # Extract ALL meta tags (not just series) + meta_tags = soup.find_all('meta') + for meta_tag in meta_tags: + # Try different attribute names for the metadata name + name = meta_tag.get('name') or meta_tag.get('property', '') + content = meta_tag.get('content', '') + + if name and content: + # Store original name for debugging + original_name = name + + # Clean up common prefixes + if name.startswith('calibre:'): + name = name[8:] # Remove 'calibre:' prefix + elif name.startswith('dc:'): + name = name[3:] # Remove 'dc:' prefix + elif name.startswith('opf:'): + name = name[4:] # Remove 'opf:' prefix + + # Normalize the field name - replace hyphens with underscores + name = name.replace('-', '_') + + # Don't overwrite if already exists (prefer direct tags over meta tags) + if name not in meta: + meta[name] = content + + # Debug output for custom fields + if original_name != name: + print(f" • Found custom field: {original_name} → {name}") + + # Special handling for series information (maintain compatibility) + if 'series' not in meta: + series_tags = soup.find_all('meta', attrs={'name': lambda x: x and 'series' in x.lower()}) + for series_tag in series_tags: + series_name = series_tag.get('content', '') + if series_name: + meta['series'] = series_name + break + + # Extract refines metadata (used by some EPUB creators) + refines_metas = soup.find_all('meta', attrs={'refines': True}) + for refine in refines_metas: + property_name = refine.get('property', '') + content = refine.get_text(strip=True) or refine.get('content', '') + + if property_name and content: + # Clean property name + if ':' in property_name: + property_name = property_name.split(':')[-1] + property_name = property_name.replace('-', '_') + + if property_name not in meta: + meta[property_name] = content + + # Log extraction summary + print(f"📋 Extracted {len(meta)} metadata fields") + + # Show standard vs custom fields + standard_keys = {'title', 'creator', 'language', 'subject', 'description', + 'publisher', 'date', 'identifier', 'source', 'rights', + 'contributor', 'type', 'format', 'relation', 'coverage'} + custom_keys = set(meta.keys()) - standard_keys + + if custom_keys: + print(f"📋 Standard fields: {len(standard_keys & set(meta.keys()))}") + print(f"📋 Custom fields found: {sorted(custom_keys)}") + + # Show sample values for custom fields (truncated) + for key in sorted(custom_keys)[:5]: # Show first 5 custom fields + value = str(meta[key]) + if len(value) > 50: + value = value[:47] + "..." + print(f" • {key}: {value}") + + if len(custom_keys) > 5: + print(f" • ... and {len(custom_keys) - 5} more custom fields") + + break + + except Exception as e: + print(f"[WARNING] Failed to extract metadata: {e}") + import traceback + traceback.print_exc() + + return meta + + def _categorize_resource(self, file_path, file_name): + """Categorize a file and return (resource_type, target_dir, safe_filename)""" + file_path_lower = file_path.lower() + file_name_lower = file_name.lower() + + if file_path_lower.endswith('.css'): + return 'css', 'css', sanitize_resource_filename(file_name) + elif file_path_lower.endswith(('.ttf', '.otf', '.woff', '.woff2', '.eot')): + return 'fonts', 'fonts', sanitize_resource_filename(file_name) + elif file_path_lower.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.bmp', '.webp')): + return 'images', 'images', sanitize_resource_filename(file_name) + elif (file_path_lower.endswith(('.opf', '.ncx')) or + file_name_lower == 'container.xml' or + 'container.xml' in file_path_lower): + if 'container.xml' in file_path_lower: + safe_filename = 'container.xml' + else: + safe_filename = file_name + return 'epub_structure', None, safe_filename + elif file_path_lower.endswith(('.js', '.xml', '.txt')): + return 'other', None, sanitize_resource_filename(file_name) + + return None + + def _cleanup_old_resources(self, output_dir): + """Clean up old resource directories and EPUB structure files""" + print("🧹 Cleaning up any existing resource directories...") + + cleanup_success = True + + for resource_type in ['css', 'fonts', 'images']: + resource_dir = os.path.join(output_dir, resource_type) + if os.path.exists(resource_dir): + try: + shutil.rmtree(resource_dir) + print(f" 🗑️ Removed old {resource_type} directory") + except PermissionError as e: + print(f" ⚠️ Cannot remove {resource_type} directory (permission denied) - will merge with existing files") + cleanup_success = False + except Exception as e: + print(f" ⚠️ Error removing {resource_type} directory: {e} - will merge with existing files") + cleanup_success = False + + epub_structure_files = ['container.xml', 'content.opf', 'toc.ncx'] + for epub_file in epub_structure_files: + input_path = os.path.join(output_dir, epub_file) + if os.path.exists(input_path): + try: + os.remove(input_path) + print(f" 🗑️ Removed old {epub_file}") + except PermissionError: + print(f" ⚠️ Cannot remove {epub_file} (permission denied) - will use existing file") + except Exception as e: + print(f" ⚠️ Error removing {epub_file}: {e}") + + try: + for file in os.listdir(output_dir): + if file.lower().endswith(('.opf', '.ncx')): + file_path = os.path.join(output_dir, file) + try: + os.remove(file_path) + print(f" 🗑️ Removed old EPUB file: {file}") + except PermissionError: + print(f" ⚠️ Cannot remove {file} (permission denied)") + except Exception as e: + print(f" ⚠️ Error removing {file}: {e}") + except Exception as e: + print(f"⚠️ Error scanning for EPUB files: {e}") + + if not cleanup_success: + print("⚠️ Some cleanup operations failed due to file permissions") + print(" The program will continue and merge with existing files") + + return cleanup_success + + def _count_existing_resources(self, output_dir, extracted_resources): + """Count existing resources when skipping extraction""" + for resource_type in ['css', 'fonts', 'images', 'epub_structure']: + if resource_type == 'epub_structure': + epub_files = [] + for file in ['container.xml', 'content.opf', 'toc.ncx']: + if os.path.exists(os.path.join(output_dir, file)): + epub_files.append(file) + try: + for file in os.listdir(output_dir): + if file.lower().endswith(('.opf', '.ncx')) and file not in epub_files: + epub_files.append(file) + except: + pass + extracted_resources[resource_type] = epub_files + else: + resource_dir = os.path.join(output_dir, resource_type) + if os.path.exists(resource_dir): + try: + files = [f for f in os.listdir(resource_dir) if os.path.isfile(os.path.join(resource_dir, f))] + extracted_resources[resource_type] = files + except: + extracted_resources[resource_type] = [] + + total_existing = sum(len(files) for files in extracted_resources.values()) + print(f"✅ Found {total_existing} existing resource files") + return extracted_resources + + def _validate_critical_files(self, output_dir, extracted_resources): + """Validate that critical EPUB files were extracted""" + total_extracted = sum(len(files) for files in extracted_resources.values()) + print(f"✅ Extracted {total_extracted} resource files:") + + for resource_type, files in extracted_resources.items(): + if files: + if resource_type == 'epub_structure': + print(f" • EPUB Structure: {len(files)} files") + for file in files: + print(f" - {file}") + else: + print(f" • {resource_type.title()}: {len(files)} files") + + critical_files = ['container.xml'] + missing_critical = [f for f in critical_files if not os.path.exists(os.path.join(output_dir, f))] + + if missing_critical: + print(f"⚠️ WARNING: Missing critical EPUB files: {missing_critical}") + print(" This may prevent proper EPUB reconstruction!") + else: + print("✅ All critical EPUB structure files extracted successfully") + + opf_files = [f for f in extracted_resources['epub_structure'] if f.lower().endswith('.opf')] + if not opf_files: + print("⚠️ WARNING: No OPF file found! This will prevent EPUB reconstruction.") + else: + print(f"✅ Found OPF file(s): {opf_files}") + + def _create_extraction_report(self, output_dir, metadata, chapters, extracted_resources): + """Create comprehensive extraction report with HTML file tracking""" + report_path = os.path.join(output_dir, 'extraction_report.txt') + with open(report_path, 'w', encoding='utf-8') as f: + f.write("EPUB Extraction Report\n") + f.write("=" * 50 + "\n\n") + + f.write(f"EXTRACTION MODE: {metadata.get('extraction_mode', 'unknown').upper()}\n\n") + + f.write("METADATA:\n") + for key, value in metadata.items(): + if key not in ['chapter_titles', 'extracted_resources', 'extraction_mode']: + f.write(f" {key}: {value}\n") + + f.write(f"\nCHAPTERS ({len(chapters)}):\n") + + text_chapters = [] + image_only_chapters = [] + mixed_chapters = [] + + for chapter in chapters: + if chapter.get('has_images') and chapter.get('file_size', 0) < 500: + image_only_chapters.append(chapter) + elif chapter.get('has_images') and chapter.get('file_size', 0) >= 500: + mixed_chapters.append(chapter) + else: + text_chapters.append(chapter) + + if text_chapters: + f.write(f"\n TEXT CHAPTERS ({len(text_chapters)}):\n") + for c in text_chapters: + f.write(f" {c['num']:3d}. {c['title']} ({c['detection_method']})\n") + if c.get('original_html_file'): + f.write(f" → {c['original_html_file']}\n") + + if image_only_chapters: + f.write(f"\n IMAGE-ONLY CHAPTERS ({len(image_only_chapters)}):\n") + for c in image_only_chapters: + f.write(f" {c['num']:3d}. {c['title']} (images: {c.get('image_count', 0)})\n") + if c.get('original_html_file'): + f.write(f" → {c['original_html_file']}\n") + if 'body' in c: + try: + soup = BeautifulSoup(c['body'], 'html.parser') + images = soup.find_all('img') + for img in images[:3]: + src = img.get('src', 'unknown') + f.write(f" • Image: {src}\n") + if len(images) > 3: + f.write(f" • ... and {len(images) - 3} more images\n") + except: + pass + + if mixed_chapters: + f.write(f"\n MIXED CONTENT CHAPTERS ({len(mixed_chapters)}):\n") + for c in mixed_chapters: + f.write(f" {c['num']:3d}. {c['title']} (text: {c.get('file_size', 0)} chars, images: {c.get('image_count', 0)})\n") + if c.get('original_html_file'): + f.write(f" → {c['original_html_file']}\n") + + f.write(f"\nRESOURCES EXTRACTED:\n") + for resource_type, files in extracted_resources.items(): + if files: + if resource_type == 'epub_structure': + f.write(f" EPUB Structure: {len(files)} files\n") + for file in files: + f.write(f" - {file}\n") + else: + f.write(f" {resource_type.title()}: {len(files)} files\n") + for file in files[:5]: + f.write(f" - {file}\n") + if len(files) > 5: + f.write(f" ... and {len(files) - 5} more\n") + + f.write(f"\nHTML FILES WRITTEN:\n") + html_files_written = metadata.get('html_files_written', 0) + f.write(f" Total: {html_files_written} files\n") + f.write(f" Location: Main directory and 'originals' subdirectory\n") + + f.write(f"\nPOTENTIAL ISSUES:\n") + issues = [] + + if image_only_chapters: + issues.append(f" • {len(image_only_chapters)} chapters contain only images (may need OCR)") + + missing_html = sum(1 for c in chapters if not c.get('original_html_file')) + if missing_html > 0: + issues.append(f" • {missing_html} chapters failed to write HTML files") + + if not extracted_resources.get('epub_structure'): + issues.append(" • No EPUB structure files found (may affect reconstruction)") + + if not issues: + f.write(" None detected - extraction appears successful!\n") + else: + for issue in issues: + f.write(issue + "\n") + + print(f"📄 Saved extraction report to: {report_path}") + + def _log_extraction_summary(self, chapters, extracted_resources, detected_language, html_files_written=0): + """Log final extraction summary with HTML file information""" + extraction_mode = chapters[0].get('extraction_mode', 'unknown') if chapters else 'unknown' + + print(f"\n✅ {extraction_mode.capitalize()} extraction complete!") + print(f" 📚 Chapters: {len(chapters)}") + print(f" 📄 HTML files written: {html_files_written}") + print(f" 🎨 Resources: {sum(len(files) for files in extracted_resources.values())}") + print(f" 🌍 Language: {detected_language}") + + image_only_count = sum(1 for c in chapters if c.get('has_images') and c.get('file_size', 0) < 500) + if image_only_count > 0: + print(f" 📸 Image-only chapters: {image_only_count}") + + epub_files = extracted_resources.get('epub_structure', []) + if epub_files: + print(f" 📋 EPUB Structure: {len(epub_files)} files ({', '.join(epub_files)})") + else: + print(f" ⚠️ No EPUB structure files extracted!") + + print(f"\n🔍 Pre-flight check readiness:") + print(f" ✅ HTML files: {'READY' if html_files_written > 0 else 'NOT READY'}") + print(f" ✅ Metadata: READY") + print(f" ✅ Resources: READY") + +# ===================================================== +# UNIFIED TRANSLATION PROCESSOR +# ===================================================== + +class TranslationProcessor: + """Handles the translation of individual chapters""" + + def __init__(self, config, client, out_dir, log_callback=None, stop_callback=None, uses_zero_based=False, is_text_file=False): + self.config = config + self.client = client + self.out_dir = out_dir + self.log_callback = log_callback + self.stop_callback = stop_callback + self.chapter_splitter = ChapterSplitter(model_name=config.MODEL) + self.uses_zero_based = uses_zero_based + self.is_text_file = is_text_file + + # Check and log multi-key status + if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys: + stats = self.client.get_stats() + self._log(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys") + self._log(f" Active keys: {stats.get('active_keys', 0)}") + + def _log(self, message): + """Log a message""" + if self.log_callback: + self.log_callback(message) + else: + print(message) + + def report_key_status(self): + """Report multi-key status if available""" + if hasattr(self.client, 'get_stats'): + stats = self.client.get_stats() + if stats.get('multi_key_mode', False): + self._log(f"\n📊 API Key Status:") + self._log(f" Active Keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}") + self._log(f" Success Rate: {stats.get('success_rate', 0):.1%}") + self._log(f" Total Requests: {stats.get('total_requests', 0)}\n") + + def check_stop(self): + """Check if translation should stop""" + if self.stop_callback and self.stop_callback(): + print("❌ Translation stopped by user request.") + return True + + def check_duplicate_content(self, result, idx, prog, out, actual_num=None): + """Check if translated content is duplicate - with mode selection""" + + # Get detection mode from config + detection_mode = getattr(self.config, 'DUPLICATE_DETECTION_MODE', 'basic') + print(f" 🔍 DEBUG: Detection mode = '{detection_mode}'") + print(f" 🔍 DEBUG: Lookback chapters = {self.config.DUPLICATE_LOOKBACK_CHAPTERS}") + + # Extract content_hash if available from progress + content_hash = None + if detection_mode == 'ai-hunter': + # Try to get content_hash from the current chapter info + # Use actual_num if provided, otherwise fallback to idx+1 + if actual_num is not None: + chapter_key = str(actual_num) + else: + chapter_key = str(idx + 1) + if chapter_key in prog.get("chapters", {}): + chapter_info = prog["chapters"][chapter_key] + content_hash = chapter_info.get("content_hash") + print(f" 🔍 DEBUG: Found content_hash for chapter {idx}: {content_hash}") + + if detection_mode == 'ai-hunter': + print(" 🤖 DEBUG: Routing to AI Hunter detection...") + # Check if AI Hunter method is available (injected by the wrapper) + if hasattr(self, '_check_duplicate_ai_hunter'): + return self._check_duplicate_ai_hunter(result, idx, prog, out, content_hash) + else: + print(" ⚠️ AI Hunter method not available, falling back to basic detection") + return self._check_duplicate_basic(result, idx, prog, out) + elif detection_mode == 'cascading': + print(" 🔄 DEBUG: Routing to Cascading detection...") + return self._check_duplicate_cascading(result, idx, prog, out) + else: + print(" 📋 DEBUG: Routing to Basic detection...") + return self._check_duplicate_basic(result, idx, prog, out) + + def _check_duplicate_basic(self, result, idx, prog, out): + """Original basic duplicate detection""" + try: + result_clean = re.sub(r'<[^>]+>', '', result).strip().lower() + result_sample = result_clean[:1000] + + lookback_chapters = self.config.DUPLICATE_LOOKBACK_CHAPTERS + + for prev_idx in range(max(0, idx - lookback_chapters), idx): + prev_key = str(prev_idx) + if prev_key in prog["chapters"] and prog["chapters"][prev_key].get("output_file"): + prev_file = prog["chapters"][prev_key]["output_file"] + prev_path = os.path.join(out, prev_file) + + if os.path.exists(prev_path): + try: + with open(prev_path, 'r', encoding='utf-8') as f: + prev_content = f.read() + prev_clean = re.sub(r'<[^>]+>', '', prev_content).strip().lower() + prev_sample = prev_clean[:1000] + + # Use SequenceMatcher for similarity comparison + similarity = SequenceMatcher(None, result_sample, prev_sample).ratio() + + if similarity >= 0.85: # 85% threshold + print(f" 🚀 Basic detection: Duplicate found ({int(similarity*100)}%)") + return True, int(similarity * 100) + + except Exception as e: + print(f" Warning: Failed to read {prev_path}: {e}") + continue + + return False, 0 + + except Exception as e: + print(f" Warning: Failed to check duplicate content: {e}") + return False, 0 + + + def _check_duplicate_cascading(self, result, idx, prog, out): + """Cascading detection - basic first, then AI Hunter for borderline cases""" + # Step 1: Basic + is_duplicate_basic, similarity_basic = self._check_duplicate_basic(result, idx, prog, out) + + if is_duplicate_basic: + return True, similarity_basic + + # Step 2: If basic detection finds moderate similarity, use AI Hunter + if similarity_basic >= 60: # Configurable threshold + print(f" 🤖 Moderate similarity ({similarity_basic}%) - running AI Hunter analysis...") + if hasattr(self, '_check_duplicate_ai_hunter'): + is_duplicate_ai, similarity_ai = self._check_duplicate_ai_hunter(result, idx, prog, out) + if is_duplicate_ai: + return True, similarity_ai + else: + print(" ⚠️ AI Hunter method not available for cascading analysis") + + return False, max(similarity_basic, 0) + + def _extract_text_features(self, text): + """Extract multiple features from text for AI Hunter analysis""" + features = { + 'semantic': {}, + 'structural': {}, + 'characters': [], + 'patterns': {} + } + + # Semantic fingerprint + lines = text.split('\n') + + # Character extraction (names that appear 3+ times) + words = re.findall(r'\b[A-Z][a-z]+\b', text) + word_freq = Counter(words) + features['characters'] = [name for name, count in word_freq.items() if count >= 3] + + # Dialogue patterns + dialogue_patterns = re.findall(r'"([^"]+)"', text) + features['semantic']['dialogue_count'] = len(dialogue_patterns) + features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]] + + # Speaker patterns + speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower()) + features['semantic']['speakers'] = list(set(speaker_patterns[:20])) + + # Number extraction + numbers = re.findall(r'\b\d+\b', text) + features['patterns']['numbers'] = numbers[:20] + + # Structural signature + para_lengths = [] + dialogue_count = 0 + for para in text.split('\n\n'): + if para.strip(): + para_lengths.append(len(para)) + if '"' in para: + dialogue_count += 1 + + features['structural']['para_count'] = len(para_lengths) + features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths)) + features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths)) + + # Create structural pattern string + pattern = [] + for para in text.split('\n\n')[:20]: # First 20 paragraphs + if para.strip(): + if '"' in para: + pattern.append('D') # Dialogue + elif len(para) > 300: + pattern.append('L') # Long + elif len(para) < 100: + pattern.append('S') # Short + else: + pattern.append('M') # Medium + features['structural']['pattern'] = ''.join(pattern) + + return features + + def _calculate_exact_similarity(self, text1, text2): + """Calculate exact text similarity""" + return SequenceMatcher(None, text1.lower(), text2.lower()).ratio() + + def _calculate_smart_similarity(self, text1, text2): + """Smart similarity with length-aware sampling""" + # Check length ratio first + len_ratio = len(text1) / max(1, len(text2)) + if len_ratio < 0.7 or len_ratio > 1.3: + return 0.0 + + # Smart sampling for large texts + if len(text1) > 10000: + sample_size = 3000 + samples1 = [ + text1[:sample_size], + text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2], + text1[-sample_size:] + ] + samples2 = [ + text2[:sample_size], + text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2], + text2[-sample_size:] + ] + similarities = [SequenceMatcher(None, s1.lower(), s2.lower()).ratio() + for s1, s2 in zip(samples1, samples2)] + return sum(similarities) / len(similarities) + else: + # Use first 2000 chars for smaller texts + return SequenceMatcher(None, text1[:2000].lower(), text2[:2000].lower()).ratio() + + def _calculate_semantic_similarity(self, sem1, sem2): + """Calculate semantic fingerprint similarity""" + score = 0.0 + max_score = 0.0 + + # Compare dialogue counts + if 'dialogue_count' in sem1 and 'dialogue_count' in sem2: + max_score += 1.0 + ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / max(1, max(sem1['dialogue_count'], sem2['dialogue_count'])) + score += ratio * 0.3 + + # Compare speakers + if 'speakers' in sem1 and 'speakers' in sem2: + max_score += 1.0 + if sem1['speakers'] and sem2['speakers']: + overlap = len(set(sem1['speakers']) & set(sem2['speakers'])) + total = len(set(sem1['speakers']) | set(sem2['speakers'])) + score += (overlap / max(1, total)) * 0.4 + + # Compare dialogue lengths pattern + if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2: + max_score += 1.0 + if sem1['dialogue_lengths'] and sem2['dialogue_lengths']: + # Compare dialogue length patterns + len1 = sem1['dialogue_lengths'][:10] + len2 = sem2['dialogue_lengths'][:10] + if len1 and len2: + avg1 = sum(len1) / len(len1) + avg2 = sum(len2) / len(len2) + ratio = min(avg1, avg2) / max(1, max(avg1, avg2)) + score += ratio * 0.3 + + return score / max(1, max_score) + + def _calculate_structural_similarity(self, struct1, struct2): + """Calculate structural signature similarity""" + score = 0.0 + + # Compare paragraph patterns + if 'pattern' in struct1 and 'pattern' in struct2: + pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio() + score += pattern_sim * 0.4 + + # Compare paragraph statistics + if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \ + all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']): + + # Paragraph count ratio + para_ratio = min(struct1['para_count'], struct2['para_count']) / max(1, max(struct1['para_count'], struct2['para_count'])) + score += para_ratio * 0.2 + + # Average length ratio + avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / max(1, max(struct1['avg_para_length'], struct2['avg_para_length'])) + score += avg_ratio * 0.2 + + # Dialogue ratio similarity + dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio']) + score += (1 - dialogue_diff) * 0.2 + + return score + + def _calculate_character_similarity(self, chars1, chars2): + """Calculate character name similarity""" + if not chars1 or not chars2: + return 0.0 + + # Find overlapping characters + set1 = set(chars1) + set2 = set(chars2) + overlap = len(set1 & set2) + total = len(set1 | set2) + + return overlap / max(1, total) + + def _calculate_pattern_similarity(self, pat1, pat2): + """Calculate pattern-based similarity""" + score = 0.0 + + # Compare numbers (they rarely change in translations) + if 'numbers' in pat1 and 'numbers' in pat2: + nums1 = set(pat1['numbers']) + nums2 = set(pat2['numbers']) + if nums1 and nums2: + overlap = len(nums1 & nums2) + total = len(nums1 | nums2) + score = overlap / max(1, total) + + return score + + def generate_rolling_summary(self, history_manager, chapter_num, base_system_content=None, source_text=None): + """Generate rolling summary after a chapter for context continuity. + Uses a dedicated summary system prompt (with glossary) distinct from translation. + Writes the summary to rolling_summary.txt and returns the summary string. + """ + if not self.config.USE_ROLLING_SUMMARY: + return None + + + current_history = history_manager.load_history() + messages_to_include = self.config.ROLLING_SUMMARY_EXCHANGES * 2 + + # Prefer directly provided source text (e.g., just-translated chapter) when available + assistant_responses = [] + if source_text and isinstance(source_text, str) and source_text.strip(): + assistant_responses = [source_text] + else: + if len(current_history) >= 2: + recent_messages = current_history[-messages_to_include:] if messages_to_include > 0 else current_history + for h in recent_messages: + if h.get("role") == "assistant": + assistant_responses.append(h["content"]) + + # If still empty, skip quietly + if not assistant_responses: + return None + + # Build a dedicated summary system prompt (do NOT reuse main translation system prompt) + # Append glossary to keep terminology consistent + summary_system_template = os.getenv("ROLLING_SUMMARY_SYSTEM_PROMPT", "You create concise summaries for continuity.").strip() + try: + glossary_path = find_glossary_file(self.out_dir) + except Exception: + glossary_path = None + system_prompt = build_system_prompt(summary_system_template, glossary_path) + # Add explicit instruction for clarity + system_prompt += "\n\n[Instruction: Generate a concise rolling summary of the previous chapter. Use glossary terms consistently. Do not include warnings or explanations.]" + + user_prompt_template = os.getenv( + "ROLLING_SUMMARY_USER_PROMPT", + "Summarize the key events, characters, tone, and important details from these translations. " + "Focus on: character names/relationships, plot developments, and any special terminology used.\n\n" + "{translations}" + ) + + translations_text = "\n---\n".join(assistant_responses) + user_prompt = user_prompt_template.replace("{translations}", translations_text) + + summary_msgs = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"[Rolling Summary of Chapter {chapter_num}]\n" + user_prompt} + ] + + + try: + summary_resp, _ = send_with_interrupt( + summary_msgs, self.client, self.config.TEMP, + min(2000, self.config.MAX_OUTPUT_TOKENS), + self.check_stop, + context='summary' + ) + + # Save the summary to the output folder + summary_file = os.path.join(self.out_dir, "rolling_summary.txt") + header = f"=== Rolling Summary of Chapter {chapter_num} ===\n(This is a summary of the previous chapter for context)\n" + + mode = "a" if self.config.ROLLING_SUMMARY_MODE == "append" else "w" + with open(summary_file, mode, encoding="utf-8") as sf: + if mode == "a": + sf.write("\n\n") + sf.write(header) + sf.write(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}]\n") + sf.write(summary_resp.strip()) + + # If in append mode, trim to retain only the last N entries if configured + try: + if self.config.ROLLING_SUMMARY_MODE == "append": + max_entries = int(getattr(self.config, "ROLLING_SUMMARY_MAX_ENTRIES", 0) or 0) + if max_entries > 0: + with open(summary_file, 'r', encoding='utf-8') as rf: + content = rf.read() + # Find the start of each summary block by header line + headers = [m.start() for m in re.finditer(r"(?m)^===\s*Rolling Summary.*$", content)] + if len(headers) > max_entries: + # Keep only the last max_entries blocks + keep_starts = headers[-max_entries:] + blocks = [] + for i, s in enumerate(keep_starts): + e = keep_starts[i + 1] if i + 1 < len(keep_starts) else len(content) + block = content[s:e].strip() + if block: + blocks.append(block) + trimmed_content = ("\n\n".join(blocks) + "\n") if blocks else "" + with open(summary_file, 'w', encoding='utf-8') as wf: + wf.write(trimmed_content) + # Optional log showing retained count + try: + self._log(f"📚 Total summaries in memory: {len(blocks)} (trimmed to last {max_entries})") + except Exception: + pass + except Exception as _trim_err: + try: + self._log(f"⚠️ Failed to trim rolling summaries: {_trim_err}") + except Exception: + pass + + # Log to GUI if available, otherwise console + try: + self._log(f"📝 Generated rolling summary for Chapter {chapter_num} ({'append' if mode=='a' else 'replace'} mode)") + self._log(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)") + except Exception: + print(f"📝 Generated rolling summary for Chapter {chapter_num} ({'append' if mode=='a' else 'replace'} mode)") + print(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)") + return summary_resp.strip() + + except Exception as e: + try: + self._log(f"⚠️ Failed to generate rolling summary: {e}") + except Exception: + print(f"⚠️ Failed to generate rolling summary: {e}") + return None + + def translate_with_retry(self, msgs, chunk_html, c, chunk_idx, total_chunks): + """Handle translation with retry logic""" + + # CRITICAL FIX: Reset client state for each chunk + if hasattr(self.client, 'reset_cleanup_state'): + self.client.reset_cleanup_state() + + # Also ensure we're not in cleanup mode from previous operations + if hasattr(self.client, '_in_cleanup'): + self.client._in_cleanup = False + if hasattr(self.client, '_cancelled'): + self.client._cancelled = False + + + retry_count = 0 + + # Get retry attempts from AI Hunter config if available + ai_config = {} + try: + # Try to get AI Hunter config from environment variable first + ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG') + if ai_hunter_config_str: + ai_config = json.loads(ai_hunter_config_str) + else: + # Fallback to config attribute + ai_config = getattr(self.config, 'ai_hunter_config', {}) + except (json.JSONDecodeError, AttributeError): + ai_config = {} + + if isinstance(ai_config, dict): + max_retries = ai_config.get('retry_attempts', 3) + max_duplicate_retries = ai_config.get('retry_attempts', 6) # Use same setting for duplicate retries + else: + max_retries = 3 + max_duplicate_retries = 6 + + duplicate_retry_count = 0 + timeout_retry_count = 0 + max_timeout_retries = 2 + history_purged = False + + original_max_tokens = self.config.MAX_OUTPUT_TOKENS + original_temp = self.config.TEMP + original_user_prompt = msgs[-1]["content"] + + chunk_timeout = None + if self.config.RETRY_TIMEOUT: + chunk_timeout = self.config.CHUNK_TIMEOUT + + result = None + finish_reason = None + + while True: + if self.check_stop(): + return None, None + + try: + current_max_tokens = self.config.MAX_OUTPUT_TOKENS + current_temp = self.config.TEMP + + total_tokens = sum(self.chapter_splitter.count_tokens(m["content"]) for m in msgs) + # Determine file reference + if c.get('is_chunk', False): + file_ref = f"Section_{c['num']}" + else: + # Check if this is a text file - need to access from self + is_text_source = self.is_text_file or c.get('filename', '').endswith('.txt') + terminology = "Section" if is_text_source else "Chapter" + file_ref = c.get('original_basename', f'{terminology}_{c["num"]}') + + print(f"[DEBUG] Chunk {chunk_idx}/{total_chunks} tokens = {total_tokens:,} / {self.get_token_budget_str()} [File: {file_ref}]") + + self.client.context = 'translation' + + # Generate filename for chunks + if chunk_idx and total_chunks > 1: + # This is a chunk - use chunk naming format + fname = f"response_{c['num']:03d}_chunk_{chunk_idx}.html" + else: + # Not a chunk - use regular naming + fname = FileUtilities.create_chapter_filename(c, c.get('actual_chapter_num', c['num'])) + + # Set output filename BEFORE the API call + if hasattr(self.client, 'set_output_filename'): + self.client.set_output_filename(fname) + + # Track the filename so truncation logs know which file this is + if hasattr(self.client, '_current_output_file'): + self.client._current_output_file = fname + + # Generate unique request ID for this chunk + #request_id = f"{c['num']:03d}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}" + + result, finish_reason = send_with_interrupt( + msgs, self.client, current_temp, current_max_tokens, + self.check_stop, chunk_timeout + ) + # Enhanced mode workflow: + # 1. Original HTML -> html2text -> Markdown/plain text (during extraction) + # 2. Markdown sent to translation API (better for translation quality) + # 3. Translated markdown -> HTML conversion (here) + if result and c.get("enhanced_extraction", False): + print(f"🔄 Converting translated markdown back to HTML...") + result = convert_enhanced_text_to_html(result, c) + retry_needed = False + retry_reason = "" + is_duplicate_retry = False + + # ENHANCED: Force re-read environment variable for latest setting + retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1" + + # Debug logging to verify the toggle state + #print(f" DEBUG: finish_reason='{finish_reason}', RETRY_TRUNCATED={retry_truncated_enabled}, config.RETRY_TRUNCATED={self.config.RETRY_TRUNCATED}") + #print(f" DEBUG: Current tokens={self.config.MAX_OUTPUT_TOKENS}, Min retry tokens={self.config.MAX_RETRY_TOKENS}, retry_count={retry_count}") + + if finish_reason == "length" and (retry_truncated_enabled or self.config.RETRY_TRUNCATED): + if retry_count < max_retries: + # For truncated responses, ensure we never go below the minimum retry tokens + proposed_limit = self.config.MAX_OUTPUT_TOKENS * 2 + + # Always enforce minimum - never retry with tokens below the constraint + new_token_limit = max(proposed_limit, self.config.MAX_RETRY_TOKENS) + + if new_token_limit != self.config.MAX_OUTPUT_TOKENS: + retry_needed = True + retry_reason = "truncated output" + old_limit = self.config.MAX_OUTPUT_TOKENS + self.config.MAX_OUTPUT_TOKENS = new_token_limit + retry_count += 1 + + if old_limit < self.config.MAX_RETRY_TOKENS: + print(f" 🔄 TRUNCATION RETRY: Boosting tokens {old_limit} → {new_token_limit} (enforcing minimum: {self.config.MAX_RETRY_TOKENS})") + else: + print(f" 🔄 TRUNCATION RETRY: Doubling tokens {old_limit} → {new_token_limit} (above minimum: {self.config.MAX_RETRY_TOKENS})") + else: + print(f" ⚠️ TRUNCATION DETECTED: Token adjustment not needed - already at maximum {self.config.MAX_OUTPUT_TOKENS}") + else: + print(f" ⚠️ TRUNCATION DETECTED: Max retries ({max_retries}) reached - accepting truncated response") + elif finish_reason == "length" and not (retry_truncated_enabled or self.config.RETRY_TRUNCATED): + print(f" ⏭️ TRUNCATION DETECTED: Auto-retry is DISABLED - accepting truncated response") + elif finish_reason == "length": + print(f" ⚠️ TRUNCATION DETECTED: Unexpected condition - check logic") + + if not retry_needed: + # Force re-read the environment variable to ensure we have current setting + duplicate_enabled = os.getenv("RETRY_DUPLICATE_BODIES", "0") == "1" + + if duplicate_enabled and duplicate_retry_count < max_duplicate_retries: + idx = c.get('__index', 0) + prog = c.get('__progress', {}) + print(f" 🔍 Checking for duplicate content...") + # Get actual chapter number for duplicate detection + actual_num = c.get('actual_chapter_num', c.get('num', idx + 1)) + is_duplicate, similarity = self.check_duplicate_content(result, idx, prog, self.out_dir, actual_num) + + if is_duplicate: + retry_needed = True + is_duplicate_retry = True + retry_reason = f"duplicate content (similarity: {similarity}%)" + duplicate_retry_count += 1 + + # Check if temperature change is disabled + disable_temp_change = ai_config.get('disable_temperature_change', False) if isinstance(ai_config, dict) else False + + if duplicate_retry_count >= 3 and not history_purged: + print(f" 🧹 Clearing history after 3 attempts...") + if 'history_manager' in c: + c['history_manager'].save_history([]) + history_purged = True + if not disable_temp_change: + self.config.TEMP = original_temp + else: + print(f" 🌡️ Temperature change disabled - keeping current temp: {self.config.TEMP}") + + elif duplicate_retry_count == 1: + if disable_temp_change: + print(f" 🔄 First duplicate retry - temperature change disabled") + else: + print(f" 🔄 First duplicate retry - same temperature") + + elif history_purged: + if not disable_temp_change: + attempts_since_purge = duplicate_retry_count - 3 + self.config.TEMP = min(original_temp + (0.1 * attempts_since_purge), 1.0) + print(f" 🌡️ Post-purge temp: {self.config.TEMP}") + else: + print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}") + + else: + if not disable_temp_change: + self.config.TEMP = min(original_temp + (0.1 * (duplicate_retry_count - 1)), 1.0) + print(f" 🌡️ Gradual temp increase: {self.config.TEMP}") + else: + print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}") + + if duplicate_retry_count == 1: + user_prompt = f"[RETRY] Chapter {c['num']}: Ensure unique translation.\n{chunk_html}" + elif duplicate_retry_count <= 3: + user_prompt = f"[ATTEMPT {duplicate_retry_count}] Translate uniquely:\n{chunk_html}" + else: + user_prompt = f"Chapter {c['num']}:\n{chunk_html}" + + msgs[-1] = {"role": "user", "content": user_prompt} + elif not duplicate_enabled: + print(f" ⏭️ Duplicate detection is DISABLED - skipping check") + + if retry_needed: + if is_duplicate_retry: + print(f" 🔄 Duplicate retry {duplicate_retry_count}/{max_duplicate_retries}") + else: + print(f" 🔄 Retry {retry_count}/{max_retries}: {retry_reason}") + + time.sleep(2) + continue + + break + + except UnifiedClientError as e: + error_msg = str(e) + + if "stopped by user" in error_msg: + print("❌ Translation stopped by user during API call") + return None, None + + if "took" in error_msg and "timeout:" in error_msg: + if timeout_retry_count < max_timeout_retries: + timeout_retry_count += 1 + print(f" ⏱️ Chunk took too long, retry {timeout_retry_count}/{max_timeout_retries}") + print(f" 🔄 Retrying") + time.sleep(2) + continue + else: + print(f" ❌ Max timeout retries reached") + raise UnifiedClientError("Translation failed after timeout retries") + + elif "timed out" in error_msg and "timeout:" not in error_msg: + print(f"⚠️ {error_msg}, retrying...") + time.sleep(5) + continue + + elif getattr(e, "error_type", None) == "rate_limit" or getattr(e, "http_status", None) == 429: + # Rate limit errors - clean handling without traceback + print("⚠️ Rate limited, sleeping 60s…") + for i in range(60): + if self.check_stop(): + print("❌ Translation stopped during rate limit wait") + return None, None + time.sleep(1) + continue + + else: + # For unexpected errors, show the error message but suppress traceback in most cases + if getattr(e, "error_type", None) in ["api_error", "validation", "prohibited_content"]: + print(f"❌ API Error: {error_msg}") + raise UnifiedClientError(f"API Error: {error_msg}") + else: + raise + + except Exception as e: + print(f"❌ Unexpected error during API call: {e}") + raise + + self.config.MAX_OUTPUT_TOKENS = original_max_tokens + self.config.TEMP = original_temp + + if retry_count > 0 or duplicate_retry_count > 0 or timeout_retry_count > 0: + if duplicate_retry_count > 0: + print(f" 🔄 Restored original temperature: {self.config.TEMP} (after {duplicate_retry_count} duplicate retries)") + elif timeout_retry_count > 0: + print(f" 🔄 Restored original settings after {timeout_retry_count} timeout retries") + elif retry_count > 0: + print(f" 🔄 Restored original settings after {retry_count} retries") + + if duplicate_retry_count >= max_duplicate_retries: + print(f" ⚠️ WARNING: Duplicate content issue persists after {max_duplicate_retries} attempts") + + return result, finish_reason + + def get_token_budget_str(self): + """Get token budget as string""" + _tok_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip() + max_tokens_limit, budget_str = parse_token_limit(_tok_env) + return budget_str + +# ===================================================== +# BATCH TRANSLATION PROCESSOR +# ===================================================== +class BatchTranslationProcessor: + """Handles batch/parallel translation processing""" + + def __init__(self, config, client, base_msg, out_dir, progress_lock, + save_progress_fn, update_progress_fn, check_stop_fn, + image_translator=None, is_text_file=False): + self.config = config + self.client = client + self.base_msg = base_msg + self.out_dir = out_dir + self.progress_lock = progress_lock + self.save_progress_fn = save_progress_fn + self.update_progress_fn = update_progress_fn + self.check_stop_fn = check_stop_fn + self.image_translator = image_translator + self.chapters_completed = 0 + self.chunks_completed = 0 + self.is_text_file = is_text_file + + # Optionally log multi-key status + if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys: + stats = self.client.get_stats() + print(f"🔑 Batch processor using multi-key mode: {stats.get('total_keys', 0)} keys") + + def process_single_chapter(self, chapter_data): + """Process a single chapter (runs in thread)""" + # APPLY INTERRUPTIBLE THREADING DELAY FIRST + thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) + if thread_delay > 0: + # Check if we need to wait (same logic as unified_api_client) + if hasattr(self.client, '_thread_submission_lock') and hasattr(self.client, '_last_thread_submission_time'): + with self.client._thread_submission_lock: + current_time = time.time() + time_since_last = current_time - self.client._last_thread_submission_time + + if time_since_last < thread_delay: + sleep_time = thread_delay - time_since_last + thread_name = threading.current_thread().name + + # PRINT BEFORE THE DELAY STARTS + idx, chapter = chapter_data # Extract chapter info for better logging + print(f"🧵 [{thread_name}] Applying thread delay: {sleep_time:.1f}s for Chapter {idx+1}") + + # Interruptible sleep - check stop flag every 0.1 seconds + elapsed = 0 + check_interval = 0.1 + while elapsed < sleep_time: + if self.check_stop_fn(): + print(f"🛑 Threading delay interrupted by stop flag") + raise Exception("Translation stopped by user during threading delay") + + sleep_chunk = min(check_interval, sleep_time - elapsed) + time.sleep(sleep_chunk) + elapsed += sleep_chunk + + self.client._last_thread_submission_time = time.time() + if not hasattr(self.client, '_thread_submission_count'): + self.client._thread_submission_count = 0 + self.client._thread_submission_count += 1 + + idx, chapter = chapter_data + chap_num = chapter["num"] + + # Use the pre-calculated actual_chapter_num from the main loop + actual_num = chapter.get('actual_chapter_num') + + # Fallback if not set (common in batch mode where first pass might be skipped) + if actual_num is None: + # Try to extract it using the same logic as non-batch mode + raw_num = FileUtilities.extract_actual_chapter_number(chapter, patterns=None, config=self.config) + + # Apply offset if configured + offset = self.config.CHAPTER_NUMBER_OFFSET if hasattr(self.config, 'CHAPTER_NUMBER_OFFSET') else 0 + raw_num += offset + + # Check if zero detection is disabled + if hasattr(self.config, 'DISABLE_ZERO_DETECTION') and self.config.DISABLE_ZERO_DETECTION: + actual_num = raw_num + elif hasattr(self.config, '_uses_zero_based') and self.config._uses_zero_based: + # This is a 0-based novel, adjust the number + actual_num = raw_num + 1 + else: + # Default to raw number (1-based or unknown) + actual_num = raw_num + + print(f" 📖 Extracted actual chapter number: {actual_num} (from raw: {raw_num})") + + try: + # Check if this is from a text file + ai_features = None + is_text_source = self.is_text_file or chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + print(f"🔄 Starting #{idx+1} (Internal: {terminology} {chap_num}, Actual: {terminology} {actual_num}) (thread: {threading.current_thread().name}) [File: {chapter.get('original_basename', f'{terminology}_{chap_num}')}]") + + content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"]) + with self.progress_lock: + self.update_progress_fn(idx, actual_num, content_hash, None, status="in_progress") + self.save_progress_fn() + + chapter_body = chapter["body"] + if chapter.get('has_images') and self.image_translator and self.config.ENABLE_IMAGE_TRANSLATION: + print(f"🖼️ Processing images for Chapter {actual_num}...") + self.image_translator.set_current_chapter(actual_num) + chapter_body, image_translations = process_chapter_images( + chapter_body, + actual_num, + self.image_translator, + self.check_stop_fn + ) + if image_translations: + # Create a copy of the processed body + from bs4 import BeautifulSoup + c = chapter + soup_for_text = BeautifulSoup(c["body"], 'html.parser') + + # Remove all translated content + for trans_div in soup_for_text.find_all('div', class_='translated-text-only'): + trans_div.decompose() + + # Use this cleaned version for text translation + text_to_translate = str(soup_for_text) + final_body_with_images = c["body"] + else: + text_to_translate = c["body"] + image_translations = {} + print(f"✅ Processed {len(image_translations)} images for Chapter {actual_num}") + + chapter_msgs = self.base_msg + [{"role": "user", "content": chapter_body}] + + # Generate filename before API call + fname = FileUtilities.create_chapter_filename(chapter, actual_num) + self.client.set_output_filename(fname) + + if hasattr(self.client, '_current_output_file'): + self.client._current_output_file = fname + + print(f"📤 Sending Chapter {actual_num} to API...") + result, finish_reason = send_with_interrupt( + chapter_msgs, self.client, self.config.TEMP, + self.config.MAX_OUTPUT_TOKENS, self.check_stop_fn + ) + + print(f"📥 Received Chapter {actual_num} response, finish_reason: {finish_reason}") + + # Enhanced mode workflow (same as non-batch): + # 1. Original HTML -> html2text -> Markdown/plain text (during extraction) + # 2. Markdown sent to translation API (better for translation quality) + # 3. Translated markdown -> HTML conversion (here) + if result and chapter.get("enhanced_extraction", False): + print(f"🔄 Converting translated markdown back to HTML...") + result = convert_enhanced_text_to_html(result, chapter) + + if finish_reason in ["length", "max_tokens"]: + print(f"⚠️ Chapter {actual_num} response was TRUNCATED!") + + if self.config.REMOVE_AI_ARTIFACTS: + result = ContentProcessor.clean_ai_artifacts(result, True) + + result = ContentProcessor.clean_memory_artifacts(result) + + cleaned = re.sub(r"^```(?:html)?\s*\n?", "", result, count=1, flags=re.MULTILINE) + cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) + cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=self.config.REMOVE_AI_ARTIFACTS) + + fname = FileUtilities.create_chapter_filename(chapter, actual_num) + + if self.is_text_file: + # For text files, save as plain text + fname_txt = fname.replace('.html', '.txt') if fname.endswith('.html') else fname + + # Extract text from HTML + from bs4 import BeautifulSoup + soup = BeautifulSoup(cleaned, 'html.parser') + text_content = soup.get_text(strip=True) + + # Merge image translations back with text translation + if 'final_body_with_images' in locals() and image_translations: + # Parse both versions + soup_with_images = BeautifulSoup(final_body_with_images, 'html.parser') + soup_with_text = BeautifulSoup(cleaned, 'html.parser') + + # Get the translated text content (without images) + body_content = soup_with_text.body + + # Add image translations to the translated content + for trans_div in soup_with_images.find_all('div', class_='translated-text-only'): + body_content.insert(0, trans_div) + + final_html = str(soup_with_text) + cleaned = final_html + + with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: + f.write(cleaned) + + # Update with .txt filename + with self.progress_lock: + self.update_progress_fn(idx, actual_num, content_hash, fname_txt, status="completed", ai_features=ai_features) + self.save_progress_fn() + else: + # Original code for EPUB files + with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: + f.write(cleaned) + + print(f"💾 Saved Chapter {actual_num}: {fname} ({len(cleaned)} chars)") + + # Initialize ai_features at the beginning to ensure it's always defined + if ai_features is None: + ai_features = None + + # Extract and save AI features for future duplicate detection + if (self.config.RETRY_DUPLICATE_BODIES and + hasattr(self.config, 'DUPLICATE_DETECTION_MODE') and + self.config.DUPLICATE_DETECTION_MODE in ['ai-hunter', 'cascading']): + try: + # Extract features from the translated content + cleaned_text = re.sub(r'<[^>]+>', '', cleaned).strip() + # Note: self.translator doesn't exist, so we can't extract features here + # The features will need to be extracted during regular processing + print(f" ⚠️ AI features extraction not available in batch mode") + except Exception as e: + print(f" ⚠️ Failed to extract AI features: {e}") + + with self.progress_lock: + # Check for QA failures with comprehensive detection + if is_qa_failed_response(cleaned): + chapter_status = "qa_failed" + failure_reason = get_failure_reason(cleaned) + print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: {failure_reason}") + # Update progress to qa_failed status + self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features) + self.save_progress_fn() + # DO NOT increment chapters_completed for qa_failed + # Return False to indicate failure + return False, actual_num + else: + chapter_status = "completed" + # Update progress to completed status + self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features) + self.save_progress_fn() + # Only increment chapters_completed for successful chapters + self.chapters_completed += 1 + self.chunks_completed += 1 + + print(f"✅ Chapter {actual_num} completed successfully") + return True, actual_num + + except Exception as e: + print(f"❌ Chapter {actual_num} failed: {e}") + with self.progress_lock: + self.update_progress_fn(idx, actual_num, content_hash, None, status="failed") + self.save_progress_fn() + return False, actual_num + +# ===================================================== +# GLOSSARY MANAGER - TRUE CSV FORMAT WITH FUZZY MATCHING +# ===================================================== + +class GlossaryManager: + """Unified glossary management with true CSV format, fuzzy matching, and parallel processing""" + + # Class-level shared lock for API submission timing + _api_submission_lock = threading.Lock() + _last_api_submission_time = 0 + + def __init__(self): + self.pattern_manager = PatternManager() + self._results_lock = threading.Lock() # Thread lock for collecting results + self._file_write_lock = threading.Lock() # Thread lock for file operations + + def _atomic_write_file(self, filepath, content, encoding='utf-8'): + """Atomically write to a file to prevent corruption from concurrent writes""" + + # Create temp file in same directory to ensure same filesystem + dir_path = os.path.dirname(filepath) + + with self._file_write_lock: + try: + # Write to temporary file first + with tempfile.NamedTemporaryFile(mode='w', encoding=encoding, + dir=dir_path, delete=False) as tmp_file: + tmp_file.write(content) + tmp_path = tmp_file.name + + # Atomic rename (on same filesystem) + if os.name == 'nt': # Windows + # Windows doesn't support atomic rename if target exists + if os.path.exists(filepath): + os.remove(filepath) + os.rename(tmp_path, filepath) + else: # Unix/Linux/Mac + os.rename(tmp_path, filepath) + + return True + + except Exception as e: + print(f"⚠️ Atomic write failed: {e}") + # Cleanup temp file if it exists + if 'tmp_path' in locals() and os.path.exists(tmp_path): + try: + os.remove(tmp_path) + except: + pass + + # Fallback to direct write with lock + try: + with open(filepath, 'w', encoding=encoding) as f: + f.write(content) + return True + except Exception as e2: + print(f"⚠️ Fallback write also failed: {e2}") + return False + + def save_glossary(self, output_dir, chapters, instructions, language="korean"): + """Targeted glossary generator with true CSV format output and parallel processing""" + print("📑 Targeted Glossary Generator v6.0 (CSV Format + Parallel)") + + # Check stop flag at start + # Ensure output directory exists + try: + os.makedirs(output_dir, exist_ok=True) + except Exception as _e: + print(f"⚠️ Could not ensure output directory exists: {output_dir} ({_e})") + if is_stop_requested(): + print("📑 ❌ Glossary generation stopped by user") + return {} + + # Check if glossary already exists; if so, we'll MERGE it later (do not return early) + glossary_path = os.path.join(output_dir, "glossary.csv") + existing_glossary_content = None + if os.path.exists(glossary_path): + print(f"📑 Existing glossary detected (will merge): {glossary_path}") + try: + with open(glossary_path, 'r', encoding='utf-8') as f: + existing_glossary_content = f.read() + except Exception as e: + print(f"⚠️ Could not read existing glossary: {e}") + + # Rest of the method continues as before... + print("📑 Extracting names and terms with configurable options") + + # Check stop flag before processing + if is_stop_requested(): + print("📑 ❌ Glossary generation stopped by user") + return {} + + # Check for manual glossary first (CSV only) + manual_glossary_path = os.getenv("MANUAL_GLOSSARY") + existing_glossary = None + if manual_glossary_path and os.path.exists(manual_glossary_path): + print(f"📑 Manual glossary detected: {os.path.basename(manual_glossary_path)}") + try: + with open(manual_glossary_path, 'r', encoding='utf-8') as f: + content = f.read() + # Treat as CSV text and stage it for merge; also copy to output for visibility + target_path = os.path.join(output_dir, "glossary.csv") + with open(target_path, 'w', encoding='utf-8') as f: + f.write(content) + print(f"📑 ✅ Manual CSV glossary copied to: {target_path}") + existing_glossary = content + except Exception as e: + print(f"⚠️ Could not copy manual glossary: {e}") + print(f"📑 Proceeding with automatic generation...") + + # Check for existing glossary from manual extraction + glossary_folder_path = os.path.join(output_dir, "Glossary") + # existing_glossary may already be set by MANUAL_GLOSSARY above + + if os.path.exists(glossary_folder_path): + for file in os.listdir(glossary_folder_path): + if file.endswith("_glossary.json"): + existing_path = os.path.join(glossary_folder_path, file) + try: + with open(existing_path, 'r', encoding='utf-8') as f: + existing_content = f.read() + existing_glossary = existing_content + print(f"📑 Found existing glossary from manual extraction: {file}") + break + except Exception as e: + print(f"⚠️ Could not load existing glossary: {e}") + + # Get configuration from environment variables + min_frequency = int(os.getenv("GLOSSARY_MIN_FREQUENCY", "2")) + max_names = int(os.getenv("GLOSSARY_MAX_NAMES", "50")) + max_titles = int(os.getenv("GLOSSARY_MAX_TITLES", "30")) + batch_size = int(os.getenv("GLOSSARY_BATCH_SIZE", "50")) + strip_honorifics = os.getenv("GLOSSARY_STRIP_HONORIFICS", "1") == "1" + fuzzy_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90")) + max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "50000")) + + print(f"📑 Settings: Min frequency: {min_frequency}, Max names: {max_names}, Max titles: {max_titles}") + print(f"📑 Strip honorifics: {'✅ Yes' if strip_honorifics else '❌ No'}") + print(f"📑 Fuzzy matching threshold: {fuzzy_threshold}") + + # Get custom prompt from environment + custom_prompt = os.getenv("AUTO_GLOSSARY_PROMPT", "").strip() + + def clean_html(html_text): + """Remove HTML tags to get clean text""" + soup = BeautifulSoup(html_text, 'html.parser') + return soup.get_text() + + # Check stop before processing chapters + if is_stop_requested(): + print("📑 ❌ Glossary generation stopped by user") + return {} + + # Get chapter split threshold and filter mode + chapter_split_threshold = int(os.getenv("GLOSSARY_CHAPTER_SPLIT_THRESHOLD", "100000")) + filter_mode = os.getenv("GLOSSARY_FILTER_MODE", "all") # all, only_with_honorifics, only_without_honorifics + + # Check if parallel extraction is enabled for automatic glossary + extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) + batch_translation = os.getenv("BATCH_TRANSLATION", "0") == "1" + api_batch_size = int(os.getenv("BATCH_SIZE", "5")) + + # Log the settings + print(f"📑 Filter mode: {filter_mode}") + if extraction_workers > 1: + print(f"📑 Parallel extraction enabled: {extraction_workers} workers") + if batch_translation: + print(f"📑 Batch API calls enabled: {api_batch_size} chunks per batch") + + all_text = ' '.join(clean_html(chapter["body"]) for chapter in chapters) + print(f"📑 Processing {len(all_text):,} characters of text") + + # Apply smart filtering FIRST to check actual size needed + use_smart_filter = os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1" + effective_text_size = len(all_text) + + filtered_text_cache = None + if use_smart_filter and custom_prompt: # Only apply for AI extraction + print(f"📑 Smart filtering enabled - checking effective text size after filtering...") + # Perform filtering ONCE and reuse for chunking + filtered_sample, _ = self._filter_text_for_glossary(all_text, min_frequency) + filtered_text_cache = filtered_sample + effective_text_size = len(filtered_sample) + print(f"📑 Effective text size after filtering: {effective_text_size:,} chars (from {len(all_text):,})") + + # Check if we need to split into chunks based on EFFECTIVE size after filtering + if chapter_split_threshold > 0 and effective_text_size > chapter_split_threshold: + print(f"📑 Effective text exceeds {chapter_split_threshold:,} chars, will process in chunks...") + + # If using smart filter, we need to split the FILTERED text, not raw text + if use_smart_filter and custom_prompt: + # Split the filtered text into chunks (reuse cached filtered text) + filtered_text = filtered_text_cache if filtered_text_cache is not None else self._filter_text_for_glossary(all_text, min_frequency)[0] + chunks_to_process = [] + + # Split filtered text into chunks of appropriate size + chunk_size = chapter_split_threshold + for i in range(0, len(filtered_text), chunk_size): + chunk_text = filtered_text[i:i + chunk_size] + chunks_to_process.append((len(chunks_to_process) + 1, chunk_text)) + + print(f"📑 Split filtered text into {len(chunks_to_process)} chunks") + all_glossary_entries = [] + else: + # Original logic for unfiltered text + all_glossary_entries = [] + chunk_size = 0 + chunk_chapters = [] + chunks_to_process = [] + + for idx, chapter in enumerate(chapters): + if is_stop_requested(): + print("📑 ❌ Glossary generation stopped by user") + return all_glossary_entries + + chapter_text = clean_html(chapter["body"]) + chunk_size += len(chapter_text) + chunk_chapters.append(chapter) + + # Process chunk when it reaches threshold or last chapter + if chunk_size >= chapter_split_threshold or idx == len(chapters) - 1: + chunk_text = ' '.join(clean_html(ch["body"]) for ch in chunk_chapters) + chunks_to_process.append((len(chunks_to_process) + 1, chunk_text)) + + # Reset for next chunk + chunk_size = 0 + chunk_chapters = [] + + print(f"📑 Split into {len(chunks_to_process)} chunks for processing") + + # Batch toggle decides concurrency: ON => parallel API calls; OFF => strict sequential + if batch_translation and custom_prompt and len(chunks_to_process) > 1: + print(f"📑 Processing chunks in batch mode with {api_batch_size} chunks per batch...") + # Set fast mode for batch processing + os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "1" + + # Use batch API calls for AI extraction + all_csv_lines = self._process_chunks_batch_api( + chunks_to_process, custom_prompt, language, + min_frequency, max_names, max_titles, + output_dir, strip_honorifics, fuzzy_threshold, + filter_mode, api_batch_size, extraction_workers + ) + + # Reset validation mode + os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "0" + + print(f"📑 All chunks completed. Aggregated raw lines: {len(all_csv_lines)}") + + # Process all collected entries at once (even if empty) + # Add header so downstream steps can work uniformly + all_csv_lines.insert(0, "type,raw_name,translated_name") + + # Merge with any on-disk glossary first (to avoid overwriting user edits) + on_disk_path = os.path.join(output_dir, "glossary.csv") + if os.path.exists(on_disk_path): + try: + with open(on_disk_path, 'r', encoding='utf-8') as f: + on_disk_content = f.read() + all_csv_lines = self._merge_csv_entries(all_csv_lines, on_disk_content, strip_honorifics, language) + print("📑 Merged with existing on-disk glossary") + except Exception as e: + print(f"⚠️ Failed to merge with existing on-disk glossary: {e}") + + # Apply filter mode if needed + if filter_mode == "only_with_honorifics": + filtered = [all_csv_lines[0]] # Keep header + for line in all_csv_lines[1:]: + parts = line.split(',', 2) + if len(parts) >= 3 and parts[0] == "character": + filtered.append(line) + all_csv_lines = filtered + print(f"📑 Filter applied: {len(all_csv_lines)-1} character entries with honorifics kept") + + # Apply fuzzy deduplication (deferred until after all chunks) + try: + print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") + all_csv_lines = self._deduplicate_glossary_with_fuzzy(all_csv_lines, fuzzy_threshold) + except Exception as e: + print(f"⚠️ Deduplication error: {e} — continuing without dedup") + + # Sort by type and name + print(f"📑 Sorting glossary by type and name...") + header = all_csv_lines[0] + entries = all_csv_lines[1:] + if entries: + entries.sort(key=lambda x: (0 if x.startswith('character,') else 1, x.split(',')[1].lower())) + all_csv_lines = [header] + entries + + # Save + # Check format preference + use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' + + if not use_legacy_format: + # Convert to token-efficient format + all_csv_lines = self._convert_to_token_efficient_format(all_csv_lines) + + # Final sanitize to prevent stray headers + all_csv_lines = self._sanitize_final_glossary_lines(all_csv_lines, use_legacy_format) + + # Save + csv_content = '\n'.join(all_csv_lines) + glossary_path = os.path.join(output_dir, "glossary.csv") + self._atomic_write_file(glossary_path, csv_content) + + # Verify file exists; fallback direct write if needed + if not os.path.exists(glossary_path): + try: + with open(glossary_path, 'w', encoding='utf-8') as f: + f.write(csv_content) + print("📑 Fallback write succeeded for glossary.csv") + except Exception as e: + print(f"❌ Failed to write glossary.csv: {e}") + + print(f"\n📑 ✅ GLOSSARY SAVED!") + print(f"📑 ✅ AI GLOSSARY SAVED!") + c_count, t_count, total = self._count_glossary_entries(all_csv_lines, use_legacy_format) + print(f"📑 Character entries: {c_count}") + print(f"📑 Term entries: {t_count}") + print(f"📑 Total entries: {total}") + + return self._parse_csv_to_dict(csv_content) + else: + # Strict sequential processing (one API call at a time) + _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") + _prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED") + _prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER") + os.environ["GLOSSARY_DEFER_SAVE"] = "1" + # Tell the extractor each chunk is already filtered to avoid re-running smart filter per chunk + os.environ["_CHUNK_ALREADY_FILTERED"] = "1" + os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1" + try: + for chunk_idx, chunk_text in chunks_to_process: + if is_stop_requested(): + break + + print(f"📑 Processing chunk {chunk_idx}/{len(chunks_to_process)} ({len(chunk_text):,} chars)...") + + if custom_prompt: + chunk_glossary = self._extract_with_custom_prompt( + custom_prompt, chunk_text, language, + min_frequency, max_names, max_titles, + None, output_dir, # Don't pass existing glossary to chunks + strip_honorifics, fuzzy_threshold, filter_mode + ) + else: + chunk_glossary = self._extract_with_patterns( + chunk_text, language, min_frequency, + max_names, max_titles, batch_size, + None, output_dir, # Don't pass existing glossary to chunks + strip_honorifics, fuzzy_threshold, filter_mode + ) + + # Normalize to CSV lines and aggregate + chunk_lines = [] + if isinstance(chunk_glossary, list): + for line in chunk_glossary: + if line and not line.startswith('type,'): + all_glossary_entries.append(line) + chunk_lines.append(line) + else: + for raw_name, translated_name in chunk_glossary.items(): + entry_type = "character" if self._has_honorific(raw_name) else "term" + line = f"{entry_type},{raw_name},{translated_name}" + all_glossary_entries.append(line) + chunk_lines.append(line) + + # Incremental update + try: + self._incremental_update_glossary(output_dir, chunk_lines, strip_honorifics, language, filter_mode) + print(f"📑 Incremental write: +{len(chunk_lines)} entries") + except Exception as e2: + print(f"⚠️ Incremental write failed: {e2}") + finally: + if _prev_defer is None: + if "GLOSSARY_DEFER_SAVE" in os.environ: + del os.environ["GLOSSARY_DEFER_SAVE"] + else: + os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer + if _prev_filtered is None: + os.environ.pop("_CHUNK_ALREADY_FILTERED", None) + else: + os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered + if _prev_force_disable is None: + os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None) + else: + os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable + + # Build CSV from aggregated entries + csv_lines = ["type,raw_name,translated_name"] + all_glossary_entries + + # Merge with any provided existing glossary AND on-disk glossary to avoid overwriting + on_disk_path = os.path.join(output_dir, "glossary.csv") + merge_sources = [] + if existing_glossary: + merge_sources.append(existing_glossary) + if os.path.exists(on_disk_path): + try: + with open(on_disk_path, 'r', encoding='utf-8') as f: + merge_sources.append(f.read()) + print("📑 Found existing on-disk glossary to merge") + except Exception as e: + print(f"⚠️ Failed to read on-disk glossary for merging: {e}") + # Also merge the main on-disk glossary if it was present at start + if existing_glossary_content: + csv_lines = self._merge_csv_entries(csv_lines, existing_glossary_content, strip_honorifics, language) + for src in merge_sources: + csv_lines = self._merge_csv_entries(csv_lines, src, strip_honorifics, language) + + # Apply filter mode to final results + csv_lines = self._filter_csv_by_mode(csv_lines, filter_mode) + + # Apply fuzzy deduplication (deferred until after all chunks) + print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") + original_count = len(csv_lines) - 1 + csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) + deduped_count = len(csv_lines) - 1 + if original_count > deduped_count: + print(f"📑 Removed {original_count - deduped_count} duplicate entries") + + # Sort by type and name + print(f"📑 Sorting glossary by type and name...") + header = csv_lines[0] + entries = csv_lines[1:] + entries.sort(key=lambda x: (0 if x.startswith('character,') else 1, x.split(',')[1].lower() if ',' in x else x.lower())) + csv_lines = [header] + entries + + # Token-efficient format if enabled + use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' + if not use_legacy_format: + csv_lines = self._convert_to_token_efficient_format(csv_lines) + + # Final sanitize to prevent stray headers and section titles at end + csv_lines = self._sanitize_final_glossary_lines(csv_lines, use_legacy_format) + + try: + # Save + csv_content = '\n'.join(csv_lines) + glossary_path = os.path.join(output_dir, "glossary.csv") + self._atomic_write_file(glossary_path, csv_content) + + # Verify file exists; fallback direct write if needed + if not os.path.exists(glossary_path): + try: + with open(glossary_path, 'w', encoding='utf-8') as f: + f.write(csv_content) + print("📑 Fallback write succeeded for glossary.csv") + except Exception as e: + print(f"❌ Failed to write glossary.csv: {e}") + finally: + print(f"\n📑 ✅ CHUNKED GLOSSARY SAVED!") + print(f"📑 ✅ AI GLOSSARY SAVED!") + print(f"📑 File: {glossary_path}") + c_count, t_count, total = self._count_glossary_entries(csv_lines, use_legacy_format) + print(f"📑 Character entries: {c_count}") + print(f"📑 Term entries: {t_count}") + print(f"📑 Total entries: {total}") + + return self._parse_csv_to_dict(csv_content) + + # Original single-text processing + if custom_prompt: + return self._extract_with_custom_prompt(custom_prompt, all_text, language, + min_frequency, max_names, max_titles, + existing_glossary, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode) + else: + return self._extract_with_patterns(all_text, language, min_frequency, + max_names, max_titles, batch_size, + existing_glossary, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode) + + total_time = time.time() - total_start_time + print(f"\n📑 ========== GLOSSARY GENERATION COMPLETE ==========") + print(f"📑 Total time: {total_time:.1f}s") + print(f"📑 Performance breakdown:") + print(f"📑 - Extraction: {getattr(self, '_extraction_time', 0):.1f}s") + print(f"📑 - API calls: {getattr(self, '_api_time', 0):.1f}s") + print(f"📑 - Frequency checking: {getattr(self, '_freq_check_time', 0):.1f}s") + print(f"📑 - Deduplication: {getattr(self, '_dedup_time', 0):.1f}s") + print(f"📑 - File I/O: {getattr(self, '_io_time', 0):.1f}s") + print(f"📑 ================================================") + + return result # This is the existing return statement + + def _convert_to_token_efficient_format(self, csv_lines): + """Convert CSV lines to token-efficient format with sections and asterisks""" + if len(csv_lines) <= 1: + return csv_lines + + header = csv_lines[0] + entries = csv_lines[1:] + + # Group by type (only from valid CSV lines) + import re as _re + grouped = {} + for line in entries: + if not line.strip(): + continue + # Only accept proper CSV rows: at least 3 fields and a sane type token + parts_full = [p.strip() for p in line.split(',')] + if len(parts_full) < 3: + continue + entry_type = parts_full[0].lower() + if not _re.match(r'^[a-z_]+$', entry_type): + continue + if entry_type not in grouped: + grouped[entry_type] = [] + grouped[entry_type].append(line) + + # Rebuild with token-efficient format + result = [] + result.append("Glossary: Characters, Terms, and Important Elements\n") + + # Process in order: character first, then term, then others + type_order = ['character', 'term'] + [t for t in grouped.keys() if t not in ['character', 'term']] + + for entry_type in type_order: + if entry_type not in grouped: + continue + + entries = grouped[entry_type] + + # Add section header + section_name = entry_type.upper() + 'S' if not entry_type.upper().endswith('S') else entry_type.upper() + result.append(f"=== {section_name} ===") + + # Add entries in new format + for line in entries: + parts = [p.strip() for p in line.split(',')] + if len(parts) >= 3: + raw_name = parts[1] + translated_name = parts[2] + + # Format: * TranslatedName (RawName) + entry_line = f"* {translated_name} ({raw_name})" + + # Add gender if present and not Unknown + if len(parts) > 3 and parts[3] and parts[3] != 'Unknown': + entry_line += f" [{parts[3]}]" + + # Add any additional fields as description + if len(parts) > 4: + description = ', '.join(parts[4:]) + if description.strip(): + entry_line += f": {description}" + + result.append(entry_line) + + result.append("") # Blank line between sections + + return result + + def _count_glossary_entries(self, lines, use_legacy_format=False): + """Return (char_count, term_count, total_count) for either format.""" + if not lines: + return 0, 0, 0 + if use_legacy_format: + data = lines[1:] if lines and lines[0].lower().startswith('type,raw_name') else lines + char_count = sum(1 for ln in data if ln.startswith('character,')) + term_count = sum(1 for ln in data if ln.startswith('term,')) + total = sum(1 for ln in data if ln and ',' in ln) + return char_count, term_count, total + # token-efficient + current = None + char_count = term_count = total = 0 + for ln in lines: + s = ln.strip() + if s.startswith('=== ') and 'CHARACTER' in s.upper(): + current = 'character' + continue + if s.startswith('=== ') and 'TERM' in s.upper(): + current = 'term' + continue + if s.startswith('* '): + total += 1 + if current == 'character': + char_count += 1 + elif current == 'term': + term_count += 1 + return char_count, term_count, total + + def _sanitize_final_glossary_lines(self, lines, use_legacy_format=False): + """Remove stray CSV headers and normalize header placement before saving. + - In legacy CSV mode, ensure exactly one header at the very top. + - In token-efficient mode, remove any CSV header lines entirely. + """ + header_norm = "type,raw_name,translated_name" + if not lines: + return lines + + if use_legacy_format: + sanitized = [] + header_seen = False + for ln in lines: + txt = ln.strip() + if txt.lower().startswith("type,raw_name"): + if not header_seen: + sanitized.append(header_norm) + header_seen = True + # skip duplicates + else: + sanitized.append(ln) + # ensure header at top + if sanitized and not sanitized[0].strip().lower().startswith("type,raw_name"): + sanitized.insert(0, header_norm) + return sanitized + else: + # remove any CSV header lines anywhere and duplicate top headers/sections + cleaned = [] + glossary_header_seen = False + for i, ln in enumerate(lines): + txt = ln.strip() + low = txt.lower() + # Drop CSV headers + if low.startswith("type,raw_name"): + continue + # Keep only the first main glossary header + if low.startswith("glossary:"): + if glossary_header_seen: + continue + glossary_header_seen = True + cleaned.append(ln) + continue + # Remove bogus section like '=== GLOSSARY: ... ===' + if low.startswith("=== glossary:"): + continue + cleaned.append(ln) + return cleaned + + def _process_chunks_batch_api(self, chunks_to_process, custom_prompt, language, + min_frequency, max_names, max_titles, + output_dir, strip_honorifics, fuzzy_threshold, + filter_mode, api_batch_size, extraction_workers): + """Process chunks using batch API calls for AI extraction with thread delay""" + + print(f"📑 Using batch API mode with {api_batch_size} chunks per batch") + + # Ensure we defer saving and heavy merging when processing chunks + _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") + os.environ["GLOSSARY_DEFER_SAVE"] = "1" + + # Get thread submission delay + thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) + if thread_delay > 0: + print(f"📑 Thread submission delay: {thread_delay}s between parallel calls") + + # CHANGE: Collect raw CSV lines instead of dictionary + all_csv_lines = [] # Collect all entries as CSV lines + total_chunks = len(chunks_to_process) + completed_chunks = 0 + + # Ensure per-chunk smart filtering is disabled globally during batch processing + _prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED") + _prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER") + os.environ["_CHUNK_ALREADY_FILTERED"] = "1" + os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1" + + # Process in API batches + for batch_start in range(0, len(chunks_to_process), api_batch_size): + if is_stop_requested(): + break + + batch_end = min(batch_start + api_batch_size, len(chunks_to_process)) + batch_chunks = chunks_to_process[batch_start:batch_end] + + print(f"📑 Processing API batch {batch_start//api_batch_size + 1}: chunks {batch_start+1}-{batch_end}") + + # Use ThreadPoolExecutor for parallel API calls within batch + # Batch mode: issue multiple API calls in parallel within each batch (one worker per chunk) + with ThreadPoolExecutor(max_workers=len(batch_chunks)) as executor: + futures = {} + last_submission_time = 0 + + for chunk_idx, chunk_text in batch_chunks: + if is_stop_requested(): + break + + # Apply thread submission delay + if thread_delay > 0 and last_submission_time > 0: + time_since_last = time.time() - last_submission_time + if time_since_last < thread_delay: + sleep_time = thread_delay - time_since_last + print(f"🧵 Thread delay: {sleep_time:.1f}s for chunk {chunk_idx}") + time.sleep(sleep_time) + + future = executor.submit( + self._extract_with_custom_prompt, + custom_prompt, chunk_text, language, + min_frequency, max_names, max_titles, + None, output_dir, strip_honorifics, + fuzzy_threshold, filter_mode + ) + futures[future] = chunk_idx + last_submission_time = time.time() + + # Collect results + for future in as_completed(futures): + if is_stop_requested(): + break + + try: + chunk_glossary = future.result() + print(f"📑 DEBUG: Chunk {futures[future]} returned type={type(chunk_glossary)}, len={len(chunk_glossary)}") + + # Normalize to CSV lines (without header) + chunk_lines = [] + if isinstance(chunk_glossary, dict): + for raw_name, translated_name in chunk_glossary.items(): + entry_type = "character" if self._has_honorific(raw_name) else "term" + chunk_lines.append(f"{entry_type},{raw_name},{translated_name}") + elif isinstance(chunk_glossary, list): + for line in chunk_glossary: + if line and not line.startswith('type,'): + chunk_lines.append(line) + + # Aggregate for end-of-run + all_csv_lines.extend(chunk_lines) + + # Incremental update of glossary.csv in token-efficient format + try: + self._incremental_update_glossary(output_dir, chunk_lines, strip_honorifics, language, filter_mode) + print(f"📑 Incremental write: +{len(chunk_lines)} entries") + except Exception as e2: + print(f"⚠️ Incremental write failed: {e2}") + + completed_chunks += 1 + + # Print progress for GUI + progress_percent = (completed_chunks / total_chunks) * 100 + print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)") + print(f"📑 Chunk {futures[future]} completed and aggregated") + + except Exception as e: + print(f"⚠️ API call for chunk {futures[future]} failed: {e}") + completed_chunks += 1 + progress_percent = (completed_chunks / total_chunks) * 100 + print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)") + + # Add delay between API batches + if batch_end < len(chunks_to_process): + api_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) + print(f"⏱️ Waiting {api_delay}s before next API batch...") + time.sleep(api_delay) + + # CHANGE: Return CSV lines instead of dictionary + + # Restore per-chunk filter disabling envs + if _prev_filtered is None: + os.environ.pop("_CHUNK_ALREADY_FILTERED", None) + else: + os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered + if _prev_force_disable is None: + os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None) + else: + os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable + + # Restore previous defer setting + if _prev_defer is None: + # Default back to not deferring if it wasn't set + if "GLOSSARY_DEFER_SAVE" in os.environ: + del os.environ["GLOSSARY_DEFER_SAVE"] + else: + os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer + + return all_csv_lines + + def _incremental_update_glossary(self, output_dir, chunk_lines, strip_honorifics, language, filter_mode): + """Incrementally update glossary.csv (token-efficient) using an on-disk CSV aggregator. + This keeps glossary.csv present and growing after each chunk while preserving + token-efficient format for the visible file. + """ + if not chunk_lines: + return + # Paths + agg_path = os.path.join(output_dir, "glossary.incremental.csv") + vis_path = os.path.join(output_dir, "glossary.csv") + # Ensure output dir + os.makedirs(output_dir, exist_ok=True) + # Compose CSV with header for merging + new_csv_lines = ["type,raw_name,translated_name"] + chunk_lines + # Load existing aggregator content, if any + existing_csv = None + if os.path.exists(agg_path): + try: + with open(agg_path, 'r', encoding='utf-8') as f: + existing_csv = f.read() + except Exception as e: + print(f"⚠️ Incremental: cannot read aggregator: {e}") + # Merge (exact merge, no fuzzy to keep this fast) + merged_csv_lines = self._merge_csv_entries(new_csv_lines, existing_csv or "", strip_honorifics, language) + # Optional filter mode + merged_csv_lines = self._filter_csv_by_mode(merged_csv_lines, filter_mode) + # Save aggregator (CSV) + self._atomic_write_file(agg_path, "\n".join(merged_csv_lines)) + # Convert to token-efficient format for visible glossary.csv + token_lines = self._convert_to_token_efficient_format(merged_csv_lines) + token_lines = self._sanitize_final_glossary_lines(token_lines, use_legacy_format=False) + self._atomic_write_file(vis_path, "\n".join(token_lines)) + if not os.path.exists(vis_path): + with open(vis_path, 'w', encoding='utf-8') as f: + f.write("\n".join(token_lines)) + + def _process_single_chunk(self, chunk_idx, chunk_text, custom_prompt, language, + min_frequency, max_names, max_titles, batch_size, + output_dir, strip_honorifics, fuzzy_threshold, filter_mode, + already_filtered=False): + """Process a single chunk - wrapper for parallel execution""" + print(f"📑 Worker processing chunk {chunk_idx} ({len(chunk_text):,} chars)...") + + if custom_prompt: + # Pass flag to indicate if text is already filtered + os.environ["_CHUNK_ALREADY_FILTERED"] = "1" if already_filtered else "0" + _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") + os.environ["GLOSSARY_DEFER_SAVE"] = "1" + try: + result = self._extract_with_custom_prompt( + custom_prompt, chunk_text, language, + min_frequency, max_names, max_titles, + None, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode + ) + finally: + os.environ["_CHUNK_ALREADY_FILTERED"] = "0" # Reset + if _prev_defer is None: + if "GLOSSARY_DEFER_SAVE" in os.environ: + del os.environ["GLOSSARY_DEFER_SAVE"] + else: + os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer + return result + else: + return self._extract_with_patterns( + chunk_text, language, min_frequency, + max_names, max_titles, batch_size, + None, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode + ) + + def _apply_final_filter(self, entries, filter_mode): + """Apply final filtering based on mode to ensure only requested types are included""" + if filter_mode == "only_with_honorifics": + # Filter to keep only entries that look like they have honorifics + filtered = {} + for key, value in entries.items(): + # Check if the key contains known honorific patterns + if self._has_honorific(key): + filtered[key] = value + print(f"📑 Final filter: Kept {len(filtered)} entries with honorifics (from {len(entries)} total)") + return filtered + elif filter_mode == "only_without_honorifics": + # Filter to keep only entries without honorifics + filtered = {} + for key, value in entries.items(): + if not self._has_honorific(key): + filtered[key] = value + print(f"📑 Final filter: Kept {len(filtered)} entries without honorifics (from {len(entries)} total)") + return filtered + else: + return entries + + def _looks_like_name(self, text): + """Check if text looks like a character name""" + if not text: + return False + + # Check for various name patterns + # Korean names (2-4 hangul characters) + if all(0xAC00 <= ord(char) <= 0xD7AF for char in text) and 2 <= len(text) <= 4: + return True + + # Japanese names (mix of kanji/kana, 2-6 chars) + has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in text) + has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in text) + if (has_kanji or has_kana) and 2 <= len(text) <= 6: + return True + + # Chinese names (2-4 Chinese characters) + if all(0x4E00 <= ord(char) <= 0x9FFF for char in text) and 2 <= len(text) <= 4: + return True + + # English names (starts with capital, mostly letters) + if text[0].isupper() and sum(1 for c in text if c.isalpha()) >= len(text) * 0.8: + return True + + return False + + def _has_honorific(self, term): + """Check if a term contains an honorific using PatternManager's comprehensive list""" + if not term: + return False + + term_lower = term.lower() + + # Check all language honorifics from PatternManager + for language, honorifics_list in self.pattern_manager.CJK_HONORIFICS.items(): + for honorific in honorifics_list: + # For romanized/English honorifics with spaces or dashes + if honorific.startswith(' ') or honorific.startswith('-'): + if term_lower.endswith(honorific.lower()): + return True + # For CJK honorifics (no separator) + else: + if honorific in term: + return True + + return False + + def _strip_all_honorifics(self, term, language='korean'): + """Strip all honorifics from a term using PatternManager's lists""" + if not term: + return term + + result = term + + # Get honorifics for the specific language and English romanizations + honorifics_to_strip = [] + if language in self.pattern_manager.CJK_HONORIFICS: + honorifics_to_strip.extend(self.pattern_manager.CJK_HONORIFICS[language]) + honorifics_to_strip.extend(self.pattern_manager.CJK_HONORIFICS.get('english', [])) + + # Sort by length (longest first) to avoid partial matches + honorifics_to_strip.sort(key=len, reverse=True) + + # Strip honorifics + for honorific in honorifics_to_strip: + if honorific.startswith(' ') or honorific.startswith('-'): + # For romanized honorifics with separators + if result.lower().endswith(honorific.lower()): + result = result[:-len(honorific)] + else: + # For CJK honorifics (no separator) + if result.endswith(honorific): + result = result[:-len(honorific)] + + return result.strip() + + def _convert_to_csv_format(self, data): + """Convert various glossary formats to CSV string format with enforced 3 columns""" + csv_lines = ["type,raw_name,translated_name"] + + if isinstance(data, str): + # Already CSV string + if data.strip().startswith('type,raw_name'): + return data + # Try to parse as JSON + try: + data = json.loads(data) + except: + return data + + if isinstance(data, list): + for item in data: + if isinstance(item, dict): + if 'type' in item and 'raw_name' in item: + # Already in correct format + line = f"{item['type']},{item['raw_name']},{item.get('translated_name', item['raw_name'])}" + csv_lines.append(line) + else: + # Old format - default to 'term' type + entry_type = 'term' + raw_name = item.get('original_name', '') + translated_name = item.get('name', raw_name) + if raw_name and translated_name: + csv_lines.append(f"{entry_type},{raw_name},{translated_name}") + + elif isinstance(data, dict): + if 'entries' in data: + # Has metadata wrapper, extract entries + for original, translated in data['entries'].items(): + csv_lines.append(f"term,{original},{translated}") + else: + # Plain dictionary - default to 'term' type + for original, translated in data.items(): + csv_lines.append(f"term,{original},{translated}") + + return '\n'.join(csv_lines) + + def _parse_csv_to_dict(self, csv_content): + """Parse CSV content to dictionary for backward compatibility""" + result = {} + lines = csv_content.strip().split('\n') + + for line in lines[1:]: # Skip header + if not line.strip(): + continue + parts = [p.strip() for p in line.split(',')] + if len(parts) >= 3: + result[parts[1]] = parts[2] # raw_name -> translated_name + + return result + + def _fuzzy_match(self, term1, term2, threshold=0.90): + """Check if two terms match using fuzzy matching""" + ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio() + return ratio >= threshold + + def _fuzzy_match_rapidfuzz(self, term_lower, text_lower, threshold, term_len): + """Use rapidfuzz library for MUCH faster fuzzy matching""" + from rapidfuzz import fuzz + + print(f"📑 Using RapidFuzz (C++ speed)...") + start_time = time.time() + + matches_count = 0 + threshold_percent = threshold * 100 # rapidfuzz uses 0-100 scale + + # Can use smaller step because rapidfuzz is so fast + step = 1 # Check every position - rapidfuzz can handle it + + # Process text + for i in range(0, len(text_lower) - term_len + 1, step): + # Check stop flag every 10000 positions + if i > 0 and i % 10000 == 0: + if is_stop_requested(): + print(f"📑 RapidFuzz stopped at position {i}") + return matches_count + + window = text_lower[i:i + term_len] + + # rapidfuzz is fast enough we can check every position + if fuzz.ratio(term_lower, window) >= threshold_percent: + matches_count += 1 + + elapsed = time.time() - start_time + print(f"📑 RapidFuzz found {matches_count} matches in {elapsed:.2f}s") + return matches_count + + def _batch_compute_frequencies(self, terms, all_text, fuzzy_threshold=0.90, min_frequency=2): + """Compute frequencies for all terms at once - MUCH faster than individual checking""" + print(f"📑 Computing frequencies for {len(terms)} terms in batch mode...") + start_time = time.time() + + # Result dictionary + term_frequencies = {} + + # First pass: exact matching (very fast) + print(f"📑 Phase 1: Exact matching...") + text_lower = all_text.lower() + for term in terms: + if is_stop_requested(): + return term_frequencies + term_lower = term.lower() + count = text_lower.count(term_lower) + term_frequencies[term] = count + + exact_time = time.time() - start_time + high_freq_terms = sum(1 for count in term_frequencies.values() if count >= min_frequency) + print(f"📑 Exact matching complete: {high_freq_terms}/{len(terms)} terms meet threshold ({exact_time:.1f}s)") + + # If fuzzy matching is disabled, we're done + if fuzzy_threshold >= 1.0: + return term_frequencies + + # Second pass: fuzzy matching ONLY for low-frequency terms + low_freq_terms = [term for term, count in term_frequencies.items() if count < min_frequency] + + if low_freq_terms: + print(f"📑 Phase 2: Fuzzy matching for {len(low_freq_terms)} low-frequency terms...") + + # Try to use RapidFuzz batch processing + try: + from rapidfuzz import process, fuzz + + # For very large texts, sample it for fuzzy matching + if len(text_lower) > 500000: + print(f"📑 Text too large ({len(text_lower):,} chars), sampling for fuzzy matching...") + # Sample every Nth character to reduce size + sample_rate = max(1, len(text_lower) // 100000) + sampled_text = text_lower[::sample_rate] + else: + sampled_text = text_lower + + # Create chunks of text for fuzzy matching + chunk_size = 1000 # Process text in chunks + text_chunks = [sampled_text[i:i+chunk_size] for i in range(0, len(sampled_text), chunk_size//2)] # Overlapping chunks + + print(f"📑 Processing {len(text_chunks)} text chunks...") + threshold_percent = fuzzy_threshold * 100 + + # Process in batches to avoid memory issues + batch_size = 100 # Process 100 terms at a time + for batch_start in range(0, len(low_freq_terms), batch_size): + if is_stop_requested(): + break + + batch_end = min(batch_start + batch_size, len(low_freq_terms)) + batch_terms = low_freq_terms[batch_start:batch_end] + + for term in batch_terms: + if is_stop_requested(): + break + + # Quick fuzzy search in chunks + fuzzy_count = 0 + for chunk in text_chunks[:50]: # Limit to first 50 chunks for speed + if fuzz.partial_ratio(term.lower(), chunk) >= threshold_percent: + fuzzy_count += 1 + + if fuzzy_count > 0: + # Scale up based on sampling + if len(text_lower) > 500000: + fuzzy_count *= (len(text_lower) // len(sampled_text)) + term_frequencies[term] += fuzzy_count + + if (batch_end % 500 == 0) or (batch_end == len(low_freq_terms)): + elapsed = time.time() - start_time + print(f"📑 Processed {batch_end}/{len(low_freq_terms)} terms ({elapsed:.1f}s)") + + except ImportError: + print("📑 RapidFuzz not available, skipping fuzzy matching") + + total_time = time.time() - start_time + final_high_freq = sum(1 for count in term_frequencies.values() if count >= min_frequency) + print(f"📑 Batch frequency computation complete: {final_high_freq}/{len(terms)} terms accepted ({total_time:.1f}s)") + + return term_frequencies + + def _find_fuzzy_matches(self, term, text, threshold=0.90): + """Find fuzzy matches of a term in text using efficient method with parallel processing""" + start_time = time.time() + + term_lower = term.lower() + text_lower = text.lower() + term_len = len(term) + + # Only log for debugging if explicitly enabled + debug_search = os.getenv("GLOSSARY_DEBUG_SEARCH", "0") == "1" + if debug_search and len(text) > 100000: + print(f"📑 Searching for '{term}' in {len(text):,} chars (threshold: {threshold})") + + # Strategy 1: Use exact matching first for efficiency + exact_start = time.time() + matches_count = text_lower.count(term_lower) + exact_time = time.time() - exact_start + + if matches_count > 0: + if debug_search and len(text) > 100000: + print(f"📑 Found {matches_count} exact matches in {exact_time:.3f}s") + return matches_count + + # Strategy 2: Try rapidfuzz if available (much faster) + if matches_count == 0 and threshold < 1.0: + try: + from rapidfuzz import fuzz + return self._fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len) + except ImportError: + pass # Fall back to parallel/sequential + + # Strategy 3: Fall back to parallel/sequential if rapidfuzz not available + # Check if parallel processing is enabled + extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) + + if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts + return self._parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers) + else: + return self._sequential_fuzzy_search(term_lower, text_lower, threshold, term_len) + # Check if parallel processing is enabled + extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) + + if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts + return self._parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers) + else: + return self._sequential_fuzzy_search(term_lower, text_lower, threshold, term_len) + + return matches_count + + def _parallel_fuzzy_search(self, term_lower, text_lower, threshold, term_len, num_workers): + """Parallel fuzzy search using ThreadPoolExecutor""" + print(f"📑 Starting parallel fuzzy search with {num_workers} workers...") + + text_len = len(text_lower) + matches_count = 0 + + # Split text into overlapping chunks for parallel processing + chunk_size = max(text_len // num_workers, term_len * 100) + chunks = [] + + for i in range(0, text_len, chunk_size): + # Add overlap to avoid missing matches at boundaries + end = min(i + chunk_size + term_len - 1, text_len) + chunks.append((i, text_lower[i:end])) + + print(f"📑 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each") + + # Process chunks in parallel + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + + for chunk_idx, (start_pos, chunk_text) in enumerate(chunks): + if is_stop_requested(): + return matches_count + + future = executor.submit( + self._fuzzy_search_chunk, + term_lower, chunk_text, threshold, term_len, chunk_idx, len(chunks) + ) + futures.append(future) + + # Collect results + for future in as_completed(futures): + if is_stop_requested(): + executor.shutdown(wait=False) + return matches_count + + try: + chunk_matches = future.result() + matches_count += chunk_matches + except Exception as e: + print(f"📑 ⚠️ Chunk processing error: {e}") + + print(f"📑 Parallel fuzzy search found {matches_count} matches") + return matches_count + + def _fuzzy_search_chunk(self, term_lower, chunk_text, threshold, term_len, chunk_idx, total_chunks): + """Process a single chunk for fuzzy matches""" + chunk_matches = 0 + + # Use a more efficient step size - no need to check every position + step = max(1, term_len // 3) # Check every third of term length + + for i in range(0, len(chunk_text) - term_len + 1, step): + # Check stop flag periodically + if i > 0 and i % 1000 == 0: + if is_stop_requested(): + return chunk_matches + + window = chunk_text[i:i + term_len] + + # Use SequenceMatcher for fuzzy matching + if SequenceMatcher(None, term_lower, window).ratio() >= threshold: + chunk_matches += 1 + + # Log progress for this chunk + if total_chunks > 1: + print(f"📑 Chunk {chunk_idx + 1}/{total_chunks} completed: {chunk_matches} matches") + + return chunk_matches + + def _sequential_fuzzy_search(self, term_lower, text_lower, threshold, term_len): + """Sequential fuzzy search (fallback for small texts or single worker)""" + print(f"📑 Starting sequential fuzzy search...") + fuzzy_start = time.time() + + matches_count = 0 + + # More efficient step size + step = max(1, term_len // 3) + total_windows = (len(text_lower) - term_len + 1) // step + + print(f"📑 Checking ~{total_windows:,} windows with step size {step}") + + windows_checked = 0 + for i in range(0, len(text_lower) - term_len + 1, step): + # Check stop flag frequently + if i > 0 and i % (step * 100) == 0: + if is_stop_requested(): + return matches_count + + # Progress log for very long operations + if windows_checked % 1000 == 0 and windows_checked > 0: + elapsed = time.time() - fuzzy_start + rate = windows_checked / elapsed if elapsed > 0 else 0 + eta = (total_windows - windows_checked) / rate if rate > 0 else 0 + print(f"📑 Progress: {windows_checked}/{total_windows} windows, {rate:.0f} w/s, ETA: {eta:.1f}s") + + window = text_lower[i:i + term_len] + if SequenceMatcher(None, term_lower, window).ratio() >= threshold: + matches_count += 1 + + windows_checked += 1 + + fuzzy_time = time.time() - fuzzy_start + print(f"📑 Sequential fuzzy search completed in {fuzzy_time:.2f}s, found {matches_count} matches") + + return matches_count + + def _fuzzy_match(self, term1, term2, threshold=0.90): + """Check if two terms match using fuzzy matching (unchanged)""" + ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio() + return ratio >= threshold + + def _strip_honorific(self, term, language_hint='unknown'): + """Strip honorific from a term if present""" + if not term: + return term + + # Get honorifics for the detected language + honorifics_to_check = [] + if language_hint in self.pattern_manager.CJK_HONORIFICS: + honorifics_to_check.extend(self.pattern_manager.CJK_HONORIFICS[language_hint]) + honorifics_to_check.extend(self.pattern_manager.CJK_HONORIFICS.get('english', [])) + + # Check and remove honorifics + for honorific in honorifics_to_check: + if honorific.startswith('-') or honorific.startswith(' '): + # English-style suffix + if term.endswith(honorific): + return term[:-len(honorific)].strip() + else: + # CJK-style suffix (no separator) + if term.endswith(honorific): + return term[:-len(honorific)] + + return term + + def _translate_chunk_traditional(self, chunk_text, chunk_index, total_chunks, chapter_title=""): + """Simplified translation for traditional APIs (DeepL, Google Translate)""" + + print(f"📝 Using traditional translation API for chunk {chunk_index}/{total_chunks}") + + # Traditional APIs don't use complex prompts, just need the text + messages = [] + + # Add minimal system context for language detection + profile = self.active_profile + if profile == 'korean': + lang_hint = "Translating from Korean to English" + elif profile == 'japanese': + lang_hint = "Translating from Japanese to English" + elif profile == 'chinese': + lang_hint = "Translating from Chinese to English" + else: + lang_hint = "Translating to English" + + messages.append({ + "role": "system", + "content": lang_hint + }) + + # For traditional APIs, we need to handle glossary differently + # Apply glossary terms as preprocessing if available + processed_text = chunk_text + + if hasattr(self, 'glossary_manager') and self.glossary_manager and self.glossary_manager.entries: + # Pre-process: Mark glossary terms with placeholders + glossary_placeholders = {} + placeholder_index = 0 + + for entry in self.glossary_manager.entries: + source = entry.get('source', '') + target = entry.get('target', '') + + if source and target and source in processed_text: + # Create unique placeholder + placeholder = f"[[GLOSS_{placeholder_index}]]" + glossary_placeholders[placeholder] = target + processed_text = processed_text.replace(source, placeholder) + placeholder_index += 1 + + print(f"📚 Applied {len(glossary_placeholders)} glossary placeholders") + + # Add the text to translate + messages.append({ + "role": "user", + "content": processed_text + }) + + # Send to API + try: + response = self.client.send(messages) + + if response and response.content: + translated_text = response.content + + # Post-process: Replace placeholders with glossary terms + if 'glossary_placeholders' in locals(): + for placeholder, target in glossary_placeholders.items(): + translated_text = translated_text.replace(placeholder, target) + print(f"✅ Restored {len(glossary_placeholders)} glossary terms") + + # Log detected language if available + if hasattr(response, 'usage') and response.usage: + detected_lang = response.usage.get('detected_source_lang') + if detected_lang: + print(f"🔍 Detected source language: {detected_lang}") + + return translated_text + else: + print("❌ No translation received from traditional API") + return None + + except Exception as e: + print(f"❌ Traditional API translation error: {e}") + return None + + def _filter_text_for_glossary(self, text, min_frequency=2): + """Filter text to extract only meaningful content for glossary extraction""" + import re + from collections import Counter + from concurrent.futures import ThreadPoolExecutor, as_completed + import time + + filter_start_time = time.time() + print(f"📑 Starting smart text filtering...") + print(f"📑 Input text size: {len(text):,} characters") + + # Clean HTML if present + print(f"📑 Step 1/7: Cleaning HTML tags...") + from bs4 import BeautifulSoup + soup = BeautifulSoup(text, 'html.parser') + clean_text = soup.get_text() + print(f"📑 Clean text size: {len(clean_text):,} characters") + + # Detect primary language for better filtering + print(f"📑 Step 2/7: Detecting primary language...") + def detect_primary_language(text_sample): + sample = text_sample[:1000] + korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) + japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) + chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) + + if korean_chars > 50: + return 'korean' + elif japanese_kana > 20: + return 'japanese' + elif chinese_chars > 50 and japanese_kana < 10: + return 'chinese' + else: + return 'english' + + primary_lang = detect_primary_language(clean_text) + print(f"📑 Detected primary language: {primary_lang}") + + # Split into sentences for better context + print(f"📑 Step 3/7: Splitting text into sentences...") + sentences = re.split(r'[.!?。!?]+', clean_text) + print(f"📑 Found {len(sentences):,} sentences") + + # Extract potential terms (words/phrases that appear multiple times) + print(f"📑 Step 4/7: Setting up extraction patterns and exclusion rules...") + word_freq = Counter() + + # Pattern for detecting potential names/terms based on capitalization or special characters + # Korean names: 2-4 hangul characters WITHOUT honorifics + korean_pattern = r'[가-힣]{2,4}' + # Japanese names: kanji/hiragana/katakana combinations + japanese_pattern = r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,6}' + # Chinese names: 2-4 Chinese characters + chinese_pattern = r'[\u4e00-\u9fff]{2,4}' + # English proper nouns: Capitalized words + english_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b' + + # Combine patterns + combined_pattern = f'({korean_pattern}|{japanese_pattern}|{chinese_pattern}|{english_pattern})' + print(f"📑 Using combined regex pattern for {primary_lang} text") + + # Get honorifics and title patterns for the detected language + honorifics_to_exclude = set() + if primary_lang in self.pattern_manager.CJK_HONORIFICS: + honorifics_to_exclude.update(self.pattern_manager.CJK_HONORIFICS[primary_lang]) + # Also add English romanizations + honorifics_to_exclude.update(self.pattern_manager.CJK_HONORIFICS.get('english', [])) + + # Compile title patterns for the language + title_patterns = [] + if primary_lang in self.pattern_manager.TITLE_PATTERNS: + for pattern in self.pattern_manager.TITLE_PATTERNS[primary_lang]: + title_patterns.append(re.compile(pattern)) + + # Function to check if a term should be excluded + def should_exclude_term(term): + term_lower = term.lower() + + # Check if it's a common word + if term in self.pattern_manager.COMMON_WORDS or term_lower in self.pattern_manager.COMMON_WORDS: + return True + + # Check if it contains honorifics + for honorific in honorifics_to_exclude: + if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])): + return True + + # Check if it matches title patterns + for pattern in title_patterns: + if pattern.search(term): + return True + + # Check if it's a number (including Chinese numbers) + if term in self.pattern_manager.CHINESE_NUMS: + return True + + # Check if it's just digits + if term.isdigit(): + return True + + return False + + # Extract potential terms from each sentence + print(f"📑 Step 5/7: Extracting and filtering terms from sentences...") + + # Check if we should use parallel processing + extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) + # Auto-detect optimal workers if not set + if extraction_workers == 1 and len(sentences) > 1000: + # Use more cores for better parallelization + cpu_count = os.cpu_count() or 4 + extraction_workers = min(cpu_count, 12) # Use up to 12 cores + print(f"📑 Auto-detected {cpu_count} CPU cores, using {extraction_workers} workers") + + use_parallel = extraction_workers > 1 and len(sentences) > 100 + + if use_parallel: + print(f"📑 Using parallel processing with {extraction_workers} workers") + print(f"📑 Estimated speedup: {extraction_workers}x faster") + + important_sentences = [] + seen_contexts = set() + processed_count = 0 + total_sentences = len(sentences) + last_progress_time = time.time() + + def process_sentence_batch(batch_sentences, batch_idx): + """Process a batch of sentences""" + local_word_freq = Counter() + local_important = [] + local_seen = set() + + for sentence in batch_sentences: + sentence = sentence.strip() + if len(sentence) < 10 or len(sentence) > 500: + continue + + # Find all potential terms in this sentence + matches = re.findall(combined_pattern, sentence) + + if matches: + # Filter out excluded terms + filtered_matches = [] + for match in matches: + if not should_exclude_term(match): + local_word_freq[match] += 1 + filtered_matches.append(match) + + # Keep sentences with valid potential terms + if filtered_matches: + sentence_key = ' '.join(sorted(filtered_matches)) + if sentence_key not in local_seen: + local_important.append(sentence) + local_seen.add(sentence_key) + + return local_word_freq, local_important, local_seen, batch_idx + + if use_parallel: + # Force SMALL batches for real parallelization + # We want MANY small batches, not few large ones! + + # Calculate based on total sentences + total_sentences = len(sentences) + + if total_sentences < 1000: + # Small dataset: 50-100 sentences per batch + optimal_batch_size = 100 + elif total_sentences < 10000: + # Medium dataset: 200 sentences per batch + optimal_batch_size = 200 + elif total_sentences < 50000: + # Large dataset: 300 sentences per batch + optimal_batch_size = 300 + else: + # Very large dataset: 400 sentences per batch max + optimal_batch_size = 400 + + # Ensure we have enough batches for all workers + min_batches = extraction_workers * 3 # At least 3 batches per worker + max_batch_size = max(50, total_sentences // min_batches) + optimal_batch_size = min(optimal_batch_size, max_batch_size) + + print(f"📑 Total sentences: {total_sentences:,}") + print(f"📑 Target batch size: {optimal_batch_size} sentences") + + # Calculate expected number of batches + expected_batches = (total_sentences + optimal_batch_size - 1) // optimal_batch_size + print(f"📑 Expected batches: {expected_batches} (for {extraction_workers} workers)") + print(f"📑 Batches per worker: ~{expected_batches // extraction_workers} batches") + + batches = [sentences[i:i + optimal_batch_size] for i in range(0, len(sentences), optimal_batch_size)] + print(f"📑 Processing {len(batches)} batches of ~{optimal_batch_size} sentences each") + print(f"📑 Expected speedup: {min(extraction_workers, len(batches))}x (using {extraction_workers} workers)") + + # Decide between ThreadPoolExecutor and ProcessPoolExecutor + import multiprocessing + in_subprocess = multiprocessing.current_process().name != 'MainProcess' + + # Use ProcessPoolExecutor for better parallelism on larger datasets + use_process_pool = (not in_subprocess and len(sentences) > 5000) + + if use_process_pool: + print(f"📑 Using ProcessPoolExecutor for maximum performance (true parallelism)") + executor_class = ProcessPoolExecutor + else: + print(f"📑 Using ThreadPoolExecutor for sentence processing") + executor_class = ThreadPoolExecutor + + with executor_class(max_workers=extraction_workers) as executor: + futures = [] + + # Prepare data for ProcessPoolExecutor if needed + if use_process_pool: + # Serialize exclusion check data for process pool + exclude_check_data = ( + list(honorifics_to_exclude), + [p.pattern for p in title_patterns], # Convert regex to strings + self.pattern_manager.COMMON_WORDS, + self.pattern_manager.CHINESE_NUMS + ) + + for idx, batch in enumerate(batches): + if use_process_pool: + # Use module-level function for ProcessPoolExecutor + future = executor.submit(_process_sentence_batch_for_extraction, + (batch, idx, combined_pattern, exclude_check_data)) + else: + # Use local function for ThreadPoolExecutor + future = executor.submit(process_sentence_batch, batch, idx) + + futures.append(future) + # Yield to GUI when submitting futures + if idx % 10 == 0: + time.sleep(0.001) + + # Collect results with progress + completed_batches = 0 + batch_start_time = time.time() + for future in as_completed(futures): + # Get result without timeout - as_completed already handles waiting + local_word_freq, local_important, local_seen, batch_idx = future.result() + + # Merge results + word_freq.update(local_word_freq) + for sentence in local_important: + sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence))) + if sentence_key not in seen_contexts: + important_sentences.append(sentence) + seen_contexts.add(sentence_key) + + processed_count += len(batches[batch_idx]) + completed_batches += 1 + + # Show progress every 10 batches or at key milestones + if completed_batches % 10 == 0 or completed_batches == len(batches): + progress = (processed_count / total_sentences) * 100 + elapsed = time.time() - batch_start_time + rate = (processed_count / elapsed) if elapsed > 0 else 0 + print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) | Batch {completed_batches}/{len(batches)} | {rate:.0f} sent/sec") + + # Yield to GUI after each batch completes + time.sleep(0.001) + else: + # Sequential processing with progress + for idx, sentence in enumerate(sentences): + sentence = sentence.strip() + if len(sentence) < 10 or len(sentence) > 500: + continue + + # Find all potential terms in this sentence + matches = re.findall(combined_pattern, sentence) + + if matches: + # Filter out excluded terms + filtered_matches = [] + for match in matches: + if not should_exclude_term(match): + word_freq[match] += 1 + filtered_matches.append(match) + + # Keep sentences with valid potential terms + if filtered_matches: + sentence_key = ' '.join(sorted(filtered_matches)) + if sentence_key not in seen_contexts: + important_sentences.append(sentence) + seen_contexts.add(sentence_key) + + # Show progress every 1000 sentences or 2 seconds + if idx % 1000 == 0 or (time.time() - last_progress_time > 2): + progress = ((idx + 1) / total_sentences) * 100 + print(f"📑 Processing sentences: {idx + 1:,}/{total_sentences:,} ({progress:.1f}%)") + last_progress_time = time.time() + # Yield to GUI thread every 1000 sentences + time.sleep(0.001) # Tiny sleep to let GUI update + # Yield to GUI thread every 1000 sentences + time.sleep(0.001) # Tiny sleep to let GUI update + + print(f"📑 Found {len(important_sentences):,} sentences with potential glossary terms") + + # Step 6/7: Deduplicate and normalize terms + print(f"📑 Step 6/7: Normalizing and deduplicating {len(word_freq):,} unique terms...") + + # Since should_exclude_term already filters honorifics, we just need to deduplicate + # based on normalized forms (lowercase, etc.) + combined_freq = Counter() + term_count = 0 + + for term, count in word_freq.items(): + # Normalize term for deduplication (but keep original form) + normalized = term.lower().strip() + + # Keep the version with highest count + if normalized in combined_freq: + # If we already have this normalized form, keep the one with higher count + if count > combined_freq[normalized]: + # Remove old entry and add new one + del combined_freq[normalized] + combined_freq[term] = count + else: + combined_freq[term] = count + + term_count += 1 + # Yield to GUI every 1000 terms + if term_count % 1000 == 0: + time.sleep(0.001) + + print(f"📑 Deduplicated to {len(combined_freq):,} unique terms") + + # Filter to keep only terms that appear at least min_frequency times + frequent_terms = {term: count for term, count in combined_freq.items() if count >= min_frequency} + + # Build filtered text focusing on sentences containing frequent terms + print(f"📑 Step 7/7: Building filtered text from relevant sentences...") + + # OPTIMIZATION: Skip sentences that already passed filtering in step 5 + # These sentences already contain glossary terms, no need to check again! + # We just need to limit the sample size + + filtered_sentences = important_sentences # Already filtered! + print(f"📑 Using {len(filtered_sentences):,} pre-filtered sentences (already contain glossary terms)") + + # For extremely large datasets, we can optionally do additional filtering + if len(filtered_sentences) > 10000 and len(frequent_terms) > 1000: + print(f"📑 Large dataset detected - applying frequency-based filtering...") + print(f"📑 Filtering {len(filtered_sentences):,} sentences for top frequent terms...") + + # Sort terms by frequency to prioritize high-frequency ones + sorted_terms = sorted(frequent_terms.items(), key=lambda x: x[1], reverse=True) + top_terms = dict(sorted_terms[:1000]) # Focus on top 1000 most frequent terms + + print(f"📑 Using top {len(top_terms):,} most frequent terms for final filtering") + + # Use parallel processing only if really needed + if use_parallel and len(filtered_sentences) > 5000: + import multiprocessing + in_subprocess = multiprocessing.current_process().name != 'MainProcess' + + # Create a simple set of terms for fast lookup (no variations needed) + term_set = set(top_terms.keys()) + + print(f"📑 Using parallel filtering with {extraction_workers} workers...") + + # Optimize batch size + check_batch_size = 500 # Larger batches since we're doing simpler checks + check_batches = [filtered_sentences[i:i + check_batch_size] + for i in range(0, len(filtered_sentences), check_batch_size)] + + print(f"📑 Processing {len(check_batches)} batches of ~{check_batch_size} sentences") + + # Simple function to check if sentence contains any top term + def check_batch_simple(batch): + result = [] + for sentence in batch: + # Simple substring check - much faster than regex + for term in term_set: + if term in sentence: + result.append(sentence) + break + return result + + new_filtered = [] + with ThreadPoolExecutor(max_workers=extraction_workers) as executor: + futures = [executor.submit(check_batch_simple, batch) for batch in check_batches] + + for future in as_completed(futures): + new_filtered.extend(future.result()) + + filtered_sentences = new_filtered + print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms") + else: + # For smaller datasets, simple sequential filtering + print(f"📑 Using sequential filtering...") + new_filtered = [] + for i, sentence in enumerate(filtered_sentences): + for term in top_terms: + if term in sentence: + new_filtered.append(sentence) + break + if i % 1000 == 0: + print(f"📑 Progress: {i:,}/{len(filtered_sentences):,} sentences") + time.sleep(0.001) + + filtered_sentences = new_filtered + print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms") + + print(f"📑 Selected {len(filtered_sentences):,} sentences containing frequent terms") + + # Limit the number of sentences to reduce token usage + max_sentences = int(os.getenv("GLOSSARY_MAX_SENTENCES", "200")) + if len(filtered_sentences) > max_sentences: + print(f"📑 Limiting to {max_sentences} representative sentences (from {len(filtered_sentences):,})") + # Take a representative sample + step = len(filtered_sentences) // max_sentences + filtered_sentences = filtered_sentences[::step][:max_sentences] + + filtered_text = ' '.join(filtered_sentences) + + # Calculate and display filtering statistics + filter_end_time = time.time() + filter_duration = filter_end_time - filter_start_time + + original_length = len(clean_text) + filtered_length = len(filtered_text) + reduction_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0 + + print(f"\n📑 === FILTERING COMPLETE ===") + print(f"📑 Duration: {filter_duration:.1f} seconds") + print(f"📑 Text reduction: {original_length:,} → {filtered_length:,} chars ({reduction_percent:.1f}% reduction)") + print(f"📑 Terms found: {len(frequent_terms):,} unique terms (min frequency: {min_frequency})") + print(f"📑 Final output: {len(filtered_sentences)} sentences, {filtered_length:,} characters") + print(f"📑 Performance: {(original_length / filter_duration / 1000):.1f}K chars/second") + print(f"📑 ========================\n") + + return filtered_text, frequent_terms + + def _extract_with_custom_prompt(self, custom_prompt, all_text, language, + min_frequency, max_names, max_titles, + existing_glossary, output_dir, + strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'): + """Extract glossary using custom AI prompt with proper filtering""" + print("📑 Using custom automatic glossary prompt") + extraction_start = time.time() + + # Check stop flag + if is_stop_requested(): + print("📑 ❌ Glossary extraction stopped by user") + return {} + + # Note: Filter mode can be controlled via the configurable prompt environment variable + # No hardcoded filter instructions are added here + + try: + MODEL = os.getenv("MODEL", "gemini-2.0-flash") + API_KEY = (os.getenv("API_KEY") or + os.getenv("OPENAI_API_KEY") or + os.getenv("OPENAI_OR_Gemini_API_KEY") or + os.getenv("GEMINI_API_KEY")) + + if is_traditional_translation_api(MODEL): + return self._translate_chunk_traditional(chunk_text, chunk_index, total_chunks, chapter_title) + + elif not API_KEY: + print(f"📑 No API key found, falling back to pattern-based extraction") + return self._extract_with_patterns(all_text, language, min_frequency, + max_names, max_titles, 50, + existing_glossary, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode) + else: + print(f"📑 Using AI-assisted extraction with custom prompt") + + from unified_api_client import UnifiedClient, UnifiedClientError + client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir) + if hasattr(client, 'reset_cleanup_state'): + client.reset_cleanup_state() + + # Apply thread submission delay using the client's method + thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) + if thread_delay > 0: + client._apply_thread_submission_delay() + + # Check if cancelled during delay + if hasattr(client, '_cancelled') and client._cancelled: + print("📑 ❌ Glossary extraction stopped during delay") + return {} + + # Check if text is already filtered (from chunking) + already_filtered = os.getenv("_CHUNK_ALREADY_FILTERED", "0") == "1" + + if already_filtered: + print("📑 Text already filtered during chunking, skipping re-filtering") + text_sample = all_text # Use as-is since it's already filtered + detected_terms = {} + else: + # Apply smart filtering to reduce noise and focus on meaningful content + force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER", "0") == "1" + use_smart_filter = (os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1") and not force_disable + + if use_smart_filter: + print("📑 Applying smart text filtering to reduce noise...") + text_sample, detected_terms = self._filter_text_for_glossary(all_text, min_frequency) + else: + print("📑 Smart filter disabled - using raw text sample") + # Fallback to simple truncation + max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "50000")) + text_sample = all_text[:max_text_size] if len(all_text) > max_text_size and max_text_size > 0 else all_text + detected_terms = {} + + # Replace placeholders in prompt + prompt = custom_prompt.replace('{language}', language) + prompt = prompt.replace('{min_frequency}', str(min_frequency)) + prompt = prompt.replace('{max_names}', str(max_names)) + prompt = prompt.replace('{max_titles}', str(max_titles)) + + # Get the format instructions from environment variable + format_instructions = os.getenv("GLOSSARY_FORMAT_INSTRUCTIONS", "") + + # If no format instructions are provided, use a default + if not format_instructions: + format_instructions = """ +Return the results in EXACT CSV format with this header: +type,raw_name,translated_name + +For example: +character,김상현,Kim Sang-hyu +character,갈편제,Gale Hardest +character,디히릿 아데,Dihirit Ade + +Only include entries that actually appear in the text. +Do not use quotes around values unless they contain commas. + +Text to analyze: +{text_sample}""" + + # Replace placeholders in format instructions + format_instructions = format_instructions.replace('{text_sample}', text_sample) + + # Combine the user's prompt with format instructions + enhanced_prompt = f"{prompt}\n\n{format_instructions}" + + messages = [ + {"role": "system", "content": "You are a glossary extraction assistant. Return ONLY CSV format with exactly 3 columns: type,raw_name,translated_name. The 'type' column should classify entries (e.g., character, term, location, etc.)."}, + {"role": "user", "content": enhanced_prompt} + ] + + # Check stop before API call + if is_stop_requested(): + print("📑 ❌ Glossary extraction stopped before API call") + return {} + + try: + temperature = float(os.getenv("TEMPERATURE", "0.3")) + max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096")) + + # Use send_with_interrupt for interruptible API call + chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "900")) # 15 minute default for glossary + print(f"📑 Sending AI extraction request (timeout: {chunk_timeout}s, interruptible)...") + + # Before API call + api_start = time.time() + print(f"📑 Preparing API request (text size: {len(text_sample):,} chars)...") + print(f"📑 ⏳ Processing {len(text_sample):,} characters... Please wait, this may take 5-10 minutes") + + response = send_with_interrupt( + messages=messages, + client=client, + temperature=temperature, + max_tokens=max_tokens, + stop_check_fn=is_stop_requested, + chunk_timeout=chunk_timeout + ) + api_time = time.time() - api_start + print(f"📑 API call completed in {api_time:.1f}s") + + # Get the actual text from the response + if hasattr(response, 'content'): + response_text = response.content + else: + response_text = str(response) + + # Before processing response + process_start = time.time() + print(f"📑 Processing AI response...") + + # Process response and build CSV + csv_lines = self._process_ai_response(response_text, all_text, min_frequency, + strip_honorifics, fuzzy_threshold, + language, filter_mode) + + print(f"📑 AI extracted {len(csv_lines) - 1} valid terms (header excluded)") + + process_time = time.time() - process_start + print(f"📑 Response processing took {process_time:.1f}s") + + # If we're running per-chunk, defer all heavy work and saving + if os.getenv("GLOSSARY_DEFER_SAVE", "0") == "1": + return csv_lines + + # Check stop before merging + if is_stop_requested(): + print("📑 ❌ Glossary generation stopped before merging") + return {} + + # Merge with existing glossary if present + if existing_glossary: + csv_lines = self._merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language) + + # Fuzzy matching deduplication + skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1" + if not skip_frequency_check: # Only dedupe if we're checking frequencies + # Time the deduplication + dedup_start = time.time() + original_count = len(csv_lines) - 1 # Exclude header + + csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) + + dedup_time = time.time() - dedup_start + final_count = len(csv_lines) - 1 # Exclude header + removed_count = original_count - final_count + + print(f"📑 Deduplication completed in {dedup_time:.1f}s") + print(f"📑 - Original entries: {original_count}") + print(f"📑 - Duplicates removed: {removed_count}") + print(f"📑 - Final entries: {final_count}") + + # Store for summary statistics + self._dedup_time = getattr(self, '_dedup_time', 0) + dedup_time + else: + print(f"📑 Skipping deduplication (frequency check disabled)") + + # Apply filter mode to final results + csv_lines = self._filter_csv_by_mode(csv_lines, filter_mode) + + # Check if we should use token-efficient format + use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' + + if not use_legacy_format: + # Convert to token-efficient format + csv_lines = self._convert_to_token_efficient_format(csv_lines) + + # Final sanitize to prevent stray headers + csv_lines = self._sanitize_final_glossary_lines(csv_lines, use_legacy_format) + + # Create final CSV content + csv_content = '\n'.join(csv_lines) + + # Save glossary as CSV with proper extension + glossary_path = os.path.join(output_dir, "glossary.csv") + self._atomic_write_file(glossary_path, csv_content) + + print(f"\n📑 ✅ AI-ASSISTED GLOSSARY SAVED!") + print(f"📑 File: {glossary_path}") + c_count, t_count, total = self._count_glossary_entries(csv_lines, use_legacy_format) + print(f"📑 Character entries: {c_count}") + print(f"📑 Term entries: {t_count}") + print(f"📑 Total entries: {total}") + total_time = time.time() - extraction_start + print(f"📑 Total extraction time: {total_time:.1f}s") + return self._parse_csv_to_dict(csv_content) + + except UnifiedClientError as e: + if "stopped by user" in str(e).lower(): + print(f"📑 ❌ AI extraction interrupted by user") + return {} + else: + print(f"⚠️ AI extraction failed: {e}") + print("📑 Falling back to pattern-based extraction") + return self._extract_with_patterns(all_text, language, min_frequency, + max_names, max_titles, 50, + existing_glossary, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode) + except Exception as e: + print(f"⚠️ AI extraction failed: {e}") + import traceback + traceback.print_exc() + print("📑 Falling back to pattern-based extraction") + return self._extract_with_patterns(all_text, language, min_frequency, + max_names, max_titles, 50, + existing_glossary, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode) + + except Exception as e: + print(f"⚠️ Custom prompt processing failed: {e}") + import traceback + traceback.print_exc() + return self._extract_with_patterns(all_text, language, min_frequency, + max_names, max_titles, 50, + existing_glossary, output_dir, + strip_honorifics, fuzzy_threshold, filter_mode) + + def _filter_csv_by_mode(self, csv_lines, filter_mode): + """Filter CSV lines based on the filter mode""" + if filter_mode == "all": + return csv_lines + + filtered = [csv_lines[0]] # Keep header + + for line in csv_lines[1:]: + if not line.strip(): + continue + + parts = [p.strip() for p in line.split(',')] + if len(parts) < 3: + continue + + entry_type = parts[0].lower() + raw_name = parts[1] + + if filter_mode == "only_with_honorifics": + # Only keep character entries with honorifics + if entry_type == "character" and self._has_honorific(raw_name): + filtered.append(line) + elif filter_mode == "only_without_honorifics": + # Keep terms and characters without honorifics + if entry_type == "term" or (entry_type == "character" and not self._has_honorific(raw_name)): + filtered.append(line) + + print(f"📑 Filter '{filter_mode}': {len(filtered)-1} entries kept from {len(csv_lines)-1}") + return filtered + + def _process_ai_response(self, response_text, all_text, min_frequency, + strip_honorifics, fuzzy_threshold, language, filter_mode): + """Process AI response and return CSV lines""" + + # option to completely skip frequency validation for speed + skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1" + + if skip_all_validation: + print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)") + + # Clean response text + response_text = response_text.strip() + + # Remove string representation artifacts if they wrap the entire response + if response_text.startswith('("') and response_text.endswith('")'): + response_text = response_text[2:-2] + elif response_text.startswith('"') and response_text.endswith('"'): + response_text = response_text[1:-1] + elif response_text.startswith('(') and response_text.endswith(')'): + response_text = response_text[1:-1] + + # Unescape the string + response_text = response_text.replace('\\n', '\n') + response_text = response_text.replace('\\r', '') + response_text = response_text.replace('\\t', '\t') + response_text = response_text.replace('\\"', '"') + response_text = response_text.replace("\\'", "'") + response_text = response_text.replace('\\\\', '\\') + + # Clean up markdown code blocks if present + if '```' in response_text: + parts = response_text.split('```') + for part in parts: + if 'csv' in part[:10].lower(): + response_text = part[part.find('\n')+1:] + break + elif part.strip() and ('type,raw_name' in part or 'character,' in part or 'term,' in part): + response_text = part + break + + # Normalize line endings + response_text = response_text.replace('\r\n', '\n').replace('\r', '\n') + lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()] + + csv_lines = [] + header_found = False + + # Check if we should skip frequency check + skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1" + + # Add option to completely skip ALL validation for maximum speed + skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1" + + if skip_all_validation: + print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)") + + # Always use the enforced 3-column header + csv_lines.append("type,raw_name,translated_name") + + # Process the AI response + for line in lines: + # Skip header lines + if 'type' in line.lower() and 'raw_name' in line.lower(): + continue + + # Parse CSV line + parts = [p.strip().strip('"\"') for p in line.split(',')] + + if len(parts) >= 3: + # Has all 3 columns + entry_type = parts[0] + raw_name = parts[1] + translated_name = parts[2] + if raw_name and translated_name: + csv_lines.append(f"{entry_type},{raw_name},{translated_name}") + elif len(parts) == 2: + # Missing type, default to 'term' + raw_name = parts[0] + translated_name = parts[1] + if raw_name and translated_name: + csv_lines.append(f"term,{raw_name},{translated_name}") + + print(f"📑 Fast mode: Accepted {len(csv_lines) - 1} entries without validation") + return csv_lines + + # For "only_with_honorifics" mode, ALWAYS skip frequency check + if filter_mode == "only_with_honorifics": + skip_frequency_check = True + print("📑 Filter mode 'only_with_honorifics': Bypassing frequency checks") + + print(f"📑 Processing {len(lines)} lines from AI response...") + print(f"📑 Text corpus size: {len(all_text):,} chars") + print(f"📑 Frequency checking: {'DISABLED' if skip_frequency_check else f'ENABLED (min: {min_frequency})'}") + print(f"📑 Fuzzy threshold: {fuzzy_threshold}") + + # Collect all terms first for batch processing + all_terms_to_check = [] + term_info_map = {} # Map term to its full info + + if not skip_frequency_check: + # First pass: collect all terms that need frequency checking + for line in lines: + if 'type' in line.lower() and 'raw_name' in line.lower(): + continue # Skip header + + parts = [p.strip().strip('"\"') for p in line.split(',')] + if len(parts) >= 3: + entry_type = parts[0].lower() + raw_name = parts[1] + translated_name = parts[2] + elif len(parts) == 2: + entry_type = 'term' + raw_name = parts[0] + translated_name = parts[1] + else: + continue + + if raw_name and translated_name: + # Store for batch processing + original_raw = raw_name + if strip_honorifics: + raw_name = self._strip_honorific(raw_name, language) + + all_terms_to_check.append(raw_name) + term_info_map[raw_name] = { + 'entry_type': entry_type, + 'original_raw': original_raw, + 'translated_name': translated_name, + 'line': line + } + + # Batch compute all frequencies at once + if all_terms_to_check: + print(f"📑 Computing frequencies for {len(all_terms_to_check)} terms...") + term_frequencies = self._batch_compute_frequencies( + all_terms_to_check, all_text, fuzzy_threshold, min_frequency + ) + else: + term_frequencies = {} + + # Now process the results using pre-computed frequencies + entries_processed = 0 + entries_accepted = 0 + # Process based on mode + if filter_mode == "only_with_honorifics" or skip_frequency_check: + # For these modes, accept all entries + csv_lines.append("type,raw_name,translated_name") # Header + for line in lines: + if 'type' in line.lower() and 'raw_name' in line.lower(): + continue # Skip header + + parts = [p.strip().strip('"\"') for p in line.split(',')] + if len(parts) >= 3: + entry_type = parts[0].lower() + raw_name = parts[1] + translated_name = parts[2] + elif len(parts) == 2: + entry_type = 'term' + raw_name = parts[0] + translated_name = parts[1] + else: + continue + + if raw_name and translated_name: + csv_line = f"{entry_type},{raw_name},{translated_name}" + csv_lines.append(csv_line) + entries_accepted += 1 + + print(f"📑 Accepted {entries_accepted} entries (frequency check disabled)") + + else: + # Use pre-computed frequencies + csv_lines.append("type,raw_name,translated_name") # Header + + for term, info in term_info_map.items(): + count = term_frequencies.get(term, 0) + + # Also check original form if it was stripped + if info['original_raw'] != term: + count += term_frequencies.get(info['original_raw'], 0) + + if count >= min_frequency: + csv_line = f"{info['entry_type']},{term},{info['translated_name']}" + csv_lines.append(csv_line) + entries_accepted += 1 + + # Log first few examples + if entries_accepted <= 5: + print(f"📑 ✓ Example: {term} -> {info['translated_name']} (freq: {count})") + + print(f"📑 Frequency filtering complete: {entries_accepted}/{len(term_info_map)} terms accepted") + + # Ensure we have at least the header + if len(csv_lines) == 0: + csv_lines.append("type,raw_name,translated_name") + + # Print final summary + print(f"📑 Processing complete: {entries_accepted} terms accepted") + + return csv_lines + + def _deduplicate_glossary_with_fuzzy(self, csv_lines, fuzzy_threshold): + """Apply fuzzy matching to remove duplicate entries from the glossary with stop flag checks""" + from difflib import SequenceMatcher + + print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") + + # Check stop flag at start + if is_stop_requested(): + print(f"📑 ❌ Deduplication stopped by user") + return csv_lines + + header_line = csv_lines[0] # Keep header + entry_lines = csv_lines[1:] # Data lines + + deduplicated = [header_line] + seen_entries = {} # Use dict for O(1) lookups instead of list + seen_names_lower = set() # Quick exact match check + removed_count = 0 + total_entries = len(entry_lines) + + # Pre-process all entries for faster comparison + print(f"📑 Processing {total_entries} entries for deduplication...") + + for idx, line in enumerate(entry_lines): + # Check stop flag every 100 entries + if idx > 0 and idx % 100 == 0: + if is_stop_requested(): + print(f"📑 ❌ Deduplication stopped at entry {idx}/{total_entries}") + return deduplicated + + # Show progress for large glossaries + if total_entries > 500 and idx % 200 == 0: + progress = (idx / total_entries) * 100 + print(f"📑 Deduplication progress: {progress:.1f}% ({idx}/{total_entries})") + + if not line.strip(): + continue + + parts = [p.strip() for p in line.split(',')] + if len(parts) < 3: + continue + + entry_type = parts[0] + raw_name = parts[1] + translated_name = parts[2] + raw_name_lower = raw_name.lower() + + # Fast exact duplicate check first + if raw_name_lower in seen_names_lower: + removed_count += 1 + continue + + # For fuzzy matching, only check if threshold is less than 1.0 + is_duplicate = False + if fuzzy_threshold < 1.0: + # Use a more efficient approach: only check similar length strings + name_len = len(raw_name) + min_len = int(name_len * 0.7) + max_len = int(name_len * 1.3) + + # Only compare with entries of similar length + candidates = [] + for seen_name, (seen_type, seen_trans) in seen_entries.items(): + if min_len <= len(seen_name) <= max_len: + candidates.append(seen_name) + + # Check fuzzy similarity with candidates + for seen_name in candidates: + # Quick character overlap check before expensive SequenceMatcher + char_overlap = len(set(raw_name_lower) & set(seen_name.lower())) + if char_overlap < len(raw_name_lower) * 0.5: + continue # Too different, skip + + raw_similarity = SequenceMatcher(None, raw_name_lower, seen_name.lower()).ratio() + + if raw_similarity >= fuzzy_threshold: + if removed_count < 10: # Only log first few + print(f"📑 Removing duplicate: '{raw_name}' ~= '{seen_name}' (similarity: {raw_similarity:.2%})") + removed_count += 1 + is_duplicate = True + break + + if not is_duplicate: + seen_entries[raw_name] = (entry_type, translated_name) + seen_names_lower.add(raw_name_lower) + deduplicated.append(line) + + print(f"📑 ✅ Removed {removed_count} duplicates from glossary") + print(f"📑 Final glossary size: {len(deduplicated) - 1} unique entries") + + return deduplicated + + def _merge_csv_entries(self, new_csv_lines, existing_glossary, strip_honorifics, language): + """Merge CSV entries with existing glossary with stop flag checks""" + + # Check stop flag at start + if is_stop_requested(): + print(f"📑 ❌ Glossary merge stopped by user") + return new_csv_lines + + # Parse existing glossary + existing_lines = [] + existing_names = set() + + if isinstance(existing_glossary, str): + # Already CSV format + lines = existing_glossary.strip().split('\n') + total_lines = len(lines) + + for idx, line in enumerate(lines): + # Check stop flag every 50 lines + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Merge stopped while processing existing glossary at line {idx}/{total_lines}") + return new_csv_lines + + if total_lines > 200: + progress = (idx / total_lines) * 100 + print(f"📑 Processing existing glossary: {progress:.1f}%") + + if 'type,raw_name' in line.lower(): + continue # Skip header + + line_stripped = line.strip() + # Skip token-efficient lines and section/bullet markers + if not line_stripped or line_stripped.startswith('===') or line_stripped.startswith('*') or line_stripped.lower().startswith('glossary:'): + continue + + parts = [p.strip() for p in line.split(',')] + # Require at least 3 fields (type, raw_name, translated_name) + if len(parts) < 3: + continue + + entry_type = parts[0].strip().lower() + # Only accept reasonable type tokens (letters/underscores only) + import re as _re + if not _re.match(r'^[a-z_]+$', entry_type): + continue + + raw_name = parts[1] + if strip_honorifics: + raw_name = self._strip_honorific(raw_name, language) + parts[1] = raw_name + if raw_name not in existing_names: + existing_lines.append(','.join(parts)) + existing_names.add(raw_name) + + # Check stop flag before processing new names + if is_stop_requested(): + print(f"📑 ❌ Merge stopped before processing new entries") + return new_csv_lines + + # Get new names + new_names = set() + final_lines = [] + + for idx, line in enumerate(new_csv_lines): + # Check stop flag every 50 lines + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Merge stopped while processing new entries at line {idx}") + return final_lines if final_lines else new_csv_lines + + if 'type,raw_name' in line.lower(): + final_lines.append(line) # Keep header + continue + parts = [p.strip() for p in line.split(',')] + if len(parts) >= 2: + new_names.add(parts[1]) + final_lines.append(line) + + # Check stop flag before adding existing entries + if is_stop_requested(): + print(f"📑 ❌ Merge stopped before combining entries") + return final_lines + + # Add non-duplicate existing entries + added_count = 0 + for idx, line in enumerate(existing_lines): + # Check stop flag every 50 additions + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Merge stopped while adding existing entries ({added_count} added)") + return final_lines + + parts = [p.strip() for p in line.split(',')] + if len(parts) >= 2 and parts[1] not in new_names: + final_lines.append(line) + added_count += 1 + + print(f"📑 Merged {added_count} entries from existing glossary") + return final_lines + + def _extract_with_patterns(self, all_text, language, min_frequency, + max_names, max_titles, batch_size, + existing_glossary, output_dir, + strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'): + """Extract glossary using pattern matching with true CSV format output and stop flag checks""" + print("📑 Using pattern-based extraction") + + # Check stop flag at start + if is_stop_requested(): + print("📑 ❌ Pattern-based extraction stopped by user") + return {} + + def is_valid_name(name, language_hint='unknown'): + """Strict validation for proper names only""" + if not name or len(name.strip()) < 1: + return False + + name = name.strip() + + if name.lower() in self.pattern_manager.COMMON_WORDS or name in self.pattern_manager.COMMON_WORDS: + return False + + if language_hint == 'korean': + if not (2 <= len(name) <= 4): + return False + if not all(0xAC00 <= ord(char) <= 0xD7AF for char in name): + return False + if len(set(name)) == 1: + return False + + elif language_hint == 'japanese': + if not (2 <= len(name) <= 6): + return False + has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in name) + has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in name) + if not (has_kanji or has_kana): + return False + + elif language_hint == 'chinese': + if not (2 <= len(name) <= 4): + return False + if not all(0x4E00 <= ord(char) <= 0x9FFF for char in name): + return False + + elif language_hint == 'english': + if not name[0].isupper(): + return False + if sum(1 for c in name if c.isalpha()) < len(name) * 0.8: + return False + if not (2 <= len(name) <= 20): + return False + + return True + + def detect_language_hint(text_sample): + """Quick language detection for validation purposes""" + sample = text_sample[:1000] + + korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) + japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) + chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) + latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A) + + if korean_chars > 50: + return 'korean' + elif japanese_kana > 20: + return 'japanese' + elif chinese_chars > 50 and japanese_kana < 10: + return 'chinese' + elif latin_chars > 100: + return 'english' + else: + return 'unknown' + + language_hint = detect_language_hint(all_text) + print(f"📑 Detected primary language: {language_hint}") + + # Check stop flag after language detection + if is_stop_requested(): + print("📑 ❌ Extraction stopped after language detection") + return {} + + honorifics_to_use = [] + if language_hint in self.pattern_manager.CJK_HONORIFICS: + honorifics_to_use.extend(self.pattern_manager.CJK_HONORIFICS[language_hint]) + honorifics_to_use.extend(self.pattern_manager.CJK_HONORIFICS.get('english', [])) + + print(f"📑 Using {len(honorifics_to_use)} honorifics for {language_hint}") + + names_with_honorifics = {} + standalone_names = {} + + # Check if parallel processing is enabled + extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) + + # PARALLEL HONORIFIC PROCESSING + if extraction_workers > 1 and len(honorifics_to_use) > 3: + print(f"📑 Scanning for names with honorifics (parallel with {extraction_workers} workers)...") + + # Create a wrapper function that can be called in parallel + def process_honorific(args): + """Process a single honorific in a worker thread""" + honorific, idx, total = args + + # Check stop flag + if is_stop_requested(): + return None, None + + print(f"📑 Worker processing honorific {idx}/{total}: '{honorific}'") + + # Local dictionaries for this worker + local_names_with = {} + local_standalone = {} + + # Call the extraction method + self._extract_names_for_honorific( + honorific, all_text, language_hint, + min_frequency, local_names_with, + local_standalone, is_valid_name, fuzzy_threshold + ) + + return local_names_with, local_standalone + + # Prepare arguments for parallel processing + honorific_args = [ + (honorific, idx + 1, len(honorifics_to_use)) + for idx, honorific in enumerate(honorifics_to_use) + ] + + # Process honorifics in parallel + with ThreadPoolExecutor(max_workers=min(extraction_workers, len(honorifics_to_use))) as executor: + futures = [] + + for args in honorific_args: + if is_stop_requested(): + executor.shutdown(wait=False) + return {} + + future = executor.submit(process_honorific, args) + futures.append(future) + + # Collect results as they complete + completed = 0 + for future in as_completed(futures): + if is_stop_requested(): + executor.shutdown(wait=False) + return {} + + try: + result = future.result() + if result and result[0] is not None: + local_names_with, local_standalone = result + + # Merge results (thread-safe since we're in main thread) + for name, count in local_names_with.items(): + if name not in names_with_honorifics: + names_with_honorifics[name] = count + else: + names_with_honorifics[name] = max(names_with_honorifics[name], count) + + for name, count in local_standalone.items(): + if name not in standalone_names: + standalone_names[name] = count + else: + standalone_names[name] = max(standalone_names[name], count) + + completed += 1 + if completed % 5 == 0 or completed == len(honorifics_to_use): + print(f"📑 Honorific processing: {completed}/{len(honorifics_to_use)} completed") + + except Exception as e: + print(f"⚠️ Failed to process honorific: {e}") + completed += 1 + + print(f"📑 Parallel honorific processing completed: found {len(names_with_honorifics)} names") + + else: + # SEQUENTIAL PROCESSING (fallback) + print("📑 Scanning for names with honorifics...") + + # Extract names with honorifics + total_honorifics = len(honorifics_to_use) + for idx, honorific in enumerate(honorifics_to_use): + # Check stop flag before each honorific + if is_stop_requested(): + print(f"📑 ❌ Extraction stopped at honorific {idx}/{total_honorifics}") + return {} + + print(f"📑 Processing honorific {idx + 1}/{total_honorifics}: '{honorific}'") + + self._extract_names_for_honorific(honorific, all_text, language_hint, + min_frequency, names_with_honorifics, + standalone_names, is_valid_name, fuzzy_threshold) + + # Check stop flag before processing terms + if is_stop_requested(): + print("📑 ❌ Extraction stopped before processing terms") + return {} + + # Apply filter mode + filtered_names = {} + if filter_mode == 'only_with_honorifics': + # Only keep names that have honorifics (no standalone names) + filtered_names = names_with_honorifics.copy() + print(f"📑 Filter: Keeping only names with honorifics ({len(filtered_names)} names)") + elif filter_mode == 'only_without_honorifics': + # Keep standalone names that were NOT found with honorifics + for name, count in standalone_names.items(): + # Check if this name also appears with honorifics + appears_with_honorific = False + for honorific_name in names_with_honorifics.keys(): + if self._strip_honorific(honorific_name, language_hint) == name: + appears_with_honorific = True + break + + # Only add if it doesn't appear with honorifics + if not appears_with_honorific: + filtered_names[name] = count + + print(f"📑 Filter: Keeping only names without honorifics ({len(filtered_names)} names)") + else: # 'all' mode + # Keep all names (both with and without honorifics) + filtered_names = names_with_honorifics.copy() + # Also add standalone names + for name, count in standalone_names.items(): + if name not in filtered_names and not any( + self._strip_honorific(n, language_hint) == name for n in filtered_names.keys() + ): + filtered_names[name] = count + print(f"📑 Filter: Keeping all names ({len(filtered_names)} names)") + + # Process extracted terms + final_terms = {} + + term_count = 0 + total_terms = len(filtered_names) + for term, count in filtered_names.items(): + term_count += 1 + + # Check stop flag every 20 terms + if term_count % 20 == 0: + if is_stop_requested(): + print(f"📑 ❌ Term processing stopped at {term_count}/{total_terms}") + return {} + + if strip_honorifics: + clean_term = self._strip_honorific(term, language_hint) + if clean_term in final_terms: + final_terms[clean_term] = final_terms[clean_term] + count + else: + final_terms[clean_term] = count + else: + final_terms[term] = count + + # Check stop flag before finding titles + if is_stop_requested(): + print("📑 ❌ Extraction stopped before finding titles") + return {} + + # Find titles (but respect filter mode) + print("📑 Scanning for titles...") + found_titles = {} + + # Extract titles for all modes EXCEPT "only_with_honorifics" + # (titles are included in "only_without_honorifics" since titles typically don't have honorifics) + if filter_mode != 'only_with_honorifics': + title_patterns_to_use = [] + if language_hint in self.pattern_manager.TITLE_PATTERNS: + title_patterns_to_use.extend(self.pattern_manager.TITLE_PATTERNS[language_hint]) + title_patterns_to_use.extend(self.pattern_manager.TITLE_PATTERNS.get('english', [])) + + total_patterns = len(title_patterns_to_use) + for pattern_idx, pattern in enumerate(title_patterns_to_use): + # Check stop flag before each pattern + if is_stop_requested(): + print(f"📑 ❌ Title extraction stopped at pattern {pattern_idx}/{total_patterns}") + return {} + + print(f"📑 Processing title pattern {pattern_idx + 1}/{total_patterns}") + + matches = list(re.finditer(pattern, all_text, re.IGNORECASE if 'english' in pattern else 0)) + + for match_idx, match in enumerate(matches): + # Check stop flag every 50 matches + if match_idx > 0 and match_idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Title extraction stopped at match {match_idx}") + return {} + + title = match.group(0) + + # Skip if this title is already in names + if title in filtered_names or title in names_with_honorifics: + continue + + count = self._find_fuzzy_matches(title, all_text, fuzzy_threshold) + + # Check if stopped during fuzzy matching + if is_stop_requested(): + print(f"📑 ❌ Title extraction stopped during fuzzy matching") + return {} + + if count >= min_frequency: + if re.match(r'[A-Za-z]', title): + title = title.title() + + if strip_honorifics: + title = self._strip_honorific(title, language_hint) + + if title not in found_titles: + found_titles[title] = count + + if filter_mode == 'only_without_honorifics': + print(f"📑 Found {len(found_titles)} titles (included in 'without honorifics' mode)") + else: + print(f"📑 Found {len(found_titles)} unique titles") + else: + print(f"📑 Skipping title extraction (filter mode: only_with_honorifics)") + + # Check stop flag before sorting and translation + if is_stop_requested(): + print("📑 ❌ Extraction stopped before sorting terms") + return {} + + # Combine and sort + sorted_names = sorted(final_terms.items(), key=lambda x: x[1], reverse=True)[:max_names] + sorted_titles = sorted(found_titles.items(), key=lambda x: x[1], reverse=True)[:max_titles] + + all_terms = [] + for name, count in sorted_names: + all_terms.append(name) + for title, count in sorted_titles: + all_terms.append(title) + + print(f"📑 Total terms to translate: {len(all_terms)}") + + # Check stop flag before translation + if is_stop_requested(): + print("📑 ❌ Extraction stopped before translation") + return {} + + # Translate terms + if os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1": + print("📑 Translation disabled - keeping original terms") + translations = {term: term for term in all_terms} + else: + print(f"📑 Translating {len(all_terms)} terms...") + translations = self._translate_terms_batch(all_terms, language_hint, batch_size, output_dir) + + # Check if translation was stopped + if is_stop_requested(): + print("📑 ❌ Extraction stopped after translation") + return translations # Return partial results + + # Build CSV lines + csv_lines = ["type,raw_name,translated_name"] + + for name, _ in sorted_names: + if name in translations: + csv_lines.append(f"character,{name},{translations[name]}") + + for title, _ in sorted_titles: + if title in translations: + csv_lines.append(f"term,{title},{translations[title]}") + + # Check stop flag before merging + if is_stop_requested(): + print("📑 ❌ Extraction stopped before merging with existing glossary") + # Still save what we have + csv_content = '\n'.join(csv_lines) + glossary_path = os.path.join(output_dir, "glossary.json") + self._atomic_write_file(glossary_path, csv_content) + return self._parse_csv_to_dict(csv_content) + + # Merge with existing glossary + if existing_glossary: + csv_lines = self._merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language_hint) + + # Check stop flag before deduplication + if is_stop_requested(): + print("📑 ❌ Extraction stopped before deduplication") + csv_content = '\n'.join(csv_lines) + glossary_path = os.path.join(output_dir, "glossary.json") + self._atomic_write_file(glossary_path, csv_content) + return self._parse_csv_to_dict(csv_content) + + # Fuzzy matching deduplication + csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) + + # Create CSV content + csv_content = '\n'.join(csv_lines) + # Save glossary as CSV + glossary_path = os.path.join(output_dir, "glossary.csv") + self._atomic_write_file(glossary_path, csv_content) + + print(f"\n📑 ✅ TARGETED GLOSSARY SAVED!") + print(f"📑 File: {glossary_path}") + print(f"📑 Total entries: {len(csv_lines) - 1}") # Exclude header + + return self._parse_csv_to_dict(csv_content) + + def _translate_terms_batch(self, term_list, profile_name, batch_size=50, output_dir=None): + """Use fully configurable prompts for translation with interrupt support""" + if not term_list or os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1": + print(f"📑 Glossary translation disabled or no terms to translate") + return {term: term for term in term_list} + + # Check stop flag + if is_stop_requested(): + print("📑 ❌ Glossary translation stopped by user") + return {term: term for term in term_list} + + try: + MODEL = os.getenv("MODEL", "gemini-1.5-flash") + API_KEY = (os.getenv("API_KEY") or + os.getenv("OPENAI_API_KEY") or + os.getenv("OPENAI_OR_Gemini_API_KEY") or + os.getenv("GEMINI_API_KEY")) + + if is_traditional_translation_api(MODEL): + return + + if not API_KEY: + print(f"📑 No API key found, skipping translation") + return {term: term for term in term_list} + + print(f"📑 Translating {len(term_list)} {profile_name} terms to English using batch size {batch_size}...") + + from unified_api_client import UnifiedClient, UnifiedClientError + client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir) + if hasattr(client, 'reset_cleanup_state'): + client.reset_cleanup_state() + + # Get custom translation prompt from environment + translation_prompt_template = os.getenv("GLOSSARY_TRANSLATION_PROMPT", "") + + if not translation_prompt_template: + translation_prompt_template = """You are translating {language} character names and important terms to English. + For character names, provide English transliterations or keep as romanized. + Keep honorifics/suffixes only if they are integral to the name. + Respond with the same numbered format. + + Terms to translate: + {terms_list} + + Provide translations in the same numbered format.""" + + all_translations = {} + chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "300")) # 5 minute default + + for i in range(0, len(term_list), batch_size): + # Check stop flag before each batch + if is_stop_requested(): + print(f"📑 ❌ Translation stopped at batch {(i // batch_size) + 1}") + # Return partial translations + for term in term_list: + if term not in all_translations: + all_translations[term] = term + return all_translations + + batch = term_list[i:i + batch_size] + batch_num = (i // batch_size) + 1 + total_batches = (len(term_list) + batch_size - 1) // batch_size + + print(f"📑 Processing batch {batch_num}/{total_batches} ({len(batch)} terms)...") + + # Format terms list + terms_text = "" + for idx, term in enumerate(batch, 1): + terms_text += f"{idx}. {term}\n" + + # Replace placeholders in prompt + prompt = translation_prompt_template.replace('{language}', profile_name) + prompt = prompt.replace('{terms_list}', terms_text.strip()) + prompt = prompt.replace('{batch_size}', str(len(batch))) + + messages = [ + {"role": "user", "content": prompt} + ] + + try: + temperature = float(os.getenv("TEMPERATURE", "0.3")) + max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096")) + + # Use send_with_interrupt for interruptible API call + print(f"📑 Sending translation request for batch {batch_num} (interruptible)...") + + response = send_with_interrupt( + messages=messages, + client=client, + temperature=temperature, + max_tokens=max_tokens, + stop_check_fn=is_stop_requested, + chunk_timeout=chunk_timeout + ) + + # Handle response properly + if hasattr(response, 'content'): + response_text = response.content + else: + response_text = str(response) + + batch_translations = self._parse_translation_response(response_text, batch) + all_translations.update(batch_translations) + + print(f"📑 Batch {batch_num} completed: {len(batch_translations)} translations") + + # Small delay between batches to avoid rate limiting (configurable) + if i + batch_size < len(term_list): + # Check stop before sleep + if is_stop_requested(): + print(f"📑 ❌ Translation stopped after batch {batch_num}") + # Fill in missing translations + for term in term_list: + if term not in all_translations: + all_translations[term] = term + return all_translations + # Use configurable batch delay or default to 0.1s (much faster than 0.5s) + batch_delay = float(os.getenv("GLOSSARY_BATCH_DELAY", "0.001")) + if batch_delay > 0: + time.sleep(batch_delay) + + except UnifiedClientError as e: + if "stopped by user" in str(e).lower(): + print(f"📑 ❌ Translation interrupted by user at batch {batch_num}") + # Fill in remaining terms with originals + for term in term_list: + if term not in all_translations: + all_translations[term] = term + return all_translations + else: + print(f"⚠️ Translation failed for batch {batch_num}: {e}") + for term in batch: + all_translations[term] = term + except Exception as e: + print(f"⚠️ Translation failed for batch {batch_num}: {e}") + for term in batch: + all_translations[term] = term + + # Ensure all terms have translations + for term in term_list: + if term not in all_translations: + all_translations[term] = term + + translated_count = sum(1 for term, translation in all_translations.items() + if translation != term and translation.strip()) + + print(f"📑 Successfully translated {translated_count}/{len(term_list)} terms") + return all_translations + + except Exception as e: + print(f"⚠️ Glossary translation failed: {e}") + return {term: term for term in term_list} + + + def _extract_names_for_honorific(self, honorific, all_text, language_hint, + min_frequency, names_with_honorifics, + standalone_names, is_valid_name, fuzzy_threshold=0.90): + """Extract names for a specific honorific with fuzzy matching and stop flag checks""" + + # Check stop flag at start + if is_stop_requested(): + print(f"📑 ❌ Name extraction for '{honorific}' stopped by user") + return + + if language_hint == 'korean' and not honorific.startswith('-'): + pattern = r'([\uac00-\ud7af]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,.\!?]|$))' + + matches = list(re.finditer(pattern, all_text)) + total_matches = len(matches) + + for idx, match in enumerate(matches): + # Check stop flag every 50 matches + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Korean name extraction stopped at {idx}/{total_matches}") + return + + # Show progress for large sets + if total_matches > 500: + progress = (idx / total_matches) * 100 + print(f"📑 Processing Korean names: {progress:.1f}% ({idx}/{total_matches})") + + potential_name = match.group(1) + + if is_valid_name(potential_name, 'korean'): + full_form = potential_name + honorific + + # Use fuzzy matching for counting with stop check + count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) + + # Check if stopped during fuzzy matching + if is_stop_requested(): + print(f"📑 ❌ Name extraction stopped during fuzzy matching") + return + + if count >= min_frequency: + context_patterns = [ + full_form + r'[은는이가]', + full_form + r'[을를]', + full_form + r'[에게한테]', + r'["]' + full_form, + full_form + r'[,]', + ] + + context_count = 0 + for ctx_pattern in context_patterns: + context_count += len(re.findall(ctx_pattern, all_text)) + + if context_count > 0: + names_with_honorifics[full_form] = count + standalone_names[potential_name] = count + + elif language_hint == 'japanese' and not honorific.startswith('-'): + pattern = r'([\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,5})(?=' + re.escape(honorific) + r'(?:\s|[、。!?]|$))' + + matches = list(re.finditer(pattern, all_text)) + total_matches = len(matches) + + for idx, match in enumerate(matches): + # Check stop flag every 50 matches + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Japanese name extraction stopped at {idx}/{total_matches}") + return + + if total_matches > 500: + progress = (idx / total_matches) * 100 + print(f"📑 Processing Japanese names: {progress:.1f}% ({idx}/{total_matches})") + + potential_name = match.group(1) + + if is_valid_name(potential_name, 'japanese'): + full_form = potential_name + honorific + count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) + + if is_stop_requested(): + print(f"📑 ❌ Name extraction stopped during fuzzy matching") + return + + if count >= min_frequency: + names_with_honorifics[full_form] = count + standalone_names[potential_name] = count + + elif language_hint == 'chinese' and not honorific.startswith('-'): + pattern = r'([\u4e00-\u9fff]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,。!?]|$))' + + matches = list(re.finditer(pattern, all_text)) + total_matches = len(matches) + + for idx, match in enumerate(matches): + # Check stop flag every 50 matches + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ Chinese name extraction stopped at {idx}/{total_matches}") + return + + if total_matches > 500: + progress = (idx / total_matches) * 100 + print(f"📑 Processing Chinese names: {progress:.1f}% ({idx}/{total_matches})") + + potential_name = match.group(1) + + if is_valid_name(potential_name, 'chinese'): + full_form = potential_name + honorific + count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) + + if is_stop_requested(): + print(f"📑 ❌ Name extraction stopped during fuzzy matching") + return + + if count >= min_frequency: + names_with_honorifics[full_form] = count + standalone_names[potential_name] = count + + elif honorific.startswith('-') or honorific.startswith(' '): + is_space_separated = honorific.startswith(' ') + + if is_space_separated: + pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'(?=\s|[,.\!?]|$)' + else: + pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'\b' + + matches = list(re.finditer(pattern_english, all_text)) + total_matches = len(matches) + + for idx, match in enumerate(matches): + # Check stop flag every 50 matches + if idx > 0 and idx % 50 == 0: + if is_stop_requested(): + print(f"📑 ❌ English name extraction stopped at {idx}/{total_matches}") + return + + if total_matches > 500: + progress = (idx / total_matches) * 100 + print(f"📑 Processing English names: {progress:.1f}% ({idx}/{total_matches})") + + potential_name = match.group(1) + + if is_valid_name(potential_name, 'english'): + full_form = potential_name + honorific + count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) + + if is_stop_requested(): + print(f"📑 ❌ Name extraction stopped during fuzzy matching") + return + + if count >= min_frequency: + names_with_honorifics[full_form] = count + standalone_names[potential_name] = count + + def _parse_translation_response(self, response, original_terms): + """Parse translation response - handles numbered format""" + translations = {} + + # Handle UnifiedResponse object + if hasattr(response, 'content'): + response_text = response.content + else: + response_text = str(response) + + lines = response_text.strip().split('\n') + + for line in lines: + line = line.strip() + if not line or not line[0].isdigit(): + continue + + try: + number_match = re.match(r'^(\d+)\.?\s*(.+)', line) + if number_match: + num = int(number_match.group(1)) - 1 + content = number_match.group(2).strip() + + if 0 <= num < len(original_terms): + original_term = original_terms[num] + + for separator in ['->', '→', ':', '-', '—', '=']: + if separator in content: + parts = content.split(separator, 1) + if len(parts) == 2: + translation = parts[1].strip() + translation = translation.strip('"\'()[]') + if translation and translation != original_term: + translations[original_term] = translation + break + else: + if content != original_term: + translations[original_term] = content + + except (ValueError, IndexError): + continue + + return translations + +# ===================================================== +# UNIFIED UTILITIES +# ===================================================== +def sanitize_resource_filename(filename): + """Sanitize resource filenames for filesystem compatibility""" + filename = unicodedata.normalize('NFC', filename) + + replacements = { + '/': '_', '\\': '_', ':': '_', '*': '_', + '?': '_', '"': '_', '<': '_', '>': '_', + '|': '_', '\0': '', '\n': '_', '\r': '_' + } + + for old, new in replacements.items(): + filename = filename.replace(old, new) + + filename = ''.join(char for char in filename if ord(char) >= 32) + + name, ext = os.path.splitext(filename) + + if not name: + name = 'resource' + + return name + ext + +def should_retain_source_extension(): + """Read GUI toggle for retaining original extension and no 'response_' prefix. + This is stored in config or env by the GUI; we read env as bridge. + """ + return os.getenv('RETAIN_SOURCE_EXTENSION', os.getenv('retain_source_extension', '0')) in ('1', 'true', 'True') + +def make_safe_filename(title, actual_num): + """Create a safe filename that works across different filesystems""" + if not title: + return f"chapter_{actual_num:03d}" + + title = unicodedata.normalize('NFC', str(title)) + + dangerous_chars = { + '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_', + '"': '_', '<': '_', '>': '_', '|': '_', '\0': '', + '\n': ' ', '\r': ' ', '\t': ' ' + } + + for old, new in dangerous_chars.items(): + title = title.replace(old, new) + + title = ''.join(char for char in title if ord(char) >= 32) + title = re.sub(r'\s+', '_', title) + title = title.strip('_.• \t') + + if not title or title == '_' * len(title): + title = f"chapter_{actual_num:03d}" + + return title + +def get_content_hash(html_content): + """Create a stable hash of content""" + return ContentProcessor.get_content_hash(html_content) + +def clean_ai_artifacts(text, remove_artifacts=True): + """Remove AI response artifacts from text""" + return ContentProcessor.clean_ai_artifacts(text, remove_artifacts) + +def find_glossary_file(output_dir): + """Return path to glossary file preferring CSV over JSON, or None if not found""" + candidates = [ + os.path.join(output_dir, "glossary.csv"), + os.path.join(output_dir, "glossary.json"), + ] + for p in candidates: + if os.path.exists(p): + return p + return None + +def clean_memory_artifacts(text): + """Remove any memory/summary artifacts""" + return ContentProcessor.clean_memory_artifacts(text) + +def emergency_restore_paragraphs(text, original_html=None, verbose=True): + """Emergency restoration when AI returns wall of text""" + return ContentProcessor.emergency_restore_paragraphs(text, original_html, verbose) + +def is_meaningful_text_content(html_content): + """Check if chapter has meaningful text beyond just structure""" + return ContentProcessor.is_meaningful_text_content(html_content) + +# ===================================================== +# GLOBAL SETTINGS AND FLAGS +# ===================================================== +logging.basicConfig(level=logging.DEBUG) + +try: + if hasattr(sys.stdout, 'reconfigure'): + sys.stdout.reconfigure(encoding='utf-8', errors='ignore') +except AttributeError: + if sys.stdout is None: + devnull = open(os.devnull, "wb") + sys.stdout = io.TextIOWrapper(devnull, encoding='utf-8', errors='ignore') + elif hasattr(sys.stdout, 'buffer'): + try: + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') + except: + pass + +_stop_requested = False + +def set_stop_flag(value): + """Set the global stop flag""" + global _stop_requested + _stop_requested = value + +def is_stop_requested(): + """Check if stop was requested""" + global _stop_requested + return _stop_requested + +def set_output_redirect(log_callback=None): + """Redirect print statements to a callback function for GUI integration""" + if log_callback: + class CallbackWriter: + def __init__(self, callback): + self.callback = callback + + def write(self, text): + if text.strip(): + self.callback(text.strip()) + + def flush(self): + pass + + sys.stdout = CallbackWriter(log_callback) + +# ===================================================== +# EPUB AND FILE PROCESSING +# ===================================================== +def extract_chapter_number_from_filename(filename, opf_spine_position=None, opf_spine_data=None): + """Extract chapter number from filename, prioritizing OPF spine order""" + + # Priority 1: Use OPF spine position if available + if opf_spine_position is not None: + # Handle special non-chapter files (always chapter 0) + filename_lower = filename.lower() + name_without_ext = os.path.splitext(filename)[0].lower() + + # Check for special keywords OR no numbers present + special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface', 'nav'] + has_special_keyword = any(name in filename_lower for name in special_keywords) + has_no_numbers = not re.search(r'\d', name_without_ext) + + if has_special_keyword or has_no_numbers: + return 0, 'opf_special_file' + + # Use spine position for regular chapters (0, 1, 2, 3...) + return opf_spine_position, 'opf_spine_order' + + # Priority 2: Check if this looks like a special file (even without OPF) + name_without_ext = os.path.splitext(filename)[0].lower() + special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface'] + has_special_keyword = any(name in name_without_ext for name in special_keywords) + has_no_numbers = not re.search(r'\d', name_without_ext) + + if has_special_keyword or has_no_numbers: + return 0, 'special_file' + + # Priority 3: Try to extract sequential numbers (000, 001, 002...) + name_without_ext = os.path.splitext(filename)[0] + + # Look for simple sequential patterns first + # Priority 3: Try to extract sequential numbers and decimals + sequential_patterns = [ + (r'^(\d+)\.(\d+)$', 'decimal_number'), # 1.5, 2.3 (NEW!) + (r'^(\d{3,4})$', 'sequential_number'), # 000, 001, 0001 + (r'^(\d+)$', 'direct_number'), # 0, 1, 2 + ] + + for pattern, method in sequential_patterns: + match = re.search(pattern, name_without_ext) + if match: + if method == 'decimal_number': + # Return as float for decimal chapters + return float(f"{match.group(1)}.{match.group(2)}"), method + else: + return int(match.group(1)), method + + # Priority 4: Fall back to existing filename parsing patterns + fallback_patterns = [ + (r'^response_(\d+)[_\.]', 'response_prefix'), + (r'[Cc]hapter[_\s]*(\d+)', 'chapter_word'), + (r'[Cc]h[_\s]*(\d+)', 'ch_abbreviation'), + (r'No(\d+)', 'no_prefix'), + (r'第(\d+)[章话回]', 'chinese_chapter'), + (r'-h-(\d+)', 'h_suffix'), # For your -h-16 pattern + (r'_(\d+)', 'underscore_suffix'), + (r'-(\d+)', 'dash_suffix'), + (r'(\d+)', 'trailing_number'), + ] + + for pattern, method in fallback_patterns: + match = re.search(pattern, name_without_ext, re.IGNORECASE) + if match: + return int(match.group(1)), method + + return None, None + +def process_chapter_images(chapter_html: str, actual_num: int, image_translator: ImageTranslator, + check_stop_fn=None) -> Tuple[str, Dict[str, str]]: + """Process and translate images in a chapter""" + from bs4 import BeautifulSoup + images = image_translator.extract_images_from_chapter(chapter_html) + + if not images: + return chapter_html, {} + + print(f"🖼️ Found {len(images)} images in chapter {actual_num}") + + soup = BeautifulSoup(chapter_html, 'html.parser') + + image_translations = {} + translated_count = 0 + + max_images_per_chapter = int(os.getenv('MAX_IMAGES_PER_CHAPTER', '10')) + if len(images) > max_images_per_chapter: + print(f" ⚠️ Chapter has {len(images)} images - processing first {max_images_per_chapter} only") + images = images[:max_images_per_chapter] + + for idx, img_info in enumerate(images, 1): + if check_stop_fn and check_stop_fn(): + print("❌ Image translation stopped by user") + break + + img_src = img_info['src'] + + if img_src.startswith('../'): + img_path = os.path.join(image_translator.output_dir, img_src[3:]) + elif img_src.startswith('./'): + img_path = os.path.join(image_translator.output_dir, img_src[2:]) + elif img_src.startswith('/'): + img_path = os.path.join(image_translator.output_dir, img_src[1:]) + else: + possible_paths = [ + os.path.join(image_translator.images_dir, os.path.basename(img_src)), + os.path.join(image_translator.output_dir, img_src), + os.path.join(image_translator.output_dir, 'images', os.path.basename(img_src)), + os.path.join(image_translator.output_dir, os.path.basename(img_src)), + os.path.join(image_translator.output_dir, os.path.dirname(img_src), os.path.basename(img_src)) + ] + + img_path = None + for path in possible_paths: + if os.path.exists(path): + img_path = path + print(f" ✅ Found image at: {path}") + break + + if not img_path: + print(f" ❌ Image not found in any location for: {img_src}") + print(f" Tried: {possible_paths}") + continue + + img_path = os.path.normpath(img_path) + + if not os.path.exists(img_path): + print(f" ⚠️ Image not found: {img_path}") + print(f" 📁 Images directory: {image_translator.images_dir}") + print(f" 📁 Output directory: {image_translator.output_dir}") + print(f" 📁 Working directory: {os.getcwd()}") + + if os.path.exists(image_translator.images_dir): + files = os.listdir(image_translator.images_dir) + print(f" 📁 Files in images dir: {files[:5]}...") + continue + + print(f" 🔍 Processing image {idx}/{len(images)}: {os.path.basename(img_path)}") + + context = "" + if img_info.get('alt'): + context += f", Alt text: {img_info['alt']}" + + if translated_count > 0: + delay = float(os.getenv('IMAGE_API_DELAY', '1.0')) + time.sleep(delay) + + translation_result = image_translator.translate_image(img_path, context, check_stop_fn) + + print(f"\n🔍 DEBUG: Image {idx}/{len(images)}") + print(f" Translation result: {'Success' if translation_result and '[Image Translation Error:' not in translation_result else 'Failed'}") + if translation_result and "[Image Translation Error:" in translation_result: + print(f" Error message: {translation_result}") + + if translation_result: + img_tag = None + for img in soup.find_all('img'): + if img.get('src') == img_src: + img_tag = img + break + + if img_tag: + hide_label = os.getenv("HIDE_IMAGE_TRANSLATION_LABEL", "0") == "1" + + print(f" 🔍 DEBUG: Integration Phase") + print(f" 🏷️ Hide label mode: {hide_label}") + print(f" 📍 Found img tag: {img_tag.get('src')}") + + # Store the translation result in the dictionary FIRST + image_translations[img_path] = translation_result + + # Parse the translation result to integrate into the chapter HTML + if '
' in translation_result: + trans_soup = BeautifulSoup(translation_result, 'html.parser') + + # Try to get the full container first + full_container = trans_soup.find('div', class_=['translated-text-only', 'image-with-translation']) + + if full_container: + # Clone the container to avoid issues + new_container = BeautifulSoup(str(full_container), 'html.parser').find('div') + img_tag.replace_with(new_container) + print(f" ✅ Replaced image with full translation container") + else: + # Fallback: manually build the structure + trans_div = trans_soup.find('div', class_='image-translation') + if trans_div: + container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'}) + img_tag.replace_with(container) + + if not hide_label: + new_img = soup.new_tag('img', src=img_src) + if img_info.get('alt'): + new_img['alt'] = img_info.get('alt') + container.append(new_img) + + # Clone the translation div content + new_trans_div = soup.new_tag('div', **{'class': 'image-translation'}) + # Copy all children from trans_div to new_trans_div + for child in trans_div.children: + if hasattr(child, 'name'): + new_trans_div.append(BeautifulSoup(str(child), 'html.parser')) + else: + new_trans_div.append(str(child)) + + container.append(new_trans_div) + print(f" ✅ Built container with translation div") + else: + print(f" ⚠️ No translation div found in result") + continue + else: + # Plain text translation - build structure manually + container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'}) + img_tag.replace_with(container) + + if not hide_label: + new_img = soup.new_tag('img', src=img_src) + if img_info.get('alt'): + new_img['alt'] = img_info.get('alt') + container.append(new_img) + + # Create translation div with content + translation_div = soup.new_tag('div', **{'class': 'image-translation'}) + if not hide_label: + label_p = soup.new_tag('p') + label_em = soup.new_tag('em') + #label_em.string = "[Image text translation:]" + label_p.append(label_em) + translation_div.append(label_p) + + trans_p = soup.new_tag('p') + trans_p.string = translation_result + translation_div.append(trans_p) + container.append(translation_div) + print(f" ✅ Created plain text translation structure") + + translated_count += 1 + + # Save to translated_images folder + trans_filename = f"ch{actual_num:03d}_img{idx:02d}_translation.html" + trans_filepath = os.path.join(image_translator.translated_images_dir, trans_filename) + + # Extract just the translation content for saving + save_soup = BeautifulSoup(translation_result, 'html.parser') + save_div = save_soup.find('div', class_='image-translation') + if not save_div: + # Create a simple div for plain text + save_div = f'

{translation_result}

' + + with open(trans_filepath, 'w', encoding='utf-8') as f: + f.write(f""" + + + + Chapter {actual_num} - Image {idx} Translation + + +

Chapter {actual_num} - Image {idx}

+

Original: {os.path.basename(img_path)}

+
+ {save_div} + +""") + + print(f" ✅ Saved translation to: {trans_filename}") + else: + print(f" ⚠️ Could not find image tag in HTML for: {img_src}") + + if translated_count > 0: + print(f" 🖼️ Successfully translated {translated_count} images") + + # Debug output + final_html = str(soup) + trans_count = final_html.count('
') + print(f" 📊 Final HTML has {trans_count} translation divs") + print(f" 📊 image_translations dict has {len(image_translations)} entries") + + prog = image_translator.load_progress() + if "image_chunks" in prog: + completed_images = [] + for img_key, img_data in prog["image_chunks"].items(): + if len(img_data["completed"]) == img_data["total"]: + completed_images.append(img_key) + + for img_key in completed_images: + del prog["image_chunks"][img_key] + + if completed_images: + image_translator.save_progress(prog) + print(f" 🧹 Cleaned up progress for {len(completed_images)} completed images") + + image_translator.save_translation_log(actual_num, image_translations) + + return str(soup), image_translations + else: + print(f" ℹ️ No images were successfully translated") + + return chapter_html, {} + +def detect_novel_numbering(chapters): + """Detect if the novel uses 0-based or 1-based chapter numbering with improved accuracy""" + print("[DEBUG] Detecting novel numbering system...") + + if not chapters: + return False + + if isinstance(chapters[0], str): + print("[DEBUG] Text file detected, skipping numbering detection") + return False + + patterns = PatternManager.FILENAME_EXTRACT_PATTERNS + + # Special check for prefix_suffix pattern like "0000_1.xhtml" + prefix_suffix_pattern = r'^(\d+)_(\d+)[_\.]' + + # Track chapter numbers from different sources + filename_numbers = [] + content_numbers = [] + has_prefix_suffix = False + prefix_suffix_numbers = [] + + for idx, chapter in enumerate(chapters): + extracted_num = None + + # Check filename patterns + if 'original_basename' in chapter and chapter['original_basename']: + filename = chapter['original_basename'] + elif 'filename' in chapter: + filename = os.path.basename(chapter['filename']) + else: + continue + + # First check for prefix_suffix pattern + prefix_match = re.search(prefix_suffix_pattern, filename, re.IGNORECASE) + if prefix_match: + has_prefix_suffix = True + # Use the SECOND number (after underscore) + suffix_num = int(prefix_match.group(2)) + prefix_suffix_numbers.append(suffix_num) + extracted_num = suffix_num + print(f"[DEBUG] Prefix_suffix pattern matched: {filename} -> Chapter {suffix_num}") + else: + # Try other patterns + for pattern in patterns: + match = re.search(pattern, filename) + if match: + extracted_num = int(match.group(1)) + #print(f"[DEBUG] Pattern '{pattern}' matched: {filename} -> Chapter {extracted_num}") + break + + if extracted_num is not None: + filename_numbers.append(extracted_num) + + # Also check chapter content for chapter declarations + if 'body' in chapter: + # Look for "Chapter N" in the first 1000 characters + content_preview = chapter['body'][:1000] + content_match = re.search(r'Chapter\s+(\d+)', content_preview, re.IGNORECASE) + if content_match: + content_num = int(content_match.group(1)) + content_numbers.append(content_num) + print(f"[DEBUG] Found 'Chapter {content_num}' in content") + + # Decision logic with improved heuristics + + # 1. If using prefix_suffix pattern, trust those numbers exclusively + if has_prefix_suffix and prefix_suffix_numbers: + min_suffix = min(prefix_suffix_numbers) + if min_suffix >= 1: + print(f"[DEBUG] ✅ 1-based novel detected (prefix_suffix pattern starts at {min_suffix})") + return False + else: + print(f"[DEBUG] ✅ 0-based novel detected (prefix_suffix pattern starts at {min_suffix})") + return True + + # 2. If we have content numbers, prefer those over filename numbers + if content_numbers: + min_content = min(content_numbers) + # Check if we have a good sequence starting from 0 or 1 + if 0 in content_numbers and 1 in content_numbers: + print(f"[DEBUG] ✅ 0-based novel detected (found both Chapter 0 and Chapter 1 in content)") + return True + elif min_content == 1: + print(f"[DEBUG] ✅ 1-based novel detected (content chapters start at 1)") + return False + + # 3. Fall back to filename numbers + if filename_numbers: + min_filename = min(filename_numbers) + max_filename = max(filename_numbers) + + # Check for a proper sequence + # If we have 0,1,2,3... it's likely 0-based + # If we have 1,2,3,4... it's likely 1-based + + # Count how many chapters we have in sequence starting from 0 + zero_sequence_count = 0 + for i in range(len(chapters)): + if i in filename_numbers: + zero_sequence_count += 1 + else: + break + + # Count how many chapters we have in sequence starting from 1 + one_sequence_count = 0 + for i in range(1, len(chapters) + 1): + if i in filename_numbers: + one_sequence_count += 1 + else: + break + + print(f"[DEBUG] Zero-based sequence length: {zero_sequence_count}") + print(f"[DEBUG] One-based sequence length: {one_sequence_count}") + + # If we have a better sequence starting from 1, it's 1-based + if one_sequence_count > zero_sequence_count and min_filename >= 1: + print(f"[DEBUG] ✅ 1-based novel detected (better sequence match starting from 1)") + return False + + # If we have any 0 in filenames and it's part of a sequence + if 0 in filename_numbers and zero_sequence_count >= 3: + print(f"[DEBUG] ✅ 0-based novel detected (found 0 in sequence)") + return True + + # 4. Default to 1-based if uncertain + print(f"[DEBUG] ✅ Defaulting to 1-based novel (insufficient evidence for 0-based)") + return False + +def validate_chapter_continuity(chapters): + """Validate chapter continuity and warn about issues""" + if not chapters: + print("No chapters to translate") + return + + issues = [] + + # Get all chapter numbers + chapter_nums = [c['num'] for c in chapters] + actual_nums = [c.get('actual_chapter_num', c['num']) for c in chapters] + + # Check for duplicates + duplicates = [num for num in chapter_nums if chapter_nums.count(num) > 1] + if duplicates: + issues.append(f"Duplicate chapter numbers found: {set(duplicates)}") + + # Check for gaps in sequence + min_num = min(chapter_nums) + max_num = max(chapter_nums) + expected = set(range(min_num, max_num + 1)) + actual = set(chapter_nums) + missing = expected - actual + + if missing: + issues.append(f"Missing chapter numbers: {sorted(missing)}") + # Show gaps more clearly + gaps = [] + sorted_missing = sorted(missing) + if sorted_missing: + start = sorted_missing[0] + end = sorted_missing[0] + for num in sorted_missing[1:]: + if num == end + 1: + end = num + else: + gaps.append(f"{start}-{end}" if start != end else str(start)) + start = end = num + gaps.append(f"{start}-{end}" if start != end else str(start)) + issues.append(f"Gap ranges: {', '.join(gaps)}") + + # Check for duplicate titles + title_map = {} + for c in chapters: + title_lower = c['title'].lower().strip() + if title_lower in title_map: + title_map[title_lower].append(c['num']) + else: + title_map[title_lower] = [c['num']] + + for title, nums in title_map.items(): + if len(nums) > 1: + issues.append(f"Duplicate title '{title}' in chapters: {nums}") + + # Print summary + print("\n" + "="*60) + print("📚 CHAPTER VALIDATION SUMMARY") + print("="*60) + print(f"Total chapters: {len(chapters)}") + print(f"Chapter range: {min_num} to {max_num}") + print(f"Expected count: {max_num - min_num + 1}") + print(f"Actual count: {len(chapters)}") + + if len(chapters) != (max_num - min_num + 1): + print(f"⚠️ Chapter count mismatch - missing {(max_num - min_num + 1) - len(chapters)} chapters") + + if issues: + print("\n⚠️ Issues found:") + for issue in issues: + print(f" - {issue}") + else: + print("✅ No continuity issues detected") + + print("="*60 + "\n") + +def validate_epub_structure(output_dir): + """Validate that all necessary EPUB structure files are present""" + print("🔍 Validating EPUB structure...") + + required_files = { + 'container.xml': 'META-INF container file (critical)', + '*.opf': 'OPF package file (critical)', + '*.ncx': 'Navigation file (recommended)' + } + + found_files = {} + missing_files = [] + + container_path = os.path.join(output_dir, 'container.xml') + if os.path.exists(container_path): + found_files['container.xml'] = 'Found' + print(" ✅ container.xml - Found") + else: + missing_files.append('container.xml') + print(" ❌ container.xml - Missing (CRITICAL)") + + opf_files = [] + ncx_files = [] + + for file in os.listdir(output_dir): + if file.lower().endswith('.opf'): + opf_files.append(file) + elif file.lower().endswith('.ncx'): + ncx_files.append(file) + + if opf_files: + found_files['opf'] = opf_files + print(f" ✅ OPF file(s) - Found: {', '.join(opf_files)}") + else: + missing_files.append('*.opf') + print(" ❌ OPF file - Missing (CRITICAL)") + + if ncx_files: + found_files['ncx'] = ncx_files + print(f" ✅ NCX file(s) - Found: {', '.join(ncx_files)}") + else: + missing_files.append('*.ncx') + print(" ⚠️ NCX file - Missing (navigation may not work)") + + html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')] + if html_files: + print(f" ✅ Translated chapters - Found: {len(html_files)} files") + else: + print(" ⚠️ No translated chapter files found") + + critical_missing = [f for f in missing_files if f in ['container.xml', '*.opf']] + + if not critical_missing: + print("✅ EPUB structure validation PASSED") + print(" All critical files present for EPUB reconstruction") + return True + else: + print("❌ EPUB structure validation FAILED") + print(f" Missing critical files: {', '.join(critical_missing)}") + print(" EPUB reconstruction may fail without these files") + return False + +def check_epub_readiness(output_dir): + """Check if the output directory is ready for EPUB compilation""" + print("📋 Checking EPUB compilation readiness...") + + issues = [] + + if not validate_epub_structure(output_dir): + issues.append("Missing critical EPUB structure files") + + html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')] + if not html_files: + issues.append("No translated chapter files found") + else: + print(f" ✅ Found {len(html_files)} translated chapters") + + metadata_path = os.path.join(output_dir, 'metadata.json') + if os.path.exists(metadata_path): + print(" ✅ Metadata file present") + try: + with open(metadata_path, 'r', encoding='utf-8') as f: + metadata = json.load(f) + if 'title' not in metadata: + issues.append("Metadata missing title") + except Exception as e: + issues.append(f"Metadata file corrupted: {e}") + else: + issues.append("Missing metadata.json file") + + resource_dirs = ['css', 'fonts', 'images'] + found_resources = 0 + for res_dir in resource_dirs: + res_path = os.path.join(output_dir, res_dir) + if os.path.exists(res_path): + files = [f for f in os.listdir(res_path) if os.path.isfile(os.path.join(res_path, f))] + if files: + found_resources += len(files) + print(f" ✅ Found {len(files)} {res_dir} files") + + if found_resources > 0: + print(f" ✅ Total resources: {found_resources} files") + else: + print(" ⚠️ No resource files found (this may be normal)") + + if not issues: + print("🎉 EPUB compilation readiness: READY") + print(" All necessary files present for EPUB creation") + return True + else: + print("⚠️ EPUB compilation readiness: ISSUES FOUND") + for issue in issues: + print(f" • {issue}") + return False + +def cleanup_previous_extraction(output_dir): + """Clean up any files from previous extraction runs (preserves CSS files)""" + # Remove 'css' from cleanup_items to preserve CSS files + cleanup_items = [ + 'images', # Removed 'css' from this list + '.resources_extracted' + ] + + epub_structure_files = [ + 'container.xml', 'content.opf', 'toc.ncx' + ] + + cleaned_count = 0 + + # Clean up directories (except CSS) + for item in cleanup_items: + if item.startswith('.'): + continue + item_path = os.path.join(output_dir, item) + try: + if os.path.isdir(item_path): + shutil.rmtree(item_path) + print(f"🧹 Removed directory: {item}") + cleaned_count += 1 + except Exception as e: + print(f"⚠️ Could not remove directory {item}: {e}") + + # Clean up EPUB structure files + for epub_file in epub_structure_files: + file_path = os.path.join(output_dir, epub_file) + try: + if os.path.isfile(file_path): + os.remove(file_path) + print(f"🧹 Removed EPUB file: {epub_file}") + cleaned_count += 1 + except Exception as e: + print(f"⚠️ Could not remove {epub_file}: {e}") + + # Clean up any loose .opf and .ncx files + try: + for file in os.listdir(output_dir): + if file.lower().endswith(('.opf', '.ncx')): + file_path = os.path.join(output_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + print(f"🧹 Removed EPUB file: {file}") + cleaned_count += 1 + except Exception as e: + print(f"⚠️ Error scanning for EPUB files: {e}") + + # Remove extraction marker + marker_path = os.path.join(output_dir, '.resources_extracted') + try: + if os.path.isfile(marker_path): + os.remove(marker_path) + print(f"🧹 Removed extraction marker") + cleaned_count += 1 + except Exception as e: + print(f"⚠️ Could not remove extraction marker: {e}") + + # Check if CSS files exist and inform user they're being preserved + css_path = os.path.join(output_dir, 'css') + if os.path.exists(css_path): + try: + css_files = [f for f in os.listdir(css_path) if os.path.isfile(os.path.join(css_path, f))] + if css_files: + print(f"📚 Preserving {len(css_files)} CSS files") + except Exception: + pass + + if cleaned_count > 0: + print(f"🧹 Cleaned up {cleaned_count} items from previous runs (CSS files preserved)") + + return cleaned_count + +# ===================================================== +# API AND TRANSLATION UTILITIES +# ===================================================== +def send_with_interrupt(messages, client, temperature, max_tokens, stop_check_fn, chunk_timeout=None, request_id=None, context=None): + """Send API request with interrupt capability and optional timeout retry. + Optional context parameter is passed through to the client to improve payload labeling. + """ + # Import UnifiedClientError at function level to avoid scoping issues + from unified_api_client import UnifiedClientError + + # The client.send() call will handle multi-key rotation automatically + + # Generate request_id if not provided + #if request_id is None: + # request_id = str(uuid.uuid4())[:8] + + result_queue = queue.Queue() + + def api_call(): + try: + start_time = time.time() + + # Check if client.send accepts request_id parameter + send_params = { + 'messages': messages, + 'temperature': temperature, + 'max_tokens': max_tokens + } + # Add context if supported + sig = inspect.signature(client.send) + if 'context' in sig.parameters and context is not None: + send_params['context'] = context + + # Add request_id if the client supports it + sig = inspect.signature(client.send) + #if 'request_id' in sig.parameters: + # send_params['request_id'] = request_id + + result = client.send(**send_params) + elapsed = time.time() - start_time + result_queue.put((result, elapsed)) + except Exception as e: + result_queue.put(e) + + api_thread = threading.Thread(target=api_call) + api_thread.daemon = True + api_thread.start() + + timeout = chunk_timeout if chunk_timeout is not None else 86400 + check_interval = 0.5 + elapsed = 0 + + while elapsed < timeout: + try: + result = result_queue.get(timeout=check_interval) + if isinstance(result, Exception): + # For expected errors like rate limits, preserve the error type without extra traceback + if hasattr(result, 'error_type') and result.error_type == "rate_limit": + raise result + elif "429" in str(result) or "rate limit" in str(result).lower(): + # Convert generic exceptions to UnifiedClientError for rate limits + raise UnifiedClientError(str(result), error_type="rate_limit") + else: + raise result + if isinstance(result, tuple): + api_result, api_time = result + if chunk_timeout and api_time > chunk_timeout: + # Set cleanup flag when chunk timeout occurs + if hasattr(client, '_in_cleanup'): + client._in_cleanup = True + if hasattr(client, 'cancel_current_operation'): + client.cancel_current_operation() + raise UnifiedClientError(f"API call took {api_time:.1f}s (timeout: {chunk_timeout}s)") + return api_result + return result + except queue.Empty: + if stop_check_fn(): + # Set cleanup flag when user stops + if hasattr(client, '_in_cleanup'): + client._in_cleanup = True + if hasattr(client, 'cancel_current_operation'): + client.cancel_current_operation() + raise UnifiedClientError("Translation stopped by user") + elapsed += check_interval + + # Set cleanup flag when timeout occurs + if hasattr(client, '_in_cleanup'): + client._in_cleanup = True + if hasattr(client, 'cancel_current_operation'): + client.cancel_current_operation() + raise UnifiedClientError(f"API call timed out after {timeout} seconds") + +def handle_api_error(processor, error, chunk_info=""): + """Handle API errors with multi-key support""" + error_str = str(error) + + # Check for rate limit + if "429" in error_str or "rate limit" in error_str.lower(): + if processor.config.use_multi_api_keys: + print(f"⚠️ Rate limit hit {chunk_info}, client should rotate to next key") + stats = processor.client.get_stats() + print(f"📊 API Stats - Active keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}") + + if stats.get('active_keys', 0) == 0: + print("⏳ All API keys are cooling down - will wait and retry") + print(f"🔄 Multi-key error handling: Rate limit processed, preparing for key rotation...") + time.sleep(0.1) # Brief pause after rate limit detection for stability + return True # Always retry + else: + print(f"⚠️ Rate limit hit {chunk_info}, waiting before retry...") + time.sleep(60) + print(f"🔄 Single-key error handling: Rate limit wait completed, ready for retry...") + time.sleep(0.1) # Brief pause after rate limit wait for stability + return True # Always retry + + # Other errors + print(f"❌ API Error {chunk_info}: {error_str}") + return False + +def parse_token_limit(env_value): + """Parse token limit from environment variable""" + if not env_value or env_value.strip() == "": + return None, "unlimited" + + env_value = env_value.strip() + if env_value.lower() == "unlimited": + return None, "unlimited" + + if env_value.isdigit() and int(env_value) > 0: + limit = int(env_value) + return limit, str(limit) + + return 1000000, "1000000 (default)" + +def build_system_prompt(user_prompt, glossary_path=None): + """Build the system prompt with glossary - TRUE BRUTE FORCE VERSION""" + append_glossary = os.getenv("APPEND_GLOSSARY", "1") == "1" + actual_glossary_path = glossary_path + + + system = user_prompt if user_prompt else "" + + if append_glossary and actual_glossary_path and os.path.exists(actual_glossary_path): + try: + print(f"[DEBUG] ✅ Loading glossary from: {os.path.abspath(actual_glossary_path)}") + + # Try to load as JSON first + try: + with open(actual_glossary_path, "r", encoding="utf-8") as gf: + glossary_data = json.load(gf) + glossary_text = json.dumps(glossary_data, ensure_ascii=False, indent=2) + print(f"[DEBUG] Loaded as JSON") + except json.JSONDecodeError: + # If JSON fails, just read as raw text + #print(f"[DEBUG] JSON parse failed, reading as raw text") + with open(actual_glossary_path, "r", encoding="utf-8") as gf: + glossary_text = gf.read() + + if system: + system += "\n\n" + + custom_prompt = os.getenv("APPEND_GLOSSARY_PROMPT", "Character/Term Glossary (use these translations consistently):").strip() + if not custom_prompt: + custom_prompt = "Character/Term Glossary (use these translations consistently):" + + system += f"{custom_prompt}\n{glossary_text}" + + print(f"[DEBUG] ✅ Entire glossary appended!") + print(f"[DEBUG] Glossary text length: {len(glossary_text)} characters") + + except Exception as e: + print(f"[ERROR] Could not load glossary: {e}") + import traceback + print(f"[ERROR] Full traceback: {traceback.format_exc()}") + else: + if not append_glossary: + #print(f"[DEBUG] ❌ Glossary append disabled") + pass + elif not actual_glossary_path: + print(f"[DEBUG] ❌ No glossary path provided") + elif not os.path.exists(actual_glossary_path): + print(f"[DEBUG] ❌ Glossary file does not exist: {actual_glossary_path}") + + print(f"🎯 Final system prompt length: {len(system)} characters") + + return system + +def translate_title(title, client, system_prompt, user_prompt, temperature=0.3): + """Translate the book title using the configured settings""" + if not title or not title.strip(): + return title + + print(f"📚 Processing book title: {title}") + + try: + if os.getenv("TRANSLATE_BOOK_TITLE", "1") == "0": + print(f"📚 Book title translation disabled - keeping original") + return title + + # Check if we're using a translation service (not AI) + client_type = getattr(client, 'client_type', '') + is_translation_service = client_type in ['deepl', 'google_translate'] + + if is_translation_service: + # For translation services, send only the text without AI prompts + print(f"📚 Using translation service ({client_type}) - sending text directly") + messages = [ + {"role": "user", "content": title} + ] + max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) + translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens) + else: + # For AI services, use prompts as before + book_title_prompt = os.getenv("BOOK_TITLE_PROMPT", + "Translate this book title to English while retaining any acronyms:") + + # Get the system prompt for book titles, with fallback to default + book_title_system_prompt = os.getenv("BOOK_TITLE_SYSTEM_PROMPT", + "You are a translator. Respond with only the translated text, nothing else. Do not add any explanation or additional content.") + + messages = [ + {"role": "system", "content": book_title_system_prompt}, + {"role": "user", "content": f"{book_title_prompt}\n\n{title}"} + ] + max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) + translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens) + + print(f"[DEBUG] Raw API response: '{translated_title}'") + print(f"[DEBUG] Response length: {len(translated_title)} (original: {len(title)})") + newline = '\n' + print(f"[DEBUG] Has newlines: {repr(translated_title) if newline in translated_title else 'No'}") + + translated_title = translated_title.strip() + + if ((translated_title.startswith('"') and translated_title.endswith('"')) or + (translated_title.startswith("'") and translated_title.endswith("'"))): + translated_title = translated_title[1:-1].strip() + + if '\n' in translated_title: + print(f"⚠️ API returned multi-line content, keeping original title") + return title + + # Check for JSON-like structured content, but allow simple brackets like [END] + if (any(char in translated_title for char in ['{', '}']) or + '"role":' in translated_title or + '"content":' in translated_title or + ('[[' in translated_title and ']]' in translated_title)): # Only flag double brackets + print(f"⚠️ API returned structured content, keeping original title") + return title + + if any(tag in translated_title.lower() for tag in ['

', '

', '

', '

', '= 2: + return True + + # Single strong error indicator in very short response + if len(content_str) < 50 and error_count >= 1: + return True + + return False + + +# Additional helper function for debugging +def get_failure_reason(content): + """ + Returns the specific reason why content was marked as qa_failed + Useful for debugging and logging + """ + if not content: + return "Empty content" + + content_str = str(content).strip() + content_lower = content_str.lower() + + # Check each category and return the first match + failure_categories = { + "Explicit Failure Marker": [ + "[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]", + "[IMAGE TRANSLATION FAILED]", + "API response unavailable", + "[]" + ], + "HTTP Error": [ + "authentication_error", "rate_limit_error", "api_error" + ], + "Content Filter": [ + "content_filter", "safety filter", "blocked by safety" + ], + "Timeout": [ + "timeout", "timed out", "apitimeouterror" + ], + "Rate Limit": [ + "rate limit exceeded", "quota exceeded", "too many requests" + ], + "Refusal Pattern": [ + "i cannot", "i can't", "unable to process" + ], + "Empty Response": [ + '"text": ""', "choices: [ { text: ''" + ] + } + + for category, markers in failure_categories.items(): + for marker in markers: + if marker in content_str or marker in content_lower: + return f"{category}: {marker}" + + if len(content_str) < 50: + return f"Short response with error indicators: {content_str[:30]}..." + + return "Unknown failure pattern" + +def convert_enhanced_text_to_html(plain_text, chapter_info=None): + """Convert markdown/plain text back to HTML after translation (for enhanced mode) + + This function handles the conversion of translated markdown back to HTML. + The input is the TRANSLATED text that was originally extracted using html2text. + """ + import re + + preserve_structure = chapter_info.get('preserve_structure', False) if chapter_info else False + + # First, try to use markdown2 for proper markdown conversion + try: + import markdown2 + + # Check if the text contains markdown patterns + has_markdown = any([ + '##' in plain_text, # Headers + '**' in plain_text, # Bold + '*' in plain_text and not '**' in plain_text, # Italic + '[' in plain_text and '](' in plain_text, # Links + '```' in plain_text, # Code blocks + '> ' in plain_text, # Blockquotes + '- ' in plain_text or '* ' in plain_text or '1. ' in plain_text # Lists + ]) + + if has_markdown or preserve_structure: + # Use markdown2 for proper conversion + html = markdown2.markdown(plain_text, extras=[ + 'cuddled-lists', # Lists without blank lines + 'fenced-code-blocks', # Code blocks with ``` + 'break-on-newline', # Treat single newlines as
+ 'smarty-pants', # Smart quotes and dashes + 'tables', # Markdown tables + ]) + + # Post-process to ensure proper paragraph structure + if not '

' in html: + # If markdown2 didn't create paragraphs, wrap content + lines = html.split('\n') + processed_lines = [] + for line in lines: + line = line.strip() + if line and not line.startswith('<') and not line.endswith('>'): + processed_lines.append(f'

{line}

') + elif line: + processed_lines.append(line) + html = '\n'.join(processed_lines) + + return html + + except ImportError: + print("⚠️ markdown2 not available, using fallback HTML conversion") + + # Fallback: Manual markdown-to-HTML conversion + lines = plain_text.strip().split('\n') + html_parts = [] + in_code_block = False + code_block_content = [] + + for line in lines: + # Handle code blocks + if line.strip().startswith('```'): + if in_code_block: + # End code block + html_parts.append('
' + '\n'.join(code_block_content) + '
') + code_block_content = [] + in_code_block = False + else: + # Start code block + in_code_block = True + continue + + if in_code_block: + code_block_content.append(line) + continue + + line = line.strip() + if not line: + # Preserve empty lines as paragraph breaks + if html_parts and not html_parts[-1].endswith('

'): + # Only add break if not already after a closing tag + html_parts.append('
') + continue + + # Check for markdown headers + if line.startswith('#'): + match = re.match(r'^(#+)\s*(.+)$', line) + if match: + level = min(len(match.group(1)), 6) + header_text = match.group(2).strip() + html_parts.append(f'{header_text}') + continue + + # Check for blockquotes + if line.startswith('> '): + quote_text = line[2:].strip() + html_parts.append(f'
{quote_text}
') + continue + + # Check for lists + if re.match(r'^[*\-+]\s+', line): + list_text = re.sub(r'^[*\-+]\s+', '', line) + html_parts.append(f'
  • {list_text}
  • ') + continue + + if re.match(r'^\d+\.\s+', line): + list_text = re.sub(r'^\d+\.\s+', '', line) + html_parts.append(f'
  • {list_text}
  • ') + continue + + # Convert inline markdown + # Bold + line = re.sub(r'\*\*(.+?)\*\*', r'\1', line) + line = re.sub(r'__(.+?)__', r'\1', line) + + # Italic + line = re.sub(r'\*(.+?)\*', r'\1', line) + line = re.sub(r'_(.+?)_', r'\1', line) + + # Links + line = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', line) + + # Code inline + line = re.sub(r'`([^`]+)`', r'\1', line) + + # Regular paragraph + html_parts.append(f'

    {line}

    ') + + # Post-process lists to wrap in ul/ol tags + final_html = [] + in_list = False + list_type = None + + for part in html_parts: + if part.startswith('
  • '): + if not in_list: + # Determine list type based on context (simplified) + list_type = 'ul' # Default to unordered + final_html.append(f'<{list_type}>') + in_list = True + final_html.append(part) + else: + if in_list: + final_html.append(f'') + in_list = False + final_html.append(part) + + # Close any open list + if in_list: + final_html.append(f'') + + return '\n'.join(final_html) +# ===================================================== +# MAIN TRANSLATION FUNCTION +# ===================================================== +def main(log_callback=None, stop_callback=None): + """Main translation function with enhanced duplicate detection and progress tracking""" + + config = TranslationConfig() + builtins._DISABLE_ZERO_DETECTION = config.DISABLE_ZERO_DETECTION + + if config.DISABLE_ZERO_DETECTION: + print("=" * 60) + print("⚠️ 0-BASED DETECTION DISABLED BY USER") + print("⚠️ All chapter numbers will be used exactly as found") + print("=" * 60) + + args = None + chapters_completed = 0 + chunks_completed = 0 + + args = None + chapters_completed = 0 + chunks_completed = 0 + + input_path = config.input_path + if not input_path and len(sys.argv) > 1: + input_path = sys.argv[1] + + is_text_file = input_path.lower().endswith('.txt') + + if is_text_file: + os.environ["IS_TEXT_FILE_TRANSLATION"] = "1" + + import json as _json + _original_load = _json.load + + def debug_json_load(fp, *args, **kwargs): + result = _original_load(fp, *args, **kwargs) + if isinstance(result, list) and len(result) > 0: + if isinstance(result[0], dict) and 'original_name' in result[0]: + print(f"[DEBUG] Loaded glossary list with {len(result)} items from {fp.name if hasattr(fp, 'name') else 'unknown'}") + return result + + _json.load = debug_json_load + + if log_callback: + set_output_redirect(log_callback) + + def check_stop(): + if stop_callback and stop_callback(): + print("❌ Translation stopped by user request.") + return True + return is_stop_requested() + + if config.EMERGENCY_RESTORE: + print("✅ Emergency paragraph restoration is ENABLED") + else: + print("⚠️ Emergency paragraph restoration is DISABLED") + + print(f"[DEBUG] REMOVE_AI_ARTIFACTS environment variable: {os.getenv('REMOVE_AI_ARTIFACTS', 'NOT SET')}") + print(f"[DEBUG] REMOVE_AI_ARTIFACTS parsed value: {config.REMOVE_AI_ARTIFACTS}") + if config.REMOVE_AI_ARTIFACTS: + print("⚠️ AI artifact removal is ENABLED - will clean AI response artifacts") + else: + print("✅ AI artifact removal is DISABLED - preserving all content as-is") + + if '--epub' in sys.argv or (len(sys.argv) > 1 and sys.argv[1].endswith(('.epub', '.txt'))): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('epub', help='Input EPUB or text file') + args = parser.parse_args() + input_path = args.epub + + is_text_file = input_path.lower().endswith('.txt') + + if is_text_file: + file_base = os.path.splitext(os.path.basename(input_path))[0] + else: + epub_base = os.path.splitext(os.path.basename(input_path))[0] + file_base = epub_base + + out = file_base + os.makedirs(out, exist_ok=True) + print(f"[DEBUG] Created output folder → {out}") + + cleanup_previous_extraction(out) + + os.environ["EPUB_OUTPUT_DIR"] = out + payloads_dir = out + + # clear history if CONTEXTUAL is disabled + if not config.CONTEXTUAL: + history_file = os.path.join(payloads_dir, "translation_history.json") + if os.path.exists(history_file): + os.remove(history_file) + print("[DEBUG] CONTEXTUAL disabled - cleared translation history") + + history_manager = HistoryManager(payloads_dir) + chapter_splitter = ChapterSplitter(model_name=config.MODEL) + chunk_context_manager = ChunkContextManager() + progress_manager = ProgressManager(payloads_dir) + + # Create ChapterExtractor with progress callback if available + chapter_progress_callback = None + if log_callback: + # Create a wrapper that formats progress messages for the log + def chapter_progress_callback(msg): + log_callback(f"📊 {msg}") + + chapter_extractor = ChapterExtractor(progress_callback=chapter_progress_callback) + glossary_manager = GlossaryManager() + + history_file = os.path.join(payloads_dir, "translation_history.json") + if os.path.exists(history_file): + os.remove(history_file) + print(f"[DEBUG] Purged translation history → {history_file}") + + print("🔍 Checking for deleted output files...") + progress_manager.cleanup_missing_files(out) + progress_manager.save() + + if check_stop(): + return + + if not config.API_KEY: + print("❌ Error: Set API_KEY, OPENAI_API_KEY, or OPENAI_OR_Gemini_API_KEY in your environment.") + return + + #print(f"[DEBUG] Found API key: {config.API_KEY[:10]}...") + print(f"[DEBUG] Using model = {config.MODEL}") + print(f"[DEBUG] Max output tokens = {config.MAX_OUTPUT_TOKENS}") + + client = UnifiedClient(model=config.MODEL, api_key=config.API_KEY, output_dir=out) + if hasattr(client, 'use_multi_keys') and client.use_multi_keys: + stats = client.get_stats() + print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded") + print(f" Active keys: {stats.get('active_keys', 0)}") + else: + print(f"🔑 Single-key mode: Using {config.MODEL}") + # Reset cleanup state when starting new translation + if hasattr(client, 'reset_cleanup_state'): + client.reset_cleanup_state() + + if is_text_file: + print("📄 Processing text file...") + try: + txt_processor = TextFileProcessor(input_path, out) + chapters = txt_processor.extract_chapters() + txt_processor.save_original_structure() + + metadata = { + "title": os.path.splitext(os.path.basename(input_path))[0], + "type": "text", + "chapter_count": len(chapters) + } + except ImportError as e: + print(f"❌ Error: Text file processor not available: {e}") + if log_callback: + log_callback(f"❌ Error: Text file processor not available: {e}") + return + except Exception as e: + print(f"❌ Error processing text file: {e}") + if log_callback: + log_callback(f"❌ Error processing text file: {e}") + return + else: + # Check if we should use async extraction (for GUI mode) + use_async_extraction = os.getenv("USE_ASYNC_CHAPTER_EXTRACTION", "0") == "1" + + if use_async_extraction and log_callback: + print("🚀 Using async chapter extraction (subprocess mode)...") + from chapter_extraction_manager import ChapterExtractionManager + + # Create manager with log callback + extraction_manager = ChapterExtractionManager(log_callback=log_callback) + + # Get extraction mode + extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower() + + # Define completion callback + extraction_result = {"completed": False, "result": None} + + def on_extraction_complete(result): + extraction_result["completed"] = True + extraction_result["result"] = result + + # Safety check for None result + if result is None: + log_callback("❌ Chapter extraction failed: No result returned") + return + + if result.get("success"): + log_callback(f"✅ Chapter extraction completed: {result.get('chapters', 0)} chapters") + else: + log_callback(f"❌ Chapter extraction failed: {result.get('error', 'Unknown error')}") + + # Start async extraction + extraction_manager.extract_chapters_async( + input_path, + out, + extraction_mode=extraction_mode, + progress_callback=lambda msg: log_callback(f"📊 {msg}"), + completion_callback=on_extraction_complete + ) + + # Wait for completion (with timeout) + timeout = 300 # 5 minutes timeout + start_time = time.time() + + while not extraction_result["completed"]: + if check_stop(): + extraction_manager.stop_extraction() + return + + if time.time() - start_time > timeout: + log_callback("⚠️ Chapter extraction timeout") + extraction_manager.stop_extraction() + return + + time.sleep(0.1) # Check every 100ms + + # Check if extraction was successful + if not extraction_result["result"] or not extraction_result["result"].get("success"): + log_callback("❌ Chapter extraction failed") + return + + # Load the extracted data + metadata_path = os.path.join(out, "metadata.json") + if os.path.exists(metadata_path): + with open(metadata_path, 'r', encoding='utf-8') as f: + metadata = json.load(f) + else: + metadata = extraction_result["result"].get("metadata", {}) + + # The async extraction should have saved chapters directly, similar to the sync version + # We need to reconstruct the chapters list with body content + + # Check if the extraction actually created a chapters.json file with full content + chapters_full_path = os.path.join(out, "chapters_full.json") + chapters_info_path = os.path.join(out, "chapters_info.json") + + chapters = [] + + # First try to load full chapters if saved + if os.path.exists(chapters_full_path): + log_callback("Loading full chapters data...") + with open(chapters_full_path, 'r', encoding='utf-8') as f: + chapters = json.load(f) + log_callback(f"✅ Loaded {len(chapters)} chapters with content") + + elif os.path.exists(chapters_info_path): + # Fall back to loading from individual files + log_callback("Loading chapter info and searching for content files...") + with open(chapters_info_path, 'r', encoding='utf-8') as f: + chapters_info = json.load(f) + + # List all files in the output directory + all_files = os.listdir(out) + log_callback(f"Found {len(all_files)} files in output directory") + + # Try to match chapter files + for info in chapters_info: + chapter_num = info['num'] + found = False + + # Try different naming patterns + patterns = [ + f"chapter_{chapter_num:04d}_", # With leading zeros + f"chapter_{chapter_num}_", # Without leading zeros + f"ch{chapter_num:04d}_", # Shortened with zeros + f"ch{chapter_num}_", # Shortened without zeros + f"{chapter_num:04d}_", # Just number with zeros + f"{chapter_num}_" # Just number + ] + + for pattern in patterns: + # Find files matching this pattern (any extension) + matching_files = [f for f in all_files if f.startswith(pattern)] + + if matching_files: + # Prefer HTML/XHTML files + html_files = [f for f in matching_files if f.endswith(('.html', '.xhtml', '.htm'))] + if html_files: + chapter_file = html_files[0] + else: + chapter_file = matching_files[0] + + chapter_path = os.path.join(out, chapter_file) + + try: + with open(chapter_path, 'r', encoding='utf-8') as f: + content = f.read() + + chapters.append({ + "num": chapter_num, + "title": info.get("title", f"Chapter {chapter_num}"), + "body": content, + "filename": info.get("original_filename", ""), + "has_images": info.get("has_images", False), + "file_size": len(content), + "content_hash": info.get("content_hash", "") + }) + found = True + break + except Exception as e: + log_callback(f"⚠️ Error reading {chapter_file}: {e}") + + if not found: + log_callback(f"⚠️ No file found for Chapter {chapter_num}") + # Log available files for debugging + if len(all_files) < 50: + similar_files = [f for f in all_files if str(chapter_num) in f] + if similar_files: + log_callback(f" Similar files: {similar_files[:3]}") + + if not chapters: + log_callback("❌ No chapters could be loaded!") + log_callback(f"❌ Output directory: {out}") + log_callback(f"❌ Files in directory: {len(os.listdir(out))} files") + # Show first few files for debugging + sample_files = os.listdir(out)[:10] + log_callback(f"❌ Sample files: {sample_files}") + return + + # Sort chapters by OPF spine order if available + opf_path = os.path.join(out, 'content.opf') + if os.path.exists(opf_path) and chapters: + log_callback("📋 Sorting chapters according to OPF spine order...") + # Use the existing chapter_extractor instance to sort + chapters = chapter_extractor._sort_by_opf_spine(chapters, opf_path) + log_callback("✅ Chapters sorted according to OPF reading order") + else: + print("🚀 Using comprehensive chapter extraction with resource handling...") + with zipfile.ZipFile(input_path, 'r') as zf: + metadata = chapter_extractor._extract_epub_metadata(zf) + chapters = chapter_extractor.extract_chapters(zf, out) + + print(f"\n📚 Extraction Summary:") + print(f" Total chapters extracted: {len(chapters)}") + if chapters: + nums = [c.get('num', 0) for c in chapters] + print(f" Chapter range: {min(nums)} to {max(nums)}") + + # Check for gaps in the sequence + expected_count = max(nums) - min(nums) + 1 + if len(chapters) < expected_count: + print(f"\n⚠️ Potential missing chapters detected:") + print(f" Expected {expected_count} chapters (from {min(nums)} to {max(nums)})") + print(f" Actually found: {len(chapters)} chapters") + print(f" Potentially missing: {expected_count - len(chapters)} chapters") + + validate_chapter_continuity(chapters) + + print("\n" + "="*50) + validate_epub_structure(out) + print("="*50 + "\n") + + progress_manager.migrate_to_content_hash(chapters) + progress_manager.save() + + if check_stop(): + return + + metadata_path = os.path.join(out, "metadata.json") + if os.path.exists(metadata_path): + with open(metadata_path, 'r', encoding='utf-8') as mf: + metadata = json.load(mf) + + metadata["chapter_count"] = len(chapters) + metadata["chapter_titles"] = {str(c["num"]): c["title"] for c in chapters} + + print(f"[DEBUG] Initializing client with model = {config.MODEL}") + client = UnifiedClient(api_key=config.API_KEY, model=config.MODEL, output_dir=out) + if hasattr(client, 'use_multi_keys') and client.use_multi_keys: + stats = client.get_stats() + print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded") + print(f" Active keys: {stats.get('active_keys', 0)}") + else: + print(f"🔑 Single-key mode: Using {config.MODEL}") + + # Reset cleanup state when starting new translation + if hasattr(client, 'reset_cleanup_state'): + client.reset_cleanup_state() + + if "title" in metadata and config.TRANSLATE_BOOK_TITLE and not metadata.get("title_translated", False): + original_title = metadata["title"] + print(f"📚 Original title: {original_title}") + + if not check_stop(): + translated_title = translate_title( + original_title, + client, + None, + None, + config.TEMP + ) + + metadata["original_title"] = original_title + metadata["title"] = translated_title + metadata["title_translated"] = True + + print(f"📚 Translated title: {translated_title}") + else: + print("❌ Title translation skipped due to stop request") + + # Translate other metadata fields if configured + translate_metadata_fields_str = os.getenv('TRANSLATE_METADATA_FIELDS', '{}') + metadata_translation_mode = os.getenv('METADATA_TRANSLATION_MODE', 'together') + + try: + translate_metadata_fields = json.loads(translate_metadata_fields_str) + + if translate_metadata_fields and any(translate_metadata_fields.values()): + # Filter out fields that should be translated (excluding already translated fields) + fields_to_translate = {} + skipped_fields = [] + + for field_name, should_translate in translate_metadata_fields.items(): + if should_translate and field_name != 'title' and field_name in metadata: + # Check if already translated + if metadata.get(f"{field_name}_translated", False): + skipped_fields.append(field_name) + print(f"✓ Skipping {field_name} - already translated") + else: + fields_to_translate[field_name] = should_translate + + if fields_to_translate: + print("\n" + "="*50) + print("📋 METADATA TRANSLATION PHASE") + print("="*50) + print(f"🌐 Translating {len(fields_to_translate)} metadata fields...") + + # Get ALL configuration from environment - NO DEFAULTS + system_prompt = os.getenv('BOOK_TITLE_SYSTEM_PROMPT', '') + if not system_prompt: + print("❌ No system prompt configured, skipping metadata translation") + else: + # Get field-specific prompts + field_prompts_str = os.getenv('METADATA_FIELD_PROMPTS', '{}') + try: + field_prompts = json.loads(field_prompts_str) + except: + field_prompts = {} + + if not field_prompts and not field_prompts.get('_default'): + print("❌ No field prompts configured, skipping metadata translation") + else: + # Get language configuration + lang_behavior = os.getenv('LANG_PROMPT_BEHAVIOR', 'auto') + forced_source_lang = os.getenv('FORCED_SOURCE_LANG', 'Korean') + output_language = os.getenv('OUTPUT_LANGUAGE', 'English') + + # Determine source language + source_lang = metadata.get('language', '').lower() + if lang_behavior == 'never': + lang_str = "" + elif lang_behavior == 'always': + lang_str = forced_source_lang + else: # auto + if 'zh' in source_lang or 'chinese' in source_lang: + lang_str = 'Chinese' + elif 'ja' in source_lang or 'japanese' in source_lang: + lang_str = 'Japanese' + elif 'ko' in source_lang or 'korean' in source_lang: + lang_str = 'Korean' + else: + lang_str = '' + + # Check if batch translation is enabled for parallel processing + batch_translate_enabled = os.getenv('BATCH_TRANSLATION', '0') == '1' + batch_size = int(os.getenv('BATCH_SIZE', '50')) # Default batch size + + if batch_translate_enabled and len(fields_to_translate) > 1: + print(f"⚡ Using parallel metadata translation mode ({len(fields_to_translate)} fields, batch size: {batch_size})...") + + # Import ThreadPoolExecutor for parallel processing + from concurrent.futures import ThreadPoolExecutor, as_completed + import threading + + # Thread-safe results storage + translation_results = {} + results_lock = threading.Lock() + + def translate_metadata_field(field_name, original_value): + """Translate a single metadata field""" + try: + print(f"\n📋 Translating {field_name}: {original_value[:100]}..." + if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}") + + # Get field-specific prompt + prompt_template = field_prompts.get(field_name, field_prompts.get('_default', '')) + + if not prompt_template: + print(f"⚠️ No prompt configured for field '{field_name}', skipping") + return None + + # Replace variables in prompt + field_prompt = prompt_template.replace('{source_lang}', lang_str) + field_prompt = field_prompt.replace('{output_lang}', output_language) + field_prompt = field_prompt.replace('English', output_language) + field_prompt = field_prompt.replace('{field_value}', str(original_value)) + + # Check if we're using a translation service (not AI) + client_type = getattr(client, 'client_type', '') + is_translation_service = client_type in ['deepl', 'google_translate'] + + if is_translation_service: + # For translation services, send only the field value without AI prompts + print(f"🌐 Using translation service ({client_type}) - sending field directly") + messages = [ + {"role": "user", "content": str(original_value)} + ] + else: + # For AI services, use prompts as before + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"{field_prompt}\n\n{original_value}"} + ] + + # Add delay for rate limiting + if config.DELAY > 0: + time.sleep(config.DELAY) + + # Make API call + content, finish_reason = client.send( + messages, + temperature=config.TEMP, + max_tokens=config.MAX_OUTPUT_TOKENS + ) + translated_value = content.strip() + + # Store result thread-safely + with results_lock: + translation_results[field_name] = { + 'original': original_value, + 'translated': translated_value, + 'success': True + } + + print(f"✅ Translated {field_name}: {translated_value}") + return translated_value + + except Exception as e: + print(f"❌ Failed to translate {field_name}: {e}") + with results_lock: + translation_results[field_name] = { + 'original': original_value, + 'translated': None, + 'success': False, + 'error': str(e) + } + return None + + # Execute parallel translations with limited workers + max_workers = min(len(fields_to_translate), batch_size) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all translation tasks + futures = {} + for field_name in fields_to_translate: + if field_name in metadata and not check_stop(): + original_value = metadata[field_name] + future = executor.submit(translate_metadata_field, field_name, original_value) + futures[future] = field_name + + # Wait for completion + for future in as_completed(futures): + if check_stop(): + print("❌ Metadata translation stopped by user") + break + + # Apply results to metadata + for field_name, result in translation_results.items(): + if result['success'] and result['translated']: + metadata[f"original_{field_name}"] = result['original'] + metadata[field_name] = result['translated'] + metadata[f"{field_name}_translated"] = True + + else: + # Sequential translation mode (individual translation) + mode_desc = "sequential" if not batch_translate_enabled else "sequential (single field)" + print(f"📝 Using {mode_desc} translation mode...") + + for field_name in fields_to_translate: + if not check_stop() and field_name in metadata: + original_value = metadata[field_name] + print(f"\n📋 Translating {field_name}: {original_value[:100]}..." + if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}") + + # Get field-specific prompt + prompt_template = field_prompts.get(field_name, field_prompts.get('_default', '')) + + if not prompt_template: + print(f"⚠️ No prompt configured for field '{field_name}', skipping") + continue + + # Replace variables in prompt + field_prompt = prompt_template.replace('{source_lang}', lang_str) + field_prompt = field_prompt.replace('{output_lang}', output_language) + field_prompt = field_prompt.replace('English', output_language) + field_prompt = field_prompt.replace('{field_value}', str(original_value)) + + # Check if we're using a translation service (not AI) + client_type = getattr(client, 'client_type', '') + is_translation_service = client_type in ['deepl', 'google_translate'] + + if is_translation_service: + # For translation services, send only the field value without AI prompts + print(f"🌐 Using translation service ({client_type}) - sending field directly") + messages = [ + {"role": "user", "content": str(original_value)} + ] + else: + # For AI services, use prompts as before + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"{field_prompt}\n\n{original_value}"} + ] + + try: + # Add delay using the config instance from main() + if config.DELAY > 0: # ✅ FIXED - use config.DELAY instead of config.SEND_INTERVAL + time.sleep(config.DELAY) + + # Use the same client instance from main() + # ✅ FIXED - Properly unpack tuple response and provide max_tokens + content, finish_reason = client.send( + messages, + temperature=config.TEMP, + max_tokens=config.MAX_OUTPUT_TOKENS # ✅ FIXED - provide max_tokens to avoid NoneType error + ) + translated_value = content.strip() # ✅ FIXED - use content from unpacked tuple + + metadata[f"original_{field_name}"] = original_value + metadata[field_name] = translated_value + metadata[f"{field_name}_translated"] = True + + print(f"✅ Translated {field_name}: {translated_value}") + + except Exception as e: + print(f"❌ Failed to translate {field_name}: {e}") + + else: + if check_stop(): + print("❌ Metadata translation stopped by user") + break + else: + print("📋 No additional metadata fields to translate") + + except Exception as e: + print(f"⚠️ Error processing metadata translation settings: {e}") + import traceback + traceback.print_exc() + + with open(metadata_path, 'w', encoding='utf-8') as mf: + json.dump(metadata, mf, ensure_ascii=False, indent=2) + print(f"💾 Saved metadata with {'translated' if metadata.get('title_translated', False) else 'original'} title") + + print("\n" + "="*50) + print("📑 GLOSSARY GENERATION PHASE") + print("="*50) + + print(f"📑 DEBUG: ENABLE_AUTO_GLOSSARY = '{os.getenv('ENABLE_AUTO_GLOSSARY', 'NOT SET')}'") + print(f"📑 DEBUG: MANUAL_GLOSSARY = '{config.MANUAL_GLOSSARY}'") + print(f"📑 DEBUG: Manual glossary exists? {os.path.isfile(config.MANUAL_GLOSSARY) if config.MANUAL_GLOSSARY else False}") + + # Check if glossary.csv already exists in the source folder + existing_glossary_csv = os.path.join(out, "glossary.csv") + existing_glossary_json = os.path.join(out, "glossary.json") + print(f"📑 DEBUG: Existing glossary.csv? {os.path.exists(existing_glossary_csv)}") + print(f"📑 DEBUG: Existing glossary.json? {os.path.exists(existing_glossary_json)}") + + if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY): + ext = os.path.splitext(config.MANUAL_GLOSSARY)[1].lower() + target_name = "glossary.csv" if ext == ".csv" else "glossary.json" + target_path = os.path.join(out, target_name) + if os.path.abspath(config.MANUAL_GLOSSARY) != os.path.abspath(target_path): + shutil.copy(config.MANUAL_GLOSSARY, target_path) + print("📑 Using manual glossary from:", config.MANUAL_GLOSSARY) + else: + print("📑 Using existing glossary:", config.MANUAL_GLOSSARY) + elif os.path.exists(existing_glossary_csv) or os.path.exists(existing_glossary_json): + print("📑 Existing glossary file detected in source folder - skipping automatic generation") + if os.path.exists(existing_glossary_csv): + print(f"📑 Using existing glossary.csv: {existing_glossary_csv}") + elif os.path.exists(existing_glossary_json): + print(f"📑 Using existing glossary.json: {existing_glossary_json}") + elif os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1": + model = os.getenv("MODEL", "gpt-4") + if is_traditional_translation_api(model): + print("📑 Automatic glossary generation disabled") + print(f" {model} does not support glossary extraction") + print(" Traditional translation APIs cannot identify character names/terms") + else: + print("📑 Starting automatic glossary generation...") + try: + # Use the new process-safe glossary worker + from glossary_process_worker import generate_glossary_in_process + import concurrent.futures + import multiprocessing + + instructions = "" + + # Get extraction workers setting + extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) + if extraction_workers == 1: + # Auto-detect for better performance + extraction_workers = min(os.cpu_count() or 4, 4) + print(f"📑 Using {extraction_workers} CPU cores for glossary generation") + + # Collect environment variables to pass to subprocess + env_vars = {} + important_vars = [ + 'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES', + 'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS', + 'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT', + 'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED', + 'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION', + 'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS', + 'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY' + ] + + for var in important_vars: + if var in os.environ: + env_vars[var] = os.environ[var] + + # Create a Queue for real-time log streaming + manager = multiprocessing.Manager() + log_queue = manager.Queue() + + # Use ProcessPoolExecutor for true parallelism (completely bypasses GIL) + print("📑 Starting glossary generation in separate process...") + with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: + # Submit to separate process WITH log queue + future = executor.submit( + generate_glossary_in_process, + out, + chapters, + instructions, + env_vars, + log_queue # Pass the queue for real-time logs + ) + + # Poll for completion and stream logs in real-time + poll_count = 0 + while not future.done(): + poll_count += 1 + + # Check for logs from subprocess and print them immediately + try: + while not log_queue.empty(): + log_line = log_queue.get_nowait() + print(log_line) # Print to GUI + except: + pass + + # Super short sleep to yield to GUI + time.sleep(0.001) + + # Check for stop every 100 polls + if poll_count % 100 == 0: + if check_stop(): + print("📑 ❌ Glossary generation cancelled") + executor.shutdown(wait=False, cancel_futures=True) + return + + # Get any remaining logs from queue + try: + while not log_queue.empty(): + log_line = log_queue.get_nowait() + print(log_line) + except: + pass + + # Get result + if future.done(): + try: + result = future.result(timeout=0.1) + if isinstance(result, dict): + if result.get('success'): + print(f"📑 ✅ Glossary generation completed successfully") + else: + print(f"📑 ❌ Glossary generation failed: {result.get('error')}") + if result.get('traceback'): + print(f"📑 Error details:\n{result.get('traceback')}") + except Exception as e: + print(f"📑 ❌ Error retrieving glossary result: {e}") + + print("✅ Automatic glossary generation COMPLETED") + + # Handle deferred glossary appending + if os.getenv('DEFER_GLOSSARY_APPEND') == '1': + print("📑 Processing deferred glossary append to system prompt...") + + glossary_path = find_glossary_file(out) + if glossary_path and os.path.exists(glossary_path): + try: + glossary_block = None + if glossary_path.lower().endswith('.csv'): + with open(glossary_path, 'r', encoding='utf-8') as f: + glossary_block = f.read() + else: + with open(glossary_path, 'r', encoding='utf-8') as f: + glossary_data = json.load(f) + + formatted_entries = {} + if isinstance(glossary_data, dict) and 'entries' in glossary_data: + formatted_entries = glossary_data['entries'] + elif isinstance(glossary_data, dict): + formatted_entries = {k: v for k, v in glossary_data.items() if k != "metadata"} + + if formatted_entries: + glossary_block = json.dumps(formatted_entries, ensure_ascii=False, indent=2) + else: + glossary_block = None + + if glossary_block: + glossary_prompt = os.getenv('GLOSSARY_APPEND_PROMPT', + "Character/Term Glossary (use these translations consistently):") + + current_prompt = config.PROMPT + if current_prompt: + current_prompt += "\n\n" + current_prompt += f"{glossary_prompt}\n{glossary_block}" + + config.PROMPT = current_prompt + + print(f"✅ Added auto-generated glossary to system prompt ({os.path.basename(glossary_path)})") + + if 'DEFER_GLOSSARY_APPEND' in os.environ: + del os.environ['DEFER_GLOSSARY_APPEND'] + if 'GLOSSARY_APPEND_PROMPT' in os.environ: + del os.environ['GLOSSARY_APPEND_PROMPT'] + else: + print("⚠️ Auto-generated glossary has no entries - skipping append") + if 'DEFER_GLOSSARY_APPEND' in os.environ: + del os.environ['DEFER_GLOSSARY_APPEND'] + if 'GLOSSARY_APPEND_PROMPT' in os.environ: + del os.environ['GLOSSARY_APPEND_PROMPT'] + except Exception as e: + print(f"⚠️ Failed to append auto-generated glossary: {e}") + else: + print("⚠️ No glossary file found after automatic generation") + + except Exception as e: + print(f"❌ Glossary generation failed: {e}") + else: + print("📑 Automatic glossary generation disabled") + # Don't create an empty glossary - let any existing manual glossary remain + + glossary_file = find_glossary_file(out) + if glossary_file and os.path.exists(glossary_file): + try: + if glossary_file.lower().endswith('.csv'): + # Quick CSV stats + with open(glossary_file, 'r', encoding='utf-8') as f: + lines = [ln.strip() for ln in f.readlines() if ln.strip()] + entry_count = max(0, len(lines) - 1) if lines and ',' in lines[0] else len(lines) + print(f"📑 Glossary ready (CSV) with {entry_count} entries") + print("📑 Sample glossary lines:") + for ln in lines[1:4]: + print(f" • {ln}") + else: + with open(glossary_file, 'r', encoding='utf-8') as f: + glossary_data = json.load(f) + + if isinstance(glossary_data, dict): + if 'entries' in glossary_data and isinstance(glossary_data['entries'], dict): + entry_count = len(glossary_data['entries']) + sample_items = list(glossary_data['entries'].items())[:3] + else: + entry_count = len(glossary_data) + sample_items = list(glossary_data.items())[:3] + + print(f"📑 Glossary ready with {entry_count} entries") + print("📑 Sample glossary entries:") + for key, value in sample_items: + print(f" • {key} → {value}") + + elif isinstance(glossary_data, list): + print(f"📑 Glossary ready with {len(glossary_data)} entries") + print("📑 Sample glossary entries:") + for i, entry in enumerate(glossary_data[:3]): + if isinstance(entry, dict): + original = entry.get('original_name', '?') + translated = entry.get('name', original) + print(f" • {original} → {translated}") + else: + print(f"⚠️ Unexpected glossary format: {type(glossary_data)}") + + except Exception as e: + print(f"⚠️ Failed to inspect glossary file: {e}") + else: + print("📑 No glossary file found") + + print("="*50) + print("🚀 STARTING MAIN TRANSLATION PHASE") + print("="*50 + "\n") + + glossary_path = find_glossary_file(out) + if glossary_path and os.path.exists(glossary_path) and glossary_path.lower().endswith('.json'): + try: + with open(glossary_path, 'r', encoding='utf-8') as f: + g_data = json.load(f) + + print(f"[DEBUG] Glossary type before translation: {type(g_data)}") + if isinstance(g_data, list): + print(f"[DEBUG] Glossary is a list") + except Exception as e: + print(f"[DEBUG] Error checking glossary: {e}") + glossary_path = find_glossary_file(out) + system = build_system_prompt(config.SYSTEM_PROMPT, glossary_path) + base_msg = [{"role": "system", "content": system}] + # Preserve the original system prompt to avoid in-place mutations + original_system_prompt = system + last_summary_block_text = None # Will hold the last rolling summary text for the NEXT chapter only + + image_translator = None + + if config.ENABLE_IMAGE_TRANSLATION: + print(f"🖼️ Image translation enabled for model: {config.MODEL}") + print("🖼️ Image translation will use your custom system prompt and glossary") + image_translator = ImageTranslator( + client, + out, + config.PROFILE_NAME, + system, + config.TEMP, + log_callback , + progress_manager, + history_manager, + chunk_context_manager + ) + + known_vision_models = [ + 'gemini-1.5-pro', 'gemini-1.5-flash', 'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro', + 'gpt-4-turbo', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1-nano', 'o4-mini', 'gpt-4.1-mini' + ] + + if config.MODEL.lower() not in known_vision_models: + print(f"⚠️ Note: {config.MODEL} may not have vision capabilities. Image translation will be attempted anyway.") + else: + print("ℹ️ Image translation disabled by user") + + total_chapters = len(chapters) + + # Only detect numbering if the toggle is not disabled + if config.DISABLE_ZERO_DETECTION: + print(f"📊 0-based detection disabled by user setting") + uses_zero_based = False + # Important: Set a flag that can be checked throughout the codebase + config._force_disable_zero_detection = True + else: + if chapters: + uses_zero_based = detect_novel_numbering(chapters) + print(f"📊 Novel numbering detected: {'0-based' if uses_zero_based else '1-based'}") + else: + uses_zero_based = False + config._force_disable_zero_detection = False + + # Store this for later use + config._uses_zero_based = uses_zero_based + + + rng = os.getenv("CHAPTER_RANGE", "") + start = None + end = None + if rng and re.match(r"^\d+\s*-\s*\d+$", rng): + start, end = map(int, rng.split("-", 1)) + + if config.DISABLE_ZERO_DETECTION: + print(f"📊 0-based detection disabled - using range as specified: {start}-{end}") + elif uses_zero_based: + print(f"📊 0-based novel detected") + print(f"📊 User range {start}-{end} will be used as-is (chapters are already adjusted)") + else: + print(f"📊 1-based novel detected") + print(f"📊 Using range as specified: {start}-{end}") + + print("📊 Calculating total chunks needed...") + total_chunks_needed = 0 + chunks_per_chapter = {} + chapters_to_process = 0 + + # When setting actual chapter numbers (in the main function) + for idx, c in enumerate(chapters): + chap_num = c["num"] + content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) + + # Extract the raw chapter number from the file + raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) + #print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}") + + + # Apply the offset + offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 + raw_num += offset + + # When toggle is disabled, use raw numbers without any 0-based adjustment + if config.DISABLE_ZERO_DETECTION: + c['actual_chapter_num'] = raw_num + # Store raw number for consistency + c['raw_chapter_num'] = raw_num + c['zero_adjusted'] = False + else: + # Store raw number + c['raw_chapter_num'] = raw_num + # Apply adjustment only if this is a 0-based novel + if uses_zero_based: + c['actual_chapter_num'] = raw_num + 1 + c['zero_adjusted'] = True + else: + c['actual_chapter_num'] = raw_num + c['zero_adjusted'] = False + + # Now we can safely use actual_num + actual_num = c['actual_chapter_num'] + + + if start is not None: + if not (start <= c['actual_chapter_num'] <= end): + #print(f"[SKIP] Chapter {c['actual_chapter_num']} outside range {start}-{end}") + continue + + needs_translation, skip_reason, _ = progress_manager.check_chapter_status( + idx, actual_num, content_hash, out + ) + + if not needs_translation: + chunks_per_chapter[idx] = 0 + continue + + chapters_to_process += 1 + + chapter_key = str(actual_num) + if chapter_key in progress_manager.prog["chapters"] and progress_manager.prog["chapters"][chapter_key].get("status") == "in_progress": + pass + + # Calculate based on OUTPUT limit only + max_output_tokens = config.MAX_OUTPUT_TOKENS + safety_margin_output = 500 + + # Korean to English typically compresses to 0.7-0.9x + compression_factor = config.COMPRESSION_FACTOR + available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) + + # Ensure minimum + available_tokens = max(available_tokens, 1000) + + #print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") + + # For mixed content chapters, calculate on clean text + # For mixed content chapters, calculate on clean text + if c.get('has_images', False) and ContentProcessor.is_meaningful_text_content(c["body"]): + # Don't modify c["body"] at all during chunk calculation + # Just pass the body as-is, the chunking will be slightly off but that's OK + chunks = chapter_splitter.split_chapter(c["body"], available_tokens) + else: + chunks = chapter_splitter.split_chapter(c["body"], available_tokens) + + chapter_key_str = content_hash + old_key_str = str(idx) + + if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}): + progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str] + del progress_manager.prog["chapter_chunks"][old_key_str] + #print(f"[PROGRESS] Migrated chunks for chapter {actual_num} to new tracking system") + + # Always count actual chunks - ignore "completed" tracking + chunks_per_chapter[idx] = len(chunks) + total_chunks_needed += chunks_per_chapter[idx] + + terminology = "Sections" if is_text_file else "Chapters" + print(f"📊 Total chunks to translate: {total_chunks_needed}") + print(f"📚 {terminology} to process: {chapters_to_process}") + + multi_chunk_chapters = [(idx, count) for idx, count in chunks_per_chapter.items() if count > 1] + if multi_chunk_chapters: + # Determine terminology based on file type + terminology = "Sections" if is_text_file else "Chapters" + print(f"📄 {terminology} requiring multiple chunks:") + for idx, chunk_count in multi_chunk_chapters: + chap = chapters[idx] + section_term = "Section" if is_text_file else "Chapter" + print(f" • {section_term} {idx+1} ({chap['title'][:30]}...): {chunk_count} chunks") + + translation_start_time = time.time() + chunks_completed = 0 + chapters_completed = 0 + + current_chunk_number = 0 + + if config.BATCH_TRANSLATION: + print(f"\n📦 PARALLEL TRANSLATION MODE ENABLED") + print(f"📦 Processing chapters with up to {config.BATCH_SIZE} concurrent API calls") + + import concurrent.futures + from threading import Lock + + progress_lock = Lock() + + chapters_to_translate = [] + + # FIX: First pass to set actual chapter numbers for ALL chapters + # This ensures batch mode has the same chapter numbering as non-batch mode + print("📊 Setting chapter numbers...") + for idx, c in enumerate(chapters): + raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) + + # Apply offset if configured + offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 + raw_num += offset + + if config.DISABLE_ZERO_DETECTION: + # Use raw numbers without adjustment + c['actual_chapter_num'] = raw_num + c['raw_chapter_num'] = raw_num + c['zero_adjusted'] = False + else: + # Store raw number + c['raw_chapter_num'] = raw_num + # Apply 0-based adjustment if detected + if uses_zero_based: + c['actual_chapter_num'] = raw_num + 1 + c['zero_adjusted'] = True + else: + c['actual_chapter_num'] = raw_num + c['zero_adjusted'] = False + + for idx, c in enumerate(chapters): + chap_num = c["num"] + content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) + + # Check if this is a pre-split text chunk with decimal number + if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): + actual_num = c['num'] # Preserve the decimal for text files only + else: + actual_num = c.get('actual_chapter_num', c['num']) # Now this will exist! + + # Skip chapters outside the range + if start is not None and not (start <= actual_num <= end): + continue + + # Check if chapter needs translation + needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( + idx, actual_num, content_hash, out, c # Pass the chapter object + ) + # Add explicit file check for supposedly completed chapters + if not needs_translation and existing_file: + file_path = os.path.join(out, existing_file) + if not os.path.exists(file_path): + print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}") + needs_translation = True + skip_reason = None + # Update status to file_missing + progress_manager.update(idx, actual_num, content_hash, None, status="file_missing") + progress_manager.save() + + if not needs_translation: + # Modify skip_reason to use appropriate terminology + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + + # Replace "Chapter" with appropriate terminology in skip_reason + skip_reason_modified = skip_reason.replace("Chapter", terminology) + print(f"[SKIP] {skip_reason_modified}") + chapters_completed += 1 + continue + + # Check for empty or image-only chapters + has_images = c.get('has_images', False) + has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) + text_size = c.get('file_size', 0) + + is_empty_chapter = (not has_images and text_size < 10) + is_image_only_chapter = (has_images and not has_meaningful_text) + + # Handle empty chapters + if is_empty_chapter: + print(f"📄 Empty chapter {chap_num} - will process individually") + + safe_title = make_safe_filename(c['title'], c['num']) + + if isinstance(c['num'], float): + fname = FileUtilities.create_chapter_filename(c, c['num']) + else: + fname = FileUtilities.create_chapter_filename(c, c['num']) + with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: + f.write(c["body"]) + progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty") + progress_manager.save() + chapters_completed += 1 + continue + + # Add to chapters to translate + chapters_to_translate.append((idx, c)) + + print(f"📊 Found {len(chapters_to_translate)} chapters to translate in parallel") + + # Continue with the rest of the existing batch processing code... + batch_processor = BatchTranslationProcessor( + config, client, base_msg, out, progress_lock, + progress_manager.save, + lambda idx, actual_num, content_hash, output_file=None, status="completed", **kwargs: progress_manager.update(idx, actual_num, content_hash, output_file, status, **kwargs), + check_stop, + image_translator, + is_text_file=is_text_file + ) + + total_to_process = len(chapters_to_translate) + processed = 0 + + # Apply conservative batching setting + batch_multiplier = 3 if os.getenv('CONSERVATIVE_BATCHING', '0') == '1' else 1 + batch_group_size = config.BATCH_SIZE * batch_multiplier + + if batch_multiplier > 1: + print(f"📦 Using conservative batching: {batch_group_size} chapters per group, {config.BATCH_SIZE} parallel") + else: + print(f"📦 Using direct batching (default): {batch_group_size} chapters per group, {config.BATCH_SIZE} parallel") + + with concurrent.futures.ThreadPoolExecutor(max_workers=config.BATCH_SIZE) as executor: + for batch_start in range(0, total_to_process, batch_group_size): + if check_stop(): + print("❌ Translation stopped during parallel processing") + executor.shutdown(wait=False) + return + + batch_end = min(batch_start + batch_group_size, total_to_process) + current_batch = chapters_to_translate[batch_start:batch_end] + + batch_number = (batch_start // batch_group_size) + 1 + print(f"\n📦 Submitting batch {batch_number}: {len(current_batch)} chapters") + + future_to_chapter = { + executor.submit(batch_processor.process_single_chapter, chapter_data): chapter_data + for chapter_data in current_batch + } + + active_count = 0 + completed_in_batch = 0 + failed_in_batch = 0 + + for future in concurrent.futures.as_completed(future_to_chapter): + if check_stop(): + print("❌ Translation stopped") + executor.shutdown(wait=False) + return + + chapter_data = future_to_chapter[future] + idx, chapter = chapter_data + + try: + success, chap_num = future.result() + if success: + completed_in_batch += 1 + print(f"✅ Chapter {chap_num} done ({completed_in_batch + failed_in_batch}/{len(current_batch)} in batch)") + else: + failed_in_batch += 1 + print(f"❌ Chapter {chap_num} failed ({completed_in_batch + failed_in_batch}/{len(current_batch)} in batch)") + except Exception as e: + failed_in_batch += 1 + print(f"❌ Chapter thread error: {e}") + + processed += 1 + + progress_percent = (processed / total_to_process) * 100 + print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)") + + print(f"\n📦 Batch Summary:") + print(f" ✅ Successful: {completed_in_batch}") + print(f" ❌ Failed: {failed_in_batch}") + + if batch_end < total_to_process: + print(f"⏳ Waiting {config.DELAY}s before next batch...") + time.sleep(config.DELAY) + + chapters_completed = batch_processor.chapters_completed + chunks_completed = batch_processor.chunks_completed + + print(f"\n🎉 Parallel translation complete!") + print(f" Total chapters processed: {processed}") + + # Count qa_failed chapters correctly + qa_failed_count = 0 + actual_successful = 0 + + for idx, c in enumerate(chapters): + # Get the chapter's actual number + if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): + actual_num = c['num'] + else: + actual_num = c.get('actual_chapter_num', c['num']) + + # Check if this chapter was processed and has qa_failed status + content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) + + # Check if this chapter exists in progress + chapter_info = progress_manager.prog["chapters"].get(content_hash, {}) + status = chapter_info.get("status") + + if status == "qa_failed": + qa_failed_count += 1 + elif status == "completed": + actual_successful += 1 + + # Correct the displayed counts + print(f" Successful: {actual_successful}") + if qa_failed_count > 0: + print(f"\n⚠️ {qa_failed_count} chapters failed due to content policy violations:") + qa_failed_chapters = [] + for idx, c in enumerate(chapters): + if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): + actual_num = c['num'] + else: + actual_num = c.get('actual_chapter_num', c['num']) + + content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) + chapter_info = progress_manager.prog["chapters"].get(content_hash, {}) + if chapter_info.get("status") == "qa_failed": + qa_failed_chapters.append(actual_num) + + print(f" Failed chapters: {', '.join(map(str, sorted(qa_failed_chapters)))}") + + # Stop translation completely after batch mode + print("\n📌 Batch translation completed.") + + elif not config.BATCH_TRANSLATION: + translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file) + + if config.DUPLICATE_DETECTION_MODE == 'ai-hunter': + # Build the main config from environment variables and config object + main_config = { + 'duplicate_lookback_chapters': config.DUPLICATE_LOOKBACK_CHAPTERS, + 'duplicate_detection_mode': config.DUPLICATE_DETECTION_MODE, + } + + # Check if AI Hunter config was passed via environment variable + ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG') + if ai_hunter_config_str: + try: + ai_hunter_config = json.loads(ai_hunter_config_str) + main_config['ai_hunter_config'] = ai_hunter_config + print("🤖 AI Hunter: Loaded configuration from environment") + except json.JSONDecodeError: + print("⚠️ AI Hunter: Failed to parse AI_HUNTER_CONFIG from environment") + + # If no AI Hunter config in environment, try to load from file as fallback + if 'ai_hunter_config' not in main_config: + # Try multiple locations for config.json + config_paths = [ + os.path.join(os.getcwd(), 'config.json'), + os.path.join(out, '..', 'config.json'), + ] + + if getattr(sys, 'frozen', False): + config_paths.append(os.path.join(os.path.dirname(sys.executable), 'config.json')) + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + config_paths.extend([ + os.path.join(script_dir, 'config.json'), + os.path.join(os.path.dirname(script_dir), 'config.json') + ]) + + for config_path in config_paths: + if os.path.exists(config_path): + try: + with open(config_path, 'r', encoding='utf-8') as f: + file_config = json.load(f) + if 'ai_hunter_config' in file_config: + main_config['ai_hunter_config'] = file_config['ai_hunter_config'] + print(f"🤖 AI Hunter: Loaded configuration from {config_path}") + break + except Exception as e: + print(f"⚠️ Failed to load config from {config_path}: {e}") + + # Always create and inject the improved AI Hunter when ai-hunter mode is selected + ai_hunter = ImprovedAIHunterDetection(main_config) + + # The TranslationProcessor class has a method that checks for duplicates + # We need to replace it with our enhanced AI Hunter + + # Create a wrapper to match the expected signature + def enhanced_duplicate_check(self, result, idx, prog, out, actual_num=None): + # If actual_num is not provided, try to get it from progress + if actual_num is None: + # Look for the chapter being processed + for ch_key, ch_info in prog.get("chapters", {}).items(): + if ch_info.get("chapter_idx") == idx: + actual_num = ch_info.get("actual_num", idx + 1) + break + + # Fallback to idx+1 if not found + if actual_num is None: + actual_num = idx + 1 + + return ai_hunter.detect_duplicate_ai_hunter_enhanced(result, idx, prog, out, actual_num) + + # Bind the enhanced method to the processor instance + translation_processor.check_duplicate_content = enhanced_duplicate_check.__get__(translation_processor, TranslationProcessor) + + print("🤖 AI Hunter: Using enhanced detection with configurable thresholds") + + # First pass: set actual chapter numbers respecting the config + for idx, c in enumerate(chapters): + raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) + #print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}") + + + # Apply offset if configured + offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 + raw_num += offset + + if config.DISABLE_ZERO_DETECTION: + # Use raw numbers without adjustment + c['actual_chapter_num'] = raw_num + c['raw_chapter_num'] = raw_num + c['zero_adjusted'] = False + else: + # Store raw number + c['raw_chapter_num'] = raw_num + # Apply 0-based adjustment if detected + if uses_zero_based: + c['actual_chapter_num'] = raw_num + 1 + c['zero_adjusted'] = True + else: + c['actual_chapter_num'] = raw_num + c['zero_adjusted'] = False + + # Second pass: process chapters + for idx, c in enumerate(chapters): + chap_num = c["num"] + + # Check if this is a pre-split text chunk with decimal number + if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): + actual_num = c['num'] # Preserve the decimal for text files only + else: + actual_num = c.get('actual_chapter_num', c['num']) + content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) + + if start is not None and not (start <= actual_num <= end): + #print(f"[SKIP] Chapter {actual_num} (file: {c.get('original_basename', 'unknown')}) outside range {start}-{end}") + continue + + needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( + idx, actual_num, content_hash, out, c # Pass the chapter object + ) + # Add explicit file check for supposedly completed chapters + if not needs_translation and existing_file: + file_path = os.path.join(out, existing_file) + if not os.path.exists(file_path): + print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}") + needs_translation = True + skip_reason = None + # Update status to file_missing + progress_manager.update(idx, actual_num, content_hash, None, status="file_missing") + progress_manager.save() + if not needs_translation: + # Modify skip_reason to use appropriate terminology + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + + # Replace "Chapter" with appropriate terminology in skip_reason + skip_reason_modified = skip_reason.replace("Chapter", terminology) + print(f"[SKIP] {skip_reason_modified}") + continue + + chapter_position = f"{chapters_completed + 1}/{chapters_to_process}" + + # Determine if this is a text file + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + + # Determine file reference based on type + if c.get('is_chunk', False): + file_ref = f"Section_{c['num']}" + else: + file_ref = c.get('original_basename', f'{terminology}_{actual_num}') + + print(f"\n🔄 Processing #{idx+1}/{total_chapters} (Actual: {terminology} {actual_num}) ({chapter_position} to translate): {c['title']} [File: {file_ref}]") + + chunk_context_manager.start_chapter(chap_num, c['title']) + + has_images = c.get('has_images', False) + has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) + text_size = c.get('file_size', 0) + + is_empty_chapter = (not has_images and text_size < 10) + is_image_only_chapter = (has_images and not has_meaningful_text) + is_mixed_content = (has_images and has_meaningful_text) + is_text_only = (not has_images and has_meaningful_text) + + if is_empty_chapter: + print(f"📄 Empty chapter {actual_num} detected") + + # Create filename for empty chapter + if isinstance(c['num'], float): + fname = FileUtilities.create_chapter_filename(c, c['num']) + else: + fname = FileUtilities.create_chapter_filename(c, actual_num) + + # Save original content + with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: + f.write(c["body"]) + + # Update progress tracking + progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty") + progress_manager.save() + chapters_completed += 1 + + # CRITICAL: Skip translation! + continue + + elif is_image_only_chapter: + print(f"📸 Image-only chapter: {c.get('image_count', 0)} images") + + translated_html = c["body"] + image_translations = {} + + # Step 1: Process images if image translation is enabled + if image_translator and config.ENABLE_IMAGE_TRANSLATION: + print(f"🖼️ Translating {c.get('image_count', 0)} images...") + image_translator.set_current_chapter(chap_num) + + translated_html, image_translations = process_chapter_images( + c["body"], + actual_num, + image_translator, + check_stop + ) + + if image_translations: + print(f"✅ Translated {len(image_translations)} images") + + # Step 2: Check for headers/titles that need translation + from bs4 import BeautifulSoup + soup = BeautifulSoup(c["body"], 'html.parser') + + # Look for headers + headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) + + # If we have headers, we should translate them even in "image-only" chapters + if headers and any(h.get_text(strip=True) for h in headers): + print(f"📝 Found headers to translate in image-only chapter") + + # Create a minimal HTML with just the headers for translation + headers_html = "" + for header in headers: + if header.get_text(strip=True): + headers_html += str(header) + "\n" + + if headers_html: + print(f"📤 Translating chapter headers...") + + # Send just the headers for translation + header_msgs = base_msg + [{"role": "user", "content": headers_html}] + + # Use the standard filename + fname = FileUtilities.create_chapter_filename(c, actual_num) + client.set_output_filename(fname) + + # Simple API call for headers + header_result, _ = client.send( + header_msgs, + temperature=config.TEMP, + max_tokens=config.MAX_OUTPUT_TOKENS + ) + + if header_result: + # Clean the result + header_result = re.sub(r"^```(?:html)?\s*\n?", "", header_result, count=1, flags=re.MULTILINE) + header_result = re.sub(r"\n?```\s*$", "", header_result, count=1, flags=re.MULTILINE) + + # Parse both the translated headers and the original body + soup_headers = BeautifulSoup(header_result, 'html.parser') + soup_body = BeautifulSoup(translated_html, 'html.parser') + + # Replace headers in the body with translated versions + translated_headers = soup_headers.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) + original_headers = soup_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) + + # Match and replace headers + for orig, trans in zip(original_headers, translated_headers): + if trans and trans.get_text(strip=True): + orig.string = trans.get_text(strip=True) + + translated_html = str(soup_body) + print(f"✅ Headers translated successfully") + status = "completed" + else: + print(f"⚠️ Failed to translate headers") + status = "completed_image_only" + else: + status = "completed_image_only" + else: + print(f"ℹ️ No headers found to translate") + status = "completed_image_only" + + # Step 3: Save with correct filename + fname = FileUtilities.create_chapter_filename(c, actual_num) + + with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: + f.write(translated_html) + + print(f"[Chapter {idx+1}/{total_chapters}] ✅ Saved image-only chapter") + progress_manager.update(idx, actual_num, content_hash, fname, status=status) + progress_manager.save() + chapters_completed += 1 + continue + + else: + # Set default text to translate + text_to_translate = c["body"] + image_translations = {} + if is_mixed_content and image_translator and config.ENABLE_IMAGE_TRANSLATION: + print(f"🖼️ Processing {c.get('image_count', 0)} images first...") + + print(f"[DEBUG] Content before image processing (first 200 chars):") + print(c["body"][:200]) + print(f"[DEBUG] Has h1 tags: {'

    ' in c['body']}") + print(f"[DEBUG] Has h2 tags: {'

    ' in c['body']}") + + image_translator.set_current_chapter(chap_num) + + # Store the original body before processing + original_body = c["body"] + + # Calculate original chapter tokens before modification + original_chapter_tokens = chapter_splitter.count_tokens(original_body) + + # Process images and get body with translations + body_with_images, image_translations = process_chapter_images( + c["body"], + actual_num, + image_translator, + check_stop + ) + + if image_translations: + print(f"✅ Translated {len(image_translations)} images") + + # Store the body with images for later merging + c["body_with_images"] = c["body"] + + # For chapters with only images and title, we still need to translate the title + # Extract clean text for translation from ORIGINAL body + from bs4 import BeautifulSoup + soup_clean = BeautifulSoup(original_body, 'html.parser') + + # Remove images from the original to get pure text + for img in soup_clean.find_all('img'): + img.decompose() + + # Set clean text for translation - use prettify() or str() on the full document + c["body"] = str(soup_clean) if soup_clean.body else original_body + + # If there's no meaningful text content after removing images, + # the text translation will just translate the title, which is correct + print(f" 📝 Clean text for translation: {len(c['body'])} chars") + + # Update text_size to reflect actual text to translate + text_size = len(c["body"]) + + # Recalculate the actual token count for clean text + actual_text_tokens = chapter_splitter.count_tokens(c["body"]) + print(f" 📊 Actual text tokens: {actual_text_tokens} (was counting {original_chapter_tokens} with images)") + else: + print(f"ℹ️ No translatable text found in images") + # Keep original body if no image translations + c["body"] = original_body + + print(f"📖 Translating text content ({text_size} characters)") + progress_manager.update(idx, actual_num, content_hash, output_file=None, status="in_progress") + progress_manager.save() + + # Apply ignore filtering to the content before chunk splitting + batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' + ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active + ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active + + if (ignore_title_tag or ignore_header_tags) and c["body"]: + from bs4 import BeautifulSoup + content_soup = BeautifulSoup(c["body"], 'html.parser') + + # Remove title tags if ignored + if ignore_title_tag: + for title_tag in content_soup.find_all('title'): + title_tag.decompose() + + # Remove header tags if ignored + if ignore_header_tags: + for header_tag in content_soup.find_all(['h1', 'h2', 'h3']): + header_tag.decompose() + + c["body"] = str(content_soup) # Update the chapter body + + # Check if this chapter is already a chunk from text file splitting + if c.get('is_chunk', False): + # This is already a pre-split chunk, but still check if it needs further splitting + # Calculate based on OUTPUT limit only + max_output_tokens = config.MAX_OUTPUT_TOKENS + safety_margin_output = 500 + + # CJK to English typically compresses to 0.7-0.9x + compression_factor = config.COMPRESSION_FACTOR + available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) + + # Ensure minimum + available_tokens = max(available_tokens, 1000) + + print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") + + chapter_tokens = chapter_splitter.count_tokens(c["body"]) + + if chapter_tokens > available_tokens: + # Even pre-split chunks might need further splitting + chunks = chapter_splitter.split_chapter(c["body"], available_tokens) + print(f"📄 Section {c['num']} (pre-split from text file) needs further splitting into {len(chunks)} chunks") + else: + chunks = [(c["body"], 1, 1)] + print(f"📄 Section {c['num']} (pre-split from text file)") + else: + # Normal splitting logic for non-text files + # Calculate based on OUTPUT limit only + max_output_tokens = config.MAX_OUTPUT_TOKENS + safety_margin_output = 500 + + # CJK to English typically compresses to 0.7-0.9x + compression_factor = config.COMPRESSION_FACTOR + available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) + + # Ensure minimum + available_tokens = max(available_tokens, 1000) + + print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") + + chunks = chapter_splitter.split_chapter(c["body"], available_tokens) + + # Use consistent terminology + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + print(f"📄 {terminology} will be processed in {len(chunks)} chunk(s)") + + # Recalculate tokens on the actual text to be translated + actual_chapter_tokens = chapter_splitter.count_tokens(c["body"]) + + if len(chunks) > 1: + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (limit: {available_tokens:,} tokens per chunk)") + else: + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (within limit of {available_tokens:,} tokens)") + + chapter_key_str = str(idx) + if chapter_key_str not in progress_manager.prog["chapter_chunks"]: + progress_manager.prog["chapter_chunks"][chapter_key_str] = { + "total": len(chunks), + "completed": [], + "chunks": {} + } + + progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks) + + translated_chunks = [] + + for chunk_html, chunk_idx, total_chunks in chunks: + chapter_key_str = content_hash + old_key_str = str(idx) + + if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}): + progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str] + del progress_manager.prog["chapter_chunks"][old_key_str] + #print(f"[PROGRESS] Migrated chunks for chapter {chap_num} to new tracking system") + + if chapter_key_str not in progress_manager.prog["chapter_chunks"]: + progress_manager.prog["chapter_chunks"][chapter_key_str] = { + "total": len(chunks), + "completed": [], + "chunks": {} + } + + progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks) + + # Get chapter status to check for qa_failed + chapter_info = progress_manager.prog["chapters"].get(chapter_key_str, {}) + chapter_status = chapter_info.get("status") + + if chapter_status == "qa_failed": + # Force retranslation of qa_failed chapters + print(f" [RETRY] Chunk {chunk_idx}/{total_chunks} - retranslating due to QA failure") + + if config.CONTEXTUAL and history_manager.will_reset_on_next_append(config.HIST_LIMIT): + print(f" 📌 History will reset after this chunk (current: {len(history_manager.load_history())//2}/{config.HIST_LIMIT} exchanges)") + + if check_stop(): + print(f"❌ Translation stopped during chapter {actual_num}, chunk {chunk_idx}") + return + + current_chunk_number += 1 + + progress_percent = (current_chunk_number / total_chunks_needed) * 100 if total_chunks_needed > 0 else 0 + + if chunks_completed > 0: + elapsed_time = time.time() - translation_start_time + avg_time_per_chunk = elapsed_time / chunks_completed + remaining_chunks = total_chunks_needed - current_chunk_number + 1 + eta_seconds = remaining_chunks * avg_time_per_chunk + + eta_hours = int(eta_seconds // 3600) + eta_minutes = int((eta_seconds % 3600) // 60) + eta_str = f"{eta_hours}h {eta_minutes}m" if eta_hours > 0 else f"{eta_minutes}m" + else: + eta_str = "calculating..." + + if total_chunks > 1: + print(f" 🔄 Translating chunk {chunk_idx}/{total_chunks} for #{idx+1} (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str})") + print(f" ⏳ Chunk size: {len(chunk_html):,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)") + else: + # Determine terminology and file reference + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + + # Consistent file reference + if c.get('is_chunk', False): + file_ref = f"Section_{c['num']}" + else: + file_ref = c.get('original_basename', f'{terminology}_{actual_num}') + + print(f" 📄 Translating {terminology.lower()} content (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str}) [File: {file_ref}]") + print(f" 📊 {terminology} {actual_num} size: {len(chunk_html):,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)") + + print(f" ℹ️ This may take 30-60 seconds. Stop will take effect after completion.") + + if log_callback: + if hasattr(log_callback, '__self__') and hasattr(log_callback.__self__, 'append_chunk_progress'): + if total_chunks == 1: + # Determine terminology based on source type + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + + log_callback.__self__.append_chunk_progress( + 1, 1, "text", + f"{terminology} {actual_num}", + overall_current=current_chunk_number, + overall_total=total_chunks_needed, + extra_info=f"{len(chunk_html):,} chars" + ) + else: + log_callback.__self__.append_chunk_progress( + chunk_idx, + total_chunks, + "text", + f"{terminology} {actual_num}", + overall_current=current_chunk_number, + overall_total=total_chunks_needed + ) + else: + # Determine terminology based on source type + is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) + terminology = "Section" if is_text_source else "Chapter" + terminology_lower = "section" if is_text_source else "chapter" + + if total_chunks == 1: + log_callback(f"📄 Processing {terminology} {actual_num} ({chapters_completed + 1}/{chapters_to_process}) - {progress_percent:.1f}% complete") + else: + log_callback(f"📄 processing chunk {chunk_idx}/{total_chunks} for {terminology_lower} {actual_num} - {progress_percent:.1f}% complete") + + # Get custom chunk prompt template from environment + chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]\n{chunk_html}") + + if total_chunks > 1: + user_prompt = chunk_prompt_template.format( + chunk_idx=chunk_idx, + total_chunks=total_chunks, + chunk_html=chunk_html + ) + else: + user_prompt = chunk_html + + if config.CONTEXTUAL: + history = history_manager.load_history() + trimmed = history[-config.HIST_LIMIT*2:] + chunk_context = chunk_context_manager.get_context_messages(limit=2) + else: + history = [] # Set empty history when not contextual + trimmed = [] + chunk_context = [] + + # Build the current system prompt from the original each time, and append the last summary block if present + current_system_content = original_system_prompt + if config.USE_ROLLING_SUMMARY and last_summary_block_text: + current_system_content = ( + current_system_content + + "\n\n[Rolling Summary of Previous Chapter]\n" + + "(For AI: Use as context only; do not include in output)\n" + + last_summary_block_text + + "\n[End of Rolling Summary]" + ) + current_base = [{"role": "system", "content": current_system_content}] + # If we have a prepared rolling summary from previous chapter, include it as a separate message (do NOT mutate system prompt) + summary_msgs_list = [] + if config.USE_ROLLING_SUMMARY and last_summary_block_text: + summary_msgs_list = [{ + "role": os.getenv("SUMMARY_ROLE", "user"), + "content": ( + "CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n" + "[MEMORY] Previous context summary:\n\n" + f"{last_summary_block_text}\n\n" + "[END MEMORY]\n" + "END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:" + ) + }] + msgs = current_base + summary_msgs_list + chunk_context + trimmed + [{"role": "user", "content": user_prompt}] + + c['__index'] = idx + c['__progress'] = progress_manager.prog + c['history_manager'] = history_manager + + result, finish_reason = translation_processor.translate_with_retry( + msgs, chunk_html, c, chunk_idx, total_chunks + ) + + if result is None: + progress_manager.update(idx, actual_num, content_hash, output_file=None, status="failed") + progress_manager.save() + continue + + if config.REMOVE_AI_ARTIFACTS: + result = ContentProcessor.clean_ai_artifacts(result, True) + + if config.EMERGENCY_RESTORE: + result = ContentProcessor.emergency_restore_paragraphs(result, chunk_html) + + if config.REMOVE_AI_ARTIFACTS: + lines = result.split('\n') + + json_line_count = 0 + for i, line in enumerate(lines[:5]): + if line.strip() and any(pattern in line for pattern in [ + '"role":', '"content":', '"messages":', + '{"role"', '{"content"', '[{', '}]' + ]): + json_line_count = i + 1 + else: + break + + if json_line_count > 0 and json_line_count < len(lines): + remaining = '\n'.join(lines[json_line_count:]) + if remaining.strip() and len(remaining) > 100: + result = remaining + print(f"✂️ Removed {json_line_count} lines of JSON artifacts") + + result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE) + + translated_chunks.append((result, chunk_idx, total_chunks)) + + chunk_context_manager.add_chunk(user_prompt, result, chunk_idx, total_chunks) + + progress_manager.prog["chapter_chunks"][chapter_key_str]["completed"].append(chunk_idx) + progress_manager.prog["chapter_chunks"][chapter_key_str]["chunks"][str(chunk_idx)] = result + progress_manager.save() + + chunks_completed += 1 + + will_reset = history_manager.will_reset_on_next_append( + config.HIST_LIMIT if config.CONTEXTUAL else 0, + config.TRANSLATION_HISTORY_ROLLING + ) + + + history = history_manager.append_to_history( + user_prompt, + result, + config.HIST_LIMIT if config.CONTEXTUAL else 0, + reset_on_limit=True, + rolling_window=config.TRANSLATION_HISTORY_ROLLING + ) + + if chunk_idx < total_chunks: + # Handle float delays while checking for stop + full_seconds = int(config.DELAY) + fractional_second = config.DELAY - full_seconds + + # Check stop signal every second for full seconds + for i in range(full_seconds): + if check_stop(): + print("❌ Translation stopped during delay") + return + time.sleep(1) + + # Handle the fractional part if any + if fractional_second > 0: + if check_stop(): + print("❌ Translation stopped during delay") + return + time.sleep(fractional_second) + + if check_stop(): + print(f"❌ Translation stopped before saving chapter {actual_num}") + return + + if len(translated_chunks) > 1: + print(f" 📎 Merging {len(translated_chunks)} chunks...") + translated_chunks.sort(key=lambda x: x[1]) + merged_result = chapter_splitter.merge_translated_chunks(translated_chunks) + else: + merged_result = translated_chunks[0][0] if translated_chunks else "" + + if config.CONTEXTUAL and len(translated_chunks) > 1: + user_summary, assistant_summary = chunk_context_manager.get_summary_for_history() + + if user_summary and assistant_summary: + history_manager.append_to_history( + user_summary, + assistant_summary, + config.HIST_LIMIT, + reset_on_limit=False, + rolling_window=config.TRANSLATION_HISTORY_ROLLING + ) + print(f" 📝 Added chapter summary to history") + + chunk_context_manager.clear() + + # For text file chunks, ensure we pass the decimal number + if is_text_file and c.get('is_chunk', False) and isinstance(c.get('num'), float): + fname = FileUtilities.create_chapter_filename(c, c['num']) # Use the decimal num directly + else: + fname = FileUtilities.create_chapter_filename(c, actual_num) + + client.set_output_filename(fname) + cleaned = re.sub(r"^```(?:html)?\s*\n?", "", merged_result, count=1, flags=re.MULTILINE) + cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) + + cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=config.REMOVE_AI_ARTIFACTS) + + if is_mixed_content and image_translations: + print(f"🔀 Merging {len(image_translations)} image translations with text...") + from bs4 import BeautifulSoup + # Parse the translated text (which has the translated title/header) + soup_translated = BeautifulSoup(cleaned, 'html.parser') + + # For each image translation, insert it into the document + for img_path, translation_html in image_translations.items(): + if translation_html and ' 0: + combined.write(f"\n\n{'='*50}\n\n") + + # Write the original chapter title (without Part X/Y suffix) + original_title = chapter_data['title'] + # Remove the (Part X/Y) suffix if present + if ' (Part ' in original_title: + original_title = original_title.split(' (Part ')[0] + + combined.write(f"{original_title}\n\n") + + # Add the chunk content + combined.write(content) + + # Add spacing between chunks of the same chapter + if chunk_idx < total_chunks: + combined.write("\n\n") + else: + # This is a standalone chapter + current_main_chapter = chapter_data['num'] + + # Add separator if not first chapter + if i > 0: + combined.write(f"\n\n{'='*50}\n\n") + + # Write the chapter title + combined.write(f"{chapter_data['title']}\n\n") + + # Add the content + combined.write(content) + + print(f" • Combined file with preserved sections: {combined_path}") + + total_time = time.time() - translation_start_time + hours = int(total_time // 3600) + minutes = int((total_time % 3600) // 60) + seconds = int(total_time % 60) + + print(f"\n⏱️ Total translation time: {hours}h {minutes}m {seconds}s") + print(f"📊 Chapters completed: {chapters_completed}") + print(f"✅ Text file translation complete!") + + if log_callback: + log_callback(f"✅ Text file translation complete! Created {combined_path}") + + except Exception as e: + print(f"❌ Error creating combined text file: {e}") + if log_callback: + log_callback(f"❌ Error creating combined text file: {e}") + else: + print("🔍 Checking for translated chapters...") + # Respect retain extension toggle: if enabled, don't look for response_ prefix + if should_retain_source_extension(): + response_files = [f for f in os.listdir(out) if f.endswith('.html') and not f.startswith('chapter_')] + else: + response_files = [f for f in os.listdir(out) if f.startswith('response_') and f.endswith('.html')] + chapter_files = [f for f in os.listdir(out) if f.startswith('chapter_') and f.endswith('.html')] + + if not response_files and chapter_files: + if should_retain_source_extension(): + print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist") + print("ℹ️ Retain-source-extension mode is ON: skipping placeholder creation and using original files for EPUB compilation.") + else: + print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist") + print("📝 Creating placeholder response files for EPUB compilation...") + + for chapter_file in chapter_files: + response_file = chapter_file.replace('chapter_', 'response_', 1) + src = os.path.join(out, chapter_file) + dst = os.path.join(out, response_file) + + try: + with open(src, 'r', encoding='utf-8') as f: + content = f.read() + + soup = BeautifulSoup(content, 'html.parser') + notice = soup.new_tag('p') + notice.string = "[Note: This chapter could not be translated - showing original content]" + notice['style'] = "color: red; font-style: italic;" + + if soup.body: + soup.body.insert(0, notice) + + with open(dst, 'w', encoding='utf-8') as f: + f.write(str(soup)) + + except Exception as e: + print(f"⚠️ Error processing {chapter_file}: {e}") + try: + shutil.copy2(src, dst) + except: + pass + + print(f"✅ Created {len(chapter_files)} placeholder response files") + print("⚠️ Note: The EPUB will contain untranslated content") + + print("📘 Building final EPUB…") + try: + from epub_converter import fallback_compile_epub + fallback_compile_epub(out, log_callback=log_callback) + print("✅ All done: your final EPUB is in", out) + + total_time = time.time() - translation_start_time + hours = int(total_time // 3600) + minutes = int((total_time % 3600) // 60) + seconds = int(total_time % 60) + + print(f"\n📊 Translation Statistics:") + print(f" • Total chunks processed: {chunks_completed}") + print(f" • Total time: {hours}h {minutes}m {seconds}s") + if chunks_completed > 0: + avg_time = total_time / chunks_completed + print(f" • Average time per chunk: {avg_time:.1f} seconds") + + stats = progress_manager.get_stats(out) + print(f"\n📊 Progress Tracking Summary:") + print(f" • Total chapters tracked: {stats['total_tracked']}") + print(f" • Successfully completed: {stats['completed']}") + print(f" • Missing files: {stats['missing_files']}") + print(f" • In progress: {stats['in_progress']}") + + except Exception as e: + print("❌ EPUB build failed:", e) + + print("TRANSLATION_COMPLETE_SIGNAL") + +if __name__ == "__main__": + main() \ No newline at end of file