Spaces:

Shirochi
/

Glossarion

Sleeping

App Files Files Community

Glossarion / Chapter_Extractor.py

Shirochi

Upload 93 files

ec038f4 verified about 1 month ago

raw

history blame contribute delete

112 kB

	# Chapter_Extractor.py - Module-level chapter extraction functions
	import os
	import re
	import sys
	import json
	import threading
	import time
	import shutil
	import hashlib
	import warnings

	# Lazy import for PatternManager to speed up ProcessPoolExecutor worker startup on Windows
	# The heavy TransateKRtoEN import is deferred until actually needed
	_PatternManager = None
	_PM = None

	def _get_pattern_manager():
	"""Lazy initialization of PatternManager to avoid slow imports in worker processes"""
	global _PatternManager, _PM
	if _PatternManager is None:
	from TransateKRtoEN import PatternManager as PM_Class
	_PatternManager = PM_Class
	_PM = PM_Class()
	return _PM

	# For backward compatibility - property-like access
	class _LazyPM:
	def __getattr__(self, name):
	return getattr(_get_pattern_manager(), name)

	PM = _LazyPM()

	from bs4 import BeautifulSoup
	try:
	from bs4 import XMLParsedAsHTMLWarning
	warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
	except ImportError:
	pass
	from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
	from collections import Counter

	# Stop request function (can be overridden)
	def is_stop_requested():
	"""Check if stop has been requested - default implementation"""
	return False

	# Progress bar for terminal output
	class ProgressBar:
	"""Simple in-place progress bar for terminal output"""
	_last_line_length = 0

	@classmethod
	def update(cls, current, total, prefix="Progress", bar_length=30):
	if total == 0:
	return
	percent = min(100, int(100 * current / total))
	filled = int(bar_length * current / total)
	bar = '█' * filled + '░' * (bar_length - filled)
	line = f"\r{prefix}: [{bar}] {current}/{total} ({percent}%)"
	if len(line) < cls._last_line_length:
	line += ' ' * (cls._last_line_length - len(line))
	cls._last_line_length = len(line)
	print(line, end='', flush=True)

	@classmethod
	def finish(cls):
	print()
	cls._last_line_length = 0

	# Helper for resource filename sanitization
	def sanitize_resource_filename(filename):
	"""Sanitize resource filenames to be filesystem-safe"""
	import unicodedata
	# Normalize unicode - use NFC to preserve Korean/CJK characters
	# NFKD decomposes Korean Hangul into jamo components, corrupting them
	filename = unicodedata.normalize('NFC', filename)
	# Remove or replace problematic characters
	filename = re.sub(r'[<>:"/\\\|?*]', '_', filename)
	return filename

	def _get_best_parser():
	"""Determine the best parser available, preferring lxml for CJK text"""
	try:
	import lxml
	return 'lxml'
	except ImportError:
	return 'html.parser'

	def _sort_by_opf_spine(chapters, opf_path):
	"""Sort chapters according to OPF spine order"""
	try:
	import xml.etree.ElementTree as ET

	# Read OPF file
	with open(opf_path, 'r', encoding='utf-8') as f:
	opf_content = f.read()

	# Parse OPF
	root = ET.fromstring(opf_content)

	# Find namespaces
	ns = {'opf': 'http://www.idpf.org/2007/opf'}
	if root.tag.startswith('{'):
	default_ns = root.tag[1:root.tag.index('}')]
	ns = {'opf': default_ns}

	# Build manifest map (id -> href)
	manifest = {}
	for item in root.findall('.//opf:manifest/opf:item', ns):
	item_id = item.get('id')
	href = item.get('href')
	if item_id and href:
	manifest[item_id] = href

	# Get spine order
	spine_order = []
	spine = root.find('.//opf:spine', ns)
	if spine is not None:
	for itemref in spine.findall('opf:itemref', ns):
	idref = itemref.get('idref')
	if idref and idref in manifest:
	href = manifest[idref]
	spine_order.append(href)

	if not spine_order:
	print("⚠️ No spine order found in OPF, keeping original order")
	return chapters

	# Create a mapping of filenames to spine position
	spine_map = {}
	for idx, href in enumerate(spine_order):
	# Try different matching strategies
	basename = os.path.basename(href)
	spine_map[basename] = idx
	spine_map[href] = idx
	# Also store without extension for flexible matching
	name_no_ext = os.path.splitext(basename)[0]
	spine_map[name_no_ext] = idx

	print(f"📋 OPF spine contains {len(spine_order)} items")

	# Sort chapters based on spine order
	def get_spine_position(chapter):
	# Try to match chapter to spine
	filename = chapter.get('filename', '')
	basename = chapter.get('original_basename', '')

	# Try exact filename match
	if filename in spine_map:
	return spine_map[filename]

	# Try basename match
	if basename in spine_map:
	return spine_map[basename]

	# Try basename of filename
	if filename:
	fname_base = os.path.basename(filename)
	if fname_base in spine_map:
	return spine_map[fname_base]

	# Try without extension
	if basename:
	if basename + '.html' in spine_map:
	return spine_map[basename + '.html']
	if basename + '.xhtml' in spine_map:
	return spine_map[basename + '.xhtml']

	# Fallback to chapter number * 1000 (to sort after spine items)
	return 1000000 + chapter.get('num', 0)

	# Sort chapters
	sorted_chapters = sorted(chapters, key=get_spine_position)

	# Renumber chapters based on new order
	for idx, chapter in enumerate(sorted_chapters, 1):
	chapter['spine_order'] = idx
	# Optionally update chapter numbers to match spine order
	# chapter['num'] = idx # Uncomment if you want to renumber

	# Log reordering info
	reordered_count = 0
	for idx, chapter in enumerate(sorted_chapters):
	original_idx = chapters.index(chapter)
	if original_idx != idx:
	reordered_count += 1

	if reordered_count > 0:
	print(f"🔄 Reordered {reordered_count} chapters to match OPF spine")
	else:
	print(f"✅ Chapter order already matches OPF spine")

	return sorted_chapters

	except Exception as e:
	print(f"⚠️ Could not sort by OPF spine: {e}")
	import traceback
	traceback.print_exc()
	return chapters


	def protect_angle_brackets_with_korean(text: str) -> str:
	"""Protect CJK text in angle brackets from HTML parsing"""
	if text is None:
	return ""

	import re
	# Extended pattern to include Korean, Chinese, and Japanese characters
	cjk_pattern = r'[가-힣ㄱ-ㅎㅏ-ㅣ一-龿ぁ-ゟァ-ヿ]'
	bracket_pattern = rf'<([^<>]{cjk_pattern}[^<>])>'

	def replace_brackets(match):
	content = match.group(1)
	return f'<{content}>'

	return re.sub(bracket_pattern, replace_brackets, text)

	def ensure_all_opf_chapters_extracted(zf, chapters, out):
	"""Ensure ALL chapters from OPF spine are extracted, not just what ChapterExtractor found"""

	# Parse OPF to get ALL chapters in spine
	opf_chapters = []

	try:
	# Find content.opf
	opf_content = None
	for name in zf.namelist():
	if name.endswith('content.opf'):
	opf_content = zf.read(name)
	break

	if not opf_content:
	return chapters # No OPF, return original

	import xml.etree.ElementTree as ET
	root = ET.fromstring(opf_content)

	# Handle namespaces
	ns = {'opf': 'http://www.idpf.org/2007/opf'}
	if root.tag.startswith('{'):
	default_ns = root.tag[1:root.tag.index('}')]
	ns = {'opf': default_ns}

	# Get manifest
	manifest = {}
	for item in root.findall('.//opf:manifest/opf:item', ns):
	item_id = item.get('id')
	href = item.get('href')
	media_type = item.get('media-type', '')

	if item_id and href and ('html' in media_type.lower() or href.endswith(('.html', '.xhtml', '.htm'))):
	manifest[item_id] = href

	# Get spine order
	spine = root.find('.//opf:spine', ns)
	if spine:
	for itemref in spine.findall('opf:itemref', ns):
	idref = itemref.get('idref')
	if idref and idref in manifest:
	href = manifest[idref]
	filename = os.path.basename(href)

	# Skip nav, toc, cover - BUT only if filename has NO numbers
	# Files with numbers like 'nav01', 'toc05' are real chapters
	import re
	has_numbers = bool(re.search(r'\d', filename))
	if not has_numbers and any(skip in filename.lower() for skip in ['nav', 'toc', 'cover']):
	continue

	opf_chapters.append(href)

	print(f"📚 OPF spine contains {len(opf_chapters)} chapters")

	# Check which OPF chapters are missing from extraction
	extracted_files = set()
	for c in chapters:
	if 'filename' in c:
	extracted_files.add(c['filename'])
	if 'original_basename' in c:
	extracted_files.add(c['original_basename'])

	missing_chapters = []
	for opf_chapter in opf_chapters:
	basename = os.path.basename(opf_chapter)
	if basename not in extracted_files and opf_chapter not in extracted_files:
	missing_chapters.append(opf_chapter)

	if missing_chapters:
	print(f"⚠️ {len(missing_chapters)} chapters in OPF but not extracted!")
	print(f" Missing: {missing_chapters[:5]}{'...' if len(missing_chapters) > 5 else ''}")

	# Extract the missing chapters
	for href in missing_chapters:
	try:
	# Read the chapter content
	content = zf.read(href).decode('utf-8')

	# Extract chapter number
	import re
	basename = os.path.basename(href)
	matches = re.findall(r'(\d+)', basename)
	if matches:
	chapter_num = int(matches[-1])
	else:
	chapter_num = len(chapters) + 1

	# Create chapter entry
	from bs4 import BeautifulSoup
	parser = 'lxml' if 'lxml' in sys.modules else 'html.parser'
	soup = BeautifulSoup(content, parser)

	# Get title
	title = "Chapter " + str(chapter_num)
	title_tag = soup.find('title')
	if title_tag:
	title = title_tag.get_text().strip() or title
	else:
	for tag in ['h1', 'h2', 'h3']:
	header = soup.find(tag)
	if header:
	title = header.get_text().strip() or title
	break

	# Save the chapter file
	output_filename = f"chapter_{chapter_num:04d}_{basename}"
	output_path = os.path.join(out, output_filename)
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(content)

	# Add to chapters list
	new_chapter = {
	'num': chapter_num,
	'title': title,
	'body': content,
	'filename': href,
	'original_basename': basename,
	'file_size': len(content),
	'has_images': bool(soup.find_all('img')),
	'detection_method': 'opf_recovery',
	'content_hash': None # Will be calculated later
	}

	chapters.append(new_chapter)
	print(f" ✅ Recovered chapter {chapter_num}: {basename}")

	except Exception as e:
	print(f" ❌ Failed to extract {href}: {e}")

	# Re-sort chapters by number
	chapters.sort(key=lambda x: x['num'])
	print(f"✅ Total chapters after OPF recovery: {len(chapters)}")

	except Exception as e:
	print(f"⚠️ Error checking OPF chapters: {e}")
	import traceback
	traceback.print_exc()

	return chapters

	def extract_chapters(zf, output_dir, parser=None, progress_callback=None, pattern_manager=None):
	"""Extract chapters and all resources from EPUB using ThreadPoolExecutor

	Args:
	zf: ZipFile object of the EPUB
	output_dir: Output directory for extracted files
	parser: BeautifulSoup parser to use ('lxml' or 'html.parser')
	progress_callback: Optional callback for progress updates
	pattern_manager: Optional PatternManager instance for chapter detection
	"""
	import time

	# Initialize defaults if not provided
	if parser is None:
	parser = _get_best_parser()
	# pattern_manager is no longer used - kept for API compatibility

	# Check stop at the very beginning
	if is_stop_requested():
	print("❌ Extraction stopped by user")
	return []

	print("🚀 Starting EPUB extraction with ThreadPoolExecutor...")
	print(f"📄 Using parser: {parser} {'(optimized for CJK)' if parser == 'lxml' else '(standard)'}")

	# Initial progress
	if progress_callback:
	progress_callback("Starting EPUB extraction...")

	# First, extract and save content.opf for reference
	for name in zf.namelist():
	if name.endswith('.opf'):
	try:
	opf_content = zf.read(name).decode('utf-8', errors='ignore')
	opf_output_path = os.path.join(output_dir, 'content.opf')
	with open(opf_output_path, 'w', encoding='utf-8') as f:
	f.write(opf_content)
	print(f"📋 Saved OPF file: {name} → content.opf")
	break
	except Exception as e:
	print(f"⚠️ Could not save OPF file: {e}")

	# Get extraction mode from environment
	extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower()
	print(f"✅ Using {extraction_mode.capitalize()} extraction mode")

	# Get number of workers from environment or use default
	max_workers = int(os.getenv("EXTRACTION_WORKERS", "2"))
	print(f"🔧 Using {max_workers} workers for parallel processing")

	extracted_resources = _extract_all_resources(zf, output_dir, progress_callback)

	# Check stop after resource extraction
	if is_stop_requested():
	print("❌ Extraction stopped by user")
	return []

	metadata_path = os.path.join(output_dir, 'metadata.json')
	if os.path.exists(metadata_path):
	print("📋 Loading existing metadata...")
	with open(metadata_path, 'r', encoding='utf-8') as f:
	metadata = json.load(f)
	else:
	print("📋 Extracting fresh metadata...")
	metadata = _extract_epub_metadata(zf)
	print(f"📋 Extracted metadata: {list(metadata.keys())}")

	chapters, detected_language = _extract_chapters_universal(zf, extraction_mode, parser, progress_callback, pattern_manager)

	# Sort chapters according to OPF spine order if available
	opf_path = os.path.join(output_dir, 'content.opf')
	if os.path.exists(opf_path) and chapters:
	print("📋 Sorting chapters according to OPF spine order...")
	chapters = _sort_by_opf_spine(chapters, opf_path)
	print(f"✅ Chapters sorted according to OPF reading order")

	# Check stop after chapter extraction
	if is_stop_requested():
	print("❌ Extraction stopped by user")
	return []

	if not chapters:
	print("❌ No chapters could be extracted!")
	return []

	chapters_info_path = os.path.join(output_dir, 'chapters_info.json')
	chapters_info = []
	chapters_info_lock = threading.Lock()

	def process_chapter(chapter):
	"""Process a single chapter"""
	# Check stop in worker
	if is_stop_requested():
	return None

	info = {
	'num': chapter['num'],
	'title': chapter['title'],
	'original_filename': chapter.get('filename', ''),
	'has_images': chapter.get('has_images', False),
	'image_count': chapter.get('image_count', 0),
	'text_length': chapter.get('file_size', len(chapter.get('body', ''))),
	'detection_method': chapter.get('detection_method', 'unknown'),
	'content_hash': chapter.get('content_hash', '')
	}

	if chapter.get('has_images'):
	try:
	soup = BeautifulSoup(chapter.get('body', ''), parser)
	images = soup.find_all('img')
	info['images'] = [img.get('src', '') for img in images]
	except:
	info['images'] = []

	return info

	# Process chapters in parallel
	print(f"🔄 Processing {len(chapters)} chapters in parallel...")

	if progress_callback:
	progress_callback(f"Processing {len(chapters)} chapters...")

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	# Submit all tasks
	future_to_chapter = {
	executor.submit(process_chapter, chapter): chapter
	for chapter in chapters
	}

	# Process completed tasks
	completed = 0
	for future in as_completed(future_to_chapter):
	if is_stop_requested():
	print("❌ Extraction stopped by user")
	# Cancel remaining futures
	for f in future_to_chapter:
	f.cancel()
	return []

	try:
	result = future.result()
	if result:
	with chapters_info_lock:
	chapters_info.append(result)
	completed += 1

	# Yield to GUI periodically (can be disabled for max speed)
	if completed % 5 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1":
	time.sleep(0.001)

	# Progress updates
	if completed % 10 == 0 or completed == len(chapters):
	if progress_callback:
	progress_msg = f"Processed {completed}/{len(chapters)} chapters"
	progress_callback(progress_msg)
	else:
	# Show progress bar in terminal
	ProgressBar.update(completed, len(chapters), prefix="📊 Processing metadata")
	except Exception as e:
	chapter = future_to_chapter[future]
	print(f" ❌ Error processing chapter {chapter['num']}: {e}")

	# Finish progress bar
	if not progress_callback:
	ProgressBar.finish()

	# Sort chapters_info by chapter number to maintain order
	chapters_info.sort(key=lambda x: x['num'])

	print(f"✅ Successfully processed {len(chapters_info)} chapters")

	with open(chapters_info_path, 'w', encoding='utf-8') as f:
	json.dump(chapters_info, f, ensure_ascii=False, indent=2)

	print(f"💾 Saved detailed chapter info to: chapters_info.json")

	metadata.update({
	'chapter_count': len(chapters),
	'detected_language': detected_language,
	'extracted_resources': extracted_resources,
	'extraction_mode': extraction_mode,
	'extraction_summary': {
	'total_chapters': len(chapters),
	'chapter_range': f"{chapters[0]['num']}-{chapters[-1]['num']}",
	'resources_extracted': sum(len(files) for files in extracted_resources.values())
	}
	})

	metadata['chapter_titles'] = {
	str(c['num']): c['title'] for c in chapters
	}

	with open(metadata_path, 'w', encoding='utf-8') as f:
	json.dump(metadata, f, ensure_ascii=False, indent=2)

	print(f"💾 Saved comprehensive metadata to: {metadata_path}")

	_create_extraction_report(output_dir, metadata, chapters, extracted_resources)
	_log_extraction_summary(chapters, extracted_resources, detected_language)

	print(f"🔍 VERIFICATION: {extraction_mode.capitalize()} chapter extraction completed successfully")
	print(f"⚡ Used {max_workers} workers for parallel processing")

	return chapters

	def _extract_all_resources(zf, output_dir, progress_callback=None):
	"""Extract all resources with parallel processing"""
	import time

	extracted_resources = {
	'css': [],
	'fonts': [],
	'images': [],
	'epub_structure': [],
	'other': []
	}

	# Check if already extracted
	extraction_marker = os.path.join(output_dir, '.resources_extracted')
	if os.path.exists(extraction_marker):
	print("📦 Resources already extracted, skipping...")
	return _count_existing_resources(output_dir, extracted_resources)

	_cleanup_old_resources(output_dir)

	# Create directories
	for resource_type in ['css', 'fonts', 'images']:
	os.makedirs(os.path.join(output_dir, resource_type), exist_ok=True)

	# Only print if no callback (avoid duplicates in subprocess)
	if not progress_callback:
	print(f"📦 Extracting resources in parallel...")

	# Get list of files to process
	file_list = [f for f in zf.namelist() if not f.endswith('/') and os.path.basename(f)]

	# Thread-safe lock for extracted_resources
	resource_lock = threading.Lock()

	def extract_single_resource(file_path):
	if is_stop_requested():
	return None

	try:
	file_data = zf.read(file_path)
	resource_info = _categorize_resource(file_path, os.path.basename(file_path))

	if resource_info:
	resource_type, target_dir, safe_filename = resource_info
	target_path = os.path.join(output_dir, target_dir, safe_filename) if target_dir else os.path.join(output_dir, safe_filename)

	with open(target_path, 'wb') as f:
	f.write(file_data)

	# Thread-safe update
	with resource_lock:
	extracted_resources[resource_type].append(safe_filename)

	return (resource_type, safe_filename)
	except Exception as e:
	print(f"[WARNING] Failed to extract {file_path}: {e}")
	return None

	# Process files in parallel
	total_resources = len(file_list)
	extracted_count = 0

	# Use same worker count as chapter processing
	resource_workers = int(os.getenv("EXTRACTION_WORKERS", "2"))

	with ThreadPoolExecutor(max_workers=resource_workers) as executor:
	futures = {executor.submit(extract_single_resource, file_path): file_path
	for file_path in file_list}

	for future in as_completed(futures):
	if is_stop_requested():
	executor.shutdown(wait=False)
	break

	extracted_count += 1

	# Progress update every 20 files
	if extracted_count % 20 == 0:
	if progress_callback:
	progress_callback(f"Extracting resources: {extracted_count}/{total_resources}")
	else:
	# Print progress bar in terminal
	ProgressBar.update(extracted_count, total_resources, prefix="📦 Extracting resources")

	# Yield to GUI periodically (can be disabled for max speed)
	if extracted_count % 10 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1":
	time.sleep(0.001)

	result = future.result()
	if result:
	resource_type, filename = result
	# Only print for important resources
	if extracted_count < 10 or resource_type in ['css', 'fonts']:
	print(f" 📄 Extracted {resource_type}: {filename}")

	# Show 100% completion
	if progress_callback:
	progress_callback(f"Extracting resources: {total_resources}/{total_resources}")
	else:
	ProgressBar.update(total_resources, total_resources, prefix="📦 Extracting resources")
	ProgressBar.finish()

	# Mark as complete
	with open(extraction_marker, 'w') as f:
	f.write(f"Resources extracted at {time.time()}")

	_validate_critical_files(output_dir, extracted_resources)
	return extracted_resources

	def _extract_chapters_universal(zf, extraction_mode="smart", parser=None, progress_callback=None, pattern_manager=None):
	"""Universal chapter extraction with four modes: smart, comprehensive, full, enhanced

	All modes now properly merge Section/Chapter pairs
	Enhanced mode uses html2text for superior text processing
	Now with parallel processing for improved performance
	"""
	# Initialize defaults if not provided
	if parser is None:
	parser = _get_best_parser()
	# pattern_manager is no longer used - kept for API compatibility

	# Check stop at the beginning
	if is_stop_requested():
	print("❌ Chapter extraction stopped by user")
	return [], 'unknown'

	# Import time for yielding
	import time

	# Initialize enhanced extractor if using enhanced mode
	enhanced_extractor = None
	enhanced_filtering = extraction_mode # Default fallback
	preserve_structure = True

	# Check if user wants to translate special files (info.xhtml, message.xhtml, etc.)
	# By default, skip them as they're typically metadata/navigation
	translate_special = os.getenv('TRANSLATE_SPECIAL_FILES', '0') == '1'

	if translate_special:
	print("📝 Special files translation is ENABLED (info.xhtml, message.xhtml, etc.)")
	else:
	print("📝 Special files translation is DISABLED - skipping navigation/metadata files")

	if extraction_mode == "enhanced":
	print("🚀 Initializing Enhanced extraction mode with html2text...")

	# Get enhanced mode configuration from environment
	enhanced_filtering = os.getenv("ENHANCED_FILTERING", "smart")
	# Avoid 'full' with html2text to prevent XML declaration artifacts; use 'comprehensive' instead
	if str(enhanced_filtering).lower() == 'full':
	enhanced_filtering = 'comprehensive'
	preserve_structure = os.getenv("ENHANCED_PRESERVE_STRUCTURE", "1") == "1"

	print(f" • Enhanced filtering level: {enhanced_filtering}")
	print(f" • Preserve structure: {preserve_structure}")

	# Try to initialize enhanced extractor
	try:
	# Import our enhanced extractor (assume it's in the same directory or importable)
	from enhanced_text_extractor import EnhancedTextExtractor
	enhanced_extractor = EnhancedTextExtractor(
	filtering_mode=enhanced_filtering,
	preserve_structure=preserve_structure
	)
	print("✅ Enhanced text extractor initialized successfully")

	except ImportError as e:
	print(f"❌ Enhanced text extractor module not found: {e}")
	print(f"❌ Cannot use enhanced extraction mode. Please install enhanced_text_extractor or select a different extraction mode.")
	raise e
	except Exception as e:
	print(f"❌ Enhanced extractor initialization failed: {e}")
	print(f"❌ Cannot use enhanced extraction mode. Please select a different extraction mode.")
	raise e

	chapters = []
	sample_texts = []

	# First phase: Collect HTML files
	html_files = []
	file_list = zf.namelist()
	total_files = len(file_list)

	# Update progress for file collection
	if progress_callback and total_files > 100:
	progress_callback(f"Scanning {total_files} files in EPUB...")
	elif total_files > 100 and not progress_callback:
	# Print initial message for progress bar (only if no callback)
	print(f"📂 Scanning {total_files} files in EPUB...")

	for idx, name in enumerate(file_list):
	# Check stop while collecting files
	if is_stop_requested():
	print("❌ Chapter extraction stopped by user")
	return [], 'unknown'

	# Yield to GUI every 50 files (can be disabled for max speed)
	if idx % 50 == 0 and idx > 0:
	if os.getenv("ENABLE_GUI_YIELD", "1") == "1":
	time.sleep(0.001) # Brief yield to GUI
	if total_files > 100:
	if progress_callback:
	progress_callback(f"Scanning files: {idx}/{total_files}")
	else:
	# Print progress bar in terminal
	ProgressBar.update(idx, total_files, prefix="📂 Scanning files")

	if name.lower().endswith(('.xhtml', '.html', '.htm')):
	basename = os.path.basename(name).lower()

	# Skip cover files unless special file translation is enabled
	if basename in ['cover.html', 'cover.xhtml', 'cover.htm']:
	if not translate_special:
	print(f"[SKIP] Cover file excluded: {name}")
	continue
	else:
	print(f"[INCLUDE] Cover file included (special files enabled): {name}")

	# All filtering is now controlled by TRANSLATE_SPECIAL_FILES toggle and extraction mode
	# No hardcoded special file patterns
	html_files.append(name)

	# Print final 100% progress update before finishing
	if total_files > 100:
	if progress_callback:
	progress_callback(f"Scanning files: {total_files}/{total_files}")
	else:
	# Show 100% completion
	ProgressBar.update(total_files, total_files, prefix="📂 Scanning files")

	# Finish progress bar if we were using it
	if total_files > 100 and not progress_callback:
	ProgressBar.finish()

	# Update mode description to include enhanced mode
	mode_description = {
	"smart": "potential content files",
	"comprehensive": "HTML files",
	"full": "ALL HTML/XHTML files (no filtering)",
	"enhanced": f"files (enhanced with {enhanced_filtering} filtering)"
	}
	print(f"📚 Found {len(html_files)} {mode_description.get(extraction_mode, 'files')} in EPUB")

	# Sort files to ensure proper order
	html_files.sort()

	# Check if merging is disabled via environment variable
	disable_merging = os.getenv("DISABLE_CHAPTER_MERGING", "0") == "1"

	processed_files = set()
	merge_candidates = {} # Store potential merges without reading files yet

	if disable_merging:
	print("📌 Chapter merging is DISABLED - processing all files independently")
	else:
	print("📌 Chapter merging is ENABLED")

	# Only do merging logic if not disabled
	file_groups = {}

	# Group files by their base number to detect Section/Chapter pairs
	for file_path in html_files:
	filename = os.path.basename(file_path)

	# Try different patterns to extract base number
	base_num = None

	# Pattern 1: "No00014" from "No00014Section.xhtml"
	match = re.match(r'(No\d+)', filename)
	if match:
	base_num = match.group(1)
	else:
	# Pattern 2: "0014" from "0014_section.html" or "0014_chapter.html"
	match = re.match(r'^(\d+)[_\-]', filename)
	if match:
	base_num = match.group(1)
	else:
	# Pattern 3: Just numbers at the start
	match = re.match(r'^(\d+)', filename)
	if match:
	base_num = match.group(1)

	if base_num:
	if base_num not in file_groups:
	file_groups[base_num] = []
	file_groups[base_num].append(file_path)

	# Identify merge candidates WITHOUT reading files yet
	for base_num, group_files in sorted(file_groups.items()):
	if len(group_files) == 2:
	# Check if we have a Section/Chapter pair based on filenames only
	section_file = None
	chapter_file = None

	for file_path in group_files:
	basename = os.path.basename(file_path)
	# More strict detection - must have 'section' or 'chapter' in the filename
	if 'section' in basename.lower() and 'chapter' not in basename.lower():
	section_file = file_path
	elif 'chapter' in basename.lower() and 'section' not in basename.lower():
	chapter_file = file_path

	if section_file and chapter_file:
	# Store as potential merge candidate
	merge_candidates[chapter_file] = section_file
	processed_files.add(section_file)
	print(f"[DEBUG] Potential merge candidate: {base_num}")
	print(f" Section: {os.path.basename(section_file)}")
	print(f" Chapter: {os.path.basename(chapter_file)}")

	# Filter out section files that were marked for merging
	files_to_process = []
	for file_path in html_files:
	if not disable_merging and file_path in processed_files:
	print(f"[DEBUG] Skipping section file: {file_path}")
	continue
	files_to_process.append(file_path)

	print(f"📚 Processing {len(files_to_process)} files after merge analysis")
	if progress_callback:
	progress_callback(f"Preparing to process {len(files_to_process)} chapters...")

	# Initialize collections for aggregating results
	file_size_groups = {}
	h1_count = 0
	h2_count = 0
	skipped_files = []

	# Progress tracking
	total_files = len(files_to_process)

	# Prepare arguments for parallel processing
	zip_file_path = zf.filename

	# Process files in parallel or sequentially based on file count
	# Only print if no callback (avoid duplicates)
	if not progress_callback:
	print(f"🚀 Processing {len(files_to_process)} HTML files...")

	# Initial progress - no message needed, progress bar will show

	candidate_chapters = [] # For smart mode
	chapters_direct = [] # For other modes

	# Decide whether to use parallel processing
	use_parallel = len(files_to_process) > 10

	if use_parallel:
	# Get worker count from environment variable
	max_workers = int(os.getenv("EXTRACTION_WORKERS", "2"))
	print(f"📦 Using parallel processing with {max_workers} workers...")
	if progress_callback:
	progress_callback(f"Starting {max_workers} extraction workers...")

	# Use ProcessPoolExecutor for true multi-process parallelism
	# Now that all functions are at module level and picklable, we can use processes
	with ProcessPoolExecutor(max_workers=max_workers) as executor:
	# Submit all files for processing
	future_to_file = {
	executor.submit(
	_process_single_html_file,
	file_path=file_path,
	file_index=idx,
	zip_file_path=zip_file_path,
	parser=parser,
	merge_candidates=merge_candidates,
	disable_merging=disable_merging,
	enhanced_extractor=enhanced_extractor,
	extraction_mode=extraction_mode,
	enhanced_filtering=enhanced_filtering,
	preserve_structure=preserve_structure,
	protect_angle_brackets_func=protect_angle_brackets_with_korean,
	pattern_manager=pattern_manager,
	files_to_process=files_to_process,
	is_stop_requested=is_stop_requested
	): (file_path, idx)
	for idx, file_path in enumerate(files_to_process)
	}

	# Collect results as they complete with progress tracking
	processed_count = 0
	for future in as_completed(future_to_file):
	if is_stop_requested():
	print("❌ Chapter processing stopped by user")
	executor.shutdown(wait=False)
	return [], 'unknown'

	try:
	# Unpack result from _process_single_html_file
	result = future.result()
	chapter_info, h1_found, h2_found, file_size, sample_text, skipped_info = result

	# Update progress
	processed_count += 1
	if processed_count % 5 == 0:
	if progress_callback:
	progress_msg = f"Processing chapters: {processed_count}/{total_files} ({processed_count*100//total_files}%)"
	progress_callback(progress_msg)
	else:
	# Print progress bar in terminal
	ProgressBar.update(processed_count, total_files, prefix="📚 Processing chapters")

	# Aggregate header counts
	if h1_found:
	h1_count += 1
	if h2_found:
	h2_count += 1

	# Collect file size groups and sample texts
	if chapter_info:
	effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
	if effective_mode == "smart" and file_size > 0:
	if file_size not in file_size_groups:
	file_size_groups[file_size] = []
	file_path, _ = future_to_file[future]
	file_size_groups[file_size].append(file_path)

	# Collect sample texts
	if sample_text and len(sample_texts) < 5:
	sample_texts.append(sample_text)

	# For smart mode when merging is enabled, collect candidates
	# Otherwise, add directly to chapters
	if effective_mode == "smart" and not disable_merging:
	candidate_chapters.append(chapter_info)
	else:
	chapters_direct.append(chapter_info)

	# Collect skipped info
	if skipped_info:
	skipped_files.append(skipped_info)

	except Exception as e:
	file_path, idx = future_to_file[future]
	print(f"[ERROR] Process error processing {file_path}: {e}")
	import traceback
	traceback.print_exc()

	# Show 100% completion
	if progress_callback:
	progress_callback(f"Processing chapters: {total_files}/{total_files} (100%)")
	else:
	ProgressBar.update(total_files, total_files, prefix="📚 Processing chapters")
	else:
	print("📦 Using sequential processing (small file count)...")

	# Process files sequentially for small EPUBs
	for idx, file_path in enumerate(files_to_process):
	if is_stop_requested():
	print("❌ Chapter processing stopped by user")
	return [], 'unknown'

	# Call the module-level function directly
	result = _process_single_html_file(
	file_path=file_path,
	file_index=idx,
	zip_file_path=zip_file_path,
	parser=parser,
	merge_candidates=merge_candidates,
	disable_merging=disable_merging,
	enhanced_extractor=enhanced_extractor,
	extraction_mode=extraction_mode,
	enhanced_filtering=enhanced_filtering,
	preserve_structure=preserve_structure,
	protect_angle_brackets_func=protect_angle_brackets_with_korean,
	pattern_manager=pattern_manager,
	files_to_process=files_to_process,
	is_stop_requested=is_stop_requested
	)

	# Unpack result
	chapter_info, h1_found, h2_found, file_size, sample_text, skipped_info = result

	# Update progress
	if (idx + 1) % 5 == 0:
	if progress_callback:
	progress_msg = f"Processing chapters: {idx+1}/{total_files} ({(idx+1)*100//total_files}%)"
	progress_callback(progress_msg)
	else:
	# Print progress bar in terminal
	ProgressBar.update(idx+1, total_files, prefix="📚 Processing chapters")

	# Aggregate header counts
	if h1_found:
	h1_count += 1
	if h2_found:
	h2_count += 1

	# Collect file size groups and sample texts
	if chapter_info:
	effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
	if effective_mode == "smart" and file_size > 0:
	if file_size not in file_size_groups:
	file_size_groups[file_size] = []
	file_size_groups[file_size].append(file_path)

	# Collect sample texts
	if sample_text and len(sample_texts) < 5:
	sample_texts.append(sample_text)

	# For smart mode when merging is enabled, collect candidates
	# Otherwise, add directly to chapters
	if effective_mode == "smart" and not disable_merging:
	candidate_chapters.append(chapter_info)
	else:
	chapters_direct.append(chapter_info)

	# Collect skipped info
	if skipped_info:
	skipped_files.append(skipped_info)

	# Show 100% completion for sequential mode
	if progress_callback:
	progress_callback(f"Processing chapters: {total_files}/{total_files} (100%)")
	else:
	ProgressBar.update(total_files, total_files, prefix="📚 Processing chapters")

	# Final progress update and cleanup progress bar
	if not progress_callback:
	ProgressBar.finish()
	else:
	progress_callback(f"Chapter processing complete: {len(candidate_chapters) + len(chapters_direct)} chapters")

	# Print skip summary if any files were skipped
	if skipped_files:
	print(f"\n📊 Skipped {len(skipped_files)} files during processing:")
	empty_count = sum(1 for _, reason, _ in skipped_files if reason == 'empty')
	if empty_count > 0:
	print(f" • {empty_count} nearly empty files")
	# Show first 3 examples if debug enabled
	if os.getenv('DEBUG_SKIP_MESSAGES', '0') == '1' and skipped_files:
	print(" Examples:")
	for path, reason, size in skipped_files[:3]:
	print(f" - {os.path.basename(path)} ({size} chars)")

	# Sort direct chapters by file index to maintain order
	chapters_direct.sort(key=lambda x: x["file_index"])

	# Post-process smart mode candidates (only when merging is enabled)
	effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
	if effective_mode == "smart" and candidate_chapters and not disable_merging:
	# Check stop before post-processing
	if is_stop_requested():
	print("❌ Chapter post-processing stopped by user")
	return chapters, 'unknown'

	print(f"\n[SMART MODE] Processing {len(candidate_chapters)} candidate files...")

	# Sort candidates by file index to maintain order
	candidate_chapters.sort(key=lambda x: x["file_index"])

	# Debug: Show what files we have
	section_files = [c for c in candidate_chapters if 'section' in c['original_basename'].lower()]
	chapter_files = [c for c in candidate_chapters if 'chapter' in c['original_basename'].lower() and 'section' not in c['original_basename'].lower()]
	other_files = [c for c in candidate_chapters if c not in section_files and c not in chapter_files]

	print(f" 📊 File breakdown:")
	print(f" • Section files: {len(section_files)}")
	print(f" • Chapter files: {len(chapter_files)}")
	print(f" • Other files: {len(other_files)}")

	# Original smart mode logic when merging is enabled
	# First, separate files with detected chapter numbers from those without
	numbered_chapters = []
	unnumbered_chapters = []

	for idx, chapter in enumerate(candidate_chapters):
	# Yield periodically during categorization (can be disabled for max speed)
	if idx % 10 == 0 and idx > 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1":
	time.sleep(0.001)

	if chapter["num"] is not None:
	numbered_chapters.append(chapter)
	else:
	unnumbered_chapters.append(chapter)

	print(f" • Files with chapter numbers: {len(numbered_chapters)}")
	print(f" • Files without chapter numbers: {len(unnumbered_chapters)}")

	# Check if we have hash-based filenames (no numbered chapters found)
	if not numbered_chapters and unnumbered_chapters:
	print(" ⚠️ No chapter numbers found - likely hash-based filenames")
	print(" → Using file order as chapter sequence")

	# Sort by file index to maintain order
	unnumbered_chapters.sort(key=lambda x: x["file_index"])

	# Assign sequential numbers
	for i, chapter in enumerate(unnumbered_chapters, 1):
	chapter["num"] = i
	chapter["detection_method"] = f"{extraction_mode}_hash_filename_sequential" if extraction_mode == "enhanced" else "hash_filename_sequential"
	if not chapter["title"] or chapter["title"] == chapter["original_basename"]:
	chapter["title"] = f"Chapter {i}"

	chapters = unnumbered_chapters
	else:
	# We have some numbered chapters
	chapters = numbered_chapters

	# For unnumbered files, check if they might be duplicates or appendices
	if unnumbered_chapters:
	print(f" → Analyzing {len(unnumbered_chapters)} unnumbered files...")

	# Get the max chapter number
	max_num = max(c["num"] for c in numbered_chapters)

	# Check each unnumbered file
	for chapter in unnumbered_chapters:
	# Check stop in post-processing loop
	if is_stop_requested():
	print("❌ Chapter post-processing stopped by user")
	return chapters, 'unknown'

	# Check if it's very small (might be a separator or note)
	if chapter["file_size"] < 200:
	# Collect for summary instead of printing
	# Note: _smart_mode_skips defined in outer scope
	_smart_mode_skips.append(('small', chapter['filename'], chapter['file_size']))
	continue

	# Check if it has similar size to existing chapters (might be duplicate)
	size = chapter["file_size"]
	similar_chapters = [c for c in numbered_chapters
	if abs(c["file_size"] - size) < 50]

	if similar_chapters:
	# Might be a duplicate, skip it (collect for summary)
	_smart_mode_skips.append(('duplicate', chapter['filename'], len(similar_chapters)))
	continue

	# Otherwise, add as appendix
	max_num += 1
	chapter["num"] = max_num
	chapter["detection_method"] = f"{extraction_mode}_appendix_sequential" if extraction_mode == "enhanced" else "appendix_sequential"
	if not chapter["title"] or chapter["title"] == chapter["original_basename"]:
	chapter["title"] = f"Appendix {max_num}"
	chapters.append(chapter)
	print(f" [ADD] Added as chapter {max_num}: {chapter['filename']}")
	else:
	# For other modes or smart mode with merging disabled
	chapters = chapters_direct

	# Print smart mode skip summary if any
	if '_smart_mode_skips' in locals() and _smart_mode_skips:
	print(f"\n📊 Smart mode filtering summary:")
	small_count = sum(1 for reason, _, _ in _smart_mode_skips if reason == 'small')
	dup_count = sum(1 for reason, _, _ in _smart_mode_skips if reason == 'duplicate')
	if small_count > 0:
	print(f" • Skipped {small_count} very small files")
	if dup_count > 0:
	print(f" • Skipped {dup_count} possible duplicates")
	# Show examples if debug enabled
	if os.getenv('DEBUG_SKIP_MESSAGES', '0') == '1':
	print(" Examples:")
	for reason, filename, detail in _smart_mode_skips[:3]:
	if reason == 'small':
	print(f" - {filename} ({detail} chars)")
	else:
	print(f" - {filename} (similar to {detail} chapters)")
	# Clear the list
	_smart_mode_skips = []

	# Sort chapters by number
	chapters.sort(key=lambda x: x["num"])

	# Ensure chapter numbers are integers
	# When merging is disabled, all chapters should have integer numbers anyway
	for chapter in chapters:
	if isinstance(chapter["num"], float):
	chapter["num"] = int(chapter["num"])

	# Final validation
	if chapters:
	print(f"\n✅ Final chapter count: {len(chapters)}")
	print(f" • Chapter range: {chapters[0]['num']} - {chapters[-1]['num']}")

	# Enhanced mode summary
	if extraction_mode == "enhanced":
	enhanced_count = sum(1 for c in chapters if c.get('enhanced_extraction', False))
	total_chars = sum(len(c.get('body', '')) for c in chapters if c.get('enhanced_extraction', False))
	avg_chars = total_chars // enhanced_count if enhanced_count > 0 else 0
	print(f" 🚀 Enhanced extraction: {enhanced_count}/{len(chapters)} chapters, {total_chars:,} total chars (avg: {avg_chars:,})")

	# Check for gaps
	chapter_nums = [c["num"] for c in chapters]
	expected_nums = list(range(min(chapter_nums), max(chapter_nums) + 1))
	missing = set(expected_nums) - set(chapter_nums)
	if missing:
	print(f" ⚠️ Missing chapter numbers: {sorted(missing)}")

	# Language detection
	combined_sample = ' '.join(sample_texts) if effective_mode == "smart" else ''
	detected_language = _detect_content_language(combined_sample) if combined_sample else 'unknown'

	if chapters:
	_print_extraction_summary(chapters, detected_language, extraction_mode,
	h1_count if effective_mode == "smart" else 0,
	h2_count if effective_mode == "smart" else 0,
	file_size_groups if effective_mode == "smart" else {})

	return chapters, detected_language

	def _extract_chapter_info(soup, file_path, content_text, html_content, pattern_manager):
	"""Extract chapter number and title from various sources with parallel pattern matching"""
	chapter_num = None
	chapter_title = None
	detection_method = None

	# SPECIAL HANDLING: When we have Section/Chapter pairs, differentiate them
	filename = os.path.basename(file_path)

	# Handle different naming patterns for Section/Chapter files
	if ('section' in filename.lower() or '_section' in filename.lower()) and 'chapter' not in filename.lower():
	# For Section files, add 0.1 to the base number
	# Try different patterns
	match = re.search(r'No(\d+)', filename)
	if not match:
	match = re.search(r'^(\d+)[_\-]', filename)
	if not match:
	match = re.search(r'^(\d+)', filename)

	if match:
	base_num = int(match.group(1))
	chapter_num = base_num + 0.1 # Section gets .1
	detection_method = "filename_section_special"

	elif ('chapter' in filename.lower() or '_chapter' in filename.lower()) and 'section' not in filename.lower():
	# For Chapter files, use the base number
	# Try different patterns
	match = re.search(r'No(\d+)', filename)
	if not match:
	match = re.search(r'^(\d+)[_\-]', filename)
	if not match:
	match = re.search(r'^(\d+)', filename)

	if match:
	chapter_num = int(match.group(1))
	detection_method = "filename_chapter_special"

	# If not handled by special logic, continue with normal extraction
	if not chapter_num:
	# Try filename first - use parallel pattern matching for better performance
	chapter_patterns = [(pattern, flags, method) for pattern, flags, method in PM.CHAPTER_PATTERNS
	if method.endswith('_number')]

	if len(chapter_patterns) > 3: # Only parallelize if we have enough patterns
	# Parallel pattern matching for filename
	with ThreadPoolExecutor(max_workers=min(4, len(chapter_patterns))) as executor:
	def try_pattern(pattern_info):
	pattern, flags, method = pattern_info
	match = re.search(pattern, file_path, flags)
	if match:
	try:
	num_str = match.group(1)
	if num_str.isdigit():
	return int(num_str), f"filename_{method}"
	elif method == 'chinese_chapter_cn':
	from TransateKRtoEN import PatternManager
	pm = None # No longer needed
	converted = _convert_chinese_number(num_str, pm)
	if converted:
	return converted, f"filename_{method}"
	except (ValueError, IndexError):
	pass
	return None, None

	# Submit all patterns
	futures = [executor.submit(try_pattern, pattern_info) for pattern_info in chapter_patterns]

	# Check results as they complete
	for future in as_completed(futures):
	try:
	num, method = future.result()
	if num:
	chapter_num = num
	detection_method = method
	# Cancel remaining futures
	for f in futures:
	f.cancel()
	break
	except Exception:
	continue
	else:
	# Sequential processing for small pattern sets
	for pattern, flags, method in chapter_patterns:
	match = re.search(pattern, file_path, flags)
	if match:
	try:
	num_str = match.group(1)
	if num_str.isdigit():
	chapter_num = int(num_str)
	detection_method = f"filename_{method}"
	break
	elif method == 'chinese_chapter_cn':
	from TransateKRtoEN import PatternManager
	pm = None # No longer needed
	converted = _convert_chinese_number(num_str, pm)
	if converted:
	chapter_num = converted
	detection_method = f"filename_{method}"
	break
	except (ValueError, IndexError):
	continue

	# Try content if not found in filename
	if not chapter_num:
	# Check ignore settings for batch translation
	batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
	use_title_tag = os.getenv('USE_TITLE', '0') == '1' or not batch_translate_active
	ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active

	# Prepare all text sources to check in parallel
	text_sources = []

	# Add title tag if using titles
	if use_title_tag and soup.title and soup.title.string:
	title_text = soup.title.string.strip()
	text_sources.append(("title", title_text, True)) # True means this can be chapter_title

	# Add headers if not ignored
	if not ignore_header_tags:
	for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
	headers = soup.find_all(header_tag)
	for header in headers[:3]: # Limit to first 3 of each type
	header_text = header.get_text(strip=True)
	if header_text:
	text_sources.append((f"header_{header_tag}", header_text, True))

	# Add first paragraphs
	first_elements = soup.find_all(['p', 'div'])[:5]
	for elem in first_elements:
	elem_text = elem.get_text(strip=True)
	if elem_text:
	text_sources.append(("content", elem_text, False)) # False means don't use as chapter_title

	# Process text sources in parallel if we have many
	if len(text_sources) > 5:
	with ThreadPoolExecutor(max_workers=min(6, len(text_sources))) as executor:
	def extract_from_source(source_info):
	source_type, text, can_be_title = source_info
	num, method = _extract_from_text(text, source_type, pattern_manager)
	return num, method, text if (num and can_be_title) else None

	# Submit all text sources
	future_to_source = {executor.submit(extract_from_source, source): source
	for source in text_sources}

	# Process results as they complete
	for future in as_completed(future_to_source):
	try:
	num, method, title = future.result()
	if num:
	chapter_num = num
	detection_method = method
	if title and not chapter_title:
	chapter_title = title
	# Cancel remaining futures
	for f in future_to_source:
	f.cancel()
	break
	except Exception:
	continue
	else:
	# Sequential processing for small text sets
	for source_type, text, can_be_title in text_sources:
	num, method = _extract_from_text(text, source_type, pattern_manager)
	if num:
	chapter_num = num
	detection_method = method
	if can_be_title and not chapter_title:
	chapter_title = text
	break

	# Final fallback to filename patterns
	if not chapter_num:
	filename_base = os.path.basename(file_path)
	# Parallel pattern matching for filename extraction
	if len(PM.FILENAME_EXTRACT_PATTERNS) > 3:
	with ThreadPoolExecutor(max_workers=min(4, len(PM.FILENAME_EXTRACT_PATTERNS))) as executor:
	def try_filename_pattern(pattern):
	match = re.search(pattern, filename_base, re.IGNORECASE)
	if match:
	try:
	return int(match.group(1))
	except (ValueError, IndexError):
	pass
	return None

	futures = [executor.submit(try_filename_pattern, pattern)
	for pattern in PM.FILENAME_EXTRACT_PATTERNS]

	for future in as_completed(futures):
	try:
	num = future.result()
	if num:
	chapter_num = num
	detection_method = "filename_number"
	for f in futures:
	f.cancel()
	break
	except Exception:
	continue
	else:
	# Sequential for small pattern sets
	for pattern in PM.FILENAME_EXTRACT_PATTERNS:
	match = re.search(pattern, filename_base, re.IGNORECASE)
	if match:
	chapter_num = int(match.group(1))
	detection_method = "filename_number"
	break

	# Extract title if not already found (with ignore settings support)
	if not chapter_title:
	# Check settings for batch translation
	batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
	use_title_tag = os.getenv('USE_TITLE', '0') == '1' or not batch_translate_active
	ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active

	# Try title tag if using titles
	if use_title_tag and soup.title and soup.title.string:
	chapter_title = soup.title.string.strip()

	# Try header tags if not ignored and no title found
	if not chapter_title and not ignore_header_tags:
	for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
	header = soup.find(header_tag)
	if header:
	chapter_title = header.get_text(strip=True)
	break

	# Final fallback
	if not chapter_title:
	chapter_title = f"Chapter {chapter_num}" if chapter_num else None

	chapter_title = re.sub(r'\s+', ' ', chapter_title).strip() if chapter_title else None

	return chapter_num, chapter_title, detection_method


	def _extract_from_text(text, source_type, pattern_manager):
	"""Extract chapter number from text using patterns with parallel matching for large pattern sets"""
	# Get patterns that don't end with '_number'
	text_patterns = [(pattern, flags, method) for pattern, flags, method in PM.CHAPTER_PATTERNS
	if not method.endswith('_number')]

	# Only use parallel processing if we have many patterns
	if len(text_patterns) > 5:
	with ThreadPoolExecutor(max_workers=min(4, len(text_patterns))) as executor:
	def try_text_pattern(pattern_info):
	pattern, flags, method = pattern_info
	match = re.search(pattern, text, flags)
	if match:
	try:
	num_str = match.group(1)
	if num_str.isdigit():
	return int(num_str), f"{source_type}_{method}"
	elif method == 'chinese_chapter_cn':
	from TransateKRtoEN import PatternManager
	pm = None # No longer needed
	converted = _convert_chinese_number(num_str, pm)
	if converted:
	return converted, f"{source_type}_{method}"
	except (ValueError, IndexError):
	pass
	return None, None

	# Submit all patterns
	futures = [executor.submit(try_text_pattern, pattern_info) for pattern_info in text_patterns]

	# Check results as they complete
	for future in as_completed(futures):
	try:
	num, method = future.result()
	if num:
	# Cancel remaining futures
	for f in futures:
	f.cancel()
	return num, method
	except Exception:
	continue
	else:
	# Sequential processing for small pattern sets
	for pattern, flags, method in text_patterns:
	match = re.search(pattern, text, flags)
	if match:
	try:
	num_str = match.group(1)
	if num_str.isdigit():
	return int(num_str), f"{source_type}_{method}"
	elif method == 'chinese_chapter_cn':
	from TransateKRtoEN import PatternManager
	pm = None # No longer needed
	converted = _convert_chinese_number(num_str, pm)
	if converted:
	return converted, f"{source_type}_{method}"
	except (ValueError, IndexError):
	continue

	return None, None

	def _convert_chinese_number(cn_num, pattern_manager):
	"""Convert Chinese number to integer"""
	if cn_num in PM.CHINESE_NUMS:
	return PM.CHINESE_NUMS[cn_num]

	if '十' in cn_num:
	parts = cn_num.split('十')
	if len(parts) == 2:
	tens = PM.CHINESE_NUMS.get(parts[0], 1) if parts[0] else 1
	ones = PM.CHINESE_NUMS.get(parts[1], 0) if parts[1] else 0
	return tens * 10 + ones

	return None

	def _detect_content_language( text_sample):
	"""Detect the primary language of content with parallel processing for large texts"""

	# For very short texts, use sequential processing
	if len(text_sample) < 1000:
	scripts = {
	'korean': 0,
	'japanese_hiragana': 0,
	'japanese_katakana': 0,
	'chinese': 0,
	'latin': 0
	}

	for char in text_sample:
	code = ord(char)
	if 0xAC00 <= code <= 0xD7AF:
	scripts['korean'] += 1
	elif 0x3040 <= code <= 0x309F:
	scripts['japanese_hiragana'] += 1
	elif 0x30A0 <= code <= 0x30FF:
	scripts['japanese_katakana'] += 1
	elif 0x4E00 <= code <= 0x9FFF:
	scripts['chinese'] += 1
	elif 0x0020 <= code <= 0x007F:
	scripts['latin'] += 1
	else:
	# For longer texts, use parallel processing
	# Split text into chunks for parallel processing
	chunk_size = max(500, len(text_sample) // (os.cpu_count() or 4))
	chunks = [text_sample[i:i + chunk_size] for i in range(0, len(text_sample), chunk_size)]

	# Thread-safe accumulator
	scripts_lock = threading.Lock()
	scripts = {
	'korean': 0,
	'japanese_hiragana': 0,
	'japanese_katakana': 0,
	'chinese': 0,
	'latin': 0
	}

	def process_chunk(text_chunk):
	"""Process a chunk of text and return script counts"""
	local_scripts = {
	'korean': 0,
	'japanese_hiragana': 0,
	'japanese_katakana': 0,
	'chinese': 0,
	'latin': 0
	}

	for char in text_chunk:
	code = ord(char)
	if 0xAC00 <= code <= 0xD7AF:
	local_scripts['korean'] += 1
	elif 0x3040 <= code <= 0x309F:
	local_scripts['japanese_hiragana'] += 1
	elif 0x30A0 <= code <= 0x30FF:
	local_scripts['japanese_katakana'] += 1
	elif 0x4E00 <= code <= 0x9FFF:
	local_scripts['chinese'] += 1
	elif 0x0020 <= code <= 0x007F:
	local_scripts['latin'] += 1

	return local_scripts

	# Process chunks in parallel
	with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 4, len(chunks))) as executor:
	# Submit all chunks
	futures = [executor.submit(process_chunk, chunk) for chunk in chunks]

	# Collect results
	for future in as_completed(futures):
	try:
	chunk_scripts = future.result()
	# Thread-safe accumulation
	with scripts_lock:
	for script, count in chunk_scripts.items():
	scripts[script] += count
	except Exception as e:
	print(f"[WARNING] Error processing chunk in language detection: {e}")

	# Language determination logic (same as original)
	total_cjk = scripts['korean'] + scripts['japanese_hiragana'] + scripts['japanese_katakana'] + scripts['chinese']

	if scripts['korean'] > total_cjk * 0.3:
	return 'korean'
	elif scripts['japanese_hiragana'] + scripts['japanese_katakana'] > total_cjk * 0.2:
	return 'japanese'
	elif scripts['chinese'] > total_cjk * 0.3:
	return 'chinese'
	elif scripts['latin'] > len(text_sample) * 0.7:
	return 'english'
	else:
	return 'unknown'

	# Global flag to track if language has been printed
	_language_printed = False

	def _print_extraction_summary( chapters, detected_language, extraction_mode, h1_count, h2_count, file_size_groups):
	"""Print extraction summary"""
	global _language_printed

	print(f"\n📊 Chapter Extraction Summary ({extraction_mode.capitalize()} Mode):")
	print(f" • Total chapters extracted: {len(chapters)}")

	# Format chapter range handling both int and float
	first_num = chapters[0]['num']
	last_num = chapters[-1]['num']

	print(f" • Chapter range: {first_num} to {last_num}")

	# Only print detected language once per session
	if not _language_printed and detected_language and detected_language != 'unknown':
	print(f" 🌐 Detected language: {detected_language}")
	_language_printed = True

	if extraction_mode == "smart":
	print(f" • Primary header type: {'<h2>' if h2_count > h1_count else '<h1>'}")

	image_only_count = sum(1 for c in chapters if c.get('is_image_only', False))
	text_only_count = sum(1 for c in chapters if not c.get('has_images', False) and c.get('file_size', 0) >= 500)
	mixed_count = sum(1 for c in chapters if c.get('has_images', False) and c.get('file_size', 0) >= 500)
	empty_count = sum(1 for c in chapters if c.get('file_size', 0) < 50)

	print(f" • Text-only chapters: {text_only_count}")
	print(f" • Image-only chapters: {image_only_count}")
	print(f" • Mixed content chapters: {mixed_count}")
	print(f" • Empty/minimal content: {empty_count}")

	# Check for merged chapters
	merged_count = sum(1 for c in chapters if c.get('was_merged', False))
	if merged_count > 0:
	print(f" • Merged chapters: {merged_count}")

	# Check for missing chapters (only for integer sequences)
	expected_chapters = set(range(chapters[0]['num'], chapters[-1]['num'] + 1))
	actual_chapters = set(c['num'] for c in chapters)
	missing = expected_chapters - actual_chapters
	if missing:
	print(f" ⚠️ Missing chapter numbers: {sorted(missing)}")

	if extraction_mode == "smart":
	method_stats = Counter(c['detection_method'] for c in chapters)
	print(f" 📈 Detection methods used:")
	for method, count in method_stats.most_common():
	print(f" • {method}: {count} chapters")

	large_groups = [size for size, files in file_size_groups.items() if len(files) > 1]
	if large_groups:
	print(f" ⚠️ Found {len(large_groups)} file size groups with potential duplicates")
	else:
	print(f" • Empty/placeholder: {empty_count}")

	if extraction_mode == "full":
	print(f" 🔍 Full extraction preserved all HTML structure and tags")

	def _extract_epub_metadata(zf):
	"""Extract comprehensive metadata from EPUB file including all custom fields"""
	meta = {}
	# Use lxml for XML if available
	try:
	import lxml
	xml_parser = 'lxml-xml'
	except ImportError:
	xml_parser = 'xml'
	try:
	for name in zf.namelist():
	if name.lower().endswith('.opf'):
	opf_content = zf.read(name)
	soup = BeautifulSoup(opf_content, xml_parser)

	# Extract ALL Dublin Core elements (expanded list)
	dc_elements = ['title', 'creator', 'subject', 'description',
	'publisher', 'contributor', 'date', 'type',
	'format', 'identifier', 'source', 'language',
	'relation', 'coverage', 'rights']

	for element in dc_elements:
	tag = soup.find(element)
	if tag and tag.get_text(strip=True):
	meta[element] = tag.get_text(strip=True)

	# Extract ALL meta tags (not just series)
	meta_tags = soup.find_all('meta')
	for meta_tag in meta_tags:
	# Try different attribute names for the metadata name
	name = meta_tag.get('name') or meta_tag.get('property', '')
	content = meta_tag.get('content', '')

	if name and content:
	# Store original name for debugging
	original_name = name

	# Clean up common prefixes
	if name.startswith('calibre:'):
	name = name[8:] # Remove 'calibre:' prefix
	elif name.startswith('dc:'):
	name = name[3:] # Remove 'dc:' prefix
	elif name.startswith('opf:'):
	name = name[4:] # Remove 'opf:' prefix

	# Normalize the field name - replace hyphens with underscores
	name = name.replace('-', '_')

	# Don't overwrite if already exists (prefer direct tags over meta tags)
	if name not in meta:
	meta[name] = content

	# Debug output for custom fields
	if original_name != name:
	print(f" • Found custom field: {original_name} → {name}")

	# Special handling for series information (maintain compatibility)
	if 'series' not in meta:
	series_tags = soup.find_all('meta', attrs={'name': lambda x: x and 'series' in x.lower()})
	for series_tag in series_tags:
	series_name = series_tag.get('content', '')
	if series_name:
	meta['series'] = series_name
	break

	# Extract refines metadata (used by some EPUB creators)
	refines_metas = soup.find_all('meta', attrs={'refines': True})
	for refine in refines_metas:
	property_name = refine.get('property', '')
	content = refine.get_text(strip=True) or refine.get('content', '')

	if property_name and content:
	# Clean property name
	if ':' in property_name:
	property_name = property_name.split(':')[-1]
	property_name = property_name.replace('-', '_')

	if property_name not in meta:
	meta[property_name] = content

	# Log extraction summary
	print(f"📋 Extracted {len(meta)} metadata fields")

	# Show standard vs custom fields
	standard_keys = {'title', 'creator', 'language', 'subject', 'description',
	'publisher', 'date', 'identifier', 'source', 'rights',
	'contributor', 'type', 'format', 'relation', 'coverage'}
	custom_keys = set(meta.keys()) - standard_keys

	if custom_keys:
	print(f"📋 Standard fields: {len(standard_keys & set(meta.keys()))}")
	print(f"📋 Custom fields found: {sorted(custom_keys)}")

	# Show sample values for custom fields (truncated)
	for key in sorted(custom_keys)[:5]: # Show first 5 custom fields
	value = str(meta[key])
	if len(value) > 50:
	value = value[:47] + "..."
	print(f" • {key}: {value}")

	if len(custom_keys) > 5:
	print(f" • ... and {len(custom_keys) - 5} more custom fields")

	break

	except Exception as e:
	print(f"[WARNING] Failed to extract metadata: {e}")
	import traceback
	traceback.print_exc()

	return meta

	def _categorize_resource( file_path, file_name):
	"""Categorize a file and return (resource_type, target_dir, safe_filename)"""
	file_path_lower = file_path.lower()
	file_name_lower = file_name.lower()

	if file_path_lower.endswith('.css'):
	return 'css', 'css', sanitize_resource_filename(file_name)
	elif file_path_lower.endswith(('.ttf', '.otf', '.woff', '.woff2', '.eot')):
	return 'fonts', 'fonts', sanitize_resource_filename(file_name)
	elif file_path_lower.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.bmp', '.webp')):
	return 'images', 'images', sanitize_resource_filename(file_name)
	elif (file_path_lower.endswith(('.opf', '.ncx')) or
	file_name_lower == 'container.xml' or
	'container.xml' in file_path_lower):
	if 'container.xml' in file_path_lower:
	safe_filename = 'container.xml'
	else:
	safe_filename = file_name
	return 'epub_structure', None, safe_filename
	elif file_path_lower.endswith(('.js', '.xml', '.txt')):
	return 'other', None, sanitize_resource_filename(file_name)

	return None

	def _cleanup_old_resources( output_dir):
	"""Clean up old resource directories and EPUB structure files"""
	print("🧹 Cleaning up any existing resource directories...")

	cleanup_success = True

	for resource_type in ['css', 'fonts', 'images']:
	resource_dir = os.path.join(output_dir, resource_type)
	if os.path.exists(resource_dir):
	try:
	shutil.rmtree(resource_dir)
	print(f" 🗑️ Removed old {resource_type} directory")
	except PermissionError as e:
	print(f" ⚠️ Cannot remove {resource_type} directory (permission denied) - will merge with existing files")
	cleanup_success = False
	except Exception as e:
	print(f" ⚠️ Error removing {resource_type} directory: {e} - will merge with existing files")
	cleanup_success = False

	epub_structure_files = ['container.xml', 'content.opf', 'toc.ncx']
	for epub_file in epub_structure_files:
	input_path = os.path.join(output_dir, epub_file)
	if os.path.exists(input_path):
	try:
	os.remove(input_path)
	print(f" 🗑️ Removed old {epub_file}")
	except PermissionError:
	print(f" ⚠️ Cannot remove {epub_file} (permission denied) - will use existing file")
	except Exception as e:
	print(f" ⚠️ Error removing {epub_file}: {e}")

	try:
	for file in os.listdir(output_dir):
	if file.lower().endswith(('.opf', '.ncx')):
	file_path = os.path.join(output_dir, file)
	try:
	os.remove(file_path)
	print(f" 🗑️ Removed old EPUB file: {file}")
	except PermissionError:
	print(f" ⚠️ Cannot remove {file} (permission denied)")
	except Exception as e:
	print(f" ⚠️ Error removing {file}: {e}")
	except Exception as e:
	print(f"⚠️ Error scanning for EPUB files: {e}")

	if not cleanup_success:
	print("⚠️ Some cleanup operations failed due to file permissions")
	print(" The program will continue and merge with existing files")

	return cleanup_success

	def _count_existing_resources( output_dir, extracted_resources):
	"""Count existing resources when skipping extraction"""
	for resource_type in ['css', 'fonts', 'images', 'epub_structure']:
	if resource_type == 'epub_structure':
	epub_files = []
	for file in ['container.xml', 'content.opf', 'toc.ncx']:
	if os.path.exists(os.path.join(output_dir, file)):
	epub_files.append(file)
	try:
	for file in os.listdir(output_dir):
	if file.lower().endswith(('.opf', '.ncx')) and file not in epub_files:
	epub_files.append(file)
	except:
	pass
	extracted_resources[resource_type] = epub_files
	else:
	resource_dir = os.path.join(output_dir, resource_type)
	if os.path.exists(resource_dir):
	try:
	files = [f for f in os.listdir(resource_dir) if os.path.isfile(os.path.join(resource_dir, f))]
	extracted_resources[resource_type] = files
	except:
	extracted_resources[resource_type] = []

	total_existing = sum(len(files) for files in extracted_resources.values())
	print(f"✅ Found {total_existing} existing resource files")
	return extracted_resources

	def _validate_critical_files( output_dir, extracted_resources):
	"""Validate that critical EPUB files were extracted"""
	total_extracted = sum(len(files) for files in extracted_resources.values())
	print(f"✅ Extracted {total_extracted} resource files:")

	for resource_type, files in extracted_resources.items():
	if files:
	if resource_type == 'epub_structure':
	print(f" • EPUB Structure: {len(files)} files")
	for file in files:
	print(f" - {file}")
	else:
	print(f" • {resource_type.title()}: {len(files)} files")

	critical_files = ['container.xml']
	missing_critical = [f for f in critical_files if not os.path.exists(os.path.join(output_dir, f))]

	if missing_critical:
	print(f"⚠️ WARNING: Missing critical EPUB files: {missing_critical}")
	print(" This may prevent proper EPUB reconstruction!")
	else:
	print("✅ All critical EPUB structure files extracted successfully")

	opf_files = [f for f in extracted_resources['epub_structure'] if f.lower().endswith('.opf')]
	if not opf_files:
	print("⚠️ WARNING: No OPF file found! This will prevent EPUB reconstruction.")
	else:
	print(f"✅ Found OPF file(s): {opf_files}")

	def _create_extraction_report( output_dir, metadata, chapters, extracted_resources):
	"""Create comprehensive extraction report with HTML file tracking"""
	report_path = os.path.join(output_dir, 'extraction_report.txt')
	with open(report_path, 'w', encoding='utf-8') as f:
	f.write("EPUB Extraction Report\n")
	f.write("=" * 50 + "\n\n")

	f.write(f"EXTRACTION MODE: {metadata.get('extraction_mode', 'unknown').upper()}\n\n")

	f.write("METADATA:\n")
	for key, value in metadata.items():
	if key not in ['chapter_titles', 'extracted_resources', 'extraction_mode']:
	f.write(f" {key}: {value}\n")

	f.write(f"\nCHAPTERS ({len(chapters)}):\n")

	text_chapters = []
	image_only_chapters = []
	mixed_chapters = []

	for chapter in chapters:
	if chapter.get('has_images') and chapter.get('file_size', 0) < 500:
	image_only_chapters.append(chapter)
	elif chapter.get('has_images') and chapter.get('file_size', 0) >= 500:
	mixed_chapters.append(chapter)
	else:
	text_chapters.append(chapter)

	if text_chapters:
	f.write(f"\n TEXT CHAPTERS ({len(text_chapters)}):\n")
	for c in text_chapters:
	f.write(f" {c['num']:3d}. {c['title']} ({c['detection_method']})\n")
	if c.get('original_html_file'):
	f.write(f" → {c['original_html_file']}\n")

	if image_only_chapters:
	f.write(f"\n IMAGE-ONLY CHAPTERS ({len(image_only_chapters)}):\n")
	for c in image_only_chapters:
	f.write(f" {c['num']:3d}. {c['title']} (images: {c.get('image_count', 0)})\n")
	if c.get('original_html_file'):
	f.write(f" → {c['original_html_file']}\n")
	if 'body' in c:
	try:
	soup = BeautifulSoup(c['body'], 'html.parser')
	images = soup.find_all('img')
	for img in images[:3]:
	src = img.get('src', 'unknown')
	f.write(f" • Image: {src}\n")
	if len(images) > 3:
	f.write(f" • ... and {len(images) - 3} more images\n")
	except:
	pass

	if mixed_chapters:
	f.write(f"\n MIXED CONTENT CHAPTERS ({len(mixed_chapters)}):\n")
	for c in mixed_chapters:
	f.write(f" {c['num']:3d}. {c['title']} (text: {c.get('file_size', 0)} chars, images: {c.get('image_count', 0)})\n")
	if c.get('original_html_file'):
	f.write(f" → {c['original_html_file']}\n")

	f.write(f"\nRESOURCES EXTRACTED:\n")
	for resource_type, files in extracted_resources.items():
	if files:
	if resource_type == 'epub_structure':
	f.write(f" EPUB Structure: {len(files)} files\n")
	for file in files:
	f.write(f" - {file}\n")
	else:
	f.write(f" {resource_type.title()}: {len(files)} files\n")
	for file in files[:5]:
	f.write(f" - {file}\n")
	if len(files) > 5:
	f.write(f" ... and {len(files) - 5} more\n")

	f.write(f"\nHTML FILES WRITTEN:\n")
	html_files_written = metadata.get('html_files_written', 0)
	f.write(f" Total: {html_files_written} files\n")
	f.write(f" Location: Main directory and 'originals' subdirectory\n")

	f.write(f"\nPOTENTIAL ISSUES:\n")
	issues = []

	if image_only_chapters:
	issues.append(f" • {len(image_only_chapters)} chapters contain only images (may need OCR)")

	missing_html = sum(1 for c in chapters if not c.get('original_html_file'))
	if missing_html > 0:
	issues.append(f" • {missing_html} chapters failed to write HTML files")

	if not extracted_resources.get('epub_structure'):
	issues.append(" • No EPUB structure files found (may affect reconstruction)")

	if not issues:
	f.write(" None detected - extraction appears successful!\n")
	else:
	for issue in issues:
	f.write(issue + "\n")

	print(f"📄 Saved extraction report to: {report_path}")

	def _log_extraction_summary( chapters, extracted_resources, detected_language, html_files_written=0):
	"""Log final extraction summary with HTML file information"""
	extraction_mode = chapters[0].get('extraction_mode', 'unknown') if chapters else 'unknown'

	print(f"\n✅ {extraction_mode.capitalize()} extraction complete!")
	print(f" 📚 Chapters: {len(chapters)}")
	print(f" 📄 HTML files written: {html_files_written}")
	print(f" 🎨 Resources: {sum(len(files) for files in extracted_resources.values())}")
	print(f" 🌍 Language: {detected_language}")

	image_only_count = sum(1 for c in chapters if c.get('has_images') and c.get('file_size', 0) < 500)
	if image_only_count > 0:
	print(f" 📸 Image-only chapters: {image_only_count}")

	epub_files = extracted_resources.get('epub_structure', [])
	if epub_files:
	print(f" 📋 EPUB Structure: {len(epub_files)} files ({', '.join(epub_files)})")
	else:
	print(f" ⚠️ No EPUB structure files extracted!")

	print(f"\n🔍 Pre-flight check readiness:")
	print(f" ✅ HTML files: {'READY' if html_files_written > 0 else 'NOT READY'}")
	print(f" ✅ Metadata: READY")
	print(f" ✅ Resources: READY")

	def _process_single_html_file(
	file_path,
	file_index,
	zip_file_path,
	parser,
	merge_candidates,
	disable_merging,
	enhanced_extractor,
	extraction_mode,
	enhanced_filtering,
	preserve_structure,
	protect_angle_brackets_func,
	pattern_manager,
	files_to_process,
	is_stop_requested
	):
	"""Process a single HTML file from an EPUB - standalone function for multiprocessing.

	This function is at module level to be picklable for ProcessPoolExecutor.
	All needed data must be passed as parameters.

	Returns:
	tuple: (chapter_info, h1_found, h2_found, file_size, sample_text, skipped_info)
	- chapter_info: dict with chapter data, or None if skipped/error
	- h1_found: bool indicating if h1 tags were found
	- h2_found: bool indicating if h2 tags were found
	- file_size: int size of content text
	- sample_text: str text sample for language detection
	- skipped_info: tuple (file_path, reason, detail) if skipped, else None
	"""
	from bs4 import BeautifulSoup
	import os
	import zipfile

	# Check stop
	if is_stop_requested():
	return None, False, False, 0, '', None

	try:
	# Open our own ZipFile instance for thread safety
	with zipfile.ZipFile(zip_file_path, 'r') as zf:
	# Read file data
	file_data = zf.read(file_path)

	# Decode the file data
	html_content = None
	detected_encoding = None
	for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']:
	try:
	html_content = file_data.decode(encoding)
	detected_encoding = encoding
	break
	except UnicodeDecodeError:
	continue

	if not html_content:
	print(f"[WARNING] Could not decode {file_path}")
	return None, False, False, 0, '', None

	# Check if this file needs merging
	if not disable_merging and file_path in merge_candidates:
	section_file = merge_candidates[file_path]
	print(f"[DEBUG] Processing merge for: {file_path}")

	try:
	# Read section file with our own ZipFile
	with zipfile.ZipFile(zip_file_path, 'r') as zf:
	section_data = zf.read(section_file)
	section_html = None
	for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']:
	try:
	section_html = section_data.decode(encoding)
	break
	except UnicodeDecodeError:
	continue

	if section_html:
	# Quick check if section is small enough to merge
	section_soup = BeautifulSoup(section_html, parser)
	section_text = section_soup.get_text(strip=True)

	if len(section_text) < 200: # Merge if section is small
	# Extract body content
	chapter_soup = BeautifulSoup(html_content, parser)

	if section_soup.body:
	section_body_content = ''.join(str(child) for child in section_soup.body.children)
	else:
	section_body_content = section_html

	if chapter_soup.body:
	chapter_body_content = ''.join(str(child) for child in chapter_soup.body.children)
	else:
	chapter_body_content = html_content

	# Merge content
	html_content = section_body_content + "\n<hr/>\n" + chapter_body_content
	print(f" → MERGED: Section ({len(section_text)} chars) + Chapter")
	else:
	print(f" → NOT MERGED: Section too large ({len(section_text)} chars)")

	except Exception as e:
	print(f"[WARNING] Failed to merge {file_path}: {e}")

	# === ENHANCED EXTRACTION POINT ===
	content_html = None
	content_text = None
	chapter_title = None
	enhanced_extraction_used = False

	# Determine whether to use enhanced extractor
	use_enhanced = enhanced_extractor and extraction_mode == "enhanced"
	force_bs_traditional = False
	try:
	force_bs = os.getenv('FORCE_BS_FOR_TRADITIONAL', '0') == '1'
	model_env = os.getenv('MODEL', '')
	# Check for traditional translation API (inline to avoid circular imports)
	is_traditional_api = model_env in ['deepl', 'google-translate', 'google-translate-free'] or model_env.startswith('deepl/') or model_env.startswith('google-translate/')
	if force_bs and is_traditional_api:
	use_enhanced = False
	force_bs_traditional = True
	except Exception:
	pass

	# Use enhanced extractor if available and allowed
	if use_enhanced:
	clean_content, _, chapter_title = enhanced_extractor.extract_chapter_content(
	html_content, enhanced_filtering
	)
	enhanced_extraction_used = True

	content_html = clean_content
	content_text = clean_content

	# BeautifulSoup method (only for non-enhanced modes)
	if not enhanced_extraction_used:
	if extraction_mode == "enhanced" and not force_bs_traditional:
	print(f"❌ Skipping {file_path} - enhanced extraction required but not available")
	return None, False, False, 0, '', None

	# Parse the (possibly merged) content
	protected_html = protect_angle_brackets_func(html_content)
	soup = BeautifulSoup(protected_html, parser)

	# Get effective mode for filtering
	effective_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode

	# In full mode, keep the entire HTML structure
	if effective_filtering == "full":
	content_html = html_content
	content_text = soup.get_text(strip=True)
	else:
	# Smart and comprehensive modes extract body content
	if soup.body:
	content_html = str(soup.body)
	content_text = soup.body.get_text(strip=True)
	else:
	content_html = html_content
	content_text = soup.get_text(strip=True)

	# Extract title (with ignore settings support)
	chapter_title = None

	# Check settings for batch translation
	batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
	use_title_tag = os.getenv('USE_TITLE', '0') == '1' or not batch_translate_active
	ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active

	# Extract from title tag if using titles
	if use_title_tag and soup.title and soup.title.string:
	chapter_title = soup.title.string.strip()

	# Extract from header tags if not ignored and no title found
	if not chapter_title and not ignore_header_tags:
	for header_tag in ['h1', 'h2', 'h3']:
	header = soup.find(header_tag)
	if header:
	chapter_title = header.get_text(strip=True)
	break

	# Fallback to filename if nothing found
	if not chapter_title:
	chapter_title = os.path.splitext(os.path.basename(file_path))[0]

	# Get the effective extraction mode for processing logic
	effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode

	# Skip truly empty files in smart mode
	if effective_mode == "smart" and not disable_merging and len(content_text.strip()) < 10:
	skipped_info = (file_path, 'empty', len(content_text))
	return None, False, False, 0, '', skipped_info

	# Get actual chapter number based on original position
	actual_chapter_num = files_to_process.index(file_path) + 1

	# Mode-specific logic
	detection_method = None
	h1_found = False
	h2_found = False

	if effective_mode == "comprehensive" or effective_mode == "full":
	# For comprehensive/full mode, use sequential numbering
	chapter_num = actual_chapter_num

	if not chapter_title:
	chapter_title = os.path.splitext(os.path.basename(file_path))[0]

	detection_method = f"{extraction_mode}_sequential" if extraction_mode == "enhanced" else f"{effective_mode}_sequential"

	elif effective_mode == "smart":
	# For smart mode, when merging is disabled, use sequential numbering
	if disable_merging:
	chapter_num = actual_chapter_num

	if not chapter_title:
	chapter_title = os.path.splitext(os.path.basename(file_path))[0]

	detection_method = f"{extraction_mode}_sequential_no_merge" if extraction_mode == "enhanced" else "sequential_no_merge"
	else:
	# When merging is enabled, try to extract chapter info
	protected_html = protect_angle_brackets_func(html_content)
	soup = BeautifulSoup(protected_html, parser)

	# Count headers
	h1_tags = soup.find_all('h1')
	h2_tags = soup.find_all('h2')
	h1_found = len(h1_tags) > 0
	h2_found = len(h2_tags) > 0

	# Extract chapter number and title
	chapter_num, extracted_title, detection_method = _extract_chapter_info(
	soup, file_path, content_text, html_content, pattern_manager
	)

	# Use extracted title if we don't have one
	if extracted_title and not chapter_title:
	chapter_title = extracted_title

	# For hash-based filenames, chapter_num might be None
	if chapter_num is None:
	chapter_num = actual_chapter_num
	detection_method = f"{extraction_mode}_sequential_fallback" if extraction_mode == "enhanced" else "sequential_fallback"
	print(f"[DEBUG] No chapter number found in {file_path}, assigning: {chapter_num}")

	# Filter content_html for title/header settings (before processing)
	batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
	use_title_tag = os.getenv('USE_TITLE', '0') == '1' or not batch_translate_active
	ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
	remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1'

	if (not use_title_tag or ignore_header_tags or remove_duplicate_h1_p) and content_html and not enhanced_extraction_used:
	# Parse the content HTML to remove unused tags
	content_soup = BeautifulSoup(content_html, parser)

	# Remove title tags if not using titles
	if not use_title_tag:
	for title_tag in content_soup.find_all('title'):
	title_tag.decompose()

	# Remove header tags if ignored
	if ignore_header_tags:
	for header_tag in content_soup.find_all(['h1', 'h2', 'h3']):
	header_tag.decompose()

	# Remove duplicate H1+P pairs (where P immediately follows H1 with same text)
	if remove_duplicate_h1_p:
	for h1_tag in content_soup.find_all('h1'):
	# Skip split marker H1 tags
	h1_id = h1_tag.get('id', '')
	if h1_id and h1_id.startswith('split-'):
	continue
	h1_text = h1_tag.get_text(strip=True)
	if 'SPLIT MARKER' in h1_text:
	continue

	# Get the next sibling (skipping whitespace/text nodes)
	next_sibling = h1_tag.find_next_sibling()
	if next_sibling and next_sibling.name == 'p':
	# Compare text content (stripped)
	p_text = next_sibling.get_text(strip=True)
	if h1_text == p_text:
	# Remove the duplicate paragraph
	next_sibling.decompose()

	# Update content_html with filtered version
	content_html = str(content_soup)

	# Process images and metadata
	protected_html = protect_angle_brackets_func(html_content)
	soup = BeautifulSoup(protected_html, parser)
	images = soup.find_all('img')
	has_images = len(images) > 0
	is_image_only_chapter = has_images and len(content_text.strip()) < 500

	if is_image_only_chapter:
	print(f"[DEBUG] Image-only chapter detected: {file_path} ({len(images)} images, {len(content_text)} chars)")

	# Calculate content hash (inline to avoid circular imports)
	import hashlib
	content_hash = hashlib.sha256(content_html.encode('utf-8', errors='ignore')).hexdigest()

	file_size = len(content_text)
	sample_text = content_text[:500] if effective_mode == "smart" else ''

	# Ensure chapter_num is always an integer
	if isinstance(chapter_num, float):
	chapter_num = int(chapter_num)

	# Create chapter info
	chapter_info = {
	"num": chapter_num,
	"title": chapter_title or f"Chapter {chapter_num}",
	"body": content_html,
	"filename": file_path,
	# IMPORTANT: For PDFs, we must preserve the original filename including extension
	# so that chapter_splitter.py can detect it as PDF content.
	# But we also want to preserve the basename for display/logging.
	"source_file": os.path.basename(zip_file_path) if zip_file_path else file_path,
	"original_filename": os.path.basename(file_path),
	"original_basename": os.path.splitext(os.path.basename(file_path))[0],
	"content_hash": content_hash,
	"detection_method": detection_method if detection_method else "pending",
	"file_size": file_size,
	"has_images": has_images,
	"image_count": len(images),
	"is_empty": len(content_text.strip()) == 0,
	"is_image_only": is_image_only_chapter,
	"extraction_mode": extraction_mode,
	"file_index": file_index
	}

	# Add enhanced extraction info if used
	if enhanced_extraction_used:
	chapter_info["enhanced_extraction"] = True
	chapter_info["enhanced_filtering"] = enhanced_filtering
	chapter_info["preserve_structure"] = preserve_structure
	# Store original HTML for image restoration
	chapter_info["original_html"] = html_content

	# Add merge info if applicable
	if not disable_merging and file_path in merge_candidates:
	chapter_info["was_merged"] = True
	chapter_info["merged_with"] = merge_candidates[file_path]

	if effective_mode == "smart":
	chapter_info["language_sample"] = content_text[:500]
	# Debug for section files
	if 'section' in chapter_info['original_basename'].lower():
	print(f"[DEBUG] Added section file to candidates: {chapter_info['original_basename']} (size: {chapter_info['file_size']})")

	return chapter_info, h1_found, h2_found, file_size, sample_text, None

	except Exception as e:
	print(f"[ERROR] Failed to process {file_path}: {e}")
	import traceback
	traceback.print_exc()
	return None, False, False, 0, '', None