| import cv2
|
| import pytesseract
|
| from pytesseract import Output
|
| from pdf2image import convert_from_path
|
| import numpy as np
|
| import json
|
| from tqdm import tqdm
|
| import unicodedata
|
| from collections import defaultdict
|
| from PIL import Image
|
| import logging
|
|
|
|
|
| try:
|
| from pix2text import Pix2Text
|
|
|
| PIX2TEXT_AVAILABLE = True
|
| print("Pix2Text imported successfully for advanced math extraction")
|
| except ImportError:
|
| PIX2TEXT_AVAILABLE = False
|
| print("Pix2Text not available. Install with: pip install pix2text")
|
| print(" Falling back to traditional OCR for math expressions")
|
|
|
|
|
| logging.basicConfig(level=logging.INFO)
|
| logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
| def classify_character(char):
|
| """
|
| Classify a single character as English, Bangla, Math, or Other.
|
| Enhanced for better math detection.
|
| """
|
| if not char or char.isspace():
|
| return "space"
|
|
|
|
|
| if "\u0980" <= char <= "\u09ff":
|
| return "bangla"
|
|
|
|
|
| math_chars = set(
|
| "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬"
|
| "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
|
| "±≈≠≡⇒⇔∘∗⊕⊗⊙⊥∥∦∝∞"
|
| )
|
|
|
|
|
| math_ranges = [
|
| ("\u2200", "\u22ff"),
|
| ("\u2190", "\u21ff"),
|
| ("\u0370", "\u03ff"),
|
| ("\u2070", "\u209f"),
|
| ("\u27c0", "\u27ef"),
|
| ("\u2980", "\u29ff"),
|
| ]
|
|
|
| if char in math_chars:
|
| return "math"
|
|
|
| for start, end in math_ranges:
|
| if start <= char <= end:
|
| return "math"
|
|
|
|
|
| if char.isdigit():
|
| return "number"
|
|
|
|
|
| if char.isascii() and char.isalpha():
|
| return "english"
|
|
|
|
|
| if char in ".,;:!?()[]{}\"'-_/\\^":
|
| return "punctuation"
|
|
|
| return "other"
|
|
|
|
|
| def classify_text_region(text):
|
| """
|
| Enhanced text region classification with better math detection.
|
| """
|
| if not text.strip():
|
| return "empty"
|
|
|
| char_counts = defaultdict(int)
|
| for char in text:
|
| char_type = classify_character(char)
|
| char_counts[char_type] += 1
|
|
|
|
|
| significant_chars = {k: v for k, v in char_counts.items() if k not in ["space"]}
|
|
|
| if not significant_chars:
|
| return "empty"
|
|
|
| total_significant = sum(significant_chars.values())
|
| percentages = {k: v / total_significant for k, v in significant_chars.items()}
|
|
|
|
|
| math_indicators = percentages.get("math", 0) + percentages.get("number", 0) * 0.5
|
|
|
| if percentages.get("bangla", 0) > 0.5:
|
| return "bangla"
|
| elif math_indicators > 0.3 or has_math_patterns(text):
|
| return "math"
|
| elif percentages.get("english", 0) > 0.5:
|
| return "english"
|
| else:
|
| return "mixed"
|
|
|
|
|
| def has_math_patterns(text):
|
| """
|
| Detect mathematical patterns in text using regex and heuristics.
|
| """
|
| import re
|
|
|
|
|
| math_patterns = [
|
| r"\d+[\+\-\*/=]\d+",
|
| r"[xy]\^?\d+",
|
| r"\\[a-zA-Z]+",
|
| r"\$.*?\$",
|
| r"[a-zA-Z]\([a-zA-Z,\d\s]+\)",
|
| r"\b(sin|cos|tan|log|ln|exp|sqrt|int|sum|lim)\b",
|
| r"[≤≥≠≈∫∑∂∞]",
|
| ]
|
|
|
| for pattern in math_patterns:
|
| if re.search(pattern, text, re.IGNORECASE):
|
| return True
|
|
|
| return False
|
|
|
|
|
|
|
|
|
|
|
| def initialize_pix2text():
|
| """Initialize Pix2Text model for mathematical expression extraction."""
|
| if not PIX2TEXT_AVAILABLE:
|
| return None
|
|
|
| try:
|
|
|
|
|
| logger.info("Initializing Pix2Text...")
|
|
|
|
|
| try:
|
| p2t = Pix2Text.from_config()
|
| logger.info("✅ Pix2Text initialized with default config")
|
| return p2t
|
| except Exception as e1:
|
| logger.warning(f"Default Pix2Text init failed: {e1}")
|
|
|
|
|
| try:
|
| p2t = Pix2Text()
|
| logger.info("✅ Pix2Text initialized with basic constructor")
|
| return p2t
|
| except Exception as e2:
|
| logger.warning(f"Basic Pix2Text init failed: {e2}")
|
|
|
|
|
| try:
|
| config = {"device": "cpu"}
|
| p2t = Pix2Text.from_config(config)
|
| logger.info("✅ Pix2Text initialized with CPU config")
|
| return p2t
|
| except Exception as e3:
|
| logger.error(f"All Pix2Text initialization methods failed: {e3}")
|
|
|
| return None
|
|
|
| except Exception as e:
|
| logger.error(f"❌ Failed to initialize Pix2Text: {e}")
|
| return None
|
|
|
|
|
|
|
|
|
|
|
| def preprocess_image_advanced(pil_image):
|
| """Enhanced image preprocessing with multiple techniques."""
|
| img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
|
|
|
| gray = cv2.fastNlMeansDenoising(gray, h=15)
|
|
|
|
|
| binary = cv2.adaptiveThreshold(
|
| gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 5
|
| )
|
|
|
|
|
| enhanced = cv2.convertScaleAbs(binary, alpha=1.2, beta=10)
|
|
|
|
|
| height, width = enhanced.shape
|
| scaled = cv2.resize(
|
| enhanced, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC
|
| )
|
|
|
| return scaled
|
|
|
|
|
| def preprocess_for_pix2text(pil_image, region):
|
| """
|
| Special preprocessing for Pix2Text mathematical expression extraction.
|
| """
|
|
|
| img = np.array(pil_image)
|
|
|
|
|
| x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
|
|
|
|
| if w <= 0 or h <= 0:
|
| logger.warning(f"Invalid region dimensions: w={w}, h={h}. Skipping Pix2Text.")
|
| return None
|
|
|
|
|
| padding = 10
|
| x_start = max(0, x - padding)
|
| y_start = max(0, y - padding)
|
| x_end = min(img.shape[1], x + w + padding)
|
| y_end = min(img.shape[0], y + h + padding)
|
|
|
|
|
| if x_end <= x_start or y_end <= y_start:
|
| logger.warning(
|
| f"Invalid crop bounds: x({x_start}:{x_end}), y({y_start}:{y_end}). Skipping Pix2Text."
|
| )
|
| return None
|
|
|
| cropped = img[y_start:y_end, x_start:x_end]
|
|
|
|
|
| if cropped.size == 0:
|
| logger.warning("Cropped image is empty. Skipping Pix2Text.")
|
| return None
|
|
|
|
|
| try:
|
| cropped_pil = Image.fromarray(cropped)
|
| except Exception as e:
|
| logger.error(f"Failed to create PIL image from cropped array: {e}")
|
| return None
|
|
|
|
|
| min_size = 32
|
| if cropped_pil.width <= 0 or cropped_pil.height <= 0:
|
| logger.warning(
|
| f"Invalid PIL image dimensions: {cropped_pil.width}x{cropped_pil.height}"
|
| )
|
| return None
|
|
|
| if cropped_pil.width < min_size or cropped_pil.height < min_size:
|
|
|
| try:
|
| ratio = max(min_size / cropped_pil.width, min_size / cropped_pil.height)
|
| new_width = int(cropped_pil.width * ratio)
|
| new_height = int(cropped_pil.height * ratio)
|
|
|
|
|
| if new_width <= 0 or new_height <= 0:
|
| logger.warning(f"Invalid resized dimensions: {new_width}x{new_height}")
|
| return None
|
|
|
| cropped_pil = cropped_pil.resize((new_width, new_height), Image.LANCZOS)
|
| except Exception as e:
|
| logger.error(f"Failed to resize image: {e}")
|
| return None
|
|
|
| return cropped_pil
|
|
|
|
|
|
|
|
|
|
|
| def detect_text_regions(image):
|
| """Detect text regions and classify them by line and character type."""
|
| data = pytesseract.image_to_data(image, output_type=Output.DICT, lang="eng+ben")
|
|
|
| text_regions = []
|
| for i in range(len(data["text"])):
|
| text = data["text"][i].strip()
|
| if text and int(data["conf"][i]) > 25:
|
|
|
| width = int(data["width"][i])
|
| height = int(data["height"][i])
|
| left = int(data["left"][i])
|
| top = int(data["top"][i])
|
|
|
|
|
| if width <= 0 or height <= 0:
|
| logger.debug(
|
| f"Skipping region with invalid dimensions: {width}x{height}"
|
| )
|
| continue
|
|
|
|
|
| if width < 3 or height < 3:
|
| logger.debug(f"Skipping tiny region: {width}x{height}")
|
| continue
|
|
|
| region = {
|
| "text": text,
|
| "left": left,
|
| "top": top,
|
| "width": width,
|
| "height": height,
|
| "confidence": int(data["conf"][i]),
|
| "type": classify_text_region(text),
|
| }
|
| text_regions.append(region)
|
|
|
| logger.info(f"Detected {len(text_regions)} valid text regions")
|
| return text_regions
|
|
|
|
|
| def group_regions_by_line(regions, line_tolerance=15):
|
| """Group text regions into lines with better tolerance for math expressions."""
|
| if not regions:
|
| return []
|
|
|
| regions_sorted = sorted(regions, key=lambda x: x["top"])
|
|
|
| lines = []
|
| current_line = [regions_sorted[0]]
|
| current_top = regions_sorted[0]["top"]
|
|
|
| for region in regions_sorted[1:]:
|
|
|
|
|
| current_height = max(1, current_line[0]["height"])
|
| region_height = max(1, region["height"])
|
| height_avg = (current_height + region_height) / 2
|
| tolerance = max(line_tolerance, height_avg * 0.3)
|
|
|
| if abs(region["top"] - current_top) <= tolerance:
|
| current_line.append(region)
|
| else:
|
| current_line.sort(key=lambda x: x["left"])
|
| lines.append(current_line)
|
| current_line = [region]
|
| current_top = region["top"]
|
|
|
| if current_line:
|
| current_line.sort(key=lambda x: x["left"])
|
| lines.append(current_line)
|
|
|
| return lines
|
|
|
|
|
|
|
|
|
|
|
| def extract_english_region(image, region):
|
| """Extract English text from a specific region with optimized settings."""
|
| x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
|
|
| roi = image[y : y + h, x : x + w]
|
| if roi.size == 0:
|
| return region["text"]
|
|
|
| config = r"--oem 3 --psm 8 -l eng"
|
| try:
|
| result = pytesseract.image_to_string(roi, config=config).strip()
|
| return result if result else region["text"]
|
| except Exception:
|
| return region["text"]
|
|
|
|
|
| def extract_bangla_region(image, region):
|
| """Extract Bangla text from a specific region with optimized settings."""
|
| x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
|
|
| roi = image[y : y + h, x : x + w]
|
| if roi.size == 0:
|
| return region["text"]
|
|
|
| config = r"--oem 3 --psm 8 -l ben"
|
| try:
|
| result = pytesseract.image_to_string(roi, config=config).strip()
|
| return result if result else region["text"]
|
| except Exception:
|
| return region["text"]
|
|
|
|
|
| def extract_math_region_pix2text(pil_image, region, p2t_model):
|
| """
|
| Extract mathematical expressions using Pix2Text with fallback to traditional OCR.
|
| """
|
| if not p2t_model:
|
| return extract_math_region_traditional(pil_image, region)
|
|
|
| try:
|
|
|
| math_image = preprocess_for_pix2text(pil_image, region)
|
|
|
|
|
| if math_image is None:
|
| logger.warning(
|
| "Pix2Text preprocessing failed, falling back to traditional OCR"
|
| )
|
| return extract_math_region_traditional(pil_image, region)
|
|
|
|
|
| result = p2t_model(math_image)
|
|
|
|
|
| extracted_text = parse_pix2text_result(result)
|
|
|
| if extracted_text and extracted_text.strip():
|
|
|
| if not is_valid_pix2text_result(extracted_text):
|
| logger.warning(f"Invalid Pix2Text result: {extracted_text[:100]}...")
|
| return extract_math_region_traditional(pil_image, region)
|
|
|
| logger.info(f"✅ Pix2Text extracted: {extracted_text[:50]}...")
|
| return extracted_text.strip()
|
| else:
|
| logger.warning(
|
| "⚠️ Pix2Text returned empty result, falling back to traditional OCR"
|
| )
|
| return extract_math_region_traditional(pil_image, region)
|
|
|
| except Exception as e:
|
| logger.error(f"❌ Pix2Text extraction failed: {e}")
|
| return extract_math_region_traditional(pil_image, region)
|
|
|
|
|
| def parse_pix2text_result(result):
|
| """
|
| Parse Pix2Text result handling various response formats.
|
| """
|
| try:
|
| if isinstance(result, dict):
|
|
|
|
|
| for key in ["text", "formula", "latex", "content", "output"]:
|
| if key in result and result[key]:
|
| return str(result[key])
|
|
|
|
|
|
|
| result_str = str(result)
|
| if len(result_str) > 1000:
|
| return ""
|
| return result_str
|
|
|
| elif isinstance(result, list):
|
|
|
| if not result:
|
| return ""
|
|
|
|
|
| valid_items = []
|
| for item in result:
|
| item_str = str(item).strip()
|
| if item_str and not is_debug_content(item_str):
|
| valid_items.append(item_str)
|
|
|
| return " ".join(valid_items)
|
|
|
| elif isinstance(result, str):
|
| return result
|
| else:
|
| return str(result)
|
|
|
| except Exception as e:
|
| logger.error(f"Error parsing Pix2Text result: {e}")
|
| return ""
|
|
|
|
|
| def is_valid_pix2text_result(text):
|
| """
|
| Check if the Pix2Text result is valid mathematical content.
|
| """
|
| if not text or not text.strip():
|
| return False
|
|
|
| text = text.strip()
|
|
|
|
|
| invalid_patterns = [
|
| "Page(id=",
|
| "elements=[]",
|
| "number=0",
|
| "Error:",
|
| "Exception:",
|
| "Traceback:",
|
| "DEBUG:",
|
| "INFO:",
|
| "WARNING:",
|
| "ERROR:",
|
| ]
|
|
|
| for pattern in invalid_patterns:
|
| if pattern in text:
|
| return False
|
|
|
|
|
| if len(text) < 1:
|
| return False
|
|
|
|
|
|
|
| import re
|
|
|
| if re.search(r"[a-zA-Z0-9=+\-*/(){}[\]^_√∫∑∂πθαβγδλμΩ]", text):
|
| return True
|
|
|
| return False
|
|
|
|
|
| def is_debug_content(text):
|
| """
|
| Check if text appears to be debug/logging content rather than actual content.
|
| """
|
| debug_indicators = [
|
| "Page(",
|
| "id=",
|
| "number=",
|
| "elements=",
|
| "[])",
|
| "DEBUG",
|
| "INFO",
|
| "WARNING",
|
| "ERROR",
|
| "Exception",
|
| "Traceback",
|
| 'File "',
|
| "line ",
|
| " at 0x",
|
| ]
|
|
|
| for indicator in debug_indicators:
|
| if indicator in text:
|
| return True
|
|
|
| return False
|
|
|
|
|
| def extract_math_region_traditional(pil_image, region):
|
| """
|
| Fallback traditional OCR for mathematical expressions.
|
| """
|
|
|
| img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
|
| x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
| roi = gray[y : y + h, x : x + w]
|
|
|
| if roi.size == 0:
|
| return region["text"]
|
|
|
|
|
| math_chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇()[]{}.,;:^_αβγδλμθΩ±≈≠≡⇒⇔"
|
| config = f"--oem 3 --psm 6 -c tessedit_char_whitelist={math_chars}"
|
|
|
| try:
|
| result = pytesseract.image_to_string(roi, config=config).strip()
|
| return result if result else region["text"]
|
| except Exception:
|
| return region["text"]
|
|
|
|
|
| def extract_mixed_region(pil_image, region, p2t_model):
|
| """Extract mixed content using multiple approaches."""
|
|
|
| img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
|
| eng_result = extract_english_region(gray, region)
|
| bangla_result = extract_bangla_region(gray, region)
|
|
|
|
|
| if has_math_patterns(region["text"]):
|
| math_result = extract_math_region_pix2text(pil_image, region, p2t_model)
|
|
|
| results = [r for r in [eng_result, bangla_result, math_result] if r.strip()]
|
| return max(results, key=len) if results else region["text"]
|
|
|
|
|
| return bangla_result if len(bangla_result) > len(eng_result) else eng_result
|
|
|
|
|
|
|
|
|
|
|
| def analyze_character_by_character(text):
|
| """Analyze text character by character to identify language patterns."""
|
| analysis = {
|
| "characters": [],
|
| "language_segments": [],
|
| "total_chars": len(text),
|
| "language_distribution": defaultdict(int),
|
| }
|
|
|
| for i, char in enumerate(text):
|
| char_type = classify_character(char)
|
| analysis["characters"].append(
|
| {
|
| "char": char,
|
| "position": i,
|
| "type": char_type,
|
| "unicode_name": unicodedata.name(char, "UNKNOWN"),
|
| }
|
| )
|
| analysis["language_distribution"][char_type] += 1
|
|
|
|
|
| current_segment = None
|
| for char_info in analysis["characters"]:
|
| if char_info["type"] in ["space", "punctuation"]:
|
| continue
|
|
|
| if current_segment is None or current_segment["type"] != char_info["type"]:
|
| if current_segment:
|
| analysis["language_segments"].append(current_segment)
|
| current_segment = {
|
| "type": char_info["type"],
|
| "start": char_info["position"],
|
| "end": char_info["position"],
|
| "text": char_info["char"],
|
| }
|
| else:
|
| current_segment["end"] = char_info["position"]
|
| current_segment["text"] += char_info["char"]
|
|
|
| if current_segment:
|
| analysis["language_segments"].append(current_segment)
|
|
|
| return analysis
|
|
|
|
|
|
|
|
|
|
|
| def process_page_advanced(page_image, page_num, p2t_model):
|
| """
|
| Advanced page processing with Pix2Text integration.
|
| """
|
| print(f"Processing page {page_num + 1}...")
|
|
|
|
|
| processed_image = preprocess_image_advanced(page_image)
|
|
|
|
|
| regions = detect_text_regions(processed_image)
|
|
|
|
|
| lines = group_regions_by_line(regions)
|
|
|
| page_results = []
|
|
|
| for line_num, line in enumerate(lines):
|
| line_text_parts = []
|
|
|
| for region in line:
|
|
|
| if region["type"] == "english":
|
| extracted_text = extract_english_region(processed_image, region)
|
| elif region["type"] == "bangla":
|
| extracted_text = extract_bangla_region(processed_image, region)
|
| elif region["type"] == "math":
|
| extracted_text = extract_math_region_pix2text(
|
| page_image, region, p2t_model
|
| )
|
| elif region["type"] == "mixed":
|
| extracted_text = extract_mixed_region(page_image, region, p2t_model)
|
| else:
|
| extracted_text = region["text"]
|
|
|
|
|
| char_analysis = analyze_character_by_character(extracted_text)
|
|
|
| region_result = {
|
| "page": page_num,
|
| "line": line_num,
|
| "text": extracted_text,
|
| "original_text": region["text"],
|
| "position": {
|
| "left": region["left"],
|
| "top": region["top"],
|
| "width": region["width"],
|
| "height": region["height"],
|
| },
|
| "confidence": region["confidence"],
|
| "detected_type": region["type"],
|
| "extraction_method": "pix2text"
|
| if region["type"] == "math" and p2t_model
|
| else "tesseract",
|
| "character_analysis": char_analysis,
|
| }
|
|
|
| page_results.append(region_result)
|
| line_text_parts.append(extracted_text)
|
|
|
|
|
| if line_text_parts:
|
| line_text = " ".join(line_text_parts)
|
| print(f" Line {line_num + 1}: {line_text[:100]}...")
|
|
|
| return page_results
|
|
|
|
|
| def extract_all_text_advanced_pix2text(
|
| pdf_path, output_text_file, output_json_file, output_analysis_file
|
| ):
|
| """
|
| Advanced text extraction with Pix2Text integration.
|
| """
|
| print("[INFO] Initializing Pix2Text for mathematical expression extraction...")
|
| p2t_model = initialize_pix2text()
|
|
|
| if p2t_model:
|
| print("✅ Pix2Text ready for advanced math extraction")
|
| else:
|
| print("⚠️ Using traditional OCR for math expressions")
|
|
|
| print("[INFO] Converting PDF to images...")
|
| pages = convert_from_path(pdf_path, dpi=300)
|
|
|
| all_results = []
|
| combined_text_parts = []
|
|
|
| for page_num, page_image in enumerate(tqdm(pages, desc="Processing pages")):
|
| page_results = process_page_advanced(page_image, page_num, p2t_model)
|
| all_results.extend(page_results)
|
|
|
|
|
| page_text_parts = [result["text"] for result in page_results]
|
| page_text = " ".join(page_text_parts)
|
| combined_text_parts.append(page_text)
|
|
|
|
|
| final_text = "\n\n".join(combined_text_parts)
|
|
|
|
|
| with open(output_text_file, "w", encoding="utf-8") as f:
|
| f.write(final_text)
|
|
|
|
|
| with open(output_json_file, "w", encoding="utf-8") as f:
|
| json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
|
|
|
| summary_analysis = create_extraction_summary(all_results)
|
| with open(output_analysis_file, "w", encoding="utf-8") as f:
|
| json.dump(summary_analysis, f, ensure_ascii=False, indent=2)
|
|
|
| print("\n[✅] Advanced Pix2Text extraction complete!")
|
| print(f"→ Text file saved to: {output_text_file}")
|
| print(f"→ Detailed JSON saved to: {output_json_file}")
|
| print(f"→ Analysis report saved to: {output_analysis_file}")
|
|
|
|
|
| print("\n📊 Extraction Summary:")
|
| print(f" Total text regions: {len(all_results)}")
|
| print(f" English regions: {summary_analysis['type_distribution']['english']}")
|
| print(f" Bangla regions: {summary_analysis['type_distribution']['bangla']}")
|
| print(f" Math regions: {summary_analysis['type_distribution']['math']}")
|
| print(f" Mixed regions: {summary_analysis['type_distribution']['mixed']}")
|
|
|
|
|
| method_stats = defaultdict(int)
|
| for result in all_results:
|
| method_stats[result.get("extraction_method", "unknown")] += 1
|
|
|
| print("\n🔧 Extraction Methods Used:")
|
| for method, count in method_stats.items():
|
| print(f" {method}: {count} regions")
|
|
|
|
|
| def create_extraction_summary(results):
|
| """Create a comprehensive summary of the extraction results."""
|
| summary = {
|
| "total_regions": len(results),
|
| "total_pages": len(set(r["page"] for r in results)),
|
| "type_distribution": defaultdict(int),
|
| "character_distribution": defaultdict(int),
|
| "confidence_stats": {"min": 100, "max": 0, "avg": 0},
|
| "language_segments_summary": defaultdict(int),
|
| "extraction_methods": defaultdict(int),
|
| }
|
|
|
| total_confidence = 0
|
| for result in results:
|
| summary["type_distribution"][result["detected_type"]] += 1
|
| summary["extraction_methods"][result.get("extraction_method", "unknown")] += 1
|
|
|
| conf = result["confidence"]
|
| total_confidence += conf
|
| summary["confidence_stats"]["min"] = min(
|
| summary["confidence_stats"]["min"], conf
|
| )
|
| summary["confidence_stats"]["max"] = max(
|
| summary["confidence_stats"]["max"], conf
|
| )
|
|
|
|
|
| char_analysis = result["character_analysis"]
|
| for char_type, count in char_analysis["language_distribution"].items():
|
| summary["character_distribution"][char_type] += count
|
|
|
|
|
| for segment in char_analysis["language_segments"]:
|
| summary["language_segments_summary"][segment["type"]] += 1
|
|
|
| if results:
|
| summary["confidence_stats"]["avg"] = total_confidence / len(results)
|
|
|
| return summary
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| pdf_path = r"math102.pdf"
|
| output_text_file = "math102_pix2text.txt"
|
| output_json_file = "math102_pix2text.json"
|
| output_analysis_file = "math102_pix2text_analysis.json"
|
|
|
| extract_all_text_advanced_pix2text(
|
| pdf_path, output_text_file, output_json_file, output_analysis_file
|
| )
|
|
|