Spaces:

Ash2749
/

math-ocr

Running

App Files Files Community

math-ocr / main6_pix2text.py

Ash2749

Upload 5 files

1a3e965 verified 6 months ago

raw

history blame contribute delete

28.4 kB

	import cv2
	import pytesseract
	from pytesseract import Output
	from pdf2image import convert_from_path
	import numpy as np
	import json
	from tqdm import tqdm
	import unicodedata
	from collections import defaultdict
	from PIL import Image
	import logging


	try:
	from pix2text import Pix2Text

	PIX2TEXT_AVAILABLE = True
	print("Pix2Text imported successfully for advanced math extraction")
	except ImportError:
	PIX2TEXT_AVAILABLE = False
	print("Pix2Text not available. Install with: pip install pix2text")
	print(" Falling back to traditional OCR for math expressions")


	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	# ----------------------------
	# STEP 1: Enhanced Character Classification
	# ----------------------------
	def classify_character(char):
	"""
	Classify a single character as English, Bangla, Math, or Other.
	Enhanced for better math detection.
	"""
	if not char or char.isspace():
	return "space"

	# Unicode ranges for Bangla
	if "\u0980" <= char <= "\u09ff": # Bangla unicode range
	return "bangla"

	# Enhanced mathematical symbols and operators
	math_chars = set(
	"=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬"
	"αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
	"±≈≠≡⇒⇔∘∗⊕⊗⊙⊥∥∦∝∞"
	)

	# Extended math ranges
	math_ranges = [
	("\u2200", "\u22ff"), # Mathematical Operators
	("\u2190", "\u21ff"), # Arrows
	("\u0370", "\u03ff"), # Greek and Coptic
	("\u2070", "\u209f"), # Superscripts and Subscripts
	("\u27c0", "\u27ef"), # Miscellaneous Mathematical Symbols-A
	("\u2980", "\u29ff"), # Miscellaneous Mathematical Symbols-B
	]

	if char in math_chars:
	return "math"

	for start, end in math_ranges:
	if start <= char <= end:
	return "math"

	# Numbers (also often mathematical)
	if char.isdigit():
	return "number"

	# English letters
	if char.isascii() and char.isalpha():
	return "english"

	# Mathematical punctuation
	if char in ".,;:!?()[]{}\"'-_/\\^":
	return "punctuation"

	return "other"


	def classify_text_region(text):
	"""
	Enhanced text region classification with better math detection.
	"""
	if not text.strip():
	return "empty"

	char_counts = defaultdict(int)
	for char in text:
	char_type = classify_character(char)
	char_counts[char_type] += 1

	# Remove spaces from consideration
	significant_chars = {k: v for k, v in char_counts.items() if k not in ["space"]}

	if not significant_chars:
	return "empty"

	total_significant = sum(significant_chars.values())
	percentages = {k: v / total_significant for k, v in significant_chars.items()}

	# Enhanced classification logic
	math_indicators = percentages.get("math", 0) + percentages.get("number", 0) * 0.5

	if percentages.get("bangla", 0) > 0.5:
	return "bangla"
	elif math_indicators > 0.3 or has_math_patterns(text):
	return "math"
	elif percentages.get("english", 0) > 0.5:
	return "english"
	else:
	return "mixed"


	def has_math_patterns(text):
	"""
	Detect mathematical patterns in text using regex and heuristics.
	"""
	import re

	# Common mathematical patterns
	math_patterns = [
	r"\d+[\+\-\*/=]\d+", # Simple arithmetic
	r"[xy]\^?\d+", # Variables with powers
	r"\\[a-zA-Z]+", # LaTeX commands
	r"\$.*?\$", # LaTeX inline math
	r"[a-zA-Z]$[a-zA-Z,\d\s]+$", # Functions like f(x)
	r"\b(sin\|cos\|tan\|log\|ln\|exp\|sqrt\|int\|sum\|lim)\b", # Math functions
	r"[≤≥≠≈∫∑∂∞]", # Math symbols
	]

	for pattern in math_patterns:
	if re.search(pattern, text, re.IGNORECASE):
	return True

	return False


	# ----------------------------
	# STEP 2: Initialize Pix2Text
	# ----------------------------
	def initialize_pix2text():
	"""Initialize Pix2Text model for mathematical expression extraction."""
	if not PIX2TEXT_AVAILABLE:
	return None

	try:
	# Initialize Pix2Text with specific configuration for math
	# Try different initialization methods
	logger.info("Initializing Pix2Text...")

	# Method 1: Default initialization
	try:
	p2t = Pix2Text.from_config()
	logger.info("✅ Pix2Text initialized with default config")
	return p2t
	except Exception as e1:
	logger.warning(f"Default Pix2Text init failed: {e1}")

	# Method 2: Try with specific config
	try:
	p2t = Pix2Text()
	logger.info("✅ Pix2Text initialized with basic constructor")
	return p2t
	except Exception as e2:
	logger.warning(f"Basic Pix2Text init failed: {e2}")

	# Method 3: Try with minimal config
	try:
	config = {"device": "cpu"} # Force CPU to avoid CUDA issues
	p2t = Pix2Text.from_config(config)
	logger.info("✅ Pix2Text initialized with CPU config")
	return p2t
	except Exception as e3:
	logger.error(f"All Pix2Text initialization methods failed: {e3}")

	return None

	except Exception as e:
	logger.error(f"❌ Failed to initialize Pix2Text: {e}")
	return None


	# ----------------------------
	# STEP 3: Enhanced Image Preprocessing
	# ----------------------------
	def preprocess_image_advanced(pil_image):
	"""Enhanced image preprocessing with multiple techniques."""
	img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Noise reduction
	gray = cv2.fastNlMeansDenoising(gray, h=15)

	# Adaptive thresholding for better text separation
	binary = cv2.adaptiveThreshold(
	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 5
	)

	# Enhance contrast
	enhanced = cv2.convertScaleAbs(binary, alpha=1.2, beta=10)

	# Scale up for better OCR accuracy
	height, width = enhanced.shape
	scaled = cv2.resize(
	enhanced, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC
	)

	return scaled


	def preprocess_for_pix2text(pil_image, region):
	"""
	Special preprocessing for Pix2Text mathematical expression extraction.
	"""
	# Convert PIL to numpy array
	img = np.array(pil_image)

	# Crop the specific region
	x, y, w, h = region["left"], region["top"], region["width"], region["height"]

	# Validate region dimensions
	if w <= 0 or h <= 0:
	logger.warning(f"Invalid region dimensions: w={w}, h={h}. Skipping Pix2Text.")
	return None

	# Add padding around the math region for better recognition
	padding = 10
	x_start = max(0, x - padding)
	y_start = max(0, y - padding)
	x_end = min(img.shape[1], x + w + padding)
	y_end = min(img.shape[0], y + h + padding)

	# Validate cropping bounds
	if x_end <= x_start or y_end <= y_start:
	logger.warning(
	f"Invalid crop bounds: x({x_start}:{x_end}), y({y_start}:{y_end}). Skipping Pix2Text."
	)
	return None

	cropped = img[y_start:y_end, x_start:x_end]

	# Check if crop resulted in empty image
	if cropped.size == 0:
	logger.warning("Cropped image is empty. Skipping Pix2Text.")
	return None

	# Convert back to PIL Image
	try:
	cropped_pil = Image.fromarray(cropped)
	except Exception as e:
	logger.error(f"Failed to create PIL image from cropped array: {e}")
	return None

	# Ensure minimum size for Pix2Text
	min_size = 32
	if cropped_pil.width <= 0 or cropped_pil.height <= 0:
	logger.warning(
	f"Invalid PIL image dimensions: {cropped_pil.width}x{cropped_pil.height}"
	)
	return None

	if cropped_pil.width < min_size or cropped_pil.height < min_size:
	# Resize maintaining aspect ratio
	try:
	ratio = max(min_size / cropped_pil.width, min_size / cropped_pil.height)
	new_width = int(cropped_pil.width * ratio)
	new_height = int(cropped_pil.height * ratio)

	# Ensure new dimensions are valid
	if new_width <= 0 or new_height <= 0:
	logger.warning(f"Invalid resized dimensions: {new_width}x{new_height}")
	return None

	cropped_pil = cropped_pil.resize((new_width, new_height), Image.LANCZOS)
	except Exception as e:
	logger.error(f"Failed to resize image: {e}")
	return None

	return cropped_pil


	# ----------------------------
	# STEP 4: Text Detection and Line Segmentation
	# ----------------------------
	def detect_text_regions(image):
	"""Detect text regions and classify them by line and character type."""
	data = pytesseract.image_to_data(image, output_type=Output.DICT, lang="eng+ben")

	text_regions = []
	for i in range(len(data["text"])):
	text = data["text"][i].strip()
	if text and int(data["conf"][i]) > 25: # Lowered threshold for math
	# Validate region dimensions
	width = int(data["width"][i])
	height = int(data["height"][i])
	left = int(data["left"][i])
	top = int(data["top"][i])

	# Skip regions with invalid dimensions
	if width <= 0 or height <= 0:
	logger.debug(
	f"Skipping region with invalid dimensions: {width}x{height}"
	)
	continue

	# Skip regions that are too small to be meaningful
	if width < 3 or height < 3:
	logger.debug(f"Skipping tiny region: {width}x{height}")
	continue

	region = {
	"text": text,
	"left": left,
	"top": top,
	"width": width,
	"height": height,
	"confidence": int(data["conf"][i]),
	"type": classify_text_region(text),
	}
	text_regions.append(region)

	logger.info(f"Detected {len(text_regions)} valid text regions")
	return text_regions


	def group_regions_by_line(regions, line_tolerance=15):
	"""Group text regions into lines with better tolerance for math expressions."""
	if not regions:
	return []

	regions_sorted = sorted(regions, key=lambda x: x["top"])

	lines = []
	current_line = [regions_sorted[0]]
	current_top = regions_sorted[0]["top"]

	for region in regions_sorted[1:]:
	# More flexible line grouping for mathematical expressions
	# Handle zero heights safely
	current_height = max(1, current_line[0]["height"]) # Avoid division by zero
	region_height = max(1, region["height"]) # Avoid division by zero
	height_avg = (current_height + region_height) / 2
	tolerance = max(line_tolerance, height_avg * 0.3)

	if abs(region["top"] - current_top) <= tolerance:
	current_line.append(region)
	else:
	current_line.sort(key=lambda x: x["left"])
	lines.append(current_line)
	current_line = [region]
	current_top = region["top"]

	if current_line:
	current_line.sort(key=lambda x: x["left"])
	lines.append(current_line)

	return lines


	# ----------------------------
	# STEP 5: Advanced OCR Extractors
	# ----------------------------
	def extract_english_region(image, region):
	"""Extract English text from a specific region with optimized settings."""
	x, y, w, h = region["left"], region["top"], region["width"], region["height"]

	roi = image[y : y + h, x : x + w]
	if roi.size == 0:
	return region["text"]

	config = r"--oem 3 --psm 8 -l eng"
	try:
	result = pytesseract.image_to_string(roi, config=config).strip()
	return result if result else region["text"]
	except Exception:
	return region["text"]


	def extract_bangla_region(image, region):
	"""Extract Bangla text from a specific region with optimized settings."""
	x, y, w, h = region["left"], region["top"], region["width"], region["height"]

	roi = image[y : y + h, x : x + w]
	if roi.size == 0:
	return region["text"]

	config = r"--oem 3 --psm 8 -l ben"
	try:
	result = pytesseract.image_to_string(roi, config=config).strip()
	return result if result else region["text"]
	except Exception:
	return region["text"]


	def extract_math_region_pix2text(pil_image, region, p2t_model):
	"""
	Extract mathematical expressions using Pix2Text with fallback to traditional OCR.
	"""
	if not p2t_model:
	return extract_math_region_traditional(pil_image, region)

	try:
	# Preprocess image for Pix2Text
	math_image = preprocess_for_pix2text(pil_image, region)

	# If preprocessing failed, fall back to traditional OCR
	if math_image is None:
	logger.warning(
	"Pix2Text preprocessing failed, falling back to traditional OCR"
	)
	return extract_math_region_traditional(pil_image, region)

	# Use Pix2Text to extract mathematical expressions
	result = p2t_model(math_image)

	# Enhanced result parsing to handle different Pix2Text response formats
	extracted_text = parse_pix2text_result(result)

	if extracted_text and extracted_text.strip():
	# Filter out invalid responses
	if not is_valid_pix2text_result(extracted_text):
	logger.warning(f"Invalid Pix2Text result: {extracted_text[:100]}...")
	return extract_math_region_traditional(pil_image, region)

	logger.info(f"✅ Pix2Text extracted: {extracted_text[:50]}...")
	return extracted_text.strip()
	else:
	logger.warning(
	"⚠️ Pix2Text returned empty result, falling back to traditional OCR"
	)
	return extract_math_region_traditional(pil_image, region)

	except Exception as e:
	logger.error(f"❌ Pix2Text extraction failed: {e}")
	return extract_math_region_traditional(pil_image, region)


	def parse_pix2text_result(result):
	"""
	Parse Pix2Text result handling various response formats.
	"""
	try:
	if isinstance(result, dict):
	# Handle different Pix2Text response formats
	# Try common keys for mathematical content
	for key in ["text", "formula", "latex", "content", "output"]:
	if key in result and result[key]:
	return str(result[key])

	# If no specific key found, convert entire dict to string
	# but filter out obviously bad content
	result_str = str(result)
	if len(result_str) > 1000: # Too long, likely debug info
	return ""
	return result_str

	elif isinstance(result, list):
	# Handle list responses
	if not result:
	return ""

	# Join list elements that look like mathematical content
	valid_items = []
	for item in result:
	item_str = str(item).strip()
	if item_str and not is_debug_content(item_str):
	valid_items.append(item_str)

	return " ".join(valid_items)

	elif isinstance(result, str):
	return result
	else:
	return str(result)

	except Exception as e:
	logger.error(f"Error parsing Pix2Text result: {e}")
	return ""


	def is_valid_pix2text_result(text):
	"""
	Check if the Pix2Text result is valid mathematical content.
	"""
	if not text or not text.strip():
	return False

	text = text.strip()

	# Filter out obvious debug/error content
	invalid_patterns = [
	"Page(id=",
	"elements=[]",
	"number=0",
	"Error:",
	"Exception:",
	"Traceback:",
	"DEBUG:",
	"INFO:",
	"WARNING:",
	"ERROR:",
	]

	for pattern in invalid_patterns:
	if pattern in text:
	return False

	# Must have some reasonable length for math content
	if len(text) < 1:
	return False

	# Should contain some mathematical or textual content
	# Allow mathematical symbols, letters, numbers, basic punctuation
	import re

	if re.search(r"[a-zA-Z0-9=+\-*/(){}[\]^_√∫∑∂πθαβγδλμΩ]", text):
	return True

	return False


	def is_debug_content(text):
	"""
	Check if text appears to be debug/logging content rather than actual content.
	"""
	debug_indicators = [
	"Page(",
	"id=",
	"number=",
	"elements=",
	"[])",
	"DEBUG",
	"INFO",
	"WARNING",
	"ERROR",
	"Exception",
	"Traceback",
	'File "',
	"line ",
	" at 0x",
	]

	for indicator in debug_indicators:
	if indicator in text:
	return True

	return False


	def extract_math_region_traditional(pil_image, region):
	"""
	Fallback traditional OCR for mathematical expressions.
	"""
	# Convert PIL to OpenCV format
	img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	x, y, w, h = region["left"], region["top"], region["width"], region["height"]
	roi = gray[y : y + h, x : x + w]

	if roi.size == 0:
	return region["text"]

	# Math-optimized OCR with expanded symbol whitelist
	math_chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇()[]{}.,;:^_αβγδλμθΩ±≈≠≡⇒⇔"
	config = f"--oem 3 --psm 6 -c tessedit_char_whitelist={math_chars}"

	try:
	result = pytesseract.image_to_string(roi, config=config).strip()
	return result if result else region["text"]
	except Exception:
	return region["text"]


	def extract_mixed_region(pil_image, region, p2t_model):
	"""Extract mixed content using multiple approaches."""
	# Convert PIL to OpenCV for traditional OCR
	img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	eng_result = extract_english_region(gray, region)
	bangla_result = extract_bangla_region(gray, region)

	# If it might contain math, try Pix2Text too
	if has_math_patterns(region["text"]):
	math_result = extract_math_region_pix2text(pil_image, region, p2t_model)
	# Choose the longest non-empty result
	results = [r for r in [eng_result, bangla_result, math_result] if r.strip()]
	return max(results, key=len) if results else region["text"]

	# Choose between English and Bangla
	return bangla_result if len(bangla_result) > len(eng_result) else eng_result


	# ----------------------------
	# STEP 6: Character Analysis (unchanged)
	# ----------------------------
	def analyze_character_by_character(text):
	"""Analyze text character by character to identify language patterns."""
	analysis = {
	"characters": [],
	"language_segments": [],
	"total_chars": len(text),
	"language_distribution": defaultdict(int),
	}

	for i, char in enumerate(text):
	char_type = classify_character(char)
	analysis["characters"].append(
	{
	"char": char,
	"position": i,
	"type": char_type,
	"unicode_name": unicodedata.name(char, "UNKNOWN"),
	}
	)
	analysis["language_distribution"][char_type] += 1

	# Create language segments
	current_segment = None
	for char_info in analysis["characters"]:
	if char_info["type"] in ["space", "punctuation"]:
	continue

	if current_segment is None or current_segment["type"] != char_info["type"]:
	if current_segment:
	analysis["language_segments"].append(current_segment)
	current_segment = {
	"type": char_info["type"],
	"start": char_info["position"],
	"end": char_info["position"],
	"text": char_info["char"],
	}
	else:
	current_segment["end"] = char_info["position"]
	current_segment["text"] += char_info["char"]

	if current_segment:
	analysis["language_segments"].append(current_segment)

	return analysis


	# ----------------------------
	# STEP 7: Main Processing Pipeline
	# ----------------------------
	def process_page_advanced(page_image, page_num, p2t_model):
	"""
	Advanced page processing with Pix2Text integration.
	"""
	print(f"Processing page {page_num + 1}...")

	# Preprocess image
	processed_image = preprocess_image_advanced(page_image)

	# Detect text regions
	regions = detect_text_regions(processed_image)

	# Group regions by lines
	lines = group_regions_by_line(regions)

	page_results = []

	for line_num, line in enumerate(lines):
	line_text_parts = []

	for region in line:
	# Choose appropriate extractor based on region type
	if region["type"] == "english":
	extracted_text = extract_english_region(processed_image, region)
	elif region["type"] == "bangla":
	extracted_text = extract_bangla_region(processed_image, region)
	elif region["type"] == "math":
	extracted_text = extract_math_region_pix2text(
	page_image, region, p2t_model
	)
	elif region["type"] == "mixed":
	extracted_text = extract_mixed_region(page_image, region, p2t_model)
	else:
	extracted_text = region["text"]

	# Character-by-character analysis
	char_analysis = analyze_character_by_character(extracted_text)

	region_result = {
	"page": page_num,
	"line": line_num,
	"text": extracted_text,
	"original_text": region["text"],
	"position": {
	"left": region["left"],
	"top": region["top"],
	"width": region["width"],
	"height": region["height"],
	},
	"confidence": region["confidence"],
	"detected_type": region["type"],
	"extraction_method": "pix2text"
	if region["type"] == "math" and p2t_model
	else "tesseract",
	"character_analysis": char_analysis,
	}

	page_results.append(region_result)
	line_text_parts.append(extracted_text)

	# Log line information
	if line_text_parts:
	line_text = " ".join(line_text_parts)
	print(f" Line {line_num + 1}: {line_text[:100]}...")

	return page_results


	def extract_all_text_advanced_pix2text(
	pdf_path, output_text_file, output_json_file, output_analysis_file
	):
	"""
	Advanced text extraction with Pix2Text integration.
	"""
	print("[INFO] Initializing Pix2Text for mathematical expression extraction...")
	p2t_model = initialize_pix2text()

	if p2t_model:
	print("✅ Pix2Text ready for advanced math extraction")
	else:
	print("⚠️ Using traditional OCR for math expressions")

	print("[INFO] Converting PDF to images...")
	pages = convert_from_path(pdf_path, dpi=300)

	all_results = []
	combined_text_parts = []

	for page_num, page_image in enumerate(tqdm(pages, desc="Processing pages")):
	page_results = process_page_advanced(page_image, page_num, p2t_model)
	all_results.extend(page_results)

	# Build page text
	page_text_parts = [result["text"] for result in page_results]
	page_text = " ".join(page_text_parts)
	combined_text_parts.append(page_text)

	# Combine all text
	final_text = "\n\n".join(combined_text_parts)

	# Save text file
	with open(output_text_file, "w", encoding="utf-8") as f:
	f.write(final_text)

	# Save detailed JSON results
	with open(output_json_file, "w", encoding="utf-8") as f:
	json.dump(all_results, f, ensure_ascii=False, indent=2)

	# Create summary analysis
	summary_analysis = create_extraction_summary(all_results)
	with open(output_analysis_file, "w", encoding="utf-8") as f:
	json.dump(summary_analysis, f, ensure_ascii=False, indent=2)

	print("\n[✅] Advanced Pix2Text extraction complete!")
	print(f"→ Text file saved to: {output_text_file}")
	print(f"→ Detailed JSON saved to: {output_json_file}")
	print(f"→ Analysis report saved to: {output_analysis_file}")

	# Print summary
	print("\n📊 Extraction Summary:")
	print(f" Total text regions: {len(all_results)}")
	print(f" English regions: {summary_analysis['type_distribution']['english']}")
	print(f" Bangla regions: {summary_analysis['type_distribution']['bangla']}")
	print(f" Math regions: {summary_analysis['type_distribution']['math']}")
	print(f" Mixed regions: {summary_analysis['type_distribution']['mixed']}")

	# Show extraction method statistics
	method_stats = defaultdict(int)
	for result in all_results:
	method_stats[result.get("extraction_method", "unknown")] += 1

	print("\n🔧 Extraction Methods Used:")
	for method, count in method_stats.items():
	print(f" {method}: {count} regions")


	def create_extraction_summary(results):
	"""Create a comprehensive summary of the extraction results."""
	summary = {
	"total_regions": len(results),
	"total_pages": len(set(r["page"] for r in results)),
	"type_distribution": defaultdict(int),
	"character_distribution": defaultdict(int),
	"confidence_stats": {"min": 100, "max": 0, "avg": 0},
	"language_segments_summary": defaultdict(int),
	"extraction_methods": defaultdict(int),
	}

	total_confidence = 0
	for result in results:
	summary["type_distribution"][result["detected_type"]] += 1
	summary["extraction_methods"][result.get("extraction_method", "unknown")] += 1

	conf = result["confidence"]
	total_confidence += conf
	summary["confidence_stats"]["min"] = min(
	summary["confidence_stats"]["min"], conf
	)
	summary["confidence_stats"]["max"] = max(
	summary["confidence_stats"]["max"], conf
	)

	# Character distribution
	char_analysis = result["character_analysis"]
	for char_type, count in char_analysis["language_distribution"].items():
	summary["character_distribution"][char_type] += count

	# Language segments
	for segment in char_analysis["language_segments"]:
	summary["language_segments_summary"][segment["type"]] += 1

	if results:
	summary["confidence_stats"]["avg"] = total_confidence / len(results)

	return summary


	# ----------------------
	# MAIN EXECUTION SECTION
	# ----------------------
	if __name__ == "__main__":
	pdf_path = r"math102.pdf"
	output_text_file = "math102_pix2text.txt"
	output_json_file = "math102_pix2text.json"
	output_analysis_file = "math102_pix2text_analysis.json"

	extract_all_text_advanced_pix2text(
	pdf_path, output_text_file, output_json_file, output_analysis_file
	)