|  | import re | 
					
						
						|  | from typing import List | 
					
						
						|  |  | 
					
						
						|  | from marker.ocr.utils import alphanum_ratio | 
					
						
						|  | from marker.schema.bbox import rescale_bbox, box_intersection_pct | 
					
						
						|  | from marker.schema.page import Page | 
					
						
						|  | from marker.settings import settings | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def should_ocr_page(page: Page, no_text: bool): | 
					
						
						|  | detected_lines_found = detected_line_coverage(page) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | conditions = [ | 
					
						
						|  | no_text , | 
					
						
						|  | (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)), | 
					
						
						|  | detected_lines_found is False, | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | return any(conditions) or settings.OCR_ALL_PAGES | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3): | 
					
						
						|  | if len(text) == 0: | 
					
						
						|  |  | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | spaces = len(re.findall(r'\s+', text)) | 
					
						
						|  | alpha_chars = len(re.sub(r'\s+', '', text)) | 
					
						
						|  | if spaces / (alpha_chars + spaces) > space_threshold: | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | newlines = len(re.findall(r'\n+', text)) | 
					
						
						|  | non_newlines = len(re.sub(r'\n+', '', text)) | 
					
						
						|  | if newlines / (newlines + non_newlines) > newline_threshold: | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | if alphanum_ratio(text) < alphanum_threshold: | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | invalid_chars = len([c for c in text if c in settings.INVALID_CHARS]) | 
					
						
						|  | if invalid_chars > max(4.0, len(text) * .03): | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | return False | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def no_text_found(pages: List[Page]): | 
					
						
						|  | full_text = "" | 
					
						
						|  | for page in pages: | 
					
						
						|  | full_text += page.prelim_text | 
					
						
						|  | return len(full_text.strip()) == 0 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65): | 
					
						
						|  | found_lines = 0 | 
					
						
						|  | for detected_line in page.text_lines.bboxes: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | detected_bbox = detected_line.bbox | 
					
						
						|  | detected_bbox = rescale_bbox(page.text_lines.image_bbox, page.bbox, detected_bbox) | 
					
						
						|  |  | 
					
						
						|  | total_intersection = 0 | 
					
						
						|  | for block in page.blocks: | 
					
						
						|  | for line in block.lines: | 
					
						
						|  | intersection_pct = box_intersection_pct(detected_bbox, line.bbox) | 
					
						
						|  | total_intersection += intersection_pct | 
					
						
						|  | if total_intersection > intersect_thresh: | 
					
						
						|  | found_lines += 1 | 
					
						
						|  |  | 
					
						
						|  | total_lines = len(page.text_lines.bboxes) | 
					
						
						|  | if total_lines == 0: | 
					
						
						|  | return False | 
					
						
						|  | return found_lines / total_lines > detection_thresh | 
					
						
						|  |  |