Spaces:
Running
Running
| """ | |
| OCR Utilities for document processing | |
| """ | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| class OCRProcessor: | |
| """Handles OCR processing for images including handwriting detection.""" | |
| def __init__(self): | |
| try: | |
| import pytesseract | |
| self.pytesseract = pytesseract | |
| except ImportError: | |
| self.pytesseract = None | |
| print("Warning: pytesseract not available") | |
| def detect_handwriting(self, image): | |
| """Detect if image contains handwriting.""" | |
| try: | |
| # Convert PIL Image to numpy array | |
| img_array = np.array(image) | |
| # Convert to grayscale | |
| if len(img_array.shape) == 3: | |
| gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
| else: | |
| gray = img_array | |
| # Apply edge detection | |
| edges = cv2.Canny(gray, 50, 150) | |
| # Count edges | |
| edge_pixels = np.sum(edges > 0) | |
| total_pixels = edges.size | |
| edge_ratio = edge_pixels / total_pixels | |
| # Find contours | |
| contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| # Handwriting typically has more irregular contours | |
| is_handwritten = edge_ratio > 0.05 and len(contours) > 20 | |
| return { | |
| 'is_handwritten': is_handwritten, | |
| 'confidence': edge_ratio * 10, # Normalize to 0-1 range | |
| 'edge_ratio': edge_ratio, | |
| 'contour_count': len(contours) | |
| } | |
| except Exception as e: | |
| return { | |
| 'is_handwritten': False, | |
| 'confidence': 0, | |
| 'edge_ratio': 0, | |
| 'error': str(e) | |
| } | |
| def extract_text(self, image, enhance=True): | |
| """Extract text from image using standard OCR.""" | |
| if not self.pytesseract: | |
| return "OCR not available" | |
| try: | |
| if enhance: | |
| image = self._enhance_image(image) | |
| text = self.pytesseract.image_to_string(image) | |
| return text | |
| except Exception as e: | |
| return f"OCR error: {str(e)}" | |
| def extract_from_handwriting(self, image): | |
| """Extract text from handwritten image.""" | |
| if not self.pytesseract: | |
| return "OCR not available" | |
| try: | |
| # Enhance for handwriting | |
| enhanced = self._enhance_for_handwriting(image) | |
| # Use specific OCR config for handwriting | |
| custom_config = r'--oem 3 --psm 6' | |
| text = self.pytesseract.image_to_string(enhanced, config=custom_config) | |
| return text | |
| except Exception as e: | |
| return f"Handwriting OCR error: {str(e)}" | |
| def extract_text_with_confidence(self, image): | |
| """Extract text with confidence scores.""" | |
| if not self.pytesseract: | |
| return {'text': 'OCR not available', 'confidence': 0, 'word_count': 0} | |
| try: | |
| data = self.pytesseract.image_to_data(image, output_type=self.pytesseract.Output.DICT) | |
| # Filter by confidence | |
| text_parts = [] | |
| confidences = [] | |
| for i, conf in enumerate(data['conf']): | |
| if int(conf) > 30: # Threshold | |
| text_parts.append(data['text'][i]) | |
| confidences.append(int(conf)) | |
| text = ' '.join(text_parts) | |
| avg_confidence = np.mean(confidences) if confidences else 0 | |
| return { | |
| 'text': text, | |
| 'confidence': avg_confidence, | |
| 'word_count': len(text_parts) | |
| } | |
| except Exception as e: | |
| return { | |
| 'text': f"Error: {str(e)}", | |
| 'confidence': 0, | |
| 'word_count': 0 | |
| } | |
| def _enhance_image(self, image): | |
| """Enhance image for better OCR.""" | |
| try: | |
| # Convert PIL to numpy | |
| img_array = np.array(image) | |
| # Convert to grayscale | |
| if len(img_array.shape) == 3: | |
| gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
| else: | |
| gray = img_array | |
| # Apply thresholding | |
| _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| # Convert back to PIL | |
| return Image.fromarray(binary) | |
| except Exception: | |
| return image | |
| def _enhance_for_handwriting(self, image): | |
| """Enhance image specifically for handwriting recognition.""" | |
| try: | |
| img_array = np.array(image) | |
| if len(img_array.shape) == 3: | |
| gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
| else: | |
| gray = img_array | |
| # Apply adaptive thresholding for handwriting | |
| binary = cv2.adaptiveThreshold( | |
| gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2 | |
| ) | |
| # Denoise | |
| denoised = cv2.fastNlMeansDenoising(binary) | |
| return Image.fromarray(denoised) | |
| except Exception: | |
| return image | |