| import streamlit as st |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel |
| from PIL import Image |
| import torch |
| import cv2 |
| import numpy as np |
| import tempfile |
|
|
| |
| @st.cache_resource |
| def load_model(): |
| processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") |
| model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") |
| return processor, model |
|
|
| processor, model = load_model() |
|
|
| |
| def detect_lines(image, min_height=20, min_width=100): |
| |
| image_np = np.array(image) |
|
|
| |
| gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY) |
|
|
| |
| _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) |
|
|
| |
| kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) |
| dilated = cv2.dilate(binary, kernel, iterations=1) |
|
|
| |
| contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
|
| |
| bounding_boxes = [cv2.boundingRect(c) for c in contours] |
| bounding_boxes = sorted(bounding_boxes, key=lambda b: b[1]) |
|
|
| |
| filtered_boxes = [] |
| for x, y, w, h in bounding_boxes: |
| if h >= min_height and w >= min_width: |
| filtered_boxes.append((x, y, w, h)) |
|
|
| |
| line_images = [] |
| for (x, y, w, h) in filtered_boxes: |
| line = image_np[y:y+h, x:x+w] |
| line_images.append(line) |
|
|
| return line_images |
|
|
| |
| st.title("Multiline Handwritten OCR with Hugging Face and OpenCV") |
| uploaded_file = st.file_uploader("Upload an Image (JPG, JPEG, PNG)", type=["jpg", "jpeg", "png"]) |
|
|
| if uploaded_file is not None: |
| try: |
| |
| image = Image.open(uploaded_file).convert("RGB") |
| st.image(image, caption="Uploaded Image", use_column_width=True) |
|
|
| st.write("Processing the image...") |
|
|
| |
| line_images = detect_lines(image, min_height=30, min_width=100) |
| st.write(f"Detected {len(line_images)} lines in the image.") |
|
|
| |
| extracted_text = "" |
| for idx, line_img in enumerate(line_images): |
| |
| line_pil = Image.fromarray(line_img) |
|
|
| |
| pixel_values = processor(images=line_pil, return_tensors="pt").pixel_values |
|
|
| |
| generated_ids = model.generate(pixel_values) |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
| |
| extracted_text += f"Line {idx + 1}: {generated_text}\n" |
|
|
| |
| st.subheader("Extracted Text:") |
| st.text_area("Output Text", extracted_text, height=200) |
|
|
| |
| st.download_button( |
| label="Download Text", |
| data=extracted_text, |
| file_name="extracted_text.txt", |
| mime="text/plain", |
| ) |
|
|
| except Exception as e: |
| st.error(f"An error occurred while processing the image: {e}") |
| else: |
| st.info("Please upload an image to start the OCR process.") |
|
|