Spaces:

Knightmovies
/

ScannerUniversalRotator

Sleeping

App Files Files Community

Knightmovies commited on Sep 23

Commit

3520813

verified ·

1 Parent(s): 5cac9c5

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -71

app.py CHANGED Viewed

@@ -10,14 +10,10 @@ from scipy.spatial import distance as dist
 # ==============================================================================
 # App Configuration & Styling
 # ==============================================================================
-st.set_page_config(
-    page_title="Document AI Toolkit",
-    page_icon="🤖",
-    layout="wide"
-)
-# Inject CSS for a centered, fixed-width layout
-st.markdown("""
     <style>
     .main .block-container {
         max-width: 900px;
@@ -27,20 +23,29 @@ st.markdown("""
         padding-bottom: 2rem;
     }
     </style>
-    """, unsafe_allow_html=True)
 # ==============================================================================
 # Model Loading (Cached)
 # ==============================================================================
 @st.cache_resource
 def load_model():
-    """Loads the Table Transformer model and processor."""
-    return TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition"), DetrImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
 model, processor = load_model()
 # ==============================================================================
-# Core Image Processing Functions (Unchanged)
 # ==============================================================================
 def order_points(pts):
     xSorted = pts[np.argsort(pts[:, 0]), :]
@@ -51,87 +56,151 @@ def order_points(pts):
     (br, tr) = rightMost[np.argsort(D)[::-1], :]
     return np.array([tl, tr, br, bl], dtype="float32")
-def perspective_transform(image, pts):
-    rect = order_points(pts)
-    (tl, tr, br, bl) = rect
-    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
-    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
-    maxWidth = max(int(widthA), int(widthB))
-    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
-    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
-    maxHeight = max(int(heightA), int(heightB))
-    dst = np.array([[0, 0], [maxWidth - 1, 0], [maxWidth - 1, maxHeight - 1], [0, maxHeight - 1]], dtype="float32")
-    M = cv2.getPerspectiveTransform(rect, dst)
-    return cv2.warpPerspective(image, M, (maxWidth, maxHeight))
 def find_and_straighten_document(image):
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    _, mask = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)
-    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    if not contours: return None
-    page_contour = max(contours, key=cv2.contourArea)
-    if cv2.contourArea(page_contour) < (image.shape[0] * image.shape[1] * 0.1): return None
-    box = cv2.boxPoints(cv2.minAreaRect(page_contour))
-    return perspective_transform(image, box)
 def correct_orientation(image):
-    """Robust orientation correction using a cascade approach."""
     try:
         osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT, timeout=5)
-        rotation = osd['rotate']
-        if rotation > 0:
-            angle_map = {90: cv2.ROTATE_90_COUNTERCLOCKWISE, 180: cv2.ROTATE_180, 270: cv2.ROTATE_90_CLOCKWISE}
             return cv2.rotate(image, angle_map[rotation])
         return image
     except Exception:
         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-        orientations = {0: thresh, 90: cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE), 180: cv2.rotate(thresh, cv2.ROTATE_180), 270: cv2.rotate(thresh, cv2.ROTATE_90_COUNTERCLOCKWISE)}
-        best_rotation, max_horizontal_boxes = 0, -1
-        for angle, rotated_img in orientations.items():
             try:
-                data = pytesseract.image_to_data(rotated_img, output_type=pytesseract.Output.DICT, timeout=5)
-                horizontal_boxes = sum(1 for i, conf in enumerate(data['conf']) if int(conf) > 10 and data['width'][i] > data['height'][i])
-                if horizontal_boxes > max_horizontal_boxes:
-                    max_horizontal_boxes, best_rotation = horizontal_boxes, angle
             except Exception:
-                continue
-        angle_map = {90: cv2.ROTATE_90_CLOCKWISE, 180: cv2.ROTATE_180, 270: cv2.ROTATE_90_COUNTERCLOCKWISE}
-        return cv2.rotate(image, angle_map[best_rotation]) if best_rotation > 0 else image
 def extract_and_draw_table_structure(image_bgr):
-    """Finds and draws table structure using OpenCV."""
     image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
     inputs = processor(images=image_pil, return_tensors="pt")
-    with torch.no_grad():
         outputs = model(**inputs)
-    target_sizes = torch.tensor([image_pil.size[::-1]])
     results = processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)[0]
-    img_with_boxes = image_bgr.copy()
     colors = {"table row": (0, 255, 0), "table column": (255, 0, 0), "table": (255, 0, 255)}
     for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        class_name = model.config.id2label[label.item()]
-        if class_name in colors:
-            xmin, ymin, xmax, ymax = [int(val) for val in box.tolist()]
-            cv2.rectangle(img_with_boxes, (xmin, ymin), (xmax, ymax), colors[class_name], 2)
-    return img_with_boxes
 # ==============================================================================
 # Streamlit UI
 # ==============================================================================
-# --- Session State Management ---
 if "stage" not in st.session_state:
     st.session_state.stage = "upload"
     st.session_state.original_image = None
     st.session_state.processed_image = None
     st.session_state.annotated_image = None
-# --- Sidebar Controls ---
 with st.sidebar:
     st.title("🤖 Document AI Toolkit")
     st.markdown("---")
     if st.button("🔄 Start Over", use_container_width=True):
         for key in list(st.session_state.keys()):
             del st.session_state[key]
@@ -139,31 +208,37 @@ with st.sidebar:
     if st.session_state.stage == "upload":
         st.header("Step 1: Upload Image")
-        uploaded_file = st.file_uploader("Upload your document", type=["jpg", "jpeg", "png"], label_visibility="collapsed")
         if uploaded_file:
             file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
             st.session_state.original_image = cv2.imdecode(file_bytes, 1)
             st.session_state.stage = "processing"
             st.rerun()
     elif st.session_state.stage == "processing":
         st.header("Step 2: Pre-process")
         if st.button("▶️ Start Pre-processing", use_container_width=True, type="primary"):
-            with st.spinner("Straightening & correcting orientation..."):
                 original_image = st.session_state.original_image
-                straightened = find_and_straighten_document(original_image)
-                image_to_orient = straightened if straightened is not None and straightened.size > 0 else original_image
-                st.session_state.processed_image = correct_orientation(image_to_orient)
             st.session_state.stage = "analysis"
             st.rerun()
     elif st.session_state.stage == "analysis":
         st.header("Step 3: Analyze Table")
         if st.button("📊 Find Table Structure", use_container_width=True, type="primary"):
             with st.spinner("Running Table Transformer model..."):
-                st.session_state.annotated_image = extract_and_draw_table_structure(st.session_state.processed_image)
             st.session_state.stage = "done"
             st.rerun()
-# --- Main Panel Display ---
 st.title("Document Processing Workflow")
 # Step 1: Upload
@@ -172,17 +247,27 @@ with expander1:
     if st.session_state.original_image is None:
         st.info("Please upload a document image using the sidebar to begin.")
     else:
-        st.image(cv2.cvtColor(st.session_state.original_image, cv2.COLOR_BGR2RGB), use_container_width=True)
         st.success("Image uploaded successfully.")
 # Step 2: Pre-process
 if st.session_state.original_image is not None:
-    expander2 = st.expander("Step 2: Pre-process Document", expanded=(st.session_state.stage == "processing" or st.session_state.stage == "analysis"))
     with expander2:
         if st.session_state.processed_image is None:
             st.info("Click 'Start Pre-processing' in the sidebar.")
         else:
-            st.image(cv2.cvtColor(st.session_state.processed_image, cv2.COLOR_BGR2RGB), caption="Straightened & Oriented", use_container_width=True)
             st.success("Pre-processing complete.")
 # Step 3: Analysis
@@ -194,9 +279,21 @@ if st.session_state.processed_image is not None:
         else:
             tab1, tab2 = st.tabs(["✅ Corrected Document", "📊 Table Structure"])
             with tab1:
-                st.image(cv2.cvtColor(st.session_state.processed_image, cv2.COLOR_BGR2RGB), use_container_width=True)
                 _, buf = cv2.imencode(".jpg", st.session_state.processed_image)
-                st.download_button("📥 Download Clean Image", data=buf.tobytes(), file_name="corrected.jpg", mime="image/jpeg", use_container_width=True)
             with tab2:
-                st.image(cv2.cvtColor(st.session_state.annotated_image, cv2.COLOR_BGR2RGB), use_container_width=True)
-            st.success("Analysis complete.")

 # ==============================================================================
 # App Configuration & Styling
 # ==============================================================================
+st.set_page_config(page_title="Document AI Toolkit", page_icon="🤖", layout="wide")
+st.markdown(
+    """
     <style>
     .main .block-container {
         max-width: 900px;
         padding-bottom: 2rem;
     }
     </style>
+    """,
+    unsafe_allow_html=True,
+)
 # ==============================================================================
 # Model Loading (Cached)
 # ==============================================================================
 @st.cache_resource
 def load_model():
+    model = TableTransformerForObjectDetection.from_pretrained(
+        "microsoft/table-transformer-structure-recognition"
+    )
+    processor = DetrImageProcessor.from_pretrained(
+        "microsoft/table-transformer-structure-recognition"
+    )
+    model.eval()
+    return model, processor
 model, processor = load_model()
 # ==============================================================================
+# Core Image Processing Functions
 # ==============================================================================
 def order_points(pts):
     xSorted = pts[np.argsort(pts[:, 0]), :]
     (br, tr) = rightMost[np.argsort(D)[::-1], :]
     return np.array([tl, tr, br, bl], dtype="float32")
+def _four_point_warp(image, pts):
+    pts = order_points(pts.astype("float32"))
+    (tl, tr, br, bl) = pts
+    widthA = np.linalg.norm(br - bl)
+    widthB = np.linalg.norm(tr - tl)
+    heightA = np.linalg.norm(tr - br)
+    heightB = np.linalg.norm(tl - bl)
+    maxW, maxH = int(max(widthA, widthB)), int(max(heightA, heightB))
+    dst = np.array([[0, 0], [maxW - 1, 0], [maxW - 1, maxH - 1], [0, maxH - 1]], dtype="float32")
+    M = cv2.getPerspectiveTransform(pts, dst)
+    return cv2.warpPerspective(image, M, (maxW, maxH))
 def find_and_straighten_document(image):
+    """Find 4 page corners; fall back to minAreaRect if needed."""
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    gray = cv2.GaussianBlur(gray, (5, 5), 0)
+    edges = cv2.Canny(gray, 50, 150)
+    edges = cv2.dilate(edges, np.ones((3, 3), np.uint8), 1)
+    cnts, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[:10]
+    for c in cnts:
+        peri = cv2.arcLength(c, True)
+        approx = cv2.approxPolyDP(c, 0.02 * peri, True)
+        if len(approx) == 4:
+            return _four_point_warp(image, approx.reshape(4, 2))
+    if cnts:
+        box = cv2.boxPoints(cv2.minAreaRect(max(cnts, key=cv2.contourArea)))
+        return _four_point_warp(image, box)
+    return image
+def deskew_slight(image):
+    """Remove small residual tilt so rows/cols are parallel to axes."""
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+    coords = np.column_stack(np.where(thr == 0))  # use ink pixels
+    if len(coords) < 100:
+        return image
+    angle = cv2.minAreaRect(coords)[-1]
+    angle = -(90 + angle) if angle < -45 else -angle
+    if abs(angle) < 0.3:
+        return image
+    (h, w) = image.shape[:2]
+    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
+    return cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
 def correct_orientation(image):
+    """Robust orientation correction using pytesseract + fallback."""
     try:
         osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT, timeout=5)
+        rotation = int(osd.get("rotate", 0))
+        if rotation:
+            # pytesseract's 'rotate' is the CLOCKWISE angle to correct the image.
+            angle_map = {
+                90: cv2.ROTATE_90_CLOCKWISE,
+                180: cv2.ROTATE_180,
+                270: cv2.ROTATE_90_COUNTERCLOCKWISE,
+            }
             return cv2.rotate(image, angle_map[rotation])
         return image
     except Exception:
+        # Fallback: choose the rotation with the most horizontal text boxes
         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+        rots = {
+            0: thr,
+            90: cv2.rotate(thr, cv2.ROTATE_90_CLOCKWISE),
+            180: cv2.rotate(thr, cv2.ROTATE_180),
+            270: cv2.rotate(thr, cv2.ROTATE_90_COUNTERCLOCKWISE),
+        }
+        best = 0
+        best_count = -1
+        for ang, img in rots.items():
             try:
+                data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, timeout=5)
+                cnt = sum(
+                    1
+                    for i, c in enumerate(data["conf"])
+                    if str(c).isdigit()
+                    and int(c) > 10
+                    and data["width"][i] > data["height"][i]
+                )
+                if cnt > best_count:
+                    best, best_count = ang, cnt
             except Exception:
+                pass
+        if best:
+            angle_map = {
+                90: cv2.ROTATE_90_CLOCKWISE,
+                180: cv2.ROTATE_180,
+                270: cv2.ROTATE_90_COUNTERCLOCKWISE,
+            }
+            return cv2.rotate(image, angle_map[best])
+        return image
 def extract_and_draw_table_structure(image_bgr):
+    """Run TableTransformer and draw table/table row/table column boxes."""
     image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
     inputs = processor(images=image_pil, return_tensors="pt")
+    with torch.inference_mode():
         outputs = model(**inputs)
+    h, w = image_bgr.shape[:2]
+    target_sizes = torch.tensor([[h, w]], dtype=torch.float32)
     results = processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)[0]
+    img = image_bgr.copy()
     colors = {"table row": (0, 255, 0), "table column": (255, 0, 0), "table": (255, 0, 255)}
     for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        cls = model.config.id2label[label.item()]
+        if cls in colors:
+            xmin, ymin, xmax, ymax = [int(round(v)) for v in box.tolist()]
+            xmin = max(0, min(xmin, w - 1))
+            xmax = max(0, min(xmax, w - 1))
+            ymin = max(0, min(ymin, h - 1))
+            ymax = max(0, min(ymax, h - 1))
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), colors[cls], 2)
+    return img
 # ==============================================================================
 # Streamlit UI
 # ==============================================================================
+# Session state
 if "stage" not in st.session_state:
     st.session_state.stage = "upload"
     st.session_state.original_image = None
     st.session_state.processed_image = None
     st.session_state.annotated_image = None
+# Sidebar
 with st.sidebar:
     st.title("🤖 Document AI Toolkit")
     st.markdown("---")
     if st.button("🔄 Start Over", use_container_width=True):
         for key in list(st.session_state.keys()):
             del st.session_state[key]
     if st.session_state.stage == "upload":
         st.header("Step 1: Upload Image")
+        uploaded_file = st.file_uploader(
+            "Upload your document", type=["jpg", "jpeg", "png"], label_visibility="collapsed"
+        )
         if uploaded_file:
             file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
             st.session_state.original_image = cv2.imdecode(file_bytes, 1)
             st.session_state.stage = "processing"
             st.rerun()
     elif st.session_state.stage == "processing":
         st.header("Step 2: Pre-process")
         if st.button("▶️ Start Pre-processing", use_container_width=True, type="primary"):
+            with st.spinner("Correcting orientation, straightening & deskewing..."):
                 original_image = st.session_state.original_image
+                oriented = correct_orientation(original_image)
+                straightened = find_and_straighten_document(oriented)
+                st.session_state.processed_image = deskew_slight(straightened)
             st.session_state.stage = "analysis"
             st.rerun()
     elif st.session_state.stage == "analysis":
         st.header("Step 3: Analyze Table")
         if st.button("📊 Find Table Structure", use_container_width=True, type="primary"):
             with st.spinner("Running Table Transformer model..."):
+                st.session_state.annotated_image = extract_and_draw_table_structure(
+                    st.session_state.processed_image
+                )
             st.session_state.stage = "done"
             st.rerun()
+# Main panel
 st.title("Document Processing Workflow")
 # Step 1: Upload
     if st.session_state.original_image is None:
         st.info("Please upload a document image using the sidebar to begin.")
     else:
+        st.image(
+            cv2.cvtColor(st.session_state.original_image, cv2.COLOR_BGR2RGB),
+            use_container_width=True,
+        )
         st.success("Image uploaded successfully.")
 # Step 2: Pre-process
 if st.session_state.original_image is not None:
+    expander2 = st.expander(
+        "Step 2: Pre-process Document",
+        expanded=(st.session_state.stage == "processing" or st.session_state.stage == "analysis"),
+    )
     with expander2:
         if st.session_state.processed_image is None:
             st.info("Click 'Start Pre-processing' in the sidebar.")
         else:
+            st.image(
+                cv2.cvtColor(st.session_state.processed_image, cv2.COLOR_BGR2RGB),
+                caption="Oriented • Straightened • Deskewed",
+                use_container_width=True,
+            )
             st.success("Pre-processing complete.")
 # Step 3: Analysis
         else:
             tab1, tab2 = st.tabs(["✅ Corrected Document", "📊 Table Structure"])
             with tab1:
+                st.image(
+                    cv2.cvtColor(st.session_state.processed_image, cv2.COLOR_BGR2RGB),
+                    use_container_width=True,
+                )
                 _, buf = cv2.imencode(".jpg", st.session_state.processed_image)
+                st.download_button(
+                    "📥 Download Clean Image",
+                    data=buf.tobytes(),
+                    file_name="corrected.jpg",
+                    mime="image/jpeg",
+                    use_container_width=True,
+                )
             with tab2:
+                st.image(
+                    cv2.cvtColor(st.session_state.annotated_image, cv2.COLOR_BGR2RGB),
+                    use_container_width=True,
+                )
+            st.success("Analysis complete.")