Spaces:

AnjaliSarawgi
/

gradio_app

Sleeping

App Files Files Community

AnjaliSarawgi commited on Oct 23

Commit

6c8fcdd

1 Parent(s): 8cb1f4a

Add application file

Browse files

Files changed (1) hide show

app.py +304 -27

app.py CHANGED Viewed

@@ -1,3 +1,27 @@
 import io
 import os
@@ -22,18 +46,68 @@ from transformers import (
 from matplotlib import cm
 import gradio as gr
 MAX_LEN: int = 128
 TOPK: int = 3
 MAX_LINES: int = 120
 RESIZE_MAX_SIDE: int = 800
 REL_PROB_TH: float = 0.70
 CLEANUP: re.Pattern = re.compile(r"[\u00AD\u200B\u200C\u200D]")
 FONT_PATH: str = os.path.join("NotoSansDevanagari-Regular.ttf")
 @lru_cache(maxsize=1)
 def load_model():
     model_path = "AnjaliSarawgi/model-oct"
     hf_token = os.environ.get("HF_TOKEN")
     model = VisionEncoderDecoderModel.from_pretrained(model_path, token=hf_token)
     tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path, token=hf_token)
@@ -43,13 +117,45 @@ def load_model():
     return model, tokenizer, processor.feature_extractor, device
 def clean_text(text: str) -> str:
     text = unicodedata.normalize("NFC", text)
     text = CLEANUP.sub("", text)
     return re.sub(r"\s+", "", text)
 def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.Image:
     img = image.convert("RGB")
     w, h = img.size
     if max(w, h) > max_side:
@@ -58,10 +164,36 @@ def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.
 def get_amp_ctx():
     return torch.cuda.amp.autocast if torch.cuda.is_available() else contextlib.nullcontext
 #
 def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tuple | None = None):
     def _strip_ns(elem):
         for e in elem.iter():
             if isinstance(e.tag, str) and e.tag.startswith("{"):
@@ -191,6 +323,7 @@ def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tupl
 def sort_boxes_reading_order(boxes, y_tol: int = 10):
     def key(b):
         x1, y1, x2, y2 = b["bbox"]
         return (round(y1 / max(1, y_tol)), y1, x1)
@@ -198,6 +331,21 @@ def sort_boxes_reading_order(boxes, y_tol: int = 10):
 def draw_boxes(img: Image.Image, boxes):
     base = img.convert("RGBA")
     overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
     draw = ImageDraw.Draw(overlay)
@@ -218,7 +366,33 @@ def draw_boxes(img: Image.Image, boxes):
     return Image.alpha_composite(base, overlay).convert("RGB")
 def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOPK):
     model, tokenizer, feature_extractor, device = load_model()
     img = prepare_image(image)
     pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(device)
@@ -305,6 +479,13 @@ def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOP
     return decoded_text, df
 DEV_CONS = "\u0915-\u0939\u0958-\u095F\u0978-\u097F"  # consonants incl. nukta variants range
 INDEP_VOW = "\u0904-\u0914"  # independent vowels
 NUKTA = "\u093C"  # nukta
@@ -352,23 +533,65 @@ def parse_alt_tokens(s: str):
 def highlight_tokens_with_tooltips(
     line_text: str, df_tok: pd.DataFrame, red_threshold: float, metric_column: str
 ) -> str:
     aks, spans = split_aksharas(line_text)
     joined = "".join(aks)
-    used_ranges = []
-    insertions = []
     for _, row in df_tok.iterrows():
-        token = row.get("token", "").strip()
         try:
             val = float(row.get(metric_column, 0))
         except Exception:
-            continue
-        if val <= red_threshold or not token:
-            continue
-        # Try finding the token in the joined akshara sequence
         start_char_idx = joined.find(token)
         if start_char_idx == -1:
             continue
-        # Locate matching akshara span
         ak_start = ak_end = None
         cum_len = 0
         for i, ak in enumerate(aks):
@@ -381,17 +604,16 @@ def highlight_tokens_with_tooltips(
             cum_len = next_len
         if ak_start is None or ak_end is None:
             continue
-        # Avoid overlaps
         if any(r[0] < ak_end and ak_start < r[1] for r in used_ranges):
             continue
         used_ranges.append((ak_start, ak_end))
-        # Character positions
         char_start = spans[ak_start][0]
         char_end = spans[ak_end - 1][1]
-        # Build tooltip content
         alt_toks = row.get("alt_tokens", "").split("|")
         alt_probs = row.get("alt_probs", "").split("|")
-        tooltip_lines = []
         for t, p in zip(alt_toks, alt_probs):
             try:
                 prob = float(p)
@@ -400,12 +622,16 @@ def highlight_tokens_with_tooltips(
             tooltip_lines.append(f"{_html_escape(t)}: {prob:.3f}")
         tooltip = "\n".join(tooltip_lines)
         token_str = _html_escape(line_text[char_start:char_end])
-        html_token = f"<span class='ocr-token' data-tooltip='{_html_escape(tooltip)}'>{token_str}</span>"
         insertions.append((char_start, char_end, html_token))
     if not insertions:
         return _html_escape(line_text)
     insertions.sort()
-    out_parts = []
     last_idx = 0
     for s, e, html_tok in insertions:
         out_parts.append(_html_escape(line_text[last_idx:s]))
@@ -435,7 +661,40 @@ def run_ocr(
     apply_bin: bool,
     highlight_metric: str,
 ):
     if image is None:
         return None, "", None, None, None
     # Convert the numpy array to a PIL image
@@ -548,6 +807,32 @@ def run_ocr(
         if group.iloc[-1]["token"].strip() == "":
             to_drop.append(group.index[-1])
     df_all = df_all.drop(index=to_drop)
     # Prepare plain text by stripping HTML tags and replacing <br>
     plain_text = re.sub(r"<[^>]*>", "", predicted_html.replace("<br>", "\n"))
     # Write temporary files
@@ -569,10 +854,7 @@ def run_ocr(
         csv_fd.close()
     except Exception:
         csv_path = None
-    # return overlay_img, predicted_html, df_all, txt_path, csv_path
-    txt_bytes = plain_text.encode("utf-8")
-    csv_bytes = df_all.to_csv(index=False).encode("utf-8")
-    return overlay_img, predicted_html, df_all, txt_bytes, csv_bytes
 # ----------------------------------------------------------------------
@@ -581,7 +863,7 @@ def run_ocr(
 def create_gradio_interface():
     """Create and return the Gradio Blocks interface."""
     with gr.Blocks(title="Old Nepali HTR") as demo:
-        gr.Markdown("""# Old Nepali HTR \n\nUpload a scanned image and (optionally) a segmentation XML file.  Choose preprocessing\nsteps and a highlight metric, then click **Run OCR** to extract the text.\nUncertain tokens are highlighted with tooltips showing alternative predictions.\nYou can edit the plain text below and download it or the full token scores.""")
         with gr.Row():
             image_input = gr.Image(type="numpy", label="Upload Image")
             # When used as an input, gr.File returns either a file path or bytes
@@ -607,11 +889,8 @@ def create_gradio_interface():
         predictions_output = gr.HTML(label="Predictions (HTML)")
         df_output = gr.DataFrame(label="Token Scores", interactive=False)
         # Separate file outputs for the OCR prediction, token scores and edited text.
-        # ocr_txt_output = gr.File(label="Download OCR Prediction (.txt)")
-        # ocr_csv_output = gr.File(label="Download Token Scores (.csv)")
-        ocr_txt_output = gr.File(label="Download OCR Prediction (.txt)", type="binary")
-        ocr_csv_output = gr.File(label="Download Token Scores (.csv)", type="binary")
         edited_txt_output = gr.File(label="Download edited text (.txt)")
         # Editable text area
@@ -662,7 +941,5 @@ def create_gradio_interface():
         )
     return demo
 iface = create_gradio_interface()
 iface.launch()

+"""
+Gradio application for performing OCR on scanned Old Nepali documents.
+This script is a Gradio port of a Streamlit application originally built
+to visualize and edit OCR output. It loads a pre‑trained model for
+sequence decoding, accepts an input image (and optional segmentation
+XML in ALTO format), performs OCR on segmented lines, highlights tokens
+with low confidence and offers downloads of both the raw text and per
+token scores.
+The heavy lifting functions (model loading, pre‑processing, inference
+and highlighting) are adapted directly from the Streamlit version. The
+UI has been simplified for Gradio: users upload an image and optional
+XML file, choose preprocessing steps and a highlight metric, then run
+OCR.  The results are displayed alongside the overlaid segmentation
+boxes and a table of token scores.  An editable textbox lets users
+modify the predicted text before downloading it.
+To run this app locally, install gradio (`pip install gradio`) and
+execute this script with Python:
+    python gradio_app.py
+"""
 import io
 import os
 from matplotlib import cm
 import gradio as gr
+# ----------------------------------------------------------------------
+# Configuration
+#
+# These constants control various aspects of the OCR pipeline. You can
+# adjust them to trade off accuracy, performance or output volume.
+# The maximum number of tokens to decode for a single line.  If your
+# documents typically have longer lines you can increase this value, but
+# beware that very long sequences may cause more memory usage.
 MAX_LEN: int = 128
+# How many alternative tokens to keep when computing per–token statistics.
 TOPK: int = 3
+# If an XML segmentation file is provided, only process the first
+# MAX_LINES lines.  This prevents huge documents from consuming
+# excessive resources.
 MAX_LINES: int = 120
+# Images are resized such that the longest side does not exceed this
+# number of pixels before passing them to the OCR model.  Increasing
+# this value may improve accuracy at the cost of speed and memory.
 RESIZE_MAX_SIDE: int = 800
+# Threshold used when highlighting tokens by relative probability.  A
+# ratio of Top2/Top1 greater than this value will cause the token to
+# be highlighted in red.
 REL_PROB_TH: float = 0.70
+# A regex used to clean up Unicode control characters before text
+# normalization.  Soft hyphens, zero width spaces and similar marks
+# interfere with accurate token matching.
 CLEANUP: re.Pattern = re.compile(r"[\u00AD\u200B\u200C\u200D]")
+# Default font path for rendering predictions directly on the image.
 FONT_PATH: str = os.path.join("NotoSansDevanagari-Regular.ttf")
+# ----------------------------------------------------------------------
+# Model loading
+#
+# Loading the model and associated tokenizer/processor is slow.  Use
+# functools.lru_cache to ensure this only happens once per process.
 @lru_cache(maxsize=1)
 def load_model():
+    """Load the OCR model, tokenizer and feature extractor.
+    Returns
+    -------
+    model : VisionEncoderDecoderModel
+        The loaded model in evaluation mode.
+    tokenizer : PreTrainedTokenizerFast
+        Tokenizer corresponding to the decoder part of the model.
+    feature_extractor : callable
+        Feature extractor converting PIL images into model inputs.
+    device : torch.device
+        The device (CPU or CUDA) used for inference.
+    """
     model_path = "AnjaliSarawgi/model-oct"
+    # In an offline environment the HF token is None; if you wish
+    # to use a private model you can set HF_TOKEN in your environment.
     hf_token = os.environ.get("HF_TOKEN")
     model = VisionEncoderDecoderModel.from_pretrained(model_path, token=hf_token)
     tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path, token=hf_token)
     return model, tokenizer, processor.feature_extractor, device
+# ----------------------------------------------------------------------
+# Utility functions
+#
 def clean_text(text: str) -> str:
+    """Normalize and collapse whitespace from a decoded string.
+    Parameters
+    ----------
+    text : str
+        The raw decoded string from the model.
+    Returns
+    -------
+    str
+        The cleaned string with Unicode normalization and whitespace
+        removed.  All whitespace characters are stripped since the
+        predictions are later tokenized at the akshara (syllable) level.
+    """
     text = unicodedata.normalize("NFC", text)
     text = CLEANUP.sub("", text)
     return re.sub(r"\s+", "", text)
 def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.Image:
+    """Resize the image so that its longest side equals max_side.
+    Parameters
+    ----------
+    image : PIL.Image
+        Input image.
+    max_side : int, optional
+        Maximum allowed size for the longest side of the image.
+    Returns
+    -------
+    PIL.Image
+        The resized image.
+    """
     img = image.convert("RGB")
     w, h = img.size
     if max(w, h) > max_side:
 def get_amp_ctx():
+    """Return the appropriate context manager for automatic mixed precision."""
     return torch.cuda.amp.autocast if torch.cuda.is_available() else contextlib.nullcontext
+# ----------------------------------------------------------------------
+# XML parsing and segmentation
 #
 def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tuple | None = None):
+    """Parse ALTO or PAGE XML to extract bounding boxes.
+    Parameters
+    ----------
+    xml_bytes : bytes
+        Raw XML bytes.
+    level : {"block", "line", "word"}, optional
+        The segmentation level to extract.  For OCR we use "line".
+    image_size : tuple or None
+        If provided, image_size=(width, height) allows rescaling
+        coordinates to match the actual image.  ALTO files often store
+        absolute page sizes that differ from the image dimensions.
+    Returns
+    -------
+    list of dict
+        Each dict represents a bounding box with keys:
+        - "bbox": [x1, y1, x2, y2]
+        - "points": list of (x, y) if polygonal coordinates exist
+        - "id": line identifier (string)
+        - "label": the type of element (e.g. TextLine)
+    """
     def _strip_ns(elem):
         for e in elem.iter():
             if isinstance(e.tag, str) and e.tag.startswith("{"):
 def sort_boxes_reading_order(boxes, y_tol: int = 10):
+    """Sort bounding boxes top‑to‑bottom then left‑to‑right."""
     def key(b):
         x1, y1, x2, y2 = b["bbox"]
         return (round(y1 / max(1, y_tol)), y1, x1)
 def draw_boxes(img: Image.Image, boxes):
+    """Overlay semi‑transparent red polygons or rectangles on an image.
+    Parameters
+    ----------
+    img : PIL.Image
+        The base image.
+    boxes : list of dict
+        Segmentation boxes with either 'points' or 'bbox' keys.
+    Returns
+    -------
+    PIL.Image
+        An image with red overlays marking each box.  Boxes are numbered
+        starting from 1.
+    """
     base = img.convert("RGBA")
     overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
     draw = ImageDraw.Draw(overlay)
     return Image.alpha_composite(base, overlay).convert("RGB")
+# ----------------------------------------------------------------------
+# OCR inference per line
+#
 def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOPK):
+    """Run the model on a single cropped line and return predictions and scores.
+    This helper wraps the model.generate call to obtain per‑token
+    probabilities and derives a DataFrame summarizing each decoding step.
+    Parameters
+    ----------
+    image : PIL.Image
+        Cropped segment to process.
+    line_id : int, optional
+        Identifier used in the output DataFrame.
+    topk : int, optional
+        Number of alternative tokens to keep for each decoding position.
+    Returns
+    -------
+    decoded_text : str
+        Cleaned predicted string for the line.
+    df : pandas.DataFrame
+        Table with one row per generated token containing the following
+        columns: line_id, seq_pos, token_id, token, confidence,
+        rel_prob, entropy, gap12, alt_tokens, alt_probs.
+    """
     model, tokenizer, feature_extractor, device = load_model()
     img = prepare_image(image)
     pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(device)
     return decoded_text, df
+# ----------------------------------------------------------------------
+# Text splitting into aksharas (syllable units) for highlighting
+#
+# The following regex and helper functions split a Devanagari string into
+# aksharas.  This is necessary to map model tokens back to spans of
+# characters when highlighting uncertain predictions.
 DEV_CONS = "\u0915-\u0939\u0958-\u095F\u0978-\u097F"  # consonants incl. nukta variants range
 INDEP_VOW = "\u0904-\u0914"  # independent vowels
 NUKTA = "\u093C"  # nukta
 def highlight_tokens_with_tooltips(
     line_text: str, df_tok: pd.DataFrame, red_threshold: float, metric_column: str
 ) -> str:
+    """Insert HTML spans around tokens whose chosen metric exceeds threshold.
+    The metric column can be "rel_prob" (relative probability) or
+    "entropy".  Tokens with a value strictly greater than red_threshold
+    will be wrapped in a span with a tooltip listing alternative
+    predictions and their probabilities.
+    Parameters
+    ----------
+    line_text : str
+        The cleaned line prediction.
+    df_tok : pandas.DataFrame
+        DataFrame of token statistics for the corresponding line.
+    red_threshold : float
+        Values above this threshold will be highlighted.
+    metric_column : str
+        Column name in df_tok used for thresholding.
+    Returns
+    -------
+    str
+        An HTML string with <span> elements inserted.
+    """
     aks, spans = split_aksharas(line_text)
     joined = "".join(aks)
+    used_ranges: list = []
+    insertions: list = []
+    # Define colour classification depending on the metric
+    def color_class(val: float) -> str:
+        if metric_column == "rel_prob":
+            # Use the same thresholds as the original app: >0.7 red, >=0.05 yellow, otherwise green
+            if val >= 0.70:
+                return "token-red"
+            elif val >= 0.05:
+                return "token-yellow"
+            else:
+                return "token-green"
+        else:
+            # For entropy, high values indicate uncertainty.  Thresholds here are heuristics.
+            if val >= 2.0:
+                return "token-red"
+            elif val >= 1.0:
+                return "token-yellow"
+            else:
+                return "token-green"
     for _, row in df_tok.iterrows():
+        token = str(row.get("token", "")).strip()
+        if not token:
+            continue
+        # Extract metric value for classification
         try:
             val = float(row.get(metric_column, 0))
         except Exception:
+            val = 0.0
+        # Find the first occurrence of the token in the joined akshara sequence
         start_char_idx = joined.find(token)
         if start_char_idx == -1:
             continue
+        # Locate corresponding akshara boundaries
         ak_start = ak_end = None
         cum_len = 0
         for i, ak in enumerate(aks):
             cum_len = next_len
         if ak_start is None or ak_end is None:
             continue
+        # Prevent overlapping spans
         if any(r[0] < ak_end and ak_start < r[1] for r in used_ranges):
             continue
         used_ranges.append((ak_start, ak_end))
         char_start = spans[ak_start][0]
         char_end = spans[ak_end - 1][1]
+        # Prepare tooltip content
         alt_toks = row.get("alt_tokens", "").split("|")
         alt_probs = row.get("alt_probs", "").split("|")
+        tooltip_lines: list = []
         for t, p in zip(alt_toks, alt_probs):
             try:
                 prob = float(p)
             tooltip_lines.append(f"{_html_escape(t)}: {prob:.3f}")
         tooltip = "\n".join(tooltip_lines)
         token_str = _html_escape(line_text[char_start:char_end])
+        cls = color_class(val)
+        html_token = (
+            f"<span class='ocr-token {cls}' data-tooltip='{_html_escape(tooltip)}'>{token_str}</span>"
+        )
         insertions.append((char_start, char_end, html_token))
+    # If nothing was highlighted, return escaped original text
     if not insertions:
         return _html_escape(line_text)
     insertions.sort()
+    out_parts: list = []
     last_idx = 0
     for s, e, html_tok in insertions:
         out_parts.append(_html_escape(line_text[last_idx:s]))
     apply_bin: bool,
     highlight_metric: str,
 ):
+    """Run the OCR pipeline on user inputs and return results for Gradio.
+    Parameters
+    ----------
+    image : numpy.ndarray or None
+        The uploaded image converted to a NumPy array by Gradio.  If
+        None, the function returns empty results.
+    xml_file : tuple or None
+        A tuple representing the uploaded XML file as provided by
+        gr.File.  The first element is the file name and the second is
+        bytes.  If None, no segmentation is applied and the entire
+        image is processed as a single line.
+    apply_gray : bool
+        Whether to convert the image to grayscale before OCR.
+    apply_bin : bool
+        Whether to apply binarization (Otsu threshold) before OCR.  If
+        selected, grayscale conversion is applied first automatically.
+    highlight_metric : str
+        Which metric to use for highlighting ("Relative Probability" or
+        "Entropy").
+    Returns
+    -------
+    overlay_img : PIL.Image or None
+        Image with segmentation boxes drawn.  None if no input image.
+    predictions_html : str
+        HTML formatted predicted text with highlighted tokens.
+    df_scores : pandas.DataFrame or None
+        DataFrame of per‑token statistics.  None if no input image.
+    txt_file_path : str or None
+        Path to a temporary .txt file containing the plain predicted text.
+    csv_file_path : str or None
+        Path to a temporary CSV file containing the extended token scores.
+    """
     if image is None:
         return None, "", None, None, None
     # Convert the numpy array to a PIL image
         if group.iloc[-1]["token"].strip() == "":
             to_drop.append(group.index[-1])
     df_all = df_all.drop(index=to_drop)
+    # Inject style definitions for token colouring and tooltips
+    style_block = """
+    <style>
+    .ocr-token { position: relative; cursor: pointer; padding: 0 2px; border-radius: 2px; }
+    .ocr-token.token-red { background-color: rgba(255, 107, 107, 0.7); }
+    .ocr-token.token-yellow { background-color: rgba(255, 217, 59, 0.7); }
+    .ocr-token.token-green { background-color: rgba(107, 207, 99, 0.7); }
+    .ocr-token:hover::after {
+        content: attr(data-tooltip);
+        position: absolute;
+        bottom: 120%;
+        left: 0;
+        white-space: pre-line;
+        background: #333;
+        color: #fff;
+        padding: 6px 10px;
+        border-radius: 6px;
+        font-size: 12px;
+        z-index: 999;
+        max-width: 220px;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.15);
+    }
+    </style>
+    """
+    if predicted_html:
+        predicted_html = style_block + predicted_html
     # Prepare plain text by stripping HTML tags and replacing <br>
     plain_text = re.sub(r"<[^>]*>", "", predicted_html.replace("<br>", "\n"))
     # Write temporary files
         csv_fd.close()
     except Exception:
         csv_path = None
+    return overlay_img, predicted_html, df_all, txt_path, csv_path
 # ----------------------------------------------------------------------
 def create_gradio_interface():
     """Create and return the Gradio Blocks interface."""
     with gr.Blocks(title="Old Nepali HTR") as demo:
+        gr.Markdown("""# Old Nepali HTR (Gradio)\n\nUpload a scanned image and (optionally) a segmentation XML file.  Choose preprocessing\nsteps and a highlight metric, then click **Run OCR** to extract the text.\nUncertain tokens are highlighted with tooltips showing alternative predictions.\nYou can edit the plain text below and download it or the full token scores.""")
         with gr.Row():
             image_input = gr.Image(type="numpy", label="Upload Image")
             # When used as an input, gr.File returns either a file path or bytes
         predictions_output = gr.HTML(label="Predictions (HTML)")
         df_output = gr.DataFrame(label="Token Scores", interactive=False)
         # Separate file outputs for the OCR prediction, token scores and edited text.
+        ocr_txt_output = gr.File(label="Download OCR Prediction (.txt)")
+        ocr_csv_output = gr.File(label="Download Token Scores (.csv)")
         edited_txt_output = gr.File(label="Download edited text (.txt)")
         # Editable text area
         )
     return demo
 iface = create_gradio_interface()
 iface.launch()