Spaces:

AnjaliSarawgi
/

gradio_app

Sleeping

App Files Files Community

AnjaliSarawgi commited on Oct 23

Commit

8cb1f4a

1 Parent(s): 68afe16

Add application file

Browse files

Files changed (1) hide show

app.py +11 -242

app.py CHANGED Viewed

@@ -1,27 +1,3 @@
-"""
-Gradio application for performing OCR on scanned Old Nepali documents.
-This script is a Gradio port of a Streamlit application originally built
-to visualize and edit OCR output. It loads a pre‑trained model for
-sequence decoding, accepts an input image (and optional segmentation
-XML in ALTO format), performs OCR on segmented lines, highlights tokens
-with low confidence and offers downloads of both the raw text and per
-token scores.
-The heavy lifting functions (model loading, pre‑processing, inference
-and highlighting) are adapted directly from the Streamlit version. The
-UI has been simplified for Gradio: users upload an image and optional
-XML file, choose preprocessing steps and a highlight metric, then run
-OCR.  The results are displayed alongside the overlaid segmentation
-boxes and a table of token scores.  An editable textbox lets users
-modify the predicted text before downloading it.
-To run this app locally, install gradio (`pip install gradio`) and
-execute this script with Python:
-    python gradio_app.py
-"""
 import io
 import os
@@ -46,68 +22,18 @@ from transformers import (
 from matplotlib import cm
 import gradio as gr
-# ----------------------------------------------------------------------
-# Configuration
-#
-# These constants control various aspects of the OCR pipeline. You can
-# adjust them to trade off accuracy, performance or output volume.
-# The maximum number of tokens to decode for a single line.  If your
-# documents typically have longer lines you can increase this value, but
-# beware that very long sequences may cause more memory usage.
 MAX_LEN: int = 128
-# How many alternative tokens to keep when computing per–token statistics.
 TOPK: int = 3
-# If an XML segmentation file is provided, only process the first
-# MAX_LINES lines.  This prevents huge documents from consuming
-# excessive resources.
 MAX_LINES: int = 120
-# Images are resized such that the longest side does not exceed this
-# number of pixels before passing them to the OCR model.  Increasing
-# this value may improve accuracy at the cost of speed and memory.
 RESIZE_MAX_SIDE: int = 800
-# Threshold used when highlighting tokens by relative probability.  A
-# ratio of Top2/Top1 greater than this value will cause the token to
-# be highlighted in red.
 REL_PROB_TH: float = 0.70
-# A regex used to clean up Unicode control characters before text
-# normalization.  Soft hyphens, zero width spaces and similar marks
-# interfere with accurate token matching.
 CLEANUP: re.Pattern = re.compile(r"[\u00AD\u200B\u200C\u200D]")
-# Default font path for rendering predictions directly on the image.
 FONT_PATH: str = os.path.join("NotoSansDevanagari-Regular.ttf")
-# ----------------------------------------------------------------------
-# Model loading
-#
-# Loading the model and associated tokenizer/processor is slow.  Use
-# functools.lru_cache to ensure this only happens once per process.
 @lru_cache(maxsize=1)
 def load_model():
-    """Load the OCR model, tokenizer and feature extractor.
-    Returns
-    -------
-    model : VisionEncoderDecoderModel
-        The loaded model in evaluation mode.
-    tokenizer : PreTrainedTokenizerFast
-        Tokenizer corresponding to the decoder part of the model.
-    feature_extractor : callable
-        Feature extractor converting PIL images into model inputs.
-    device : torch.device
-        The device (CPU or CUDA) used for inference.
-    """
     model_path = "AnjaliSarawgi/model-oct"
-    # In an offline environment the HF token is None; if you wish
-    # to use a private model you can set HF_TOKEN in your environment.
     hf_token = os.environ.get("HF_TOKEN")
     model = VisionEncoderDecoderModel.from_pretrained(model_path, token=hf_token)
     tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path, token=hf_token)
@@ -117,45 +43,13 @@ def load_model():
     return model, tokenizer, processor.feature_extractor, device
-# ----------------------------------------------------------------------
-# Utility functions
-#
 def clean_text(text: str) -> str:
-    """Normalize and collapse whitespace from a decoded string.
-    Parameters
-    ----------
-    text : str
-        The raw decoded string from the model.
-    Returns
-    -------
-    str
-        The cleaned string with Unicode normalization and whitespace
-        removed.  All whitespace characters are stripped since the
-        predictions are later tokenized at the akshara (syllable) level.
-    """
     text = unicodedata.normalize("NFC", text)
     text = CLEANUP.sub("", text)
     return re.sub(r"\s+", "", text)
 def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.Image:
-    """Resize the image so that its longest side equals max_side.
-    Parameters
-    ----------
-    image : PIL.Image
-        Input image.
-    max_side : int, optional
-        Maximum allowed size for the longest side of the image.
-    Returns
-    -------
-    PIL.Image
-        The resized image.
-    """
     img = image.convert("RGB")
     w, h = img.size
     if max(w, h) > max_side:
@@ -164,36 +58,10 @@ def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.
 def get_amp_ctx():
-    """Return the appropriate context manager for automatic mixed precision."""
     return torch.cuda.amp.autocast if torch.cuda.is_available() else contextlib.nullcontext
-# ----------------------------------------------------------------------
-# XML parsing and segmentation
 #
 def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tuple | None = None):
-    """Parse ALTO or PAGE XML to extract bounding boxes.
-    Parameters
-    ----------
-    xml_bytes : bytes
-        Raw XML bytes.
-    level : {"block", "line", "word"}, optional
-        The segmentation level to extract.  For OCR we use "line".
-    image_size : tuple or None
-        If provided, image_size=(width, height) allows rescaling
-        coordinates to match the actual image.  ALTO files often store
-        absolute page sizes that differ from the image dimensions.
-    Returns
-    -------
-    list of dict
-        Each dict represents a bounding box with keys:
-        - "bbox": [x1, y1, x2, y2]
-        - "points": list of (x, y) if polygonal coordinates exist
-        - "id": line identifier (string)
-        - "label": the type of element (e.g. TextLine)
-    """
     def _strip_ns(elem):
         for e in elem.iter():
             if isinstance(e.tag, str) and e.tag.startswith("{"):
@@ -323,7 +191,6 @@ def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tupl
 def sort_boxes_reading_order(boxes, y_tol: int = 10):
-    """Sort bounding boxes top‑to‑bottom then left‑to‑right."""
     def key(b):
         x1, y1, x2, y2 = b["bbox"]
         return (round(y1 / max(1, y_tol)), y1, x1)
@@ -331,21 +198,6 @@ def sort_boxes_reading_order(boxes, y_tol: int = 10):
 def draw_boxes(img: Image.Image, boxes):
-    """Overlay semi‑transparent red polygons or rectangles on an image.
-    Parameters
-    ----------
-    img : PIL.Image
-        The base image.
-    boxes : list of dict
-        Segmentation boxes with either 'points' or 'bbox' keys.
-    Returns
-    -------
-    PIL.Image
-        An image with red overlays marking each box.  Boxes are numbered
-        starting from 1.
-    """
     base = img.convert("RGBA")
     overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
     draw = ImageDraw.Draw(overlay)
@@ -366,33 +218,7 @@ def draw_boxes(img: Image.Image, boxes):
     return Image.alpha_composite(base, overlay).convert("RGB")
-# ----------------------------------------------------------------------
-# OCR inference per line
-#
 def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOPK):
-    """Run the model on a single cropped line and return predictions and scores.
-    This helper wraps the model.generate call to obtain per‑token
-    probabilities and derives a DataFrame summarizing each decoding step.
-    Parameters
-    ----------
-    image : PIL.Image
-        Cropped segment to process.
-    line_id : int, optional
-        Identifier used in the output DataFrame.
-    topk : int, optional
-        Number of alternative tokens to keep for each decoding position.
-    Returns
-    -------
-    decoded_text : str
-        Cleaned predicted string for the line.
-    df : pandas.DataFrame
-        Table with one row per generated token containing the following
-        columns: line_id, seq_pos, token_id, token, confidence,
-        rel_prob, entropy, gap12, alt_tokens, alt_probs.
-    """
     model, tokenizer, feature_extractor, device = load_model()
     img = prepare_image(image)
     pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(device)
@@ -479,13 +305,6 @@ def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOP
     return decoded_text, df
-# ----------------------------------------------------------------------
-# Text splitting into aksharas (syllable units) for highlighting
-#
-# The following regex and helper functions split a Devanagari string into
-# aksharas.  This is necessary to map model tokens back to spans of
-# characters when highlighting uncertain predictions.
 DEV_CONS = "\u0915-\u0939\u0958-\u095F\u0978-\u097F"  # consonants incl. nukta variants range
 INDEP_VOW = "\u0904-\u0914"  # independent vowels
 NUKTA = "\u093C"  # nukta
@@ -533,29 +352,6 @@ def parse_alt_tokens(s: str):
 def highlight_tokens_with_tooltips(
     line_text: str, df_tok: pd.DataFrame, red_threshold: float, metric_column: str
 ) -> str:
-    """Insert HTML spans around tokens whose chosen metric exceeds threshold.
-    The metric column can be "rel_prob" (relative probability) or
-    "entropy".  Tokens with a value strictly greater than red_threshold
-    will be wrapped in a span with a tooltip listing alternative
-    predictions and their probabilities.
-    Parameters
-    ----------
-    line_text : str
-        The cleaned line prediction.
-    df_tok : pandas.DataFrame
-        DataFrame of token statistics for the corresponding line.
-    red_threshold : float
-        Values above this threshold will be highlighted.
-    metric_column : str
-        Column name in df_tok used for thresholding.
-    Returns
-    -------
-    str
-        An HTML string with <span> elements inserted.
-    """
     aks, spans = split_aksharas(line_text)
     joined = "".join(aks)
     used_ranges = []
@@ -639,40 +435,7 @@ def run_ocr(
     apply_bin: bool,
     highlight_metric: str,
 ):
-    """Run the OCR pipeline on user inputs and return results for Gradio.
-    Parameters
-    ----------
-    image : numpy.ndarray or None
-        The uploaded image converted to a NumPy array by Gradio.  If
-        None, the function returns empty results.
-    xml_file : tuple or None
-        A tuple representing the uploaded XML file as provided by
-        gr.File.  The first element is the file name and the second is
-        bytes.  If None, no segmentation is applied and the entire
-        image is processed as a single line.
-    apply_gray : bool
-        Whether to convert the image to grayscale before OCR.
-    apply_bin : bool
-        Whether to apply binarization (Otsu threshold) before OCR.  If
-        selected, grayscale conversion is applied first automatically.
-    highlight_metric : str
-        Which metric to use for highlighting ("Relative Probability" or
-        "Entropy").
-    Returns
-    -------
-    overlay_img : PIL.Image or None
-        Image with segmentation boxes drawn.  None if no input image.
-    predictions_html : str
-        HTML formatted predicted text with highlighted tokens.
-    df_scores : pandas.DataFrame or None
-        DataFrame of per‑token statistics.  None if no input image.
-    txt_file_path : str or None
-        Path to a temporary .txt file containing the plain predicted text.
-    csv_file_path : str or None
-        Path to a temporary CSV file containing the extended token scores.
-    """
     if image is None:
         return None, "", None, None, None
     # Convert the numpy array to a PIL image
@@ -806,7 +569,10 @@ def run_ocr(
         csv_fd.close()
     except Exception:
         csv_path = None
-    return overlay_img, predicted_html, df_all, txt_path, csv_path
 # ----------------------------------------------------------------------
@@ -815,7 +581,7 @@ def run_ocr(
 def create_gradio_interface():
     """Create and return the Gradio Blocks interface."""
     with gr.Blocks(title="Old Nepali HTR") as demo:
-        gr.Markdown("""# Old Nepali HTR (Gradio)\n\nUpload a scanned image and (optionally) a segmentation XML file.  Choose preprocessing\nsteps and a highlight metric, then click **Run OCR** to extract the text.\nUncertain tokens are highlighted with tooltips showing alternative predictions.\nYou can edit the plain text below and download it or the full token scores.""")
         with gr.Row():
             image_input = gr.Image(type="numpy", label="Upload Image")
             # When used as an input, gr.File returns either a file path or bytes
@@ -841,8 +607,11 @@ def create_gradio_interface():
         predictions_output = gr.HTML(label="Predictions (HTML)")
         df_output = gr.DataFrame(label="Token Scores", interactive=False)
         # Separate file outputs for the OCR prediction, token scores and edited text.
-        ocr_txt_output = gr.File(label="Download OCR Prediction (.txt)")
-        ocr_csv_output = gr.File(label="Download Token Scores (.csv)")
         edited_txt_output = gr.File(label="Download edited text (.txt)")
         # Editable text area

 import io
 import os
 from matplotlib import cm
 import gradio as gr
 MAX_LEN: int = 128
 TOPK: int = 3
 MAX_LINES: int = 120
 RESIZE_MAX_SIDE: int = 800
 REL_PROB_TH: float = 0.70
 CLEANUP: re.Pattern = re.compile(r"[\u00AD\u200B\u200C\u200D]")
 FONT_PATH: str = os.path.join("NotoSansDevanagari-Regular.ttf")
 @lru_cache(maxsize=1)
 def load_model():
     model_path = "AnjaliSarawgi/model-oct"
     hf_token = os.environ.get("HF_TOKEN")
     model = VisionEncoderDecoderModel.from_pretrained(model_path, token=hf_token)
     tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path, token=hf_token)
     return model, tokenizer, processor.feature_extractor, device
 def clean_text(text: str) -> str:
     text = unicodedata.normalize("NFC", text)
     text = CLEANUP.sub("", text)
     return re.sub(r"\s+", "", text)
 def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.Image:
     img = image.convert("RGB")
     w, h = img.size
     if max(w, h) > max_side:
 def get_amp_ctx():
     return torch.cuda.amp.autocast if torch.cuda.is_available() else contextlib.nullcontext
 #
 def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tuple | None = None):
     def _strip_ns(elem):
         for e in elem.iter():
             if isinstance(e.tag, str) and e.tag.startswith("{"):
 def sort_boxes_reading_order(boxes, y_tol: int = 10):
     def key(b):
         x1, y1, x2, y2 = b["bbox"]
         return (round(y1 / max(1, y_tol)), y1, x1)
 def draw_boxes(img: Image.Image, boxes):
     base = img.convert("RGBA")
     overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
     draw = ImageDraw.Draw(overlay)
     return Image.alpha_composite(base, overlay).convert("RGB")
 def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOPK):
     model, tokenizer, feature_extractor, device = load_model()
     img = prepare_image(image)
     pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(device)
     return decoded_text, df
 DEV_CONS = "\u0915-\u0939\u0958-\u095F\u0978-\u097F"  # consonants incl. nukta variants range
 INDEP_VOW = "\u0904-\u0914"  # independent vowels
 NUKTA = "\u093C"  # nukta
 def highlight_tokens_with_tooltips(
     line_text: str, df_tok: pd.DataFrame, red_threshold: float, metric_column: str
 ) -> str:
     aks, spans = split_aksharas(line_text)
     joined = "".join(aks)
     used_ranges = []
     apply_bin: bool,
     highlight_metric: str,
 ):
     if image is None:
         return None, "", None, None, None
     # Convert the numpy array to a PIL image
         csv_fd.close()
     except Exception:
         csv_path = None
+    # return overlay_img, predicted_html, df_all, txt_path, csv_path
+    txt_bytes = plain_text.encode("utf-8")
+    csv_bytes = df_all.to_csv(index=False).encode("utf-8")
+    return overlay_img, predicted_html, df_all, txt_bytes, csv_bytes
 # ----------------------------------------------------------------------
 def create_gradio_interface():
     """Create and return the Gradio Blocks interface."""
     with gr.Blocks(title="Old Nepali HTR") as demo:
+        gr.Markdown("""# Old Nepali HTR \n\nUpload a scanned image and (optionally) a segmentation XML file.  Choose preprocessing\nsteps and a highlight metric, then click **Run OCR** to extract the text.\nUncertain tokens are highlighted with tooltips showing alternative predictions.\nYou can edit the plain text below and download it or the full token scores.""")
         with gr.Row():
             image_input = gr.Image(type="numpy", label="Upload Image")
             # When used as an input, gr.File returns either a file path or bytes
         predictions_output = gr.HTML(label="Predictions (HTML)")
         df_output = gr.DataFrame(label="Token Scores", interactive=False)
         # Separate file outputs for the OCR prediction, token scores and edited text.
+        # ocr_txt_output = gr.File(label="Download OCR Prediction (.txt)")
+        # ocr_csv_output = gr.File(label="Download Token Scores (.csv)")
+        ocr_txt_output = gr.File(label="Download OCR Prediction (.txt)", type="binary")
+        ocr_csv_output = gr.File(label="Download Token Scores (.csv)", type="binary")
         edited_txt_output = gr.File(label="Download edited text (.txt)")
         # Editable text area