Spaces:

google
/

radextract

Running on CPU Upgrade

File size: 3,087 Bytes

fab8051

"""Text preprocessing for radiology reports with complex Unicode and formatting.

Handles reports containing complex Unicode symbolic characters and non-standard
structural formatting that are not currently supported by the prompt and LangExtract
library. Prevents timeout issues by normalizing problematic characters and structures
to formats compatible with downstream processing.

Typical usage example:

    from sanitize import preprocess_report
    
    clean_text = preprocess_report(raw_report)
"""

from __future__ import annotations

import re

import ftfy

_TRANSLATE = str.maketrans(
    {
        0x2022: "*",
        0x25CF: "*",
        0x27A1: "->",
        0xF0E0: "->",
        0x2192: "->",
        0x2190: "<-",
        0x00D7: "x",
        0x2191: "up",
        0x2642: "male",
        0x2640: "female",
        0x2010: "-",
        0x2013: "-",
        0x2014: "-",
        0x00A0: " ",
    }
)

_WS = re.compile(r"[ \t]+")
_BLANKS = re.compile(r"\n\s*\n\s*\n+")

# Structure normalization patterns
_BEGIN = re.compile(r"---\s*BEGIN [^-]+---\n*", re.I)
_END = re.compile(r"\n*---\s*END [^-]+---\s*", re.I)
_HEADER = re.compile(r"\*{3}\s*([^*]+?)\s*\*{3}", re.I)
_BULLET_HDR = re.compile(r"^[ \t]*[\*\u2022\u25CF-]+\s*", re.M)
_ENUM = re.compile(r"^[ \t]*(\d+)[\)\.][ \t]+", re.M)


def sanitize_text(text: str) -> str:
    """Sanitizes Unicode characters and normalizes whitespace.

    Applies ftfy text repair, translates problematic Unicode symbols to ASCII
    equivalents, normalizes whitespace, and removes excessive blank lines.

    Args:
        text: The input text to sanitize.

    Returns:
        Sanitized text with Unicode issues resolved and whitespace normalized.
    """
    out = ftfy.fix_text(text, remove_control_chars=True, normalization="NFC")
    out = out.translate(_TRANSLATE)
    out = _WS.sub(" ", out)
    out = out.replace("\r\n", "\n").replace("\r", "\n")
    out = _BLANKS.sub("\n\n", out)
    return out.strip()


def normalize_structure(text: str) -> str:
    """Normalizes structural elements in radiology reports.

    Removes report wrappers, converts asterisk headers to colon format,
    removes bullet prefixes, and standardizes enumerations.

    Args:
        text: The input text to normalize.

    Returns:
        Text with structural elements normalized for consistent formatting.
    """
    text = _BEGIN.sub("", text)
    text = _END.sub("", text)
    text = _HEADER.sub(lambda m: f"{m.group(1).strip()}:", text)
    text = _BULLET_HDR.sub("", text)
    text = _ENUM.sub(lambda m: f"{m.group(1)}. ", text)
    return text.strip()


def preprocess_report(raw: str) -> str:
    """Preprocesses radiology reports with sanitization and normalization.

    Combines Unicode sanitization and structural normalization to prepare
    radiology reports for downstream processing. This is the main entry point
    for text preprocessing.

    Args:
        raw: The raw radiology report text.

    Returns:
        Preprocessed text ready for structured extraction.
    """
    return normalize_structure(sanitize_text(raw))