Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Dec 20, 2024

Commit

e332feb

1 Parent(s): 0f59b51

Merge plus form processor

Browse files

Files changed (9) hide show

marker/config/parser.py +17 -23
marker/converters/pdf.py +5 -1
marker/llm.py +55 -0
marker/processors/llm/__init__.py +0 -0
marker/processors/llm/highqualityformprocessor.py +151 -0
marker/processors/llm/highqualitytableprocessor.py +188 -0
marker/schema/blocks/form.py +5 -0
marker/settings.py +3 -0
pyproject.toml +1 -0

marker/config/parser.py CHANGED Viewed

@@ -34,45 +34,39 @@ class ConfigParser:
         fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
         fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
         fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
-        fn = click.option("--high_quality", is_flag=True, default=False, help="Enable high quality processing with Gemini.")(fn)
         return fn
     def generate_config_dict(self) -> Dict[str, any]:
         config = {}
         output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
         for k, v in self.cli_options.items():
             match k:
                 case "debug":
-                    if v:
-                        config["debug_pdf_images"] = True
-                        config["debug_layout_images"] = True
-                        config["debug_json"] = True
-                        config["debug_data_folder"] = output_dir
                 case "page_range":
-                    if v:
-                        config["page_range"] = parse_range_str(v)
                 case "force_ocr":
-                    if v:
-                        config["force_ocr"] = True
                 case "languages":
-                    if v:
-                        config["languages"] = v.split(",")
                 case "config_json":
-                    if v:
-                        with open(v, "r") as f:
-                            config.update(json.load(f))
                 case "disable_multiprocessing":
-                    if v:
-                        config["pdftext_workers"] = 1
                 case "paginate_output":
-                    if v:
-                        config["paginate_output"] = True
                 case "disable_image_extraction":
-                    if v:
-                        config["extract_images"] = False
                 case "high_quality":
-                    if v:
-                        config["high_quality"] = True
         return config
     def get_renderer(self):

         fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
         fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
         fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
+        fn = click.option("--high_quality", is_flag=True, default=False, help="Enable high quality processing with LLMs.")(fn)
         return fn
     def generate_config_dict(self) -> Dict[str, any]:
         config = {}
         output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
         for k, v in self.cli_options.items():
+            if not v:
+                continue
             match k:
                 case "debug":
+                    config["debug_pdf_images"] = True
+                    config["debug_layout_images"] = True
+                    config["debug_json"] = True
+                    config["debug_data_folder"] = output_dir
                 case "page_range":
+                    config["page_range"] = parse_range_str(v)
                 case "force_ocr":
+                    config["force_ocr"] = True
                 case "languages":
+                    config["languages"] = v.split(",")
                 case "config_json":
+                    with open(v, "r") as f:
+                        config.update(json.load(f))
                 case "disable_multiprocessing":
+                    config["pdftext_workers"] = 1
                 case "paginate_output":
+                    config["paginate_output"] = True
                 case "disable_image_extraction":
+                    config["extract_images"] = False
                 case "high_quality":
+                    config["high_quality"] = True
         return config
     def get_renderer(self):

marker/converters/pdf.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 import inspect
 from collections import defaultdict
@@ -17,6 +17,8 @@ from marker.processors.debug import DebugProcessor
 from marker.processors.document_toc import DocumentTOCProcessor
 from marker.processors.equation import EquationProcessor
 from marker.processors.footnote import FootnoteProcessor
 from marker.processors.high_quality_text import HighQualityTextProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
@@ -68,6 +70,8 @@ class PdfConverter(BaseConverter):
                 PageHeaderProcessor,
                 SectionHeaderProcessor,
                 TableProcessor,
                 TextProcessor,
                 HighQualityTextProcessor,
                 DebugProcessor,

 import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
 import inspect
 from collections import defaultdict
 from marker.processors.document_toc import DocumentTOCProcessor
 from marker.processors.equation import EquationProcessor
 from marker.processors.footnote import FootnoteProcessor
+from marker.processors.llm.highqualityformprocessor import HighQualityFormProcessor
+from marker.processors.llm.highqualitytableprocessor import HighQualityTableProcessor
 from marker.processors.high_quality_text import HighQualityTextProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
                 PageHeaderProcessor,
                 SectionHeaderProcessor,
                 TableProcessor,
+                HighQualityTableProcessor,
+                HighQualityFormProcessor,
                 TextProcessor,
                 HighQualityTextProcessor,
                 DebugProcessor,

marker/llm.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import json
+import time
+import PIL
+import google.generativeai as genai
+from google.ai.generativelanguage_v1beta.types import content
+from google.api_core.exceptions import ResourceExhausted
+class GoogleModel:
+    def __init__(self, api_key: str, model_name: str):
+        if api_key is None:
+            raise ValueError("Google API key is not set")
+        self.api_key = api_key
+        self.model_name = model_name
+        self.model = self.configure_google_model()
+    def configure_google_model(self):
+        genai.configure(api_key=self.api_key)
+        return genai.GenerativeModel(self.model_name)
+    def generate_response(
+            self,
+            prompt: str,
+            image: PIL.Image.Image,
+            response_schema: content.Schema,
+            max_retries: int = 3,
+            timeout: int = 60
+    ):
+        tries = 0
+        while tries < max_retries:
+            try:
+                responses = self.model.generate_content(
+                    [prompt, image],
+                    stream=False,
+                    generation_config={
+                        "temperature": 0,
+                        "response_schema": response_schema,
+                        "response_mime_type": "application/json",
+                    },
+                    request_options={'timeout': timeout}
+                )
+                output = responses.candidates[0].content.parts[0].text
+                return json.loads(output)
+            except ResourceExhausted as e:
+                tries += 1
+                wait_time = tries * 3
+                print(f"ResourceExhausted: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{max_retries})")
+                time.sleep(wait_time)
+            except Exception as e:
+                print(e)
+                break
+        return {}

marker/processors/llm/__init__.py ADDED Viewed

File without changes

marker/processors/llm/highqualityformprocessor.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import markdown2
+from marker.llm import GoogleModel
+from marker.processors import BaseProcessor
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Optional
+from google.ai.generativelanguage_v1beta.types import content
+from tqdm import tqdm
+from tabled.formats import markdown_format
+from marker.schema import BlockTypes
+from marker.schema.blocks import Block
+from marker.schema.document import Document
+from marker.schema.groups.page import PageGroup
+from marker.settings import settings
+class HighQualityFormProcessor(BaseProcessor):
+    """
+    A processor for converting form blocks in a document to markdown.
+    Attributes:
+        google_api_key (str):
+            The Google API key to use for the Gemini model.
+            Default is None.
+        model_name (str):
+            The name of the Gemini model to use.
+            Default is "gemini-1.5-flash".
+        max_retries (int):
+            The maximum number of retries to use for the Gemini model.
+            Default is 3.
+        max_concurrency (int):
+            The maximum number of concurrent requests to make to the Gemini model.
+            Default is 3.
+        timeout (int):
+            The timeout for requests to the Gemini model.
+        gemini_rewriting_prompt (str):
+            The prompt to use for rewriting text.
+            Default is a string containing the Gemini rewriting prompt.
+    """
+    block_types = (BlockTypes.Form,)
+    google_api_key: Optional[str] = settings.GOOGLE_API_KEY
+    model_name: str = "gemini-1.5-flash"
+    high_quality: bool = False
+    max_retries: int = 3
+    max_concurrency: int = 3
+    timeout: int = 60
+    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+You will receive an image of a text block and a markdown representation of the form in the image.
+Your task is to correct any errors in the markdown representation, and format it properly.
+Values and labels should appear in markdown tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.
+**Instructions:**
+1. Carefully examine the provided form block image.
+2. Analyze the markdown representation of the form.
+3. If the markdown representation is largely correct, then write "No corrections needed."
+4. If the markdown representation contains errors, generate the corrected markdown representation.
+5. Output only either the corrected markdown representation or "No corrections needed."
+**Example:**
+Input:
+```markdown
+| Label 1 | Label 2 | Label 3 |
+|----------|----------|----------|
+| Value 1  | Value 2  | Value 3  |
+```
+Output:
+```markdown
+| Labels | Values |
+|--------|--------|
+| Label 1 | Value 1 |
+| Label 2 | Value 2 |
+| Label 3 | Value 3 |
+```
+**Input:**
+"""
+    def __init__(self, config=None):
+        super().__init__(config)
+        self.model = None
+        if not self.high_quality:
+            return
+        self.model = GoogleModel(self.google_api_key, self.model_name)
+    def __call__(self, document: Document):
+        if not self.high_quality or self.model is None:
+            return
+        self.rewrite_blocks(document)
+    def rewrite_blocks(self, document: Document):
+        pbar = tqdm(desc="High quality form processor")
+        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
+            for future in as_completed([
+                executor.submit(self.process_rewriting, page, block)
+                for page in document.pages
+                for block in page.contained_blocks(document, self.block_types)
+            ]):
+                future.result()  # Raise exceptions if any occurred
+                pbar.update(1)
+        pbar.close()
+    def process_rewriting(self, page: PageGroup, block: Block):
+        cells = block.cells
+        if cells is None:
+            # Happens if table/form processors didn't run
+            return
+        prompt = self.gemini_rewriting_prompt + '```markdown\n`' + markdown_format(cells) + '`\n```\n'
+        image = self.extract_image(page, block)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            enum=[],
+            required=["corrected_markdown"],
+            properties={
+                "corrected_markdown": content.Schema(
+                    type=content.Type.STRING
+                )
+            },
+        )
+        response = self.model.generate_response(prompt, image, response_schema)
+        if not response or "corrected_markdown" not in response:
+            return
+        corrected_markdown = response["corrected_markdown"]
+        # The original table is okay
+        if "no corrections" in corrected_markdown.lower():
+            return
+        orig_cell_text = "".join([cell.text for cell in cells])
+        # Potentially a partial response
+        if len(corrected_markdown) < len(orig_cell_text) * .5:
+            return
+        # Convert LLM markdown to html
+        block.html = markdown2.markdown(corrected_markdown)
+    def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01):
+        page_img = page.lowres_image
+        image_box = image_block.polygon\
+            .rescale(page.polygon.size, page_img.size)\
+            .expand(expand, expand)
+        cropped = page_img.crop(image_box.bbox)
+        return cropped

marker/processors/llm/highqualitytableprocessor.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from tabled.schema import SpanTableCell
+from marker.llm import GoogleModel
+from marker.processors import BaseProcessor
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Optional, List
+from google.ai.generativelanguage_v1beta.types import content
+from tqdm import tqdm
+from tabled.formats import markdown_format
+from marker.schema import BlockTypes
+from marker.schema.blocks import Block
+from marker.schema.document import Document
+from marker.schema.groups.page import PageGroup
+from marker.schema.polygon import PolygonBox
+from marker.settings import settings
+class HighQualityTableProcessor(BaseProcessor):
+    """
+    A processor for converting table blocks in a document to markdown.
+    Attributes:
+        google_api_key (str):
+            The Google API key to use for the Gemini model.
+            Default is None.
+        model_name (str):
+            The name of the Gemini model to use.
+            Default is "gemini-1.5-flash".
+        max_retries (int):
+            The maximum number of retries to use for the Gemini model.
+            Default is 3.
+        max_concurrency (int):
+            The maximum number of concurrent requests to make to the Gemini model.
+            Default is 3.
+        timeout (int):
+            The timeout for requests to the Gemini model.
+        gemini_rewriting_prompt (str):
+            The prompt to use for rewriting text.
+            Default is a string containing the Gemini rewriting prompt.
+    """
+    block_types = (BlockTypes.Table,)
+    google_api_key: Optional[str] = settings.GOOGLE_API_KEY
+    model_name: str = "gemini-1.5-flash"
+    high_quality: bool = False
+    max_retries: int = 3
+    max_concurrency: int = 3
+    timeout: int = 60
+    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+You will receive an image of a text block and a markdown representation of the table in the image.
+Your task is to correct any errors in the markdown representation.  The markdown representation should be as faithful to the original table as possible.
+**Instructions:**
+1. Carefully examine the provided text block image.
+2. Analyze the markdown representation of the table.
+3. If the markdown representation is largely correct, then write "No corrections needed."
+4. If the markdown representation contains errors, generate the corrected markdown representation.
+5. Output only either the corrected markdown representation or "No corrections needed."
+**Example:**
+Input:
+```markdown
+| Column 1 | Column 2 | Column 3 |
+|----------|----------|----------|
+| Value 1  | Value 2  | Value 3  |
+```
+Output:
+```markdown
+No corrections needed.
+```
+**Input:**
+"""
+    def __init__(self, config=None):
+        super().__init__(config)
+        self.model = None
+        if not self.high_quality:
+            return
+        self.model = GoogleModel(self.google_api_key, self.model_name)
+    def __call__(self, document: Document):
+        if not self.high_quality or self.model is None:
+            return
+        self.rewrite_blocks(document)
+    def rewrite_blocks(self, document: Document):
+        pbar = tqdm(desc="High quality table processor")
+        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
+            for future in as_completed([
+                executor.submit(self.process_rewriting, page, block)
+                for page in document.pages
+                for block in page.contained_blocks(document, self.block_types)
+            ]):
+                future.result()  # Raise exceptions if any occurred
+                pbar.update(1)
+        pbar.close()
+    def process_rewriting(self, page: PageGroup, block: Block):
+        cells = block.cells
+        if cells is None:
+            # Happens if table/form processors didn't run
+            return
+        prompt = self.gemini_rewriting_prompt + '```markdown\n`' + markdown_format(cells) + '`\n```\n'
+        image = self.extract_image(page, block)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            enum=[],
+            required=["corrected_markdown"],
+            properties={
+                "corrected_markdown": content.Schema(
+                    type=content.Type.STRING
+                )
+            },
+        )
+        response = self.model.generate_response(prompt, image, response_schema)
+        if not response or "corrected_markdown" not in response:
+            return
+        corrected_markdown = response["corrected_markdown"]
+        # The original table is okay
+        if "no corrections" in corrected_markdown.lower():
+            return
+        parsed_cells = self.parse_markdown_table(corrected_markdown, block)
+        if len(parsed_cells) <= 1:
+            return
+        parsed_cell_text = "".join([cell.text for cell in parsed_cells])
+        orig_cell_text = "".join([cell.text for cell in cells])
+        # Potentially a partial response
+        if len(parsed_cell_text) < len(orig_cell_text) * .5:
+            return
+        block.cells = parsed_cells
+    def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01):
+        page_img = page.lowres_image
+        image_box = image_block.polygon\
+            .rescale(page.polygon.size, page_img.size)\
+            .expand(expand, expand)
+        cropped = page_img.crop(image_box.bbox)
+        return cropped
+    def parse_markdown_table(self, markdown_text: str, block: Block) -> List[SpanTableCell]:
+        lines = [line.strip() for line in markdown_text.splitlines() if line.strip()]
+        # Remove separator row for headers
+        lines = [line for line in lines if not line.replace('|', ' ').replace('-', ' ').isspace()]
+        rows = []
+        for line in lines:
+            # Remove leading/trailing pipes and split by remaining pipes
+            cells = line.strip('|').split('|')
+            # Clean whitespace from each cell
+            cells = [cell.strip() for cell in cells]
+            rows.append(cells)
+        cells = []
+        for i, row in enumerate(rows):
+            for j, cell in enumerate(row):
+                cell_bbox = [
+                    block.polygon.bbox[0] + j,
+                    block.polygon.bbox[1] + i,
+                    block.polygon.bbox[0] + j + 1,
+                    block.polygon.bbox[1] + i + 1
+                ]
+                cell_polygon = PolygonBox.from_bbox(cell_bbox)
+                cells.append(
+                    SpanTableCell(
+                        text=cell,
+                        row_ids=[i],
+                        col_ids=[j],
+                        bbox=cell_polygon.bbox
+                    )
+                )
+        return cells

marker/schema/blocks/form.py CHANGED Viewed

@@ -10,6 +10,11 @@ from marker.schema.blocks import Block
 class Form(Block):
     block_type: str = BlockTypes.Form
     cells: List[SpanTableCell] | None = None
     def assemble_html(self, child_blocks, parent_structure=None):
         return str(html_format(self.cells))

 class Form(Block):
     block_type: str = BlockTypes.Form
     cells: List[SpanTableCell] | None = None
+    html: str | None = None
     def assemble_html(self, child_blocks, parent_structure=None):
+        # Some processors convert the form to html
+        if self.html is not None:
+            return self.html
         return str(html_format(self.cells))

marker/settings.py CHANGED Viewed

@@ -18,6 +18,9 @@ class Settings(BaseSettings):
     OUTPUT_ENCODING: str = "utf-8"
     OUTPUT_IMAGE_FORMAT: str = "JPEG"
     # General models
     TORCH_DEVICE: Optional[str] = None  # Note: MPS device does not work for text detection, and will default to CPU
     GOOGLE_API_KEY: Optional[str] = None

     OUTPUT_ENCODING: str = "utf-8"
     OUTPUT_IMAGE_FORMAT: str = "JPEG"
+    # LLM
+    GOOGLE_API_KEY: Optional[str] = None
     # General models
     TORCH_DEVICE: Optional[str] = None  # Note: MPS device does not work for text detection, and will default to CPU
     GOOGLE_API_KEY: Optional[str] = None

pyproject.toml CHANGED Viewed

@@ -40,6 +40,7 @@ tabled-pdf = "~0.2.0"
 markdownify = "^0.13.1"
 click = "^8.1.7"
 google-generativeai = "^0.8.3"
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

 markdownify = "^0.13.1"
 click = "^8.1.7"
 google-generativeai = "^0.8.3"
+markdown2 = "^2.5.2"
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"