Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Oct 30, 2023

Commit

fc65ff4

1 Parent(s): a5c1c2e

Load models externally

Browse files

Files changed (7) hide show

README.md +7 -2
marker/code.py +32 -7
marker/equations.py +6 -12
marker/markdown.py +1 -1
marker/segmentation.py +28 -16
marker/settings.py +5 -0
parse.py +15 -6

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 This project converts PDF to Markdown, balancing speed with quality:
-- Equations will be detected and converted to Latex.  This is not 100% accurate.
 - All headers/footers/other artifacts will be removed.
@@ -10,4 +10,9 @@ This project converts PDF to Markdown, balancing speed with quality:
 ## Install
 - `poetry install`
-- Set `TESSDATA_PREFIX`

 This project converts PDF to Markdown, balancing speed with quality:
+- Equations will be detected and converted to Latex when possible.
 - All headers/footers/other artifacts will be removed.
 ## Install
 - `poetry install`
+- Set `TESSDATA_PREFIX`
+## Usage
+Can work with CPU, MPS, or GPU

marker/code.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List
 import fitz as pymupdf
-def is_code_linelen(lines, thresh=50):
     # Decide based on chars per newline threshold
     total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
     total_newlines = len(lines) - 1
@@ -16,7 +16,20 @@ def is_code_linelen(lines, thresh=50):
     return ratio < thresh
 def identify_code_blocks(blocks: List[Page]):
     for page in blocks:
         try:
             common_height = page.get_line_height_stats().most_common(1)[0][0]
@@ -31,19 +44,30 @@ def identify_code_blocks(blocks: List[Page]):
                 continue
             is_code = []
             for line in block.lines:
                 fonts = [span.font for span in line.spans]
-                monospace_font = any([font for font in fonts if "mono" in font.lower() or "prop" in font.lower()])
                 line_height = line.bbox[3] - line.bbox[1]
                 line_start = line.bbox[0]
-                if line_height <= common_height and line_start > common_start and monospace_font:
                     is_code.append(True)
                 else:
                     is_code.append(False)
             is_code = [
-                sum(is_code) > len(block.lines) / 1.5,
-                len(block.lines) > 4,
-                is_code_linelen(block.lines)
             ]
             if all(is_code):
@@ -54,7 +78,8 @@ def indent_blocks(blocks: List[Page]):
     span_counter = 0
     for page in blocks:
         for block in page.blocks:
-            if block.most_common_block_type() != "Code":
                 continue
             lines = []

 import fitz as pymupdf
+def is_code_linelen(lines, thresh=70):
     # Decide based on chars per newline threshold
     total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
     total_newlines = len(lines) - 1
     return ratio < thresh
+def comment_count(lines):
+    pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|<!--|%|%{|\(\*)")
+    return sum([1 for line in lines if pattern.match(line)])
 def identify_code_blocks(blocks: List[Page]):
+    font_info = None
+    for p in blocks:
+        stats = p.get_font_stats()
+        if font_info is None:
+            font_info = stats
+        else:
+            font_info += stats
+    most_common_font = font_info.most_common(1)[0][0]
     for page in blocks:
         try:
             common_height = page.get_line_height_stats().most_common(1)[0][0]
                 continue
             is_code = []
+            line_fonts = []
             for line in block.lines:
                 fonts = [span.font for span in line.spans]
+                line_fonts += fonts
                 line_height = line.bbox[3] - line.bbox[1]
                 line_start = line.bbox[0]
+                if line_start > common_start:
                     is_code.append(True)
                 else:
                     is_code.append(False)
+            comment_lines = comment_count([line.prelim_text for line in block.lines])
             is_code = [
+                len(block.lines) > 2,
+                sum([f != most_common_font for f in line_fonts]) > len(line_fonts) // 1.5,  # At least 1/3 of the fonts are not the most common, since code usually uses a different font from the main body text
+                (
+                    sum(is_code) > len(block.lines) * .2
+                    or
+                    comment_lines > len(block.lines) * .1
+                 ), # 20% of lines are indented or comments
+                (
+                    is_code_linelen(block.lines)
+                    or
+                    comment_lines > len(block.lines) * .1
+                ), # 60 chars per newline or less for code, or 20% of lines are comments
             ]
             if all(is_code):
     span_counter = 0
     for page in blocks:
         for block in page.blocks:
+            block_types = [span.block_type for line in block.lines for span in line.spans]
+            if "Code" not in block_types:
                 continue
             lines = []

marker/equations.py CHANGED Viewed

@@ -13,7 +13,7 @@ from marker.schema import Page, Span, Line, Block, BlockType
 from nougat.utils.device import move_to_device
-def load_model():
     ckpt = get_checkpoint(None, model_tag="0.1.0-small")
     nougat_model = NougatModel.from_pretrained(ckpt)
     if settings.TORCH_DEVICE != "cpu":
@@ -23,12 +23,6 @@ def load_model():
     return nougat_model
-nougat_model = load_model()
-MODEL_MAX = nougat_model.config.max_length
-NOUGAT_HALLUCINATION_WORDS = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote", "\par\par\par", "## Chapter", "Fig."]
 def contains_equation(text):
     # Define a regular expression pattern to look for operators and symbols commonly found in equations
     pattern = re.compile(r'[=\^\√∑∏∫∂∆π≈≠≤≥∞∩∪∈∉∀∃∅∇λμσαβγδεζηθφχψω]')
@@ -66,18 +60,18 @@ def mask_bbox(png_image, bbox, selected_bboxes):
     return result
-def get_nougat_text(page, old_text, bbox, selected_bboxes, save_id, max_length=MODEL_MAX):
     pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
     png = pix.pil_tobytes(format="PNG")
     png_image = Image.open(io.BytesIO(png))
     png_image = mask_bbox(png_image, bbox, selected_bboxes)
-    nougat_model.config.max_length = min(max_length, MODEL_MAX)
     output = nougat_model.inference(image=png_image)
     return output["predictions"][0]
-def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]]):
     span_id = 0
     new_blocks = []
     for pnum, page in enumerate(blocks):
@@ -126,10 +120,10 @@ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]
                 # This prevents hallucinations from running on for a long time
                 max_tokens = len(block_text) + 50
                 max_char_length = 2 * len(block_text) + 100
-                nougat_text = get_nougat_text(doc[pnum], block_text, bbox, selected_bboxes, f"{pnum}_{i}", max_length=max_tokens)
                 conditions = [
                     len(nougat_text) > 0,
-                    not any([word in nougat_text for word in NOUGAT_HALLUCINATION_WORDS]),
                     len(nougat_text) < max_char_length, # Reduce hallucinations
                     len(nougat_text) >= len(block_text) * .8
                 ]

 from nougat.utils.device import move_to_device
+def load_nougat_model():
     ckpt = get_checkpoint(None, model_tag="0.1.0-small")
     nougat_model = NougatModel.from_pretrained(ckpt)
     if settings.TORCH_DEVICE != "cpu":
     return nougat_model
 def contains_equation(text):
     # Define a regular expression pattern to look for operators and symbols commonly found in equations
     pattern = re.compile(r'[=\^\√∑∏∫∂∆π≈≠≤≥∞∩∪∈∉∀∃∅∇λμσαβγδεζηθφχψω]')
     return result
+def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
     pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
     png = pix.pil_tobytes(format="PNG")
     png_image = Image.open(io.BytesIO(png))
     png_image = mask_bbox(png_image, bbox, selected_bboxes)
+    nougat_model.config.max_length = min(max_length, settings.NOUGAT_MODEL_MAX)
     output = nougat_model.inference(image=png_image)
     return output["predictions"][0]
+def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]], nougat_model):
     span_id = 0
     new_blocks = []
     for pnum, page in enumerate(blocks):
                 # This prevents hallucinations from running on for a long time
                 max_tokens = len(block_text) + 50
                 max_char_length = 2 * len(block_text) + 100
+                nougat_text = get_nougat_text(doc[pnum], bbox, selected_bboxes, nougat_model, max_length=max_tokens)
                 conditions = [
                     len(nougat_text) > 0,
+                    not any([word in nougat_text for word in settings.NOUGAT_HALLUCINATION_WORDS]),
                     len(nougat_text) < max_char_length, # Reduce hallucinations
                     len(nougat_text) >= len(block_text) * .8
                 ]

marker/markdown.py CHANGED Viewed

@@ -69,7 +69,7 @@ def block_surround(text, block_type):
         case "List-item":
             pass
         case "Code":
-            text = "```\n" + text + "\n```\n"
         case _:
             pass
     return text

         case "List-item":
             pass
         case "Code":
+            text = "\n" + text + "\n"
         case _:
             pass
     return text

marker/segmentation.py CHANGED Viewed

@@ -16,11 +16,8 @@ processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", app
 CHUNK_KEYS = ["input_ids", "attention_mask", "bbox", "offset_mapping"]
 NO_CHUNK_KEYS = ["pixel_values"]
-MODEL_MAX_LEN = 512
-CHUNK_OVERLAP = 128
-def load_model():
     model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
     model.config.id2label = {
         0: "Caption",
@@ -40,19 +37,16 @@ def load_model():
     return model
-layoutlm_model = load_model()
-def detect_all_block_types(doc, blocks: List[Page]):
     block_types = []
     for pnum, page in enumerate(doc):
         page_blocks = blocks[pnum]
-        predictions = detect_page_block_types(page, page_blocks)
         block_types.append(predictions)
     return block_types
-def detect_page_block_types(page, page_blocks: Page):
     page_box = page.bound()
     pwidth = page_box[2] - page_box[0]
     pheight = page_box[3] - page_box[1]
@@ -66,7 +60,7 @@ def detect_page_block_types(page, page_blocks: Page):
     boxes = [s.bbox for s in lines]
     text = [s.prelim_text for s in lines]
-    predictions = make_predictions(rgb_image, text, boxes, pwidth, pheight)
     return predictions
@@ -85,10 +79,10 @@ def get_provisional_boxes(pred, box, is_subword, start_idx=0):
     return prov_predictions, prov_boxes
-def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]:
     # Normalize boxes for model (scale to 1000x1000)
     boxes = [normalize_box(box, pwidth, pheight) for box in boxes]
-    encoding = processor(rgb_image, text=text, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True, stride=CHUNK_OVERLAP, padding="max_length", max_length=MODEL_MAX_LEN, return_overflowing_tokens=True)
     offset_mapping = encoding.pop('offset_mapping')
     overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
@@ -108,7 +102,7 @@ def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]
     predictions = logits.argmax(-1).squeeze().tolist()
     token_boxes = encoding.bbox.squeeze().tolist()
-    if len(token_boxes) == MODEL_MAX_LEN:
         predictions = [predictions]
         token_boxes = [token_boxes]
@@ -118,7 +112,7 @@ def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]
         is_subword = np.array(mapped.squeeze().tolist())[:, 0] != 0
         overlap_adjust = 0
         if i > 0:
-            overlap_adjust = 1 + CHUNK_OVERLAP - sum(is_subword[:1 + CHUNK_OVERLAP])
         prov_predictions, prov_boxes = get_provisional_boxes(pred, box, is_subword, overlap_adjust)
@@ -135,5 +129,23 @@ def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]
             if len(predicted_block_types) == 0 or unnorm_box != predicted_block_types[-1].bbox:
                 predicted_block_types.append(block_type)
-    return predicted_block_types

 CHUNK_KEYS = ["input_ids", "attention_mask", "bbox", "offset_mapping"]
 NO_CHUNK_KEYS = ["pixel_values"]
+def load_layout_model():
     model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
     model.config.id2label = {
         0: "Caption",
     return model
+def detect_all_block_types(doc, blocks: List[Page], layoutlm_model):
     block_types = []
     for pnum, page in enumerate(doc):
         page_blocks = blocks[pnum]
+        predictions = detect_page_block_types(page, page_blocks, layoutlm_model)
         block_types.append(predictions)
     return block_types
+def detect_page_block_types(page, page_blocks: Page, layoutlm_model):
     page_box = page.bound()
     pwidth = page_box[2] - page_box[0]
     pheight = page_box[3] - page_box[1]
     boxes = [s.bbox for s in lines]
     text = [s.prelim_text for s in lines]
+    predictions = make_predictions(rgb_image, text, boxes, pwidth, pheight, layoutlm_model)
     return predictions
     return prov_predictions, prov_boxes
+def make_predictions(rgb_image, text, boxes, pwidth, pheight, layoutlm_model) -> List[BlockType]:
     # Normalize boxes for model (scale to 1000x1000)
     boxes = [normalize_box(box, pwidth, pheight) for box in boxes]
+    encoding = processor(rgb_image, text=text, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True, stride=settings.LAYOUT_CHUNK_OVERLAP, padding="max_length", max_length=settings.LAYOUT_MODEL_MAX, return_overflowing_tokens=True)
     offset_mapping = encoding.pop('offset_mapping')
     overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
     predictions = logits.argmax(-1).squeeze().tolist()
     token_boxes = encoding.bbox.squeeze().tolist()
+    if len(token_boxes) == settings.LAYOUT_MODEL_MAX:
         predictions = [predictions]
         token_boxes = [token_boxes]
         is_subword = np.array(mapped.squeeze().tolist())[:, 0] != 0
         overlap_adjust = 0
         if i > 0:
+            overlap_adjust = 1 + settings.LAYOUT_CHUNK_OVERLAP - sum(is_subword[:1 + settings.LAYOUT_CHUNK_OVERLAP])
         prov_predictions, prov_boxes = get_provisional_boxes(pred, box, is_subword, overlap_adjust)
             if len(predicted_block_types) == 0 or unnorm_box != predicted_block_types[-1].bbox:
                 predicted_block_types.append(block_type)
+    # Align bboxes
+    # This will search both lists to find matching bboxes
+    # This will align both sets of bboxes by index
+    # If there are duplicate bboxes, it may result in issues
+    aligned_blocks = []
+    for i in range(len(boxes)):
+        unnorm_box = unnormalize_box(boxes[i], pwidth, pheight)
+        appended = False
+        for j in range(len(predicted_block_types)):
+            if unnorm_box == predicted_block_types[j].bbox:
+                aligned_blocks.append(predicted_block_types[j])
+                appended = True
+                break
+        if not appended:
+            aligned_blocks.append(BlockType(
+                block_type="Text",
+                bbox=unnorm_box
+            ))
+    return aligned_blocks

marker/settings.py CHANGED Viewed

@@ -12,6 +12,11 @@ class Settings(BaseSettings):
     TORCH_DEVICE: str = "cpu"
     TESSDATA_PREFIX: str = ""
     BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
     class Config:
         env_file = find_dotenv("local.env")

     TORCH_DEVICE: str = "cpu"
     TESSDATA_PREFIX: str = ""
     BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
+    NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
+    NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
+                                  "\par\par\par", "## Chapter", "Fig."]
+    LAYOUT_MODEL_MAX: int = 512
+    LAYOUT_CHUNK_OVERLAP: int = 128
     class Config:
         env_file = find_dotenv("local.env")

parse.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import fitz as pymupdf
 from marker.extract_text import get_text_blocks
 from marker.headers import categorize_blocks, filter_header_footer
-from marker.equations import replace_equations
-from marker.segmentation import detect_all_block_types
 from marker.code import identify_code_blocks, indent_blocks
 from marker.markdown import merge_spans, merge_lines, get_full_text
 from marker.schema import Page, BlockType
@@ -17,11 +19,17 @@ def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
 if __name__ == "__main__":
-    fname = "test_data/thinkpython.pdf"
     doc = pymupdf.open(fname)
     blocks, toc = get_text_blocks(doc)
-    block_types = detect_all_block_types(doc, blocks)
     filtered = deepcopy(blocks)
     annotate_spans(filtered, block_types)
@@ -38,12 +46,13 @@ if __name__ == "__main__":
             block.filter_spans(bad_span_ids)
             block.filter_bad_span_types(block_types[page.pnum])
-    filtered = replace_equations(doc, filtered, block_types)
     # Copy to avoid changing original data
     merged_lines = merge_spans(filtered)
     text_blocks = merge_lines(merged_lines, filtered)
     full_text = get_full_text(text_blocks)
-    with open("test_data/thinkpython.md", "w+") as f:
         f.write(full_text)

+import argparse
 import fitz as pymupdf
 from marker.extract_text import get_text_blocks
 from marker.headers import categorize_blocks, filter_header_footer
+from marker.equations import replace_equations, load_nougat_model
+from marker.segmentation import detect_all_block_types, load_layout_model
 from marker.code import identify_code_blocks, indent_blocks
 from marker.markdown import merge_spans, merge_lines, get_full_text
 from marker.schema import Page, BlockType
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("filename", help="PDF file to parse")
+    parser.add_argument("output", help="Output file name")
+    args = parser.parse_args()
+    fname = args.filename
     doc = pymupdf.open(fname)
     blocks, toc = get_text_blocks(doc)
+    layoutlm_model = load_layout_model()
+    block_types = detect_all_block_types(doc, blocks, layoutlm_model)
     filtered = deepcopy(blocks)
     annotate_spans(filtered, block_types)
             block.filter_spans(bad_span_ids)
             block.filter_bad_span_types(block_types[page.pnum])
+    nougat_model = load_nougat_model()
+    filtered = replace_equations(doc, filtered, block_types, nougat_model)
     # Copy to avoid changing original data
     merged_lines = merge_spans(filtered)
     text_blocks = merge_lines(merged_lines, filtered)
     full_text = get_full_text(text_blocks)
+    with open(args.output, "w+") as f:
         f.write(full_text)