Spaces:

zphilip48
/

nougat-latex

Runtime error

App Files Files Community

zphilip commited on Oct 14, 2023

Commit

9d1fa0d

•

1 Parent(s): 876fac2

adding part 1

Browse files

Files changed (42) hide show

.gitattributes +2 -0
app.py +297 -0
nougat/__init__.py +15 -0
nougat/__pycache__/__init__.cpython-310.pyc +0 -0
nougat/__pycache__/_version.cpython-310.pyc +0 -0
nougat/__pycache__/model.cpython-310.pyc +0 -0
nougat/__pycache__/postprocessing.cpython-310.pyc +0 -0
nougat/__pycache__/transforms.cpython-310.pyc +0 -0
nougat/_version.py +8 -0
nougat/dataset/__init__.py +0 -0
nougat/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
nougat/dataset/__pycache__/rasterize.cpython-310.pyc +0 -0
nougat/dataset/create_index.py +173 -0
nougat/dataset/gen_seek.py +36 -0
nougat/dataset/parser/__init__.py +0 -0
nougat/dataset/parser/document.py +703 -0
nougat/dataset/parser/html2md.py +67 -0
nougat/dataset/parser/latexml_parser.py +441 -0
nougat/dataset/parser/markdown.py +396 -0
nougat/dataset/pdffigures.py +71 -0
nougat/dataset/rasterize.py +81 -0
nougat/dataset/split_htmls_to_pages.py +219 -0
nougat/dataset/split_md_to_pages.py +477 -0
nougat/dataset/splitter.py +393 -0
nougat/dataset/staircase.py +314 -0
nougat/dataset/tokenizer.json +0 -0
nougat/dataset/utils/__init__.py +8 -0
nougat/dataset/utils/latex_conversion.py +146 -0
nougat/dataset/utils/pdf_text_extract.py +86 -0
nougat/dataset/utils/utils.py +20 -0
nougat/metrics.py +117 -0
nougat/model.py +702 -0
nougat/postprocessing.py +508 -0
nougat/transforms.py +173 -0
nougat/utils/__init__.py +0 -0
nougat/utils/__pycache__/__init__.cpython-310.pyc +0 -0
nougat/utils/__pycache__/checkpoint.cpython-310.pyc +0 -0
nougat/utils/__pycache__/dataset.cpython-310.pyc +0 -0
nougat/utils/checkpoint.py +119 -0
nougat/utils/dataset.py +280 -0
nougat/utils/device.py +38 -0
predict.py +172 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import gradio as gr
+import subprocess
+import uuid
+import os
+import requests
+import re
+os.environ['http_proxy'] = ""
+os.environ['https_proxy'] = ""
+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import sys
+from pathlib import Path
+import logging
+import re
+import argparse
+import re
+from functools import partial
+import torch
+from torch.utils.data import ConcatDataset
+from tqdm import tqdm
+from nougat import NougatModel
+from nougat.utils.dataset import LazyDataset
+from nougat.utils.checkpoint import get_checkpoint
+from nougat.postprocessing import markdown_compatible
+import fitz
+logging.basicConfig(level=logging.INFO)
+if torch.cuda.is_available():
+    BATCH_SIZE = int(
+        torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1000 * 0.3
+    )
+    if BATCH_SIZE == 0:
+        logging.warning("GPU VRAM is too small. Computing on CPU.")
+else:
+    # don't know what a good value is here. Would not recommend to run on CPU
+    BATCH_SIZE = 1
+    logging.warning("No GPU found. Conversion on CPU is very slow.")
+def nougat_predict(input_files, output_path, checkpoint, batchsize, markdown,recompute):
+    print(f'*** nougat predict with input :{input_files} ***')
+    model = NougatModel.from_pretrained(checkpoint).to(torch.bfloat16)
+    if batchsize > 0:
+        if torch.cuda.is_available():
+            model.to("cuda")
+    else:
+        # set batch size to 1. Need to check if there are benefits for CPU conversion for >1
+        batchsize = 1
+    model.eval()
+    datasets = []
+    for pdf in input_files:
+        #if not pdf.exists():
+        if not os.path.exists(pdf):
+            continue
+        if output_path:
+            out_path = output_path / pdf.with_suffix(".mmd").name
+            print(out_path)
+            if out_path.exists() and not recompute:
+                logging.info(
+                    f"Skipping {pdf.name}, already computed. Run with --recompute to convert again."
+                )
+                continue
+        try:
+            dataset = LazyDataset(
+                pdf, partial(model.encoder.prepare_input, random_padding=False)
+            )
+        except fitz.fitz.FileDataError:
+            logging.info(f"Could not load file {str(pdf)}.")
+            continue
+        datasets.append(dataset)
+    if len(datasets) == 0:
+        print(f'*** nougat out files :{out_path} ***')
+        return out_path
+    dataloader = torch.utils.data.DataLoader(
+        ConcatDataset(datasets),
+        batch_size=batchsize,
+        shuffle=False,
+        collate_fn=LazyDataset.ignore_none_collate,
+    )
+    predictions = []
+    file_index = 0
+    page_num = 0
+    for i, (sample, is_last_page) in enumerate(tqdm(dataloader)):
+        model_output = model.inference(image_tensors=sample)
+        # check if model output is faulty
+        for j, output in enumerate(model_output["predictions"]):
+            if page_num == 0:
+                logging.info(
+                    "Processing file %s with %i pages"
+                    % (datasets[file_index].name, datasets[file_index].size)
+                )
+            page_num += 1
+            if output.strip() == "[MISSING_PAGE_POST]":
+                # uncaught repetitions -- most likely empty page
+                predictions.append(f"\n\n[MISSING_PAGE_EMPTY:{page_num}]\n\n")
+            elif model_output["repeats"][j] is not None:
+                if model_output["repeats"][j] > 0:
+                    # If we end up here, it means the output is most likely not complete and was truncated.
+                    logging.warning(f"Skipping page {page_num} due to repetitions.")
+                    predictions.append(f"\n\n[MISSING_PAGE_FAIL:{page_num}]\n\n")
+                else:
+                    # If we end up here, it means the document page is too different from the training domain.
+                    # This can happen e.g. for cover pages.
+                    predictions.append(
+                        f"\n\n[MISSING_PAGE_EMPTY:{i*args.batchsize+j+1}]\n\n"
+                    )
+            else:
+                if markdown:
+                    output = markdown_compatible(output)
+                predictions.append(output)
+            if is_last_page[j]:
+                out = "".join(predictions).strip()
+                out = re.sub(r"\n{3,}", "\n\n", out).strip()
+                if output_path:
+                    out_path = output_path / Path(is_last_page[j]).with_suffix(".mmd").name
+                    out_path.parent.mkdir(parents=True, exist_ok=True)
+                    out_path.write_text(out, encoding="utf-8")
+                else:
+                    print(out, "\n\n")
+                predictions = []
+                page_num = 0
+                file_index += 1
+    print(f'the generated markdown file is : {out_path}')
+    return out_path
+def get_pdf(pdf_link):
+  # Generate a unique filename
+  unique_filename = f"input/downloaded_paper_{uuid.uuid4().hex}.pdf"
+  # Send a GET request to the PDF link
+  response = requests.get(pdf_link)
+  if response.status_code == 200:
+      # Save the PDF content to a local file
+      with open(unique_filename, 'wb') as pdf_file:
+          pdf_file.write(response.content)
+      print("PDF downloaded successfully.")
+  else:
+      print("Failed to download the PDF.")
+  return unique_filename #.split('/')[-1][:-4]
+def nougat_ocr(file_name):
+  #unique_filename = f"/content/output/downloaded_paper_{uuid.uuid4().hex}.pdf"
+  # Command to run
+  cli_command = [
+      'nougat',
+      #'--out', unique_filename,
+      '--out', 'output',
+      'pdf', f'{file_name}',
+      '--checkpoint', 'nougat',
+      '--markdown'
+  ]
+  # Run the command and capture its output
+  #completed_process =
+  subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  return #unique_filename
+import pathlib
+def predict(pdf_file, pdf_link):
+  print("*************** inference ******************")
+  if pdf_file is None:
+    if pdf_link == '':
+      print("No file is uploaded and No link is provided")
+      return "No data provided. Upload a pdf file or provide a pdf link and try again!"
+    else:
+      print(f'pdf_link is - {pdf_link}')
+      file_name = get_pdf(pdf_link)
+      print(f'file_name is - {file_name}')
+  else:
+    print(pdf_file)
+    file_name = pdf_file.name
+    print(file_name)
+    pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
+    print(pdf_name)
+  # Call nougat
+  #nougat_ocr(file_name)
+  #nougat_predict(file_name)
+  input_files = file_name if isinstance(file_name, os.PathLike) else pathlib.Path(file_name),
+  #input_files = pathlib.Path(file_name),
+  output_path = pathlib.Path("./output")
+  checkpoint = pathlib.Path("./config1/")
+  config = pathlib.Path("./config1/config.json")
+  markdown = True
+  batchsize = BATCH_SIZE
+  output_files = nougat_predict(input_files=input_files, output_path=output_path, checkpoint = checkpoint, batchsize = batchsize, markdown = markdown, recompute=False)
+  print(f'the generated markdown file is : {output_files}')
+  #print("BACKKKK")
+  # Open the file for reading
+  file_name = file_name.split('/')[-1][:-4]
+  #with open(f'output/{file_name}.mmd', 'r') as file:
+  with open(output_files, 'r+') as file:
+      content = file.read()
+      # switch math delimiters
+      content = content.replace(r"\(", "\$").replace(r'\)', '\$').replace(r'\[', '\$\$').replace(r'\]', '\$\$')
+  print("***********************************")
+  print("convert successfully")
+  print("***********************************")
+  return content
+def nougat_ocr1(file_name):
+  print('******* inside nougat_ocr *******')
+  # CLI Command to run
+  cli_command = [
+      'python predict',
+      '--out', 'output',
+      'pdf', f'{file_name}',
+      '--checkpoint', '../config1/',
+      '--markdown'
+  ]
+  # Run the command and get .mmd file in an output folder
+  subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  return
+def predict1(pdf_file):
+  print('******* inside predict *******')
+  print(f"temporary file - {pdf_file.name}")
+  pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
+  print(f"pdf file name - {pdf_name}")
+  #! Get prediction for a PDF using nougat
+  nougat_ocr(pdf_file.name)
+  print("BAACCKKK")
+  # Open the multimarkdown (.mmd) file for reading
+  with open(f'output/{pdf_name}.mmd', 'r') as file:
+      content = file.read()
+  return content
+def process_example(pdf_file,pdf_link):
+ ocr_content = predict(pdf_file,pdf_link)
+ return gr.update(value=ocr_content)
+css = """
+  #mkd {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+  gr.HTML("<h1><center>Nougat: Neural Optical Understanding for Academic Documents<center><h1>")
+  gr.HTML("<h3><center>Lukas Blecher et al. <a href='https://arxiv.org/pdf/2308.13418.pdf' target='_blank'>Paper</a>, <a href='https://facebookresearch.github.io/nougat/'>Project</a><center></h3>")
+  with gr.Row():
+    mkd = gr.Markdown('<h4><center>Upload a PDF</center></h4>',scale=1)
+    mkd = gr.Markdown('<h4><center><i>OR</i></center></h4>',scale=1)
+    mkd = gr.Markdown('<h4><center>Provide a PDF link</center></h4>',scale=1)
+  with gr.Row(equal_height=True):
+    pdf_file = gr.File(label='PDF📃', file_count='single', scale=1)
+    pdf_link = gr.Textbox(placeholder='Enter an Arxiv link here', label='PDF link🔗🌐', scale=1)
+  with gr.Row():
+    btn = gr.Button('Run NOUGAT🍫')
+    clr = gr.Button('Clear🚿')
+  output_headline = gr.Markdown("<h3>PDF converted to markup language through Nougat-OCR👇:</h3>")
+  parsed_output = gr.Markdown(elem_id='mkd', value='📃🔤OCR Output')
+  btn.click(predict, [pdf_file, pdf_link], parsed_output )
+  print('******* 1 *******')
+  clr.click(lambda : (gr.update(value=None),
+                      gr.update(value=None),
+                      gr.update(value=None)),
+             [],
+             [pdf_file, pdf_link, parsed_output]
+            )
+  gr.Examples(
+      [["./input/test.pdf", ""], [None, "https://arxiv.org/pdf/2308.08316.pdf"]],
+      inputs = [pdf_file, pdf_link],
+      outputs = parsed_output,
+      fn=process_example,
+      cache_examples=True,
+      label='Click on any Examples below to get Nougat OCR results quickly:'
+  )
+demo.queue()
+demo.launch(debug=True,share=True, server_name="0.0.0.0")

nougat/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+from .model import NougatConfig, NougatModel
+from .utils.dataset import NougatDataset
+from ._version import __version__
+__all__ = [
+    "NougatConfig",
+    "NougatModel",
+    "NougatDataset",
+]

nougat/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (464 Bytes). View file

nougat/__pycache__/_version.cpython-310.pyc ADDED Viewed

Binary file (355 Bytes). View file

nougat/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (19.9 kB). View file

nougat/__pycache__/postprocessing.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

nougat/__pycache__/transforms.cpython-310.pyc ADDED Viewed

Binary file (5.66 kB). View file

nougat/_version.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+__version__ = "0.1.17"

nougat/dataset/__init__.py ADDED Viewed

File without changes

nougat/dataset/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (151 Bytes). View file

nougat/dataset/__pycache__/rasterize.cpython-310.pyc ADDED Viewed

Binary file (2.82 kB). View file

nougat/dataset/create_index.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+This script creates an index of all available pages and parses the meta data for all pages into a separate file.
+Optionally TesseractOCR is called for each image.
+"""
+import argparse
+import json
+from typing import Dict, List
+import numpy as np
+from pathlib import Path
+import multiprocessing
+from pebble import ProcessPool
+from PIL import Image
+import pytesseract
+import re
+import logging
+from tqdm import tqdm
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+def convert_pt2px(pt, dpi=96):
+    if isinstance(pt, list):
+        return [round(dpi / 72 * p) for p in pt]
+    elif isinstance(pt, dict):
+        for k in pt:
+            pt[k] = round(dpi / 72 * pt[k])
+        return pt
+def read_metadata(data: Dict) -> List[List[Dict]]:
+    N = data["num_pages"]
+    out = [[] for _ in range(N)]
+    # pdffigures2 meta data
+    if "pdffigures" in data and data["pdffigures"]:
+        for item in data["pdffigures"]:
+            p = item.pop("page", None)
+            if p is None or p >= N:
+                continue
+            item["source"] = "fig"
+            if "regionBoundary" in item:
+                item["regionBoundary"] = convert_pt2px(item["regionBoundary"])
+            if "captionBoundary" in item:
+                item["captionBoundary"] = convert_pt2px(item["captionBoundary"])
+            out[p].append(item)
+    return out
+def index_paper(directory: Path, args: argparse.Namespace):
+    """
+    Pack all image-text pairs into a single h5 file and save it at `args.out`
+    """
+    paper = directory.name
+    markdowns = directory.glob("*.mmd")
+    meta_file = directory / "meta.json"
+    data_samples = []
+    if not meta_file.exists():
+        return
+    # load meta info
+    try:
+        meta = read_metadata(json.load(meta_file.open("r", encoding="utf-8")))
+    except json.JSONDecodeError:
+        return
+    for md_path in markdowns:
+        image = md_path.parent / (md_path.stem + ".png")
+        i = int(image.stem) - 1
+        if not image.exists():
+            continue
+        if i >= len(meta):
+            continue
+        data_sample = {}
+        ocr_path = image.parent / (image.stem + "_OCR.txt")
+        if args.tesseract and not ocr_path.exists():
+            try:
+                pil = Image.open(image)
+                ocr = pytesseract.image_to_string(pil, lang="eng", timeout=2)
+                ocr = re.sub(r"\n+\s+?([^\s])", r"\n\n\1", ocr).strip()
+                with ocr_path.open("w", encoding="utf-8") as f_ocr:
+                    f_ocr.write(ocr)
+            except RuntimeError:
+                logger.info("Page %s of paper %s timed out", image.stem, paper)
+                pass
+        if ocr_path.exists():
+            data_sample["ocr"] = str(ocr_path.relative_to(args.root))
+        data_sample["image"] = str(image.relative_to(args.root))
+        data_sample["markdown"] = md_path.read_text(encoding="utf8").strip()
+        data_sample["meta"] = meta[i]
+        data_samples.append(data_sample)
+    return data_samples
+def create_index(args):
+    if not args.dir.exists() and not args.dir.is_dir():
+        logger.error("%s does not exist or is no dir.", args.dir)
+        return
+    papers = []
+    depth = 0
+    p = args.dir
+    while True:
+        p = next(p.iterdir())
+        if p.is_file():
+            break
+        else:
+            depth += 1
+    papers = args.dir.glob("*/" * depth)
+    index = []
+    with ProcessPool(max_workers=args.workers) as pool:
+        tasks = {}
+        for j, paper in enumerate(papers):
+            fname = paper.name
+            tasks[fname] = pool.schedule(
+                index_paper,
+                args=[paper, args],
+                timeout=args.timeout,
+            )
+        for fname in tqdm(tasks):
+            try:
+                res = tasks[fname].result()
+                if res is None:
+                    logger.info("%s is faulty", fname)
+                    continue
+                index.append(res)
+            except TimeoutError:
+                logger.info("%s timed out", fname)
+        with args.out.open("w", encoding="utf-8") as f:
+            for item in index:
+                for page in item:
+                    if len(page) == 0:
+                        continue
+                    f.write(json.dumps(page) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", type=Path, required=True, help="Index file")
+    parser.add_argument(
+        "--dir", type=Path, required=True, help="Parent directory for input dirs"
+    )
+    parser.add_argument("--root", type=Path, default=None)
+    parser.add_argument(
+        "--tesseract",
+        action="store_true",
+        help="Tesseract OCR prediction for each page",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=multiprocessing.cpu_count(),
+        help="How many processes to use",
+    )
+    parser.add_argument(
+        "--dpi", type=int, default=96, help="DPI the images were saved with"
+    )
+    parser.add_argument("--timeout", type=int, default=240, help="Max time per paper")
+    args = parser.parse_args()
+    if args.root is None:
+        args.root = args.dir
+    else:
+        # check if dir is subdir of root
+        args.dir.relative_to(args.root)
+    create_index(args)

nougat/dataset/gen_seek.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from tqdm import tqdm
+import json
+from pathlib import Path
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("src_file", nargs="+", type=Path, help="JSONL file in question")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = get_args()
+    for file in args.src_file:
+        seek_map = []
+        seek_pos = 0
+        with open(file) as f:
+            with tqdm(smoothing=0.0) as pbar:
+                line = f.readline()
+                while line:
+                    seek_map.append(seek_pos)
+                    seek_pos = f.tell()
+                    line = f.readline()
+                    pbar.update(1)
+        out_file = file.parent / (file.stem + ".seek.map")
+        with open(out_file, "w") as f:
+            f.write(json.dumps(seek_map))

nougat/dataset/parser/__init__.py ADDED Viewed

File without changes

nougat/dataset/parser/document.py ADDED Viewed

	@@ -0,0 +1,703 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from collections import defaultdict
+from copy import copy
+import itertools
+import re
+from dataclasses import dataclass, field, asdict
+from typing import (
+    Any,
+    List,
+    Dict,
+    Optional,
+    TypeVar,
+    Type,
+    Generic,
+)
+import numpy as np
+import logging
+logger = logging.getLogger()
+from dataclasses import dataclass, field, asdict
+from typing import List, Dict, TypeVar, Type, Generic
+T = TypeVar("T")
+EL = TypeVar("EL")
+@dataclass
+class Element(Generic[EL]):
+    """
+    Generic class representing an element with children in a tree-like structure.
+    Attributes:
+        parent (Element): The parent element.
+        children (List[Element]): List of child elements.
+    """
+    parent: "Element" = None
+    children: List["Element"] = field(default_factory=list)
+    @property
+    def plaintext(self):
+        return "".join([child.plaintext for child in self.children])
+    def append(self, child: EL) -> EL:
+        self.children.append(child)
+        child.parent = self
+        return child
+    def find_parent(self, class_or_tuple: Type[T]) -> T:
+        elem = self
+        while elem:
+            if isinstance(elem, class_or_tuple):
+                return elem
+            elem = elem.parent
+        return None
+@dataclass
+class UnknownElement(Element):
+    pass
+@dataclass
+class TextElement(Element):
+    content: str = ""
+    @property
+    def plaintext(self):
+        return self.content
+    def append(self, child: "Element"):
+        raise Exception(f"Cannot append elements to {self.__class__.__name__}")
+@dataclass
+class Math(Element):
+    pass
+@dataclass
+class PlaintextMath(Math):
+    pass
+@dataclass
+class LatexMath(Math):
+    inline: bool = True
+    code: str = ""
+    @property
+    def plaintext(self):
+        return self.code
+@dataclass
+class Author:
+    fullname: str = None
+    lastname: str = None
+    affiliation: str = None
+@dataclass
+class Link(Element):
+    target: str = None
+@dataclass
+class InlineRef(Element):
+    target: str = None
+    def as_dict(self):
+        return {
+            "target": self.target,
+        }
+@dataclass
+class Reference:
+    """
+    Data class representing a reference with various attributes.
+    Attributes:
+        title (Element): The title of the reference.
+        authors (List[Author]): List of authors of the reference.
+        ids (Dict[str, str]): Dictionary of identification information.
+        date (str): The publication date of the reference.
+        url (str): The URL link to the reference.
+        journal (str): The journal where the reference is published.
+        full_text (str): The full text content of the reference.
+    Methods:
+        as_dict(): Convert the reference object to a dictionary.
+    """
+    title: Element = None
+    authors: List[Author] = field(default_factory=list)
+    ids: Dict[str, str] = field(default_factory=dict)
+    date: str = None
+    url: str = None
+    journal: str = None
+    full_text: str = None
+    def as_dict(self):
+        return {
+            "title": self.title.plaintext,
+            "authors": [asdict(auth) for auth in self.authors],
+            "ids": self.ids,
+            "date": self.date,
+            "url": self.url,
+            "journal": self.journal,
+            "full_text": self.full_text,
+        }
+@dataclass
+class SpanElement(Element):
+    pass
+@dataclass
+class Italic(SpanElement):
+    pass
+@dataclass
+class Bold(SpanElement):
+    pass
+@dataclass
+class Superscript(SpanElement):
+    pass
+@dataclass
+class Subscript(SpanElement):
+    pass
+@dataclass
+class Paragraph(Element):
+    pass
+@dataclass
+class TableRow(Element):
+    cells: List[Element] = field(default_factory=list)
+    def add_cell(self, cell: Element):
+        self.cells.append(cell)
+        cell.parent = self
+        return cell
+    @property
+    def plaintext(self):
+        return "\t".join([cell.plaintext for cell in self.cells])
+@dataclass
+class TableHead(TableRow):
+    pass
+@dataclass
+class Table(Element):
+    id: str = None
+    header: Element = None
+    caption: Element = None
+    rows: List[TableRow] = field(default_factory=list)
+    keep_table: bool = False
+    def add_row(self, row: TableRow) -> TableRow:
+        self.rows.append(row)
+        row.parent = self
+        return row
+    @property
+    def plaintext(self):
+        return "\n".join([row.plaintext for row in self.rows])
+@dataclass
+class Equation(Element):
+    pass
+@dataclass
+class EquationList(Element):
+    equations: List[Equation] = field(default_factory=list)
+    def add_equation(self, eqn: Equation) -> Equation:
+        self.equations.append(eqn)
+        eqn.parent = self
+        return eqn
+    @property
+    def plaintext(self):
+        return "\n".join([eqn.plaintext for eqn in self.equations])
+@dataclass
+class Algorithm(Element):
+    caption: Element = None
+    lines: List[Element] = field(default_factory=list)
+    inline: bool = False
+    def add_line(self, line: Element) -> Element:
+        self.lines.append(line)
+        line.parent = self
+        return line
+    @property
+    def plaintext(self):
+        return "\n".join([line.plaintext for line in self.lines])
+@dataclass
+class Definition(Element):
+    term: Element = None
+    definition: Element = None
+    @property
+    def plaintext(self):
+        parts = []
+        if self.term:
+            parts.append(f"{self.term.plaintext}:")
+        if self.definition:
+            parts.append(self.definition.plaintext)
+        return " ".join(parts)
+@dataclass
+class DefinitionList(Element):
+    """
+    Data class representing a list of definitions with an optional header.
+    Attributes:
+        header (Element): The header element for the definition list.
+        items (List[Definition]): List of Definition elements.
+    Methods:
+        add_item(item: Definition) -> Definition: Add a definition item to the list.
+    """
+    header: Element = None
+    items: List[Element] = field(default_factory=list)
+    def add_item(self, item: Definition) -> Definition:
+        self.items.append(item)
+        item.parent = self
+        return item
+    @property
+    def plaintext(self):
+        parts = []
+        if self.header:
+            parts.append(self.header.plaintext)
+        parts.extend([df.plaintext for df in self.items])
+        return "\n".join(parts)
+@dataclass
+class Figure(Element):
+    id: str = None
+    header: Element = None
+    caption: Element = None
+@dataclass
+class Section(Element):
+    id: str = None
+    header: Element = None
+    level: int = 0
+    hnum: int = 1
+@dataclass
+class SectionHeader(Element):
+    id: str = None
+    header: Element = None
+    level: int = 0
+@dataclass
+class ListItem(Element):
+    label: str = ""
+@dataclass
+class ListContainer(Element):
+    level: int = 0
+    ordered: bool = False
+    items: List[Element] = field(default_factory=list)
+    def add_item(self, item: ListItem) -> ListItem:
+        self.items.append(item)
+        item.parent = self
+        return item
+    @property
+    def plaintext(self):
+        return "\n".join([item.plaintext for item in self.items])
+@dataclass
+class Footnote(Element):
+    id: str = None
+@dataclass
+class Document(Element, Reference):
+    abstract: Element = None
+    language: str = None
+    keywords: List[Element] = field(default_factory=list)
+    references: List[Reference] = field(default_factory=list)
+    inline_refs: List[InlineRef] = field(default_factory=list)
+    bib: Reference = None
+    def add_reference(self, reference):
+        self.references.append(reference)
+    def add_inline_ref(self, in_ref):
+        self.inline_refs.append(in_ref)
+    def set_bib(self, reference):
+        self.bib = reference
+@dataclass
+class Spec:
+    """
+    Data class representing specifications for table cells.
+    Attributes:
+        t (int): The top border size.
+        b (int): The bottom border size.
+        l (int): The left border size.
+        r (int): The right border size.
+        align (str): The alignment of the cell content ('c' for center, 'l' for left, 'r' for right,
+                     or 'p{width}' for justified with a specified width).
+    Methods:
+        __hash__() -> int: Compute the hash of the specification.
+        __eq__(__o: object) -> bool: Check if two specifications are equal.
+        set_align(classes: List[str], style: Optional[str] = None) -> None:
+            Extract alignment information from HTML classes.
+        set_border(classes: List[str]) -> None: Automatically set border specifications.
+        set_attrs(attrs: Dict[str, Any]) -> None: Automatically set all attributes from HTML class attributes.
+        __str__() -> str: Get the string representation of the specification.
+    """
+    t: int = field(default=0, repr=False)
+    b: int = field(default=0, repr=False)
+    l: int = field(default=0)
+    r: int = field(default=0)
+    align: str = field(default="")
+    def __hash__(self) -> int:
+        return hash(repr(self))
+    def __eq__(self, __o: object) -> bool:
+        return repr(self) == repr(__o)
+    def set_align(self, classes: List[str], style: Optional[str] = None) -> None:
+        """extract alignment information from available classes (html)"""
+        aligns = [s for s in classes if "align" in s]
+        if len(aligns) == 0:
+            return
+        elif len(aligns) > 1:
+            logger.warn("Found multiple aligns in classes: %s", ", ".join(classes))
+        align = aligns[0]
+        if "center" in align or align == "c":
+            self.align = "c"
+        elif "left" in align or align == "l":
+            self.align = "l"
+        elif "right" in align or align == "r":
+            self.align = "r"
+        elif "justify" in align or align == "p":
+            # assert style is not None, "justify without style information"
+            if style is None:
+                self.align = "c"
+            else:
+                width = style.partition("width:")[2].partition(";")[0]
+                self.align = "p{%s}" % width
+        else:
+            logger.warn(
+                "only center, left, right, justify supported at the moment. Found %s",
+                align,
+            )
+            self.align = "c"
+    def set_border(self, classes: List[str]) -> None:
+        """automatically set spec with border classes e.g 'ltx_border_t'"""
+        for border in classes:
+            orientation = border.partition("border_")[2]
+            if len(orientation) > 0 and orientation[0] in "tbrl":
+                setattr(self, orientation[0], len(orientation))
+    def set_attrs(self, attrs: Dict[str, Any]) -> None:
+        """automatically set all attr from html class attributes"""
+        classes = attrs["class"]
+        style = attrs["style"] if "style" in attrs else None
+        self.set_align(classes, style=style)
+        self.set_border(classes)
+    def __str__(self) -> str:
+        if self.align:
+            return "|" * self.l + self.align + "|" * self.r
+        else:
+            # default center
+            return "|" * self.l + "c" + "|" * self.r
+@dataclass
+class TableCell(Element):
+    """
+    Represents a cell in an HTML table.
+    Attributes:
+        multicolumn (Optional[int]): The number of columns spanned by the cell.
+        multirow (Optional[int]): The number of rows spanned by the cell.
+        spec (Spec): The specification for the cell's formatting.
+        content (Element): The content of the cell.
+    Methods:
+        __post_init__(*args, **kwargs) -> None: Initialize the cell, ensuring that the spec property is not None.
+        __hash__() -> int: Compute the hash of the cell.
+        __eq__(__o: object) -> bool: Check if two cells are equal.
+        set_attrs(attrs: Dict[str, Any]) -> None: Set attributes for the cell from HTML attributes.
+        plaintext() -> str: Get the plaintext content of the cell.
+    """
+    multicolumn: Optional[int] = None
+    multirow: Optional[int] = None
+    spec: Spec = None
+    content: Element = None
+    def __post_init__(self, *args, **kwargs) -> None:
+        # spec property cannot be None
+        if self.spec is None:
+            self.spec = Spec()
+    def __hash__(self) -> int:
+        return hash(repr(self))
+    def __eq__(self, __o: object) -> bool:
+        return repr(self) == repr(__o)
+    def set_attrs(self, attrs: Dict[str, Any]) -> None:
+        if "colspan" in attrs:
+            self.multicolumn = int(attrs["colspan"])
+        if "rowspan" in attrs:
+            self.multirow = int(attrs["rowspan"])
+        self.spec.set_attrs(attrs)
+    @property
+    def plaintext(self):
+        if self.content is None:
+            return ""
+        return self.content.plaintext
+@dataclass
+class TableRow(Element):
+    """
+    Represents a row in an HTML table.
+    Attributes:
+        cells (List[TableCell]): The list of cells in the row.
+    Methods:
+        add_cell(cell: TableCell) -> TableCell: Add a cell to the row.
+        __iter__() -> Iterator: Iterate through the cells in the row.
+        __len__() -> int: Get the number of cells in the row.
+        __bool__() -> bool: Check if the row is not empty.
+        cum_cell_widths() -> List[int]: Get the cumulative cell widths.
+        cell_widths() -> List[int]: Get the widths of individual cells.
+        width() -> int: Get the total width of the row.
+        _hline(orientation: str) -> str: Determine horizontal lines to be inserted.
+        hline_above() -> str: Get the horizontal line description for the top of the row.
+        hline_below() -> str: Get the horizontal line description for the bottom of the row.
+        plaintext() -> str: Get the plaintext content of the row.
+    """
+    cells: List[TableCell] = field(default_factory=list)
+    def add_cell(self, cell: TableCell):
+        self.cells.append(cell)
+        cell.parent = self
+        return cell
+    def __iter__(self):
+        return iter(self.cells)
+    def __len__(self) -> int:
+        return len(self.cells)
+    def __bool__(self) -> bool:
+        return True
+    @property
+    def cum_cell_widths(self) -> List[int]:
+        return np.cumsum(self.cell_widths)
+    @property
+    def cell_widths(self) -> List[int]:
+        return [(cell.multicolumn or 1) for cell in self.cells]
+    @property
+    def width(self) -> int:
+        return sum(self.cell_widths)
+    def _hline(self, orientation: str) -> str:
+        """Figure out if and where horizontal lines need to be inserted.
+        Args:
+            orientation (str): Either 't' (top) or 'b' (bottom)
+        Returns:
+            str: Correct vertical line description for latex tables.
+        """
+        assert orientation == "t" or orientation == "b"
+        lines = []
+        for cell in self.cells:
+            lines.extend([getattr(cell.spec, orientation)] * (cell.multicolumn or 1))
+        lines.append(0)
+        indices = []
+        start = None
+        for i, v in enumerate(lines):
+            if v and start is None:
+                start = i
+            elif start is not None and not v:
+                indices.append((start, i - 1))
+                start = None
+        s = ""
+        for a, b in indices:
+            if b - a + 1 == self.width:
+                s += "\\hline " * lines[0]
+            else:
+                s += "\\cline{%i-%i} " % (a + 1, b + 1)
+        return s.strip()
+    @property
+    def hline_above(self) -> str:
+        return self._hline("t")
+    @property
+    def hline_below(self) -> str:
+        return self._hline("b")
+    @property
+    def plaintext(self) -> str:
+        return "\t".join([cell.plaintext for cell in self.cells])
+@dataclass
+class Tabular(Element):
+    rows: List[TableRow] = field(default_factory=list)
+    """
+    Represents a tabular structure, such as an HTML table.
+    Attributes:
+        rows (List[TableRow]): The list of rows in the tabular structure.
+    Methods:
+        add_row(row: TableRow) -> TableRow: Add a row to the tabular structure.
+        width() -> int: Get the maximum width of the tabular structure.
+        cols() -> List[List[TableCell]]: Get a list of columns in the tabular structure.
+        _square_table() -> None: Ensure the table has an equal number of columns in each row.
+        get_table_spec() -> str: Generate a LaTeX table specification based on cell alignments.
+        plaintext() -> str: Get the plaintext content of the tabular structure.
+    """
+    def add_row(self, row: TableRow) -> TableRow:
+        self.rows.append(row)
+        row.parent = self
+        return row
+    @property
+    def width(self) -> int:
+        if len(self.rows) > 0:
+            return max([r.width for r in self.rows])
+        else:
+            return 0
+    @property
+    def cols(self) -> List[List[TableCell]]:
+        return list(
+            map(
+                list,
+                itertools.zip_longest(*[r.cells for r in self.rows], fillvalue=None),
+            )
+        )
+    def _square_table(self) -> None:
+        """check if number of columns is equal for every row. Add placeholders for `\multirow` instances"""
+        for i, row in enumerate(self.rows):
+            for j, cell in enumerate(row.cells):
+                if cell.multirow is not None and cell.multirow > 1:
+                    spec = copy(cell.spec)
+                    # assume no hlines in multi cells: disable bottom lines for top and top lines for lower cells.
+                    spec.t = 0
+                    cell.spec.b = 0
+                    for k in range(i + 1, i + cell.multirow):
+                        if k < len(self.rows):
+                            for _ in range(row.cell_widths[j]):
+                                # add empty cell
+                                self.rows[k].cells.insert(
+                                    j, TableCell(parent=self.rows[k], spec=spec)
+                                )
+    def get_table_spec(self) -> str:
+        """Generates a LaTeX table spec."""
+        # First make table square
+        self._square_table()
+        # Find the most used spec in regular cells (no multi-col/row)
+        specs = [Spec() for _ in range(self.width)]
+        for i, col in enumerate(self.cols):
+            counts = defaultdict(int)
+            for cell in col:
+                if cell is None or cell.spec.align == "":
+                    continue
+                if cell.multicolumn is None and cell.multirow is None:
+                    counts[cell.spec] += 1
+            if len(counts) > 0:
+                specs[i] = max(counts, key=counts.get)
+        # convert all cells that don't match the column style into a multicol{1}{custom_spec}
+        for i, col in enumerate(self.cols):
+            for cell in col:
+                if cell is not None and cell.spec != specs[i]:
+                    # check if there is text in the cell. If not alignment doesn't matter
+                    if (
+                        len(cell.children) == 0
+                        and cell.spec.l == specs[i].l
+                        and cell.spec.r == specs[i].r
+                    ):
+                        continue
+                    # convert any standard cell into a multicol cell of width 1
+                    if cell.multicolumn is None:
+                        cell.multicolumn = 1
+        # generate final latex table spec
+        out = " ".join([str(spec) for spec in specs])
+        out = re.sub(r"(\|) +(\w)", r"\1\2", out)
+        out = re.sub(r"(\w) +(\|)", r"\1\2", out)
+        return out
+    @property
+    def plaintext(self):
+        return "\n".join([row.plaintext for row in self.rows])
+@dataclass
+class Table(Element):
+    id: str = None
+    caption: Element = None

nougat/dataset/parser/html2md.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import argparse
+from pathlib import Path
+from typing import List, Optional
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+import htmlmin
+from nougat.dataset.parser.latexml_parser import parse_latexml, _clean_html_whitespace
+from nougat.dataset.parser.markdown import format_document
+def check_file_path(paths: List[Path], wdir: Optional[Path] = None) -> List[str]:
+    """
+    Checks if the given file paths exist.
+    Args:
+        paths: A list of file paths.
+        wdir: The working directory. If None, the current working directory is used.
+    Returns:
+        A list of file paths that exist.
+    """
+    files = []
+    for path in paths:
+        if type(path) == str:
+            if path == "":
+                continue
+            path = Path(path)
+        pathsi = [path] if wdir is None else [path, wdir / path]
+        for p in pathsi:
+            if p.exists():
+                files.append((p.resolve()))
+            elif "*" in path.name:
+                files.extend([(pi.resolve()) for pi in p.parent.glob(p.name)])
+    return list(set(files))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--html", type=Path, nargs="+", help="HTML file", required=True)
+    parser.add_argument("--out", type=Path, help="Output file", required=True)
+    args = parser.parse_args()
+    args.html = check_file_path(args.html)
+    for f in tqdm(args.html):
+        html = BeautifulSoup(
+            htmlmin.minify(
+                open(f, "r", encoding="utf-8").read().replace("\xa0", " "),
+                remove_all_empty_space=1,
+            ),
+            features="html.parser",
+        )
+        try:
+            doc = parse_latexml(html)
+        except ValueError as e:
+            print(e)
+            continue
+        if doc is None:
+            continue
+        out, fig = format_document(doc, keep_refs=True)
+        outp = (args.out if args.out.is_dir() else args.out.parent) / (f.stem + ".mmd")
+        with open(outp, "w", encoding="utf-8") as f:
+            f.write(out)

nougat/dataset/parser/latexml_parser.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import re
+import sys
+import requests
+from typing import Optional, Set
+from bs4 import BeautifulSoup, NavigableString
+import soupsieve as sv
+from nougat.dataset.parser.document import *
+def printerr(*args, **kwargs):
+    # uncomment for debugging
+    # print(*args, **kwargs)
+    pass
+latexml_wrapper_selector = sv.compile(
+    ", ".join(
+        [
+            ".ltx_engrafo_equation_container",
+            "tbody",
+            ".ltx_note_content",
+            ".ltx_role_footnote",
+            ".ltx_note_type",
+            ".ltx_theorem",
+            ".ltx_proof",
+            ".ltx_quote",
+            "blockquote",
+            ".ltx_inline-para",
+            ".ltx_inline-block",
+        ]
+    )
+)
+latexml_ignore_selector = sv.compile(".ltx_rule, .ltx_pagination.ltx_role_newpage")
+def is_wrapper_element(element: BeautifulSoup) -> bool:
+    return latexml_wrapper_selector.match(element)
+def ignore_element(element: BeautifulSoup) -> bool:
+    return latexml_ignore_selector.match(element)
+def _get_classes(el: BeautifulSoup) -> Set[str]:
+    if not hasattr(el, "attrs"):
+        return set()
+    classes = el.attrs.get("class")
+    if classes is None:
+        return set()
+    return set(classes)
+def _detach_selected(element: BeautifulSoup, selector: str) -> None:
+    for elem in element.select(selector):
+        elem.extract()
+def parse_latexml_authors(ltx_authors: BeautifulSoup) -> List[Author]:
+    authors = Paragraph()
+    parse_latexml_children(ltx_authors, authors)
+    return authors
+def parse_latexml_citations(cite: BeautifulSoup, parent: Element) -> None:
+    """
+    Parses LaTeXML citations and appends them as children to the given parent element.
+    Args:
+        cite (BeautifulSoup): The BeautifulSoup object containing the citation data.
+        parent (Element): The parent element to which the citations will be added as children.
+    """
+    parse_latexml_children(cite, parent)
+    if ("[" in parent.plaintext and "]" in parent.plaintext) or re.search(
+        r"[A-Za-z]", parent.plaintext
+    ):
+        return
+    parent.children.insert(0, TextElement(content="["))
+    parent.children.append(TextElement(content="]"))
+def _clean_html_whitespace(text: str) -> str:
+    if text.strip():
+        text = re.sub(r"(^\n+|\n+$)", "\n", text)
+    else:
+        text = text.strip("\n")
+    text = re.sub(r"[ \t]+", " ", text)
+    return text
+def parse_latexml_children(html: BeautifulSoup, parent: Element) -> None:
+    """
+    Parses LaTeXML children and appends them as appropriate elements to the given parent element.
+    Args:
+        html (BeautifulSoup): The BeautifulSoup object containing the HTML data.
+        parent (Element): The parent element to which the parsed children will be added.
+    """
+    if html is None:
+        return
+    for child in html.children:
+        classes = _get_classes(child)
+        if isinstance(child, NavigableString):
+            parent.append(TextElement(content=_clean_html_whitespace(str(child))))
+        elif sv.match(
+            "p, .ltx_p, div.ltx_para, span.ltx_para, section.ltx_paragraph", child
+        ):
+            paragraph = parent.append(Paragraph())
+            parse_latexml_children(child, paragraph)
+        elif sv.match(".ltx_tag", child):
+            if "ltx_tag_note" not in classes:
+                if sv.match(".ltx_tag_section", child):
+                    child.string = child.string.upper()
+                elif sv.match(".ltx_tag_subsection", child):
+                    child.string = ""
+                parse_latexml_children(child, parent)
+            elif "ltx_tag_bibitem" in classes:
+                parse_latexml_children(child, parent.append(SpanElement()))
+        elif sv.match(".ltx_note_outer", child):
+            # try to place the footnote outside the current paragraph
+            paragraph = parent.find_parent(Paragraph)
+            if paragraph is not None and paragraph.parent is not None:
+                footnote = paragraph.parent.append(Footnote())
+            else:
+                footnote = parent.append(Footnote())
+            parse_latexml_children(child, footnote)
+        elif sv.match(".ltx_note_content > .ltx_note_mark", child):
+            footnote = parent.find_parent(Footnote)
+            if footnote is not None:
+                footnote.id = child.get_text(strip=True)
+            else:
+                printerr("Unable to find footnote to set its id", file=sys.stderr)
+                parse_latexml_children(child, parent)
+        elif sv.match("sup", child):
+            sup = parent.append(Superscript())
+            parse_latexml_children(child, sup)
+        elif sv.match("sub", child):
+            sub = parent.append(Subscript())
+            parse_latexml_children(child, sub)
+        elif sv.match("span.ltx_Math, span.ltx_DisplayMath", child):
+            inline = "ltx_DisplayMath" not in classes
+            math_elem = child.select_one(".mjx-math")
+            if math_elem:
+                tex = math_elem.attrs["aria-label"]
+                if inline:
+                    tex = rf"\({tex}\)"
+                else:
+                    tex = rf"\[{tex}\]"
+                parent.append(LatexMath(code=tex, inline=inline))
+        elif sv.match("math.ltx_Math", child):
+            # not sure if the math tag LaTeXML version specific, but that seems to work
+            inline = True
+            if "display" in child.attrs:
+                inline = child.attrs["display"] == "inline"
+            tex = child.attrs["alttext"]
+            if inline:
+                tex = rf"\({tex}\)"
+            else:
+                tex = rf"\[{tex}\]"
+            parent.append(LatexMath(code=tex, inline=inline))
+        elif sv.match("a.ref", child):
+            link = parent.append(Link())
+            link.target = child.attrs.get("href")
+            parse_latexml_children(child, link)
+        elif sv.match(
+            ".ltx_ref.ltx_missing_citation, .ltx_ref.ltx_missing_label", child
+        ):
+            placeholder = child.get_text().strip()
+            resolved = False
+            if placeholder.isnumeric():
+                parent.append(TextElement(content=placeholder))
+                resolved = True
+            else:
+                target = child.attrs.get("href")
+                if target is not None:
+                    potential_num = target.partition(".bib")[2]
+                    if potential_num.isnumeric():
+                        parent.append(TextElement(content=potential_num))
+                        resolved = True
+            if not resolved:
+                raise ValueError("missing reference detected")
+        elif sv.match(
+            ".ltx_bibblock, .ltx_role_author, .ltx_contact, .ltx_role_email, .ltx_role_affiliation",
+            child,
+        ):
+            parse_latexml_children(child, parent.append(SpanElement()))
+            parent.append(TextElement(content="\n"))
+        elif sv.match(
+            ".ltx_authors, .ltx_personname, .ltx_role_creation.ltx_date, .ltx_engrafo_author_notes, .ltx_author_notes, .ltx_date.ltx_role_creation",
+            child,
+        ):
+            parse_latexml_children(child, parent.append(Paragraph()))
+            parent.append(TextElement(content="\n"))
+        elif sv.match(
+            ".ltx_author_before, .ltx_role_pubyear, .ltx_role_pagerange", child
+        ):
+            pass
+        elif sv.match("h1.ltx_title_document", child):
+            doc = parent.find_parent(Document)
+            if doc is not None:
+                if doc.title is None:
+                    doc.title = SectionHeader(parent=doc)
+                    doc.title.hnum = int(child.name[1])
+                    parse_latexml_children(child, doc.title)
+                else:
+                    printerr("Document title is already set", file=sys.stderr)
+            else:
+                printerr("Unable to find document to set title", file=sys.stderr)
+        elif sv.match("section", child):
+            if ".ltx_bibliography" not in classes:
+                section = parent.append(Section())
+                parse_latexml_children(child, section)
+        elif sv.match("h1, h2, h3, h4, h5, h6", child) and "ltx_title" in classes:
+            if {"ltx_title_theorem", "ltx_title_proof"} & classes:
+                parse_latexml_children(child, parent)
+                parent.append(TextElement(content=": "))
+            elif isinstance(parent, Section):
+                parent.hnum = int(child.name[1])
+                if parent.header is None:
+                    parent.header = SpanElement()
+                parse_latexml_children(child, parent.header)
+            else:
+                printerr("Dangling title element", file=sys.stderr)
+                parse_latexml_children(child, parent)
+        elif sv.match(".ltx_TOC.ltx_toc_toc", child):
+            s = parent.append(Section(hnum=6, header=TextElement(content="Contents")))
+            parse_latexml_children(child, s.append(Paragraph()))
+        elif sv.match(
+            "ul.ltx_itemize, ul.ltx_toclist, ul.ltx_biblist, ol.ltx_enumerate", child
+        ):
+            lst = parent.append(ListContainer())
+            lst.ordered = child.name == "ol"
+            parent_list = parent.find_parent(ListContainer)
+            lst.level = parent_list.level + 1 if parent_list is not None else 1
+            parse_latexml_children(child, lst)
+        elif sv.match("li.ltx_item, li.ltx_tocentry, li.ltx_bibitem", child):
+            lst = parent.find_parent(ListContainer)
+            if lst is not None:
+                item = lst.add_item(ListItem())
+                parse_latexml_children(child, item)
+            else:
+                printerr("List item outside list", file=sys.stderr)
+        elif sv.match("cite", child):
+            span = parent.append(SpanElement())
+            parse_latexml_citations(child, span)
+        elif sv.match("a.ltx_ref", child):
+            target = child.attrs.get("href")
+            if target.startswith("#bib"):  # citation link
+                in_ref = parent.append(InlineRef())
+                in_ref.target = target
+                text = child.get_text()
+                in_ref.target = target
+                if text.strip().isnumeric():
+                    in_ref.append(TextElement(content=text))
+                elif re.search(r"[A-Za-z][:;.,_]?\d", text):
+                    # probably a broken citation, go with link number instead
+                    in_ref.append(
+                        TextElement(
+                            content=re.sub(r"\D", "", target.partition(".bib")[2])
+                        )
+                    )
+                else:
+                    raise ValueError('unusable reference "%s"' % text)
+                doc = parent.find_parent(Document)
+                if doc:
+                    doc.add_inline_ref(in_ref)
+            else:
+                link = parent.append(Link())
+                link.target = target
+                parse_latexml_children(child, link)
+        elif sv.match("a", child) and len(classes) == 0:
+            target = child.attrs.get("href")
+            parse_latexml_children(child, parent.append(Link(target=target)))
+        elif sv.match(".ltx_eqn_table", child):
+            eqn_list = parent.append(EquationList())
+            parse_latexml_children(child, eqn_list)
+        elif sv.match(".ltx_eqn_row", child):
+            eqn_list = parent.find_parent(EquationList)
+            if eqn_list is not None:
+                eqn = eqn_list.add_equation(Equation())
+                parse_latexml_children(child, eqn)
+            else:
+                printerr("Dangling equation row", file=sys.stderr)
+                parse_latexml_children(child, parent)
+        elif sv.match(".ltx_eqn_cell", child):
+            parse_latexml_children(child, parent)
+        elif sv.match("table, span.ltx_tabular, div.ltx_tabular", child):
+            tabular = parent.append(Tabular())
+            parse_latexml_children(child, tabular)
+        elif sv.match("thead.ltx_thead", child):
+            table = parent.find_parent(Tabular)
+            if table is not None:
+                parse_latexml_children(child, table)
+            else:
+                printerr("Table header element outside table", file=sys.stderr)
+        elif sv.match("tbody.ltx_tbody", child):
+            parse_latexml_children(child, parent)
+        elif sv.match("tr.ltx_tr", child):
+            table = parent.find_parent(Tabular)
+            if table is not None:
+                row = table.add_row(TableRow())
+                parse_latexml_children(child, row)
+            else:
+                printerr("TableRow element outside table", file=sys.stderr)
+        elif sv.match("td.ltx_td, th.ltx_th", child):
+            row = parent.find_parent(TableRow)
+            if row is not None:
+                cell = TableCell()
+                cell.set_attrs(child.attrs)
+                row.add_cell(cell)
+                parse_latexml_children(child, cell)
+            else:
+                printerr("TableData element outside table row", file=sys.stderr)
+        elif sv.match("span.ltx_text, em.ltx_emph", child):
+            if (
+                child.find_parent(ListItem) is None
+                or child.get_text() != "[label=0)]"
+                or child.get_text() != "[leftmargin=*] "
+            ):
+                if "ltx_font_italic" in classes:
+                    elem = Italic()
+                elif "ltx_font_bold" in classes:
+                    elem = Bold()
+                else:
+                    elem = SpanElement()
+                parent.append(elem)
+                parse_latexml_children(child, elem)
+            else:
+                parent.find_parent(ListContainer).items.pop()
+        elif sv.match("figure.ltx_table", child):
+            figure = parent.append(Table())
+            if "id" in child.attrs:
+                figure.id = child.attrs["id"]
+            parse_latexml_children(child, figure)
+        elif sv.match("figure.ltx_figure", child):
+            figure = parent.append(Figure())
+            if "id" in child.attrs:
+                figure.id = child.attrs["id"]
+            parse_latexml_children(child, figure)
+        elif sv.match("figure.ltx_float", child):
+            parse_latexml_children(child, parent)
+        elif sv.match(".ltx_listing", child):
+            alg = parent.append(Algorithm())
+            parse_latexml_children(child, alg)
+        elif sv.match(".ltx_listingline", child):
+            alg = parent.find_parent(Algorithm)
+            if alg is not None:
+                line = alg.add_line(Element())
+                parse_latexml_children(child, line)
+            else:
+                printerr("Listing line outside algorithm environment", file=sys.stderr)
+        elif sv.match("dl.ltx_description", child):
+            def_list = parent.append(DefinitionList())
+            parse_latexml_children(child, def_list)
+        elif sv.match("dt.ltx_item", child):
+            def_list = parent.find_parent(DefinitionList)
+            if def_list is not None:
+                item = def_list.add_item(Definition())
+                item.term = SpanElement(parent=item)
+                parse_latexml_children(child, item.term)
+            else:
+                printerr("Found dangling definition term", file=sys.stderr)
+        elif sv.match("dd.ltx_item", child):
+            def_list = parent.find_parent(DefinitionList)
+            if def_list is not None:
+                if def_list.items and def_list.items[-1].definition is None:
+                    item = def_list.items[-1]
+                else:
+                    printerr("Found definition without term", file=sys.stderr)
+                    item = def_list.add_item(Definition())
+                item.definition = SpanElement(parent=item)
+                parse_latexml_children(child, item.definition)
+            else:
+                printerr("Found dangling definition", file=sys.stderr)
+                parse_latexml_children(child, parent)
+        elif sv.match("figcaption", child):
+            fig = parent.find_parent((Figure, Table))
+            if fig is not None:
+                if fig.caption is None:
+                    fig.caption = Paragraph(parent=fig)
+                parse_latexml_children(child, fig.caption)
+                fig.caption.append(TextElement(content="\n"))
+            else:
+                printerr("Figure caption outside figure element", file=sys.stderr)
+                para = parent.append(Paragraph())
+                parse_latexml_children(child, para)
+        elif sv.match(".ltx_break", child):
+            parent.append(TextElement(content="\n\n"))
+        elif sv.match(".ltx_abstract, .ltx_acknowledgements", child):
+            abstract = parent.append(Section())
+            parse_latexml_children(child, abstract)
+        elif sv.match(".ltx_ERROR", child):
+            printerr(
+                f"LaTeX error element: {child.get_text(strip=True)}", file=sys.stderr
+            )
+        elif is_wrapper_element(child):
+            parse_latexml_children(child, parent)
+        elif ignore_element(child):
+            continue
+        else:
+            printerr(
+                f"Unknown LaTeXML element <{child.name}> with classes {', '.join(classes)}",
+                file=sys.stderr,
+            )
+            elem = parent.append(UnknownElement())
+            parse_latexml_children(child, elem)
+# TODO: move this somewhere else, so I can use it with plaintext too
+sess = requests.Session()
+def parse_latexml_references(html: BeautifulSoup, doc: Document) -> None:
+    for child in html.select("li.ltx_bibitem"):
+        child.attrs.get("id")
+        ref_text = child.get_text(strip=False).replace("\n", " ")
+        reference = Reference()
+        reference.title = TextElement(content=child.get_text(strip=True))
+        doc.add_reference(reference)
+def parse_latexml(
+    html: BeautifulSoup,
+) -> Optional[Document]:
+    if html.article is None:
+        printerr("Missing article element", file=sys.stderr)
+        return None
+    doc = Document()
+    parse_latexml_children(html.article, doc)
+    parse_latexml_references(
+        html.article,
+        doc,
+    )
+    return doc

nougat/dataset/parser/markdown.py ADDED Viewed

	@@ -0,0 +1,396 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from typing import Iterable, List, Optional, Tuple
+import re
+from uuid import uuid4
+from nougat.dataset.utils import normalize_tex
+from nougat.dataset.parser.document import *
+from nougat.dataset.parser.latexml_parser import _clean_html_whitespace
+from unidecode import unidecode
+SUPERSCRIPT_MAP = str.maketrans("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")
+SUBSCRIPT_MAP = str.maketrans("0123456789", "₀₁₂₃₄₅₆₇₈₉")
+figure_regex = re.compile(r"\[(FOOTNOTE|FIGURE|TABLE)(.*?)\](.*?)\[END\1\]", re.S)
+conv = {
+    "&": r"\&",
+    "%": r"\%",
+    "$": r"\$",
+    "#": r"\#",
+    "_": r"\_",
+    "{": r"\{",
+    "}": r"\}",
+    "~": r"\textasciitilde{}",
+    "^": r"\^{}",
+    "\\": r"\textbackslash{}",
+    "<": r"\textless{}",
+    ">": r"\textgreater{}",
+}
+regex = re.compile(
+    "|".join(
+        re.escape(str(key)) for key in sorted(conv.keys(), key=lambda item: -len(item))
+    )
+)
+def remove_trailing_whitespace(parts: List[str]) -> None:
+    """Removes whitespace elements in list inplace"""
+    for s in reversed(parts):
+        if s.rstrip() == "":
+            del parts[-1]
+        else:
+            break
+def remove_line_breaks(parts: List[str]):
+    out = []
+    for s in parts:
+        out.append(s.replace("\n", " "))
+    return out
+def leading_trailing_whitespace(
+    parts: List[str],
+) -> Tuple[List[str], List[str], List[str]]:
+    """splits the list into three parts. The first and last return elements are made up only of whitespace
+    Args:
+        parts (List[str]): List to split.
+    Returns:
+        Tuple[List[str],List[str],List[str]]: Splitted list
+    """
+    lead = []
+    trail = []
+    out_slice = [None, None]
+    for i, s in enumerate(parts):
+        if s.strip() == "":
+            lead.append(s)
+            out_slice[0] = i + 1
+        else:
+            break
+    for i, s in enumerate(reversed(parts)):
+        if s.strip() == "":
+            trail.append(s)
+            out_slice[1] = -1 - i
+        else:
+            break
+    return lead, parts[slice(*out_slice)], trail[::-1]
+def latex_escape(string: str) -> str:
+    return regex.sub(lambda match: conv[match.group()], string)
+def is_empty(content: List) -> bool:
+    """Used to determine if a Section is empty"""
+    empty = True
+    for part in content:
+        if len(part.strip()):
+            empty = False
+            break
+    return empty
+def format_element(
+    element: Element, keep_refs: bool = False, latex_env: bool = False
+) -> List[str]:
+    """
+    Formats a given Element into a list of formatted strings.
+    Args:
+        element (Element): The element to be formatted.
+        keep_refs (bool, optional): Whether to keep references in the formatting. Default is False.
+        latex_env (bool, optional): Whether to use LaTeX environment formatting. Default is False.
+    Returns:
+        List[str]: A list of formatted strings representing the formatted element.
+    """
+    if isinstance(element, TextElement):
+        if latex_env:
+            return [latex_escape(element.content)]
+        else:
+            return [element.content]
+    if isinstance(element, Bold):
+        parts = format_children(element, keep_refs, latex_env)
+        if element.find_parent(Algorithm) is not None:
+            return parts
+        lead, text, tail = leading_trailing_whitespace("".join(parts))
+        return [*lead, "**", *remove_line_breaks(text), "**", *tail]
+    if isinstance(element, Italic):
+        parts = format_children(element, keep_refs, latex_env)
+        if element.find_parent(Algorithm) is not None:
+            return parts
+        lead, text, tail = leading_trailing_whitespace("".join(parts))
+        return [*lead, "_", *remove_line_breaks(text), "_", *tail]
+    if isinstance(element, PlaintextMath):
+        return format_children(element, keep_refs) + ["\n"]
+    if isinstance(element, Paragraph):
+        return format_children(element, keep_refs, latex_env) + ["\n\n"]
+    if isinstance(element, TableCell):
+        parts = format_children(element, keep_refs, latex_env)
+        remove_trailing_whitespace(parts)
+        if element.multirow is not None:
+            parts.insert(0, "\\multirow{%i}{*}{" % (element.multirow))
+            parts.append("}")
+        if element.multicolumn is not None:
+            parts.insert(
+                0, "\\multicolumn{%i}{%s}{" % (element.multicolumn, element.spec)
+            )
+            parts.append("}")
+        return parts
+    if isinstance(element, TableRow):
+        parts = []
+        if element.hline_above:
+            parts.append(element.hline_above + "\n")
+        parts.extend(
+            remove_line_breaks(
+                format_iterator(element.cells, keep_refs, latex_env, join=" & ")
+            )
+        )
+        parts.append(r" \\")
+        parts.append((" " + element.hline_below).rstrip())
+        return parts
+    if isinstance(element, Tabular):
+        parts = [
+            "\\begin{tabular}",
+            "{%s}\n" % element.get_table_spec(),
+        ]
+        parts.extend(format_iterator(element.rows, keep_refs, True, join="\n"))
+        parts.append("\n\\end{tabular}\n")
+        return parts
+    if isinstance(element, Table):
+        parts = [
+            "[TABLE%s]\n\\begin{table}\n"
+            % (str(uuid4())[:5] if element.id is None else ":" + str(element.id))
+        ]
+        parts.extend(format_children(element, keep_refs, latex_env))
+        caption_parts = format_element(element.caption, keep_refs, latex_env)
+        remove_trailing_whitespace(caption_parts)
+        parts.append("\\end{table}\n")
+        if len(caption_parts) > 0:
+            parts.extend(caption_parts + ["\n"])
+        parts.append("[ENDTABLE]\n\n")
+        return parts
+    if isinstance(element, Figure):
+        parts = format_element(element.caption, keep_refs)
+        remove_trailing_whitespace(parts)
+        return (
+            [
+                "[FIGURE%s]\n"
+                % (str(uuid4())[:5] if element.id is None else ":" + str(element.id))
+            ]
+            + parts
+            + ["\n[ENDFIGURE]\n\n"]
+        )
+    if isinstance(element, SectionHeader):
+        parts = ["# "]
+        if element.id:
+            parts.append(f"{element.id.upper()} ")
+        if element.header:
+            header = format_element(element.header, keep_refs)
+        else:
+            header = format_iterator(element.children, keep_refs)
+        _, title, _ = leading_trailing_whitespace("".join(header))
+        parts.append(title)
+        parts.append("\n\n")
+        return parts
+    if isinstance(element, Section):
+        children_parts = format_children(element, keep_refs)
+        if is_empty(children_parts):
+            return []
+        if element.header:
+            parts = [f"\n\n{'#'*element.hnum} "]
+            _, title, _ = leading_trailing_whitespace(
+                "".join(format_element(element.header, keep_refs))
+            )
+            parts.append(title)
+            parts.append("\n\n")
+        else:
+            parts = []
+        return parts + children_parts
+    if isinstance(element, Footnote):
+        if element.id is not None:
+            foot = f"\n[FOOTNOTE:{element.id}]Footnote {element.id}: "
+        else:
+            foot = "\n[FOOTNOTE:%s]Footnote: " % (str(uuid4())[:5])
+        return [foot] + format_children(element, keep_refs) + ["[ENDFOOTNOTE]\n\n"]
+    if isinstance(element, ListContainer):
+        items = [
+            (
+                item.label,
+                "".join(format_element(item, keep_refs)).strip().replace("\n", " "),
+            )
+            for item in element.items
+        ]
+        parts = ["\n"]
+        indent = "  " * max(element.level - 1, 0)
+        for i, (label, item) in enumerate(items, 1):
+            if label:
+                bullet = label
+            else:
+                bullet = f"{i}." if element.ordered else "*"
+            parts.append(f"{indent}{bullet} {item}\n")
+        parts.append("\n")
+        return parts
+    if isinstance(element, Equation):
+        # equation comprises of multiple displaystyle TeX formulas and optional equation label
+        parts = []
+        for child in element.children:
+            if isinstance(child, LatexMath):
+                tex = normalize_tex(
+                    "".join(format_element(child, keep_refs)).strip(" \n"), inline=False
+                )
+                parts.append(tex)
+            else:
+                text = "".join(format_element(child, keep_refs))
+                if text:
+                    parts.append(text)
+        lead, eqs, tail = leading_trailing_whitespace(parts)
+        s = " ".join(eqs).replace(r"\] \[", " ")
+        return [*lead, s, *tail]
+    if isinstance(element, EquationList):
+        parts = ["\n"]
+        items = element.equations
+        items = ["".join(format_element(item, keep_refs)).rstrip() for item in items]
+        items = [item + "\n" for item in items if item]
+        if items:
+            parts.extend(items)
+            parts.append("\n")
+        return parts
+    if isinstance(element, Algorithm):
+        parts = []
+        items = element.lines
+        items = ["".join(format_element(item, keep_refs)).rstrip() for item in items]
+        if element.inline:
+            items = [item for item in items if item]
+        else:
+            items = [item + "\n" for item in items if item]
+        if items:
+            prepend = "`" if element.inline else "\n```\n"
+            parts.append(prepend)
+            parts.extend(items)
+            append = "`" if element.inline else "```\n\n"
+            parts.append(append)
+        return parts
+    if isinstance(element, DefinitionList):
+        parts = ["\n"]
+        if element.header is not None:
+            parts.extend(format_element(element.header, keep_refs))
+            parts.append("\n")
+        items = [
+            "".join(format_element(item, keep_refs)).rstrip() for item in element.items
+        ]
+        items = [item + "\n" for item in items if item]
+        if items:
+            parts.extend(items)
+            parts.append("\n")
+        return parts
+    if isinstance(element, Definition):
+        parts = []
+        if element.term is not None:
+            term = (
+                "".join(format_element(element.term, keep_refs)).rstrip(" \n\t:") + ": "
+            )
+            # maths in wiki might be inside a definition without a term
+            if term.strip() != ":":
+                parts.append(term)
+        if element.definition is not None:
+            definition = "".join(format_element(element.definition, keep_refs)).rstrip()
+            parts.append(definition)
+        if parts:
+            parts.append("\n")
+        return parts
+    if isinstance(element, LatexMath):
+        parts = []
+        if not element.inline:
+            parts.append("\n\n")
+        parts.append(normalize_tex(element.code, element.inline).strip())
+        if not element.inline:
+            parts.append("\n\n")
+        return parts
+    if isinstance(element, (Superscript, Subscript)):
+        content = element.plaintext
+        if content.strip().isdigit():
+            script_map = (
+                SUBSCRIPT_MAP if isinstance(element, Subscript) else SUPERSCRIPT_MAP
+            )
+            return [content.translate(script_map)]
+        else:
+            return format_children(element, keep_refs)
+    if isinstance(element, InlineRef):
+        parts = format_children(element, keep_refs)
+        return parts
+    return format_children(element, keep_refs, latex_env)
+def format_iterator(
+    iterator: Iterable,
+    keep_refs: bool = False,
+    latex_env: bool = False,
+    join: Optional[str] = None,
+) -> List[str]:
+    """
+    The `format_iterator` function takes an iterator and formats its elements, optionally joining them with a specified string.
+    :param iterator: The `iterator` parameter is an iterable object that contains the elements to be formatted. It could be a list, tuple, set, or any other iterable object
+    :type iterator: Iterable
+    :param keep_refs: The `keep_refs` parameter is a boolean flag that determines whether references to other elements should be preserved in the formatted output. If `keep_refs` is set to `True`, the references will be included in the output. If `keep_refs` is set to `False` (default), the, defaults to False
+    :type keep_refs: bool (optional)
+    :param latex_env: The `latex_env` parameter is a boolean flag that determines whether the output should be formatted as LaTeX code. If `latex_env` is set to `True`, the output will be formatted using LaTeX syntax. If `latex_env` is set to `False` (default), the output will be, defaults to False
+    :type latex_env: bool (optional)
+    :param join: The `join` parameter is an optional string that specifies the delimiter to be used when joining the formatted elements of the iterator into a single string. If `join` is provided, it will be inserted between each formatted element. If `join` is not provided, the formatted elements will be returned as
+    :type join: Optional[str]
+    :return: The function `format_iterator` returns a list of strings.
+    """
+    parts = []
+    for child in iterator:
+        parts.extend(format_element(child, keep_refs, latex_env))
+        if join is not None:
+            parts.append(join)
+    if join is not None:
+        parts = parts[:-1]
+    return parts
+def format_children(
+    element: Element, keep_refs: bool = False, latex_env: bool = False
+) -> List[str]:
+    if element is None:
+        return []
+    return format_iterator(element.children, keep_refs, latex_env)
+def format_document(
+    doc: Document, keep_refs: bool = False
+) -> Tuple[str, Dict[str, str]]:
+    """
+    The `format_document` function takes a `doc` object of type `Document` and a boolean `keep_refs` as input and returns a tuple containing the formatted text of the document and a dictionary of figures found in the document.
+    :param doc: The `doc` parameter is of type `Document`, which is presumably a custom class representing a document
+    :type doc: Document
+    :param keep_refs: The `keep_refs` parameter is a boolean flag that determines whether to keep references in the formatted document or not. If `keep_refs` is set to `True`, the references will be included in the formatted document. If `keep_refs` is set to `False`, the references will be excluded, defaults to False
+    :type keep_refs: bool (optional)
+    :return: The function `format_document` returns a tuple containing two elements: a formatted text document and a dictionary of figures.
+    """
+    parts = []
+    if doc.title:
+        parts.extend([*format_element(doc.title), "\n"])
+    parts.append("\n")
+    parts.extend(format_children(doc, keep_refs))
+    text = "".join(parts)
+    text = text.replace("\xa0", " ")  # replace non-breakable spaces
+    text = re.sub(r" $", "", text, flags=re.MULTILINE)
+    text = re.sub(r"\n[\t ]*$", "\n", text, flags=re.MULTILINE)
+    text = re.sub(r"(?<!\n) {2,}", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text).lstrip()
+    figures = {unidecode(m[0] + m[1]): m[2].strip() for m in figure_regex.findall(text)}
+    text = figure_regex.sub(
+        r"[\1\2][END\1]",
+        text,
+    )
+    return text, figures

nougat/dataset/pdffigures.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import os
+import subprocess
+import logging
+PDFFIGURES2_JAR_PATH = os.environ.get("PDFFIGURES_PATH", None)
+logger = logging.getLogger()
+if PDFFIGURES2_JAR_PATH is None:
+    logger.warning(
+        "You need to configure the path to the pdffigures2 executable in this file (nougat/dataset/pdffigures.py) or set the environment variable 'PDFFIGURES_PATH'."
+    )
+def call_pdffigures(
+    pdf_path: str, figures_dir: str, timeout: int = 30, verbose: bool = False
+):
+    """
+    Extract figures from a PDF file using pdffigures2.
+    Args:
+        pdf_path (str): The path to the PDF file.
+        figures_dir (str): The directory where the figures will be extracted.
+        timeout (int, optional): The timeout in seconds for the pdffigures2 command. Defaults to 30.
+        verbose (bool, optional): Whether to print the output of the pdffigures2 command. Defaults to False.
+    Returns:
+        str: The path to the JSON file containing the extracted figures.
+    """
+    os.makedirs(figures_dir, exist_ok=True)
+    kwargs = (
+        {} if verbose else {"stderr": subprocess.DEVNULL, "stdout": subprocess.DEVNULL}
+    )
+    if PDFFIGURES2_JAR_PATH is None:
+        return
+    process = subprocess.Popen(
+        "java"
+        " -jar {pdffigures_jar_path}"
+        " -d {figures_dir}/"
+        " -c"
+        " -q"
+        " {pdf_path}".format(
+            pdffigures_jar_path=PDFFIGURES2_JAR_PATH,
+            pdf_path=pdf_path,
+            figures_dir=figures_dir,
+        ),
+        shell=True,
+        **kwargs
+    )
+    try:
+        exit_code = process.wait(timeout=timeout)
+        if exit_code != 0:
+            logger.error("Extracting figures from file %s failed.", pdf_path)
+            return False
+    except subprocess.TimeoutExpired as e:
+        logger.error(
+            "pdffigures2 command did not terminate in 30 seconds, "
+            "terminating. Error: %s",
+            e,
+        )
+        process.terminate()  # give up
+        return False
+    pdf_name = os.path.basename(pdf_path).partition(".pdf")[0]
+    dest_file = os.path.join(figures_dir, (pdf_name + ".json"))
+    return dest_file

nougat/dataset/rasterize.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import argparse
+import logging
+import pypdfium2
+from pathlib import Path
+from tqdm import tqdm
+import io
+from typing import Optional, List, Union
+logging.getLogger("pypdfium2").setLevel(logging.WARNING)
+def rasterize_paper(
+    pdf: Union[Path, bytes],
+    outpath: Optional[Path] = None,
+    dpi: int = 96,
+    return_pil=False,
+    pages=None,
+) -> Optional[List[io.BytesIO]]:
+    """
+    Rasterize a PDF file to PNG images.
+    Args:
+        pdf (Path): The path to the PDF file.
+        outpath (Optional[Path], optional): The output directory. If None, the PIL images will be returned instead. Defaults to None.
+        dpi (int, optional): The output DPI. Defaults to 96.
+        return_pil (bool, optional): Whether to return the PIL images instead of writing them to disk. Defaults to False.
+        pages (Optional[List[int]], optional): The pages to rasterize. If None, all pages will be rasterized. Defaults to None.
+    Returns:
+        Optional[List[io.BytesIO]]: The PIL images if `return_pil` is True, otherwise None.
+    """
+    pils = []
+    if outpath is None:
+        return_pil = True
+    try:
+        if isinstance(pdf, (str, Path)):
+            pdf = pypdfium2.PdfDocument(pdf)
+        if pages is None:
+            pages = range(len(pdf))
+        renderer = pdf.render(
+            pypdfium2.PdfBitmap.to_pil,
+            page_indices=pages,
+            scale=dpi / 72,
+        )
+        for i, image in zip(pages, renderer):
+            if return_pil:
+                page_bytes = io.BytesIO()
+                image.save(page_bytes, "bmp")
+                pils.append(page_bytes)
+            else:
+                image.save((outpath / ("%02d.png" % (i + 1))), "png")
+    except Exception as e:
+        logging.error(e)
+    if return_pil:
+        return pils
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pdfs", nargs="+", type=Path, help="PDF files", required=True)
+    parser.add_argument("--out", type=Path, help="Output dir", default=None)
+    parser.add_argument(
+        "--dpi", type=int, default=96, help="What resolution the pages will be saved"
+    )
+    parser.add_argument(
+        "--pages", type=int, nargs="+", default=None, help="list of page numbers"
+    )
+    args = parser.parse_args()
+    if args.pages:
+        args.pages = [p - 1 for p in args.pages]
+    for pdf_file in tqdm(args.pdfs):
+        assert pdf_file.exists() and pdf_file.is_file()
+        outpath: Path = args.out or (pdf_file.parent / pdf_file.stem)
+        outpath.mkdir(exist_ok=True)
+        rasterize_paper(pdf_file, outpath, pages=args.pages, dpi=args.dpi)

nougat/dataset/split_htmls_to_pages.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import argparse
+from io import BytesIO
+import multiprocessing
+from pebble import ProcessPool
+from concurrent.futures import TimeoutError
+from tqdm import tqdm
+from typing import Tuple
+import os
+from pathlib import Path
+import logging
+import pypdf
+from PIL import Image
+import pytesseract
+from nougat.dataset.split_md_to_pages import *
+from nougat.dataset.parser.html2md import *
+from nougat.dataset.pdffigures import call_pdffigures
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+def process_paper(
+    fname: str,
+    pdf_file: Path,
+    html_file: Path,
+    json_file: Path,
+    args: argparse.Namespace,
+) -> Tuple[int, int]:
+    """
+    Process a single paper.
+    Args:
+        fname (str): The paper's filename.
+        pdf_file (Path): The path to the PDF file.
+        html_file (Path): The path to the HTML file.
+        json_file (Path): The path to the JSON file containing the extracted figures.
+        args (argparse.Namespace): The command-line arguments.
+    Returns:
+        Tuple[int, int]: The number of total pages and the number of recognized pages.
+    """
+    total_pages = 0
+    num_recognized_pages = 0
+    try:
+        pdf = pypdf.PdfReader(pdf_file)
+        total_pages = len(pdf.pages)
+        outpath: Path = args.out / fname
+        # skip this paper if already processed
+        dirs_with_same_stem = list(args.out.glob(fname.partition("v")[0] + "*"))
+        if (
+            len(dirs_with_same_stem) > 0
+            and len(list(dirs_with_same_stem[0].iterdir())) > 0
+            and not args.recompute
+        ):
+            logger.info(
+                "%s (or another version thereof) already processed. Skipping paper",
+                fname,
+            )
+            return total_pages, len(list(outpath.glob("*.mmd")))
+        html = BeautifulSoup(
+            htmlmin.minify(
+                open(html_file, "r", encoding="utf-8").read().replace("\xa0", " "),
+                remove_all_empty_space=True,
+            ),
+            features="html.parser",
+        )
+        doc = parse_latexml(html)
+        if doc is None:
+            return
+        out, fig = format_document(doc, keep_refs=True)
+        if args.markdown:
+            md_out = args.markdown / (fname + ".mmd")
+            with open(md_out, "w", encoding="utf-8") as f:
+                f.write(out)
+        if json_file is None:
+            json_file = call_pdffigures(pdf_file, args.figure)
+        if json_file:
+            figure_info = json.load(open(json_file, "r", encoding="utf-8"))
+        else:
+            figure_info = None
+        split = split_markdown(
+            out, pdf_file, figure_info=figure_info, doc_fig=fig, min_score=0.9
+        )
+        if split is None:
+            return
+        pages, meta = split
+        num_recognized_pages = sum([len(p) > 0 for p in pages])
+        if all([len(p) == 0 for p in pages]):
+            return
+        os.makedirs(outpath, exist_ok=True)
+        recognized_indices = []
+        for i, content in enumerate(pages):
+            with (outpath / "meta.json").open("w", encoding="utf-8") as f:
+                f.write(json.dumps(meta))
+            if content:
+                if re.search(r"\[(?:\?\?(?:. )?)+\]", content):
+                    # there are wrongly parsed references in the page eg [??].
+                    continue
+                with (outpath / ("%02d.mmd" % (i + 1))).open(
+                    "w", encoding="utf-8"
+                ) as f:
+                    f.write(content)
+                recognized_indices.append(i)
+        rasterize_paper(pdf_file, outpath, dpi=args.dpi, pages=recognized_indices)
+        if args.tesseract:
+            for i in recognized_indices:
+                ocr = pytesseract.image_to_string(
+                    Image.open((outpath / ("%02d.png" % (i + 1)))), lang="eng"
+                )
+                ocr = re.sub(r"\n+\s+?([^\s])", r"\n\n\1", ocr).strip()
+                with (outpath / ("%02d_OCR.txt" % (i + 1))).open(
+                    "w", encoding="utf-8"
+                ) as f_ocr:
+                    f_ocr.write(ocr)
+    except Exception as e:
+        logger.error(e)
+    return total_pages, num_recognized_pages
+def process_htmls(args):
+    for input_dir in (args.pdfs, args.html):
+        if not input_dir.exists() and not input_dir.is_dir():
+            logger.error("%s does not exist or is no dir.", input_dir)
+            return
+    htmls: List[Path] = args.html.glob("*.html")
+    args.out.mkdir(exist_ok=True)
+    if args.markdown:
+        args.markdown.mkdir(exist_ok=True)
+    with ProcessPool(max_workers=args.workers) as pool:
+        total_pages, total_pages_extracted = 0, 0
+        tasks = {}
+        for j, html_file in enumerate(htmls):
+            fname = html_file.stem
+            pdf_file = args.pdfs / (fname + ".pdf")
+            if not pdf_file.exists():
+                logger.info("%s pdf could not be found.", fname)
+                continue
+            json_file = args.figure / (fname + ".json")
+            if not json_file.exists():
+                logger.info("%s figure json could not be found.", fname)
+                json_file = None
+            tasks[fname] = pool.schedule(
+                process_paper,
+                args=[fname, pdf_file, html_file, json_file, args],
+                timeout=args.timeout,
+            )
+        for fname in tqdm(tasks):
+            try:
+                res = tasks[fname].result()
+                if res is None:
+                    logger.info("%s is faulty", fname)
+                    continue
+                num_pages, num_recognized_pages = res
+                total_pages += num_pages
+                total_pages_extracted += num_recognized_pages
+                logger.info(
+                    "%s: %i/%i pages recognized. Percentage: %.2f%%",
+                    fname,
+                    num_recognized_pages,
+                    num_pages,
+                    (100 * num_recognized_pages / max(1, num_pages)),
+                )
+            except TimeoutError:
+                logger.info("%s timed out", fname)
+    if total_pages > 0:
+        logger.info(
+            "In total: %i/%i pages recognized. Percentage: %.2f%%",
+            total_pages_extracted,
+            total_pages,
+            (100 * total_pages_extracted / max(1, total_pages)),
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--html", type=Path, help="HTML files", required=True)
+    parser.add_argument("--pdfs", type=Path, help="PDF files", required=True)
+    parser.add_argument("--out", type=Path, help="Output dir", required=True)
+    parser.add_argument("--recompute", action="store_true", help="recompute all splits")
+    parser.add_argument(
+        "--markdown", type=Path, help="Markdown output dir", default=None
+    )
+    parser.add_argument(
+        "--figure",
+        type=Path,
+        help="Figure info JSON dir",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=multiprocessing.cpu_count(),
+        help="How many processes to use",
+    )
+    parser.add_argument(
+        "--dpi", type=int, default=96, help="What resolution the pages will be saved at"
+    )
+    parser.add_argument(
+        "--timeout", type=float, default=120, help="max time per paper in seconds"
+    )
+    parser.add_argument(
+        "--tesseract",
+        action="store_true",
+        help="Tesseract OCR prediction for each page",
+    )
+    args = parser.parse_args()
+    print(args)
+    process_htmls(args)

nougat/dataset/split_md_to_pages.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import argparse
+from collections import Counter
+from copy import deepcopy
+import json
+import math
+from operator import itemgetter
+import re
+from typing import Dict, List, Tuple, Union, Optional
+import os
+import pypdf
+from unidecode import unidecode
+import Levenshtein
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.linear_model import SGDClassifier
+from nougat.dataset.staircase import Staircase
+from nougat.dataset.splitter import (
+    Splitter,
+    get_first_last,
+    get_glob_index,
+)
+from nougat.dataset.utils import unicode_to_latex, remove_pretty_linebreaks
+from nougat.dataset.utils.pdf_text_extract import get_pages, get_paragraphs
+from nougat.dataset.rasterize import rasterize_paper
+class BagOfWords:
+    """
+    A bag-of-words model for text classification.
+    Args:
+        sentences (List[str]): The training sentences.
+        target (Optional[List[int]]): The target labels for the training sentences. Defaults to None.
+    """
+    def __init__(
+        self,
+        sentences: List[str],
+        target: Optional[List[int]] = None,
+    ) -> None:
+        self.sentences = sentences
+        self.target = target
+        self.train()
+    def train(self):
+        if self.target is None:
+            self.target = np.arange(len(self.sentences))
+        self.count_vect = CountVectorizer()
+        X_train_counts = self.count_vect.fit_transform(self.sentences)
+        self.tfidf_transformer = TfidfTransformer(use_idf=True)
+        X_train_tfidf = self.tfidf_transformer.fit_transform(X_train_counts)
+        self.clf = SGDClassifier(
+            loss="hinge",
+            penalty="l2",
+            alpha=1e-3,
+            random_state=42,
+            max_iter=5,
+            tol=None,
+        )
+        self.clf.fit(X_train_tfidf, self.target)
+    def __call__(
+        self, text: Union[str, List[str]], lob_probs: bool = False
+    ) -> np.ndarray:
+        if type(text) == str:
+            text = [text]
+        X_new_counts = self.count_vect.transform(text)
+        X_new_tfidf = self.tfidf_transformer.transform(X_new_counts)
+        if lob_probs:
+            return self.clf.predict_log_proba(X_new_tfidf)
+        else:
+            return self.clf.predict(X_new_tfidf)
+def remove_short_seqs(seqs: List[str], minimum: int = 10) -> List[str]:
+    """Remove sequences shorter than the specified minimum length."""
+    out = []
+    for seq in seqs:
+        if len(seq) > minimum:
+            out.append(seq)
+    return out
+def find_figures(
+    pdf_pages: List[List[str]], figure_info: Union[Dict, List]
+) -> List[Tuple[int, int]]:
+    """ "
+    Find the locations of figures in a PDF file.
+    Args:
+        pdf_pages (List[List[str]]): The text of the PDF pages.
+        figure_info (Union[Dict, List]): A dictionary or list of dictionaries, where each dictionary
+            specifies the information about a figure, such as its caption, page number, and bounding box.
+    Returns:
+        List[Tuple[int, int]]: A list of tuples, where each tuple contains the figure index, page number,
+            start position, and end position of the figure in the PDF file.
+    """
+    figure_locations = []
+    iterator = figure_info.values() if type(figure_info) == dict else [figure_info]
+    for figure_list in iterator:
+        for i, f in enumerate(figure_list):
+            if "caption" in f:
+                fig_string = f["caption"]
+            elif "text" in f:
+                fig_string = f["text"]
+            else:
+                continue
+            fig_string = unicode_to_latex(fig_string)
+            if f["page"] >= len(pdf_pages):
+                continue
+            block, score = Splitter.fuzzysearch(
+                "\n".join(pdf_pages[f["page"]]),
+                fig_string,
+            )
+            if score > 0.8 and block[2] > 0:
+                figure_locations.append((i, f["page"], block[0], block[2]))
+    return figure_locations
+def flatten(l: List) -> List:
+    return [item for sublist in l for item in sublist]
+def get_doc_text(
+    pdf: str,
+    splitn: bool = True,
+    split_block: bool = True,
+    minlen: Optional[int] = 10,
+) -> List[List[str]]:
+    """
+    Get the text from a PDF document.
+    Args:
+        doc (str): Path to the PDF document.
+        splitn (bool): Whether to split the text into lines. Defaults to True.
+        split_block (bool): Whether to split the text into blocks. Defaults to True.
+        minlen (Optional[int]): The minimum length of a line or block. Defaults to 10.
+    Returns:
+        List[List[str]]: The text of the PDF document, either as a list of lines or a list of blocks..
+    """
+    document_lines = []
+    if split_block:
+        pages = get_paragraphs(pdf)
+    else:
+        pages = [get_pages(pdf)]
+    for blocks in pages:
+        page_lines = []
+        for block in blocks:
+            if splitn:
+                page_lines.extend(block.split("\n"))
+            else:
+                page_lines.append(block)
+        if splitn:
+            page_lines = remove_short_seqs(page_lines, minlen)
+        document_lines.append(page_lines)
+    return document_lines
+def clean_pdf_text(pages: List[List[str]], num_words: int = 10) -> List[List[str]]:
+    """
+    Clean the text of a PDF document by removing frequent words from the beginning and end of each page.
+    Args:
+        pages (List[List[str]]): The text of the PDF document, as a list of lists of strings.
+        num_words (int, optional): The number of words to consider at the beginning and end of each page. Defaults to 10.
+    Returns:
+        List[List[str]]: The cleaned text of the PDF document.
+    """
+    words = []
+    for page in pages:
+        first = get_first_last(
+            " ".join(page).lower(), num_words=num_words, first_only=True
+        )
+        words.extend(first.split(" "))
+    word_counts = Counter(words)
+    common_words = [
+        "the",
+        "of",
+        "a",
+        "and",
+        "to",
+        "in",
+        "is",
+        "that",
+        "for",
+        "are",
+        "this",
+        "we",
+        "figure",
+        "fig.",
+        "",
+    ]
+    frequent_words = []
+    for w, f in word_counts.items():
+        if w in common_words or w.startswith("\\"):
+            continue
+        if f / len(pages) >= 0.4:
+            frequent_words.append(w)
+    if len(frequent_words) == 0:
+        return pages
+    # remove frequent words from page beginning/end
+    for i in range(len(pages)):
+        page = pages[i]
+        stop = 0
+        page_num_words = 0
+        for p in page:
+            page_num_words += len(p.split(" "))
+            stop += 1
+            if page_num_words >= num_words:
+                break
+        for w in frequent_words:
+            for j in range(stop):
+                if w == "-":  # probably page number - \d -
+                    pages[i][j] = re.sub(
+                        r"-\s*\d{1,3}\s*-", "", pages[i][j], flags=re.IGNORECASE
+                    )
+                pages[i][j] = re.sub(re.escape(w), "", pages[i][j], flags=re.IGNORECASE)
+    return pages
+def split_markdown(
+    doc: str,
+    pdf_file: str,
+    figure_info: Optional[List[Dict]] = None,
+    doc_fig: Dict[str, str] = {},
+    minlen: int = 3,
+    min_num_words: int = 22,
+    doc_paragraph_chars: int = 1000,
+    min_score: float = 0.75,
+    staircase: bool = True,
+) -> Tuple[List[str], Dict]:
+    """
+    Split a PDF document into Markdown paragraphs.
+    Args:
+        doc (str): The text of the Markdown document.
+        pdf (str): The PDF document.
+        figure_info (Optional[List[Dict]]): A list of dictionaries, where each dictionary
+            specifies the information about a figure, such as its caption, page number, and bounding box.
+        doc_fig (Dict[str, str]): A dictionary mapping figure ids to LaTeX code.
+        minlen (int): The minimum length of a Markdown paragraph.
+        min_num_words: The minimum number of words in a Markdown paragraph.
+        doc_paragraph_chars: The maximum number of characters in a Markdown paragraph.
+        min_score: The minimum score for a Markdown paragraph to be split.
+        staircase: Whether to split the document into paragraphs with a staircase pattern.
+    Returns:
+        Tuple[List[str], Dict]: The list of Markdown paragraphs and the metadata.
+    """
+    pdf = pypdf.PdfReader(pdf_file)
+    doc_paragraphs_full: List[str] = doc.split("\n")
+    doc_paragraph_lengths = [len(p) for p in doc_paragraphs_full if len(p) > 1]
+    num_lines = 1 + int(doc_paragraph_chars / np.mean(doc_paragraph_lengths))
+    doc_paragraphs_full = [
+        unidecode("\n".join(doc_paragraphs_full[i : i + num_lines]))
+        for i in range(0, len(doc_paragraphs_full), num_lines)
+    ]
+    doc_paragraphs: List[str] = []
+    doc_paragraph_indices: List[int] = []
+    for i, p in enumerate(doc_paragraphs_full):
+        if len(p) > 1:
+            doc_paragraphs.append(
+                re.sub(r"(\[(FOOTNOTE|FIGURE|TABLE).*?END\2\])", "", p)
+            )
+            doc_paragraph_indices.append(i)
+    meta = {"pdffigures": figure_info}
+    if len(pdf.pages) > 1:
+        pdf_text = get_doc_text(pdf_file, True, True, minlen)
+        pdf_content = [
+            [unicode_to_latex(q).replace("\n", " ") for q in p if len(q) >= minlen]
+            for p in pdf_text
+        ]
+        pdf_content = clean_pdf_text(pdf_content)
+        if figure_info is not None:
+            figure_locations = sorted(
+                find_figures(pdf_content, figure_info), key=itemgetter(2), reverse=True
+            )
+            clean_pdf_content = deepcopy(pdf_content)
+            for i, page_content in enumerate(pdf_content):
+                len_sentences = np.cumsum([0] + [len(p) for p in page_content])
+                for match in figure_locations:
+                    _, page, start, len_ = match
+                    if i != page:
+                        continue
+                    a, b = (
+                        get_glob_index(len_sentences, start),
+                        get_glob_index(len_sentences, start + len_) + 1,
+                    )
+                    for j, k in enumerate(range(a, b + 1)):
+                        if len(clean_pdf_content[i]) == k:
+                            break
+                        if j == 0:
+                            clean_pdf_content[i][k] = clean_pdf_content[i][k][
+                                : start - len_sentences[k]
+                            ]
+                        elif k == b:
+                            clean_pdf_content[i][k] = clean_pdf_content[i][k][
+                                start + len_ - len_sentences[k] :
+                            ]
+                        else:
+                            clean_pdf_content[i][k] = ""
+                clean_pdf_content[i] = remove_short_seqs(clean_pdf_content[i], 0)
+            pdf_content = clean_pdf_content
+        paragraphs = flatten(pdf_content)
+        num_paragraphs = np.cumsum([0] + [len(page) for page in pdf_content])
+        if staircase:
+            # train bag of words
+            page_target = np.zeros(len(paragraphs))
+            page_target[num_paragraphs[1:-1] - 1] = 1
+            page_target = np.cumsum(page_target).astype(int)
+            model = BagOfWords(paragraphs, target=page_target)
+            labels = model(doc_paragraphs)
+            # fit stair case function
+            x = np.arange(len(labels))
+            stairs = Staircase(len(labels), labels.max() + 1)
+            stairs.fit(x, labels)
+            boundaries = (stairs.get_boundaries().astype(int)).tolist()
+            boundaries.insert(0, 0)
+        else:
+            boundaries = [0] * (len(pdf.pages))
+        splitter = Splitter(doc_paragraphs)
+        pages = [(0, 0, 1.0)]
+        meta["first_words"] = []
+        meta["last_words"] = []
+        for i in range(1, len(boundaries)):
+            delta = (
+                math.ceil(stairs.uncertainty[i - 1]) + 5
+                if staircase
+                else len(doc_paragraphs)
+            )
+            words_f = []
+            words_l = []
+            for p in pdf_content[i]:
+                words_f.extend(p.split(" "))
+                if len(words_f) >= min_num_words:
+                    break
+            for p in pdf_content[i - 1][::-1]:
+                words_l.extend(p.split(" ")[::-1])
+                if len(words_l) >= min_num_words:
+                    words_l = words_l[::-1]
+                    break
+            if len(words_f) < 2:
+                pages.append(pages[-1])
+            first_words = " ".join(words_f[:min_num_words]).strip()
+            last_words = " ".join(words_l[-min_num_words:]).strip()
+            meta["first_words"].append(first_words)
+            meta["last_words"].append(last_words)
+            if len(first_words) < minlen and len(last_words) < minlen:
+                pages.append(pages[-1])
+                continue
+            pages.append(
+                splitter.split_first_last(
+                    boundaries[i],
+                    first_words,
+                    last_words,
+                    delta=delta,
+                )
+            )
+    elif len(pdf.pages) == 1:  # single page
+        pages = [(0, 0, 1)]
+    else:
+        return
+    pages.append((len(doc_paragraphs), -1, 1.0))
+    out = []
+    page_scores = {}
+    for i in range(len(pages) - 1):
+        score = (pages[i][2] + pages[i + 1][2]) * 0.5
+        if score >= min_score:
+            end = pages[i + 1][0]
+            if end >= len(doc_paragraph_indices):
+                end = None
+            else:
+                end = doc_paragraph_indices[pages[i + 1][0]] + 1
+            lines = doc_paragraphs_full[doc_paragraph_indices[pages[i][0]] : end]
+            if len(lines) > 0:
+                lines[0] = lines[0][pages[i][1] :]
+                lines[-1] = lines[-1][: pages[i + 1][1]]
+        else:
+            lines = []
+        page_content = "\n".join(lines)
+        page_content = remove_pretty_linebreaks(page_content)
+        page_scores[i] = score
+        out.append(page_content)
+    meta["page_splits"] = pages
+    meta["page_scores"] = page_scores
+    meta["num_pages"] = len(pdf.pages)
+    # Reintroduce figures, tables and footnotes
+    figure_tex = list(doc_fig.keys()), list(doc_fig.values())
+    if len(doc_fig) > 0:
+        iterator = figure_info.values() if type(figure_info) == dict else [figure_info]
+        for figure_list in iterator:
+            if not figure_list:
+                continue
+            for i, f in enumerate(figure_list):
+                if "caption" in f:
+                    fig_string = f["caption"]
+                elif "text" in f:
+                    fig_string = f["text"]
+                else:
+                    continue
+                ratios = []
+                for tex in figure_tex[1]:
+                    if f["figType"] == "Table":
+                        tex = tex.partition(r"\end{table}")[2]
+                    ratios.append(Levenshtein.ratio(tex, fig_string))
+                k = np.argmax(ratios)
+                if ratios[k] < 0.8:
+                    continue
+                if f["page"] < len(out) and out[f["page"]] != "":
+                    out[f["page"]] += "\n\n" + remove_pretty_linebreaks(
+                        figure_tex[1][k].strip()
+                    )
+    for i in range(len(out)):
+        foot_match = re.findall(r"\[FOOTNOTE(.*?)\]\[ENDFOOTNOTE\]", out[i])
+        for match in foot_match:
+            out[i] = out[i].replace(
+                "[FOOTNOTE%s][ENDFOOTNOTE]" % match,
+                doc_fig.get("FOOTNOTE%s" % match, ""),
+            )
+        out[i] = re.sub(r"\[(FIGURE|TABLE)(.*?)\](.*?)\[END\1\]", "", out[i])
+    return out, meta
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--md", type=str, help="Markdown file", required=True)
+    parser.add_argument("--pdf", type=str, help="PDF File", required=True)
+    parser.add_argument("--out", type=str, help="Out dir", required=True)
+    parser.add_argument(
+        "--figure",
+        type=str,
+        help="Figure info JSON",
+    )
+    parser.add_argument("--dpi", type=int, default=96)
+    args = parser.parse_args()
+    md = open(args.md, "r", encoding="utf-8").read().replace("\xa0", " ")
+    pdf = pypdf.PdfReader(args.pdf)
+    try:
+        fig_info = json.load(open(args.figure, "r", encoding="utf-8"))
+    except FileNotFoundError:
+        fig_info = None
+    pages, meta = split_markdown(md, pdf, fig_info)
+    if args.out:
+        outpath = os.path.join(args.out, os.path.basename(args.pdf).partition(".")[0])
+        os.makedirs(outpath, exist_ok=True)
+        found_pages = []
+        for i, content in enumerate(pages):
+            if content:
+                with open(
+                    os.path.join(
+                        outpath, "%02d_s=%.2f.mmd" % (i + 1, meta["page_scores"][i])
+                    ),
+                    "w",
+                    encoding="utf-8",
+                ) as f:
+                    f.write(content)
+                found_pages.append(i)
+        rasterize_paper(pdf, outpath, dpi=args.dpi, pages=found_pages)

nougat/dataset/splitter.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from difflib import SequenceMatcher
+from operator import itemgetter
+from typing import List, Tuple, Union
+import re
+import numpy as np
+from Levenshtein.StringMatcher import StringMatcher
+import Levenshtein
+from fuzzysearch import find_near_matches
+math_start_regex = re.compile(r"(?<!\\)\\[\[\(]", re.M)
+math_end_regex = re.compile(r"(?<!\\)\\[\]\)]", re.M)
+def reverse(lst: List[str]) -> List[str]:
+    """Reverses a list and the strings inside
+    Args:
+        lst (List[str]): List to process
+    Returns:
+        List[str]: Reversed list
+    """
+    out = lst[::-1]
+    for i in range(len(out)):
+        out[i] = out[i][::-1]
+    return out
+def get_first_last(
+    s: str,
+    num_words: int = 8,
+    delim: str = " ",
+    first_only: bool = False,
+    last_only: bool = False,
+) -> Union[Tuple[str, str], str]:
+    """
+    Get the first and last `num_words` from a string `s`.
+    Args:
+        s (str): The string.
+        num_words (int): The number of words.
+        delim (str): The delimiter between words.
+        first_only (bool): Whether to only get the first `num_words`.
+        last_only (bool): Whether to only get the last `num_words`.
+    Returns:
+        Union[Tuple[str, str], str]: The first and last `num_words` from `s`, or `s` if `num_words` is 0.
+    """
+    s = s.split(delim)
+    if not first_only and not last_only:
+        return delim.join(s[:num_words]), delim.join(s[-num_words:])
+    elif first_only:
+        return delim.join(s[:num_words])
+    elif last_only:
+        return delim.join(s[-num_words:])
+def get_glob_index(
+    lengths: List[int], ind: int, return_breakpoints: bool = False
+) -> int:
+    """returns the index where ind is closest and greater than the lengths"""
+    breakpoints = np.cumsum(lengths)
+    overlap = breakpoints - ind
+    overlap[overlap > 0] = -int(1e5)
+    indices = overlap.argmax(0)
+    if return_breakpoints:
+        return indices, breakpoints
+    else:
+        return indices
+# table-header-figure regex
+# thf_regex = re.compile(r"(\[(FOOTNOTE|FIGURE|TABLE).*?END\2\])")
+class Splitter:
+    _split_locs: List[Tuple[int, int]] = None
+    def __init__(self, paragraphs: List[str]) -> None:
+        self.paragraphs = paragraphs
+        self.paragraphs_no_space = [self.remove_special_chars(h) for h in paragraphs]
+        self._split_locs = [(0, 0)]
+        self.paragraphs_rev = reverse(self.paragraphs)
+        self.paragraphs_rev_no_space = reverse(self.paragraphs_no_space)
+    @staticmethod
+    def remove_special_chars(string: str) -> str:
+        # string = thf_regex.sub(r"", string)
+        return (
+            string.replace("\\ ", "")
+            .replace(" ", "")
+            .replace("\n", "")
+            .replace("*", "")
+            .replace("_", "")
+            .replace("^", "")
+            .replace("\\[", "")
+            .replace("\\]", "")
+            .replace("\\(", "")
+            .replace("\\)", "")
+            .replace("\\right", "")
+            .replace("\\left", "")
+            .replace("\\sum", "X")  # old latex unicode encoding issue
+            .replace("{", "")
+            .replace("}", "")
+            .replace("#", "")
+            .replace("[REF]", "")
+            .replace("[ENDREF]", "")
+            .replace("\\varphi", "\\phi")  # https://meta.stackexchange.com/a/349360
+            .replace("\\quad", "")
+            .replace("\\qquad", "")
+            .replace("\\hskip", "")
+            .replace("\\vskip", "")
+            .replace("\\frac", "")
+            .replace("\\rm", "")
+            .replace("\\,", "")
+            .replace("-", "")
+            .lower()
+        )
+    @staticmethod
+    def count_special_chars(string: str, char_ind: int) -> int:
+        if len(string) == 0:
+            return 0
+        add_space_ind = 0
+        while True:
+            string_ = string[: char_ind + add_space_ind]
+            # last_first = string[: char_ind + add_space_ind+]
+            add = (
+                string_.count(" ")
+                + string_.count("\\ ") * 2
+                + string_.count("\n")
+                + string_.count("*")
+                + string_.count("_")
+                + string_.count("^")
+                + string_.count("\\[") * 2
+                + string_.count("\\]") * 2
+                + string_.count("\\(") * 2
+                + string_.count("\\)") * 2
+                + string_.count("\\right") * 6
+                + string_.count("\\left") * 5
+                + string_.count("\\sum") * 3  # replaced to X that's why not 4
+                + string_.count("{")
+                + string_.count("}")
+                + string_.count("#")
+                + string_.count("[REF]") * 5
+                + string_.count("[ENDREF]") * 8
+                + string_.count("\\varphi") * 3
+                + string_.count("\\quad") * 5
+                + string_.count("\\qquad") * 6
+                + string_.count("\\hskip") * 6
+                + string_.count("\\vskip") * 6
+                + string_.count("\\frac") * 5
+                + string_.count("\\rm") * 3
+                + string_.count("\\,") * 2
+                + string_.count("-")
+            )
+            if add == add_space_ind:
+                break
+            add_space_ind = add
+        if len(string) <= char_ind + add_space_ind:
+            add_space_ind = max(0, len(string) - 1 - char_ind)
+        # check first chars of rest if they match closing expressions
+        while True:
+            rest = string[char_ind + add_space_ind :]
+            string_ = string[: char_ind + add_space_ind]
+            section_title = re.match(r"#+\s?\d*\s*$", string_)
+            if rest.startswith("\\]") or rest.startswith("\\)"):
+                add_space_ind += 2
+            elif (rest.startswith(")") or rest.startswith("]")) and string_.endswith(
+                "\\"
+            ):
+                add_space_ind += 1
+            elif (rest.startswith("(") or rest.startswith("[")) and string_.endswith(
+                "\\"
+            ):
+                add_space_ind -= 1
+            elif rest.startswith(" "):
+                add_space_ind += 1
+            elif section_title:
+                add_space_ind -= section_title.end() - section_title.start()
+            elif (
+                re.match(r"^[^\w\s]*_\s", rest)
+                or re.match(r"^[^\w\s]*\*\*?\s", rest)
+                or re.match(r"^.\n", rest)
+            ):
+                add_space_ind += 1
+            else:
+                break
+        # check if it starts in a math env and include everything before
+        end = math_end_regex.search(rest)
+        if end is not None:
+            start = math_start_regex.search(rest)
+            if start is None or start.start() > end.start():
+                inds = [
+                    m.start()
+                    for m in math_start_regex.finditer(string_)
+                    if m.start() < end.start() + len(string_)
+                ]
+                if len(inds) > 0:
+                    add_space_ind = inds[-1] - char_ind
+                    # assert string_[char_ind+add_space_ind]=='\\'
+        return add_space_ind
+    def split_first_last(
+        self, index: int, first: str, last: str, delta: int = 5
+    ) -> Tuple[int, int, float]:
+        """Refines a split by looking at both the first words from a new page and the last words from the previous page.
+        Args:
+            index (int): paragraph index
+            first (str): first words
+            last (str): last words
+            delta (int, optional): paragraph search radius. Defaults to 5.
+        Returns:
+            Tuple[int, int, float]: split prediction
+        """
+        if first:
+            first_split = glob_f, char_f, score_f = self.split(
+                index, first, delta=delta
+            )
+        if last:
+            last_split = glob_l, char_l, score_l = self.split(
+                index, last, delta=delta, reverse=True
+            )
+        if first and not last:
+            return first_split
+        elif not first and last:
+            return last_split
+        elif not first and not last:
+            return index, 0, 0.0
+        if char_f == char_l and glob_f == glob_l and (score_f > 0.5 or score_l > 0.5):
+            return glob_l, char_l, 1.0
+        # score calculation
+        first, last = self.remove_special_chars(first), self.remove_special_chars(last)
+        matching = []
+        for split in (first_split, last_split):
+            first_source = []
+            num_chars_first = len(first)
+            num_chars_last = len(last)
+            last_source = []
+            for i, p in enumerate(self.paragraphs[split[0] :]):
+                if i == 0:
+                    p = p[split[1] :]
+                first_source.append(self.remove_special_chars(p))
+                if sum([len(s) for s in first_source]) >= num_chars_first:
+                    break
+            first_source = "".join(first_source)[:num_chars_first]
+            for i, p in enumerate(self.paragraphs[split[0] :: -1]):
+                if i == 0:
+                    p = p[: split[1]]
+                last_source.insert(0, self.remove_special_chars(p))
+                if sum([len(s) for s in last_source]) >= num_chars_last:
+                    last_source = last_source
+                    break
+            last_source = "".join(last_source)[-num_chars_last:]
+            matching.append(
+                [
+                    Levenshtein.ratio(first, first_source)
+                    * Levenshtein.ratio(first[:10], first_source[:10]),
+                    Levenshtein.ratio(last, last_source)
+                    * Levenshtein.ratio(last[-10:], last_source[-10:]),
+                ]
+            )
+        scores = np.asarray(matching).max(0)
+        return (
+            (glob_l, char_l, scores[1])
+            if scores.argmax()
+            else (glob_f, char_f, scores[0])
+        )
+    def split(
+        self, index: int, string: str, delta: int = 5, reverse: bool = False
+    ) -> Tuple[int, int, float]:
+        """
+        refine split prediction. `string` are the first words from new page.
+        delta can be used as uncertainty measure.
+        returns new index and split index
+        """
+        if reverse:
+            index = len(self.paragraphs) - 1 - index
+            string = string[::-1]
+            paragraphs = self.paragraphs_rev
+            paragraphs_no_space = self.paragraphs_rev_no_space
+        else:
+            paragraphs = self.paragraphs
+            paragraphs_no_space = self.paragraphs_no_space
+        string_ = self.remove_special_chars(string)
+        start_ind = max(0, index - delta)
+        search_corpus = paragraphs_no_space[start_ind : index + delta + 1]
+        lengths = np.asarray([0] + [len(p) for p in search_corpus])
+        corp = "".join(search_corpus)
+        if len(corp) == 0:
+            self._split_locs.append((index, 0))
+            return index, 0, 1
+        ind, score = self._find_match(corp, string_)
+        indices, breakpoints = get_glob_index(lengths, ind, True)
+        global_ind, char_ind = int(start_ind + indices), int(ind - breakpoints[indices])
+        self._split_locs.append((global_ind, char_ind))
+        if reverse:
+            char_ind = len(paragraphs_no_space[global_ind]) - char_ind
+            global_ind = len(paragraphs) - global_ind - 1
+        add_space_ind = self.count_special_chars(self.paragraphs[global_ind], char_ind)
+        return global_ind, char_ind + add_space_ind, score
+    def _find_match(
+        self, corp: str, key: str, get_start: bool = True
+    ) -> Tuple[int, float]:
+        block, score = self._fuzzy(corp, key)
+        index = max(0, block[0])
+        if not get_start:
+            index += block[2]
+        return index, score
+    @staticmethod
+    def _fuzzy(
+        corpus: str, string: str, max_error_rate: float = 0.025
+    ) -> Tuple[Tuple[int, int, int], float]:
+        max_dist = min(len(string) - 1, int(len(string) * min(0.9, max_error_rate)) + 5)
+        matches = find_near_matches(string, corpus, max_l_dist=max_dist)
+        if len(matches) > 0 and max_dist > 0:
+            match = min(matches, key=lambda x: x.dist)
+            block = (match.start, 0, match.end - match.start)
+            score = 1 - match.dist / max_dist
+            return block, score
+        return (0, 0, 0), 0
+    @staticmethod
+    def fuzzysearch(
+        corpus: str, string: str, max_error_rate: float = 0.025
+    ) -> Tuple[Tuple[int, int, int], float]:
+        corpus_ = Splitter.remove_special_chars(corpus)
+        string_ = Splitter.remove_special_chars(string)
+        (start, _, dist), score = Splitter._fuzzy(
+            corpus_, string_, max_error_rate=max_error_rate
+        )
+        end = Splitter.count_special_chars(corpus, start + dist) + start + dist
+        start = start + Splitter.count_special_chars(corpus, start)
+        return (start, _, end - start), score
+    @staticmethod
+    def oldfuzz(corpus, string):
+        res = []
+        for Matcher in [StringMatcher, SequenceMatcher]:
+            m = Matcher(None, corpus, string)
+            blocks = m.get_matching_blocks()
+            scores = []
+            for i, block in enumerate(blocks):
+                m2 = Matcher(
+                    None,
+                    corpus[block[0] : block[0] + max(block[2], len(string))],
+                    string,
+                )
+                r = m2.ratio()
+                if r > 0.995:
+                    return blocks[i], r
+                else:
+                    scores.append(r)
+            ind = np.argmax(scores)
+            res.append((blocks[ind], scores[ind]))
+        return max(res, key=itemgetter(1))
+    def evaluate_split(self, page_num: int, page_content: str) -> float:
+        if page_num > len(self._split_locs) or page_num < 1:
+            return 0
+        page_content = self.remove_special_chars(page_content)
+        if page_num == len(self._split_locs):
+            start, end = self._split_locs[-1], (-1, -1)
+        else:
+            start, end = self._split_locs[page_num - 1], self._split_locs[page_num]
+        if (end[0] + 1) - start[0] < 0:
+            return 0
+        doc_content = self.paragraphs_no_space[start[0] : (end[0] + 1) or None]
+        if (
+            len(doc_content) < 1
+            or len(doc_content[0]) < start[1]
+            or len(doc_content[-1]) < end[1]
+        ):
+            return 0
+        doc_content[0] = doc_content[0][start[1] :]
+        doc_content[-1] = doc_content[-1][: end[1]]
+        doc_content = "".join(doc_content)
+        match = StringMatcher(None, page_content, doc_content).ratio()
+        return match

nougat/dataset/staircase.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from collections import deque
+import operator
+import itertools
+from typing import Optional, List, Tuple
+import numpy as np
+import warnings
+warnings.filterwarnings("ignore", message="All-NaN slice encountered")
+def stair_func(x: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
+    return np.heaviside(x[:, None] - np.floor(thresholds)[None, :], 0).sum(1)
+def compute_gini(labels: np.ndarray) -> float:
+    N = len(labels)
+    if N == 0:
+        return 0
+    G = N - np.square(np.bincount(labels)).sum() / N
+    return G
+def compute_binary_gini(labels: np.ndarray) -> float:
+    N = len(labels)
+    if N == 0:
+        return 0
+    G = N - labels.sum() ** 2 / N
+    return G
+def gini_impurity(
+    thresholds: np.ndarray,
+    data: np.ndarray,
+    labels: np.ndarray,
+    classes: Optional[List[int]] = None,
+    reduction: Optional[str] = "sum",
+    padded: bool = True,
+) -> float:
+    """
+    Calculate the Gini impurity of a dataset split on a set of thresholds.
+    Args:
+        thresholds (np.ndarray): The thresholds to split the data on.
+        data (np.ndarray): The data to split.
+        labels (np.ndarray): The labels for the data.
+        classes (Optional[List[int]]): The classes to consider. If None, all classes are used.
+        reduction (Optional[str]): The reduction to apply to the impurity. One of "none", "sum", or "mean".
+        padded (bool): Whether to pad the thresholds with `[-0.5, data.max() + 0.5]`.
+    Returns:
+        float: The Gini impurity.
+    """
+    G = []
+    if not padded:
+        thresholds = np.insert(
+            thresholds, [0, len(thresholds)], [-0.5, data.max() + 0.5]
+        )
+    if classes is None:
+        classes = np.arange(len(thresholds) - 1)
+    else:
+        classes = np.asarray(classes)
+    if data.ndim == 1:
+        data = np.expand_dims(data, 0)
+    masks = np.logical_and(
+        data > thresholds[classes, None],
+        data <= thresholds[classes + 1, None],
+    )
+    for i, c in enumerate(classes):
+        G.append(compute_binary_gini(np.where(labels[masks[i]] == c, 1, 0)))
+    if reduction is None or reduction == "none":
+        return G
+    elif reduction == "sum":
+        return sum(G)
+    elif reduction == "mean":
+        return sum(G) / len(G)
+    else:
+        raise NotImplementedError
+def step_impurity(
+    thresholds,
+    data: np.ndarray,
+    labels: np.ndarray,
+    classes: Optional[List[int]] = None,
+) -> float:
+    """
+    Calculate the step-wise Gini impurity of a dataset split on a set of thresholds.
+    Args:
+        thresholds (np.ndarray): The thresholds to split the data on.
+        data (np.ndarray): The data to split.
+        labels (np.ndarray): The labels for the data.
+        classes (Optional[List[int]]): The classes to consider. If None, all classes are used.
+    Returns:
+        float: The step-wise Gini impurity.
+    """
+    G = gini_impurity(thresholds, data, labels, reduction=None, classes=classes)
+    out = []
+    for i in range(len(G) - 1):
+        out.append(G[i] + G[i + 1])
+    return out
+class PaddedArray:
+    """
+    A wrapper class for an array that allows for relative indexing.
+    Args:
+        array (np.ndarray): The array to wrap.
+        range (Optional[Tuple[int, int]]): The range of the array to expose. Defaults to (1, -1).
+    """
+    def __init__(
+        self, array: np.ndarray, range: Optional[Tuple[int, int]] = (1, -1)
+    ) -> None:
+        self.array = array
+        mi, ma = range
+        assert ma <= 0, "relative assignment only"
+        self.range = mi, ma
+    def __len__(self):
+        return len(self.array) + self.range[1] - self.range[0]
+    def _process_index(self, index):
+        if isinstance(index, slice):
+            index = slice(
+                (index.start or 0) + self.range[0],
+                self.range[0] + (len(self) if index.stop is None else index.stop),
+                index.step,
+            )
+            if index.stop > len(self.array):
+                raise IndexError
+        else:
+            index = index + self.range[0]
+            if index > len(self):
+                raise IndexError
+        return index
+    def __getitem__(self, index):
+        index = self._process_index(index)
+        return self.array[index]
+    def __setitem__(self, index, value):
+        self.array[self._process_index(index)] = value
+    def copy(self):
+        return PaddedArray(self.array.copy(), self.range)
+    def toarray(self):
+        return self.array[self.range[0] : self.range[1]]
+class Staircase:
+    """
+    A class for learning a staircase decision tree.
+    Args:
+        domain: The number of points in the domain.
+        n_classes: The number of classes.
+    """
+    def __init__(self, domain: int, n_classes: int) -> None:
+        self.domain = domain
+        self.classes = n_classes
+        assert domain > 0
+        assert n_classes > 0
+        self.thresholds = self._back_thres = self._forward_thres = np.linspace(
+            domain / n_classes, domain, n_classes - 1, endpoint=False
+        )
+        self.uncertainty = np.zeros_like(self.thresholds)
+    def statistic_fit(
+        self,
+        data: np.ndarray,
+        labels: np.ndarray,
+    ):
+        """
+        Fit statistical thresholds for anomaly detection.
+        This method fits statistical thresholds for anomaly detection based on input data and labels.
+        Args:
+            data (np.ndarray): The input data.
+            labels (np.ndarray): The labels corresponding to the data.
+        Note:
+            This method modifies the internal state of the object to set statistical thresholds.
+        """
+        onehot = np.eye(self.classes)[labels.reshape(-1)]
+        onehot.reshape(list(labels.shape) + [self.classes])
+        k = onehot * data.T.repeat(self.classes, 1)
+        k[k == 0] = np.nan
+        med = np.nanmedian(k, 0)
+        for i in range(len(med)):
+            if med[i] != med[i]:
+                med[i] = 0 if i == 0 else med[i - 1]
+        mad = 5 * np.nan_to_num(
+            np.nanmedian(np.absolute(k - np.nanmedian(k, 0)), 0),
+            nan=self.domain / self.classes / 2,
+        )
+        arr = np.vstack(((med - mad)[:-1], (med + mad)[1:]))
+        self._forward_thres[:] = arr.max(0)
+        self._back_thres[:] = arr.min(0)
+        self._stat_forward = self._forward_thres.copy()
+        self._stat_back = self._back_thres.copy()
+    def fit(
+        self,
+        data: np.ndarray,
+        labels: np.ndarray,
+        early_stop_after: int = 10,
+        fixed: bool = True,
+    ) -> None:
+        """
+        Fit statistical thresholds for anomaly detection.
+        This method fits statistical thresholds for anomaly detection based on input data and labels.
+        Args:
+            data (np.ndarray): The input data.
+            labels (np.ndarray): The labels corresponding to the data.
+            early_stop_after (int, optional): The number of consecutive early stops to consider. Default is 10.
+            fixed (bool, optional): Whether to use fixed thresholds. Default is True.
+        Note:
+            This method modifies the internal state of the object to set statistical thresholds.
+        """
+        assert data.ndim == 1
+        assert labels.ndim <= 2
+        if self.classes == 1:
+            self.thresholds = np.array([0.5 + data.max()])
+            self.uncertainty = np.zeros_like(self.thresholds)
+        if data.ndim == 1:
+            data = np.expand_dims(data, 0)
+        thresholds = PaddedArray(
+            np.insert(
+                np.arange(self.domain - self.classes + 1, self.domain) - 1,
+                [0, self.classes - 1],
+                [-0.5, self.domain + 0.5],
+            ).astype(int)
+        )
+        self._back_thres = thresholds.copy()
+        self._forward_thres = thresholds.copy()
+        self.statistic_fit(data, labels)
+        last = -0.5
+        for n in range(self.classes):
+            G = np.inf
+            Gis = deque([], early_stop_after)
+            # forward pass
+            if n < self.classes - 1:
+                new_forward_n: float = self._forward_thres[n]
+                for i in range(
+                    max(0, self._back_thres[n - 1]) if n - 1 >= 0 else int(last),
+                    min(self.domain, self._forward_thres[n + 1])
+                    if n + 2 < self.classes
+                    else self.domain - 1,
+                ):
+                    thresholds.array[n + 1] = i + 0.5
+                    Gi = step_impurity(
+                        thresholds.array, data, labels, classes=[n, n + 1]
+                    )[0]
+                    Gis.append(Gi)
+                    if Gi <= G:
+                        last = i + 0.5
+                        new_forward_n = last
+                        G = Gi
+                    elif (
+                        (not fixed or i - last > self.domain / self.classes)
+                        and len(Gis) == early_stop_after
+                        and all(
+                            itertools.starmap(
+                                operator.ge,
+                                zip(Gis, itertools.islice(Gis, 1, early_stop_after)),
+                            )
+                        )
+                    ):
+                        break
+                thresholds.array[n + 1] = new_forward_n
+                self._forward_thres.array[n + 1] = new_forward_n
+                self._back_thres.array[n + 1] = new_forward_n
+            G = np.inf
+        self._forward_thres = self._forward_thres.toarray().clip(
+            min=0, max=self.domain - 1
+        )
+        self._back_thres = self._back_thres.toarray().clip(min=0, max=self.domain - 1)
+        self.thresholds = (self._forward_thres + self._back_thres) / 2
+        self.uncertainty = np.abs(self._forward_thres - self._back_thres) / 2
+    @property
+    def score(self):
+        try:
+            return gini_impurity(self.thresholds, self._data, self._labels) / len(
+                self._data
+            )
+        except AttributeError:
+            return np.inf
+    def predict(self, x: np.ndarray) -> np.ndarray:
+        return stair_func(x, self.get_boundaries())
+    def __call__(self, *args):
+        return self.predict(*args)
+    def get_boundaries(self) -> np.ndarray:
+        return self.thresholds.astype(int).clip(min=0, max=self.domain - 1) + 0.5

nougat/dataset/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nougat/dataset/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from nougat.dataset.utils.latex_conversion import *
+from nougat.dataset.utils.utils import *

nougat/dataset/utils/latex_conversion.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import re
+from pylatexenc.latexencode import UnicodeToLatexEncoder
+from pylatexenc.latex2text import LatexNodes2Text
+from unidecode import unidecode
+syn = [
+    ("\\rbrack ", "] "),
+    ("\\lbrack ", "[ "),
+    ("\\lbrace ", "\\} "),
+    ("\\rbrace ", "\\{ "),
+    ("\\lnot ", "\\neg "),
+    ("\\land ", "\\wedge "),
+    ("\\vee ", "\\lor "),
+    ("\\doublecup ", "\\Cup "),
+    ("\\doublecap ", "\\Cap "),
+    ("\\llless ", "\\lll "),
+    ("\\gggtr ", "\\ggg "),
+    ("\\doteqdot ", "\\Doteq "),
+    ("\\ne ", "\\neq "),
+    ("\\le ", "\\leq "),
+    ("\\ge ", "\\geq "),
+    ("\\leftarrow ", "\\gets "),
+    ("\\rightarrow ", "\\to "),
+    ("\\restriction ", "\\upharpoonright "),
+    ("\\owns ", "\\ni "),
+    ("\\textlnot ", "\\neg "),
+    ("\\textellipsis ", "\\ldots "),
+    ("\\textbullet ", "\\bullet "),
+    ("\\plusmn ", "\\pm "),
+    ("\\texttimes", "\\times"),
+    ("\\textmu", "\\mu"),
+    ("\\textendash", "-"),
+    ("\\textemdash", "---"),
+    ("\\>", "\\:"),
+    ("\\medspace", "\\:"),
+    ("\\thinspace", "\\,"),
+    ("\\negthinspace", "\\!"),
+    ("\\thickspace", "\\;"),
+]
+umlaut_mapping = {
+    "textasciicircum": "^",
+    "ddot": '"',
+    "textasciidieresis": '"',
+    "textasciicaron": "v ",
+}
+umlaut_keys = "|".join(umlaut_mapping.keys())
+umlaut_regex = re.compile(rf"\s?\\({umlaut_keys})\s(\w)")
+latex_comments = re.compile(r"(?<!\\)%.*\n")
+toascii = UnicodeToLatexEncoder(
+    non_ascii_only=True, unknown_char_policy="ignore", unknown_char_warning=False
+)
+def remove_style(string: str) -> str:
+    return (
+        string.replace("\\displaystyle", "")
+        .replace("\\scriptstyle", "")
+        .replace("\\scriptscriptstyle", "")
+        .replace("\\textstyle", "")
+    )
+def replace_duplicate_definitions(string: str) -> str:
+    """In Latex there are many commands that are interchangeable. Use just one of them"""
+    for pair in syn:
+        string = string.replace(pair[0], pair[1])
+    return string
+def unicode_to_latex(s: str) -> str:
+    s = re.sub(
+        r"\s{2,}",
+        " ",
+        re.sub(
+            r"\\ensuremath\s?\{\s?(.+?)\s?\}\s?",
+            r" \1 ",
+            toascii.unicode_to_latex(s.strip()),
+        )
+        .replace("}", " ")
+        .replace("{", " "),
+    )
+    s = (
+        s.strip()
+        .replace(
+            "\\textperiodcentered \\textperiodcentered \\textperiodcentered", "\\cdots"
+        )
+        .replace("\\textperiodcentered", "\\cdot")
+        .replace("\\textquoteright", "'")
+        .replace("\\textquoteleft", "'")
+        .replace("\\textquotedblleft", '"')
+        .replace("\\textquotedblright", '"')
+    )
+    s = umlaut_regex.sub(lambda x: "\\" + umlaut_mapping[x.group(1)] + x.group(2), s)
+    s = replace_duplicate_definitions(s)
+    s = unidecode(s)
+    return s.replace("\u2009", " ")
+latex_to_unicode = LatexNodes2Text()
+def remove_line_breaks(string: str) -> str:
+    string = latex_comments.sub("\n", string)
+    return string.replace("\n", " ")
+def normalize_tex(math: str, inline: bool) -> str:
+    """
+    Normalize TeX math expressions.
+    This function takes a TeX math expression and performs various normalization steps to ensure
+    consistency and proper formatting.
+    Args:
+        math (str): The input TeX math expression.
+        inline (bool): Indicates whether the expression should be inline (True) or displayed (False).
+    Returns:
+        str: The normalized TeX math expression.
+    """
+    math = math.strip()
+    if not math:
+        return ""
+    if math.startswith(r"\(") or math.startswith(r"\[") or math.startswith("$$"):
+        math = math[2:]
+    elif math.startswith("$"):
+        math = math[1:]
+    if math.endswith(r"\)") or math.endswith(r"\]") or math.endswith("$$"):
+        math = math[:-2]
+    elif math.endswith("$"):
+        math = math[:-1]
+    math = math.strip()
+    if not math:
+        return ""
+    math = remove_line_breaks(math.strip())
+    math = replace_duplicate_definitions(math)
+    math = remove_style(math)
+    if inline:
+        return rf"\({math}\)"
+    return rf"\[{math}\]"

nougat/dataset/utils/pdf_text_extract.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from io import StringIO
+from typing import List
+import re
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFParser
+def replace_ligatures(text: str) -> str:
+    ligatures = {
+        "ﬀ": "ff",
+        "ﬁ": "fi",
+        "ﬂ": "fl",
+        "ﬃ": "ffi",
+        "ﬄ": "ffl",
+        "ﬅ": "ft",
+        "ﬆ": "st",
+        # "Ꜳ": "AA",
+        # "Æ": "AE",
+        "ꜳ": "aa",
+    }
+    for search, replace in ligatures.items():
+        text = text.replace(search, replace)
+    return text
+def remove_hyphens(text: str) -> str:
+    """
+    This fails for:
+    * Natural dashes: well-known, self-replication, use-cases, non-semantic,
+                      Post-processing, Window-wise, viewpoint-dependent
+    * Trailing math operands: 2 - 4
+    * Names: Lopez-Ferreras, VGG-19, CIFAR-100
+    """
+    lines = [line.rstrip() for line in text.split("\n")]
+    # Find dashes
+    line_numbers = []
+    for line_no, line in enumerate(lines[:-1]):
+        if line.endswith("-"):
+            line_numbers.append(line_no)
+    # Replace
+    for line_no in line_numbers:
+        lines = dehyphenate(lines, line_no)
+    return "\n".join(lines)
+def dehyphenate(lines: List[str], line_no: int) -> List[str]:
+    next_line = lines[line_no + 1]
+    word_suffix = next_line.split(" ")[0]
+    lines[line_no] = lines[line_no][:-1] + word_suffix
+    lines[line_no + 1] = lines[line_no + 1][len(word_suffix) :]
+    return lines
+def get_pages(pdf: str) -> List[str]:
+    out = []
+    with open(pdf, "rb") as in_file:
+        parser = PDFParser(in_file)
+        doc = PDFDocument(parser)
+        rsrcmgr = PDFResourceManager()
+        for page in PDFPage.create_pages(doc):
+            output_string = StringIO()
+            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
+            interpreter = PDFPageInterpreter(rsrcmgr, device)
+            interpreter.process_page(page)
+            out.append(remove_hyphens(replace_ligatures(output_string.getvalue())))
+    return out
+def get_paragraphs(pdf: str) -> List[List[str]]:
+    pages = get_pages(pdf)
+    return [re.sub(r"\n{3,}", "\n\n", txt).split("\n\n") for txt in pages]

nougat/dataset/utils/utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import re
+def remove_pretty_linebreaks(string: str) -> str:
+    """replaces linebreaks with spaces when there would be no
+    difference between them in the markdown format
+    Args:
+        string (str): String to process
+    Returns:
+        str: Formatted string
+    """
+    return re.sub(r"(?<!\n)\n([^\n\d\*#\[])", r" \1", string).strip()

nougat/metrics.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import argparse
+from multiprocessing import Pool
+import re
+from pathlib import Path
+from collections import defaultdict
+from typing import List
+import numpy as np
+import nltk
+from nltk import edit_distance
+from tqdm import tqdm
+import orjson
+inline_reg = re.compile(r"\\\((.*?)(?<!\\)\\\)")
+display_reg = re.compile(r"\\\[(.+?)(?<!\\)\\\]")
+table_reg = re.compile(r"\\begin\{tabular\}(.+?)(?:\\end\{tabular\}|$)", re.S)
+def compute_metrics(pred, gt, minlen=4):
+    metrics = {}
+    if len(pred) < minlen or len(gt) < minlen:
+        return metrics
+    metrics["edit_dist"] = edit_distance(pred, gt) / max(len(pred), len(gt))
+    reference = gt.split()
+    hypothesis = pred.split()
+    metrics["bleu"] = nltk.translate.bleu([reference], hypothesis)
+    try:
+        metrics["meteor"] = nltk.translate.meteor([reference], hypothesis)
+    except LookupError:
+        metrics["meteor"] = np.nan
+    reference = set(reference)
+    hypothesis = set(hypothesis)
+    metrics["precision"] = nltk.scores.precision(reference, hypothesis)
+    metrics["recall"] = nltk.scores.recall(reference, hypothesis)
+    metrics["f_measure"] = nltk.scores.f_measure(reference, hypothesis)
+    return metrics
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("json", type=Path, help="results file")
+    parser.add_argument(
+        "-N", dest="N", type=int, help="number of samples", default=None
+    )
+    args = parser.parse_args()
+    d = orjson.loads(args.json.read_text(encoding="utf-8"))
+    args.pred = d["predictions"]
+    args.gt = d["ground_truths"]
+    if args.N is not None:
+        args.pred = args.pred[: args.N]
+        args.gt = args.gt[: args.N]
+    return args
+def split_text(pages: List[str]):
+    """
+    Split a list of pages into text, inline math, display math, and table blocks.
+    Args:
+        pages: The pages to split.
+    """
+    text, math, table = [], [], []
+    for page in pages:
+        for i, reg in enumerate([inline_reg, display_reg, table_reg]):
+            matches = "\n".join(reg.findall(page))
+            if i == 2:
+                table.append(matches)
+            elif i == 1:
+                math[-1] += matches
+            else:
+                math.append(matches)
+            page = reg.sub("", page)
+        text.append(page.strip())
+    return text, math, table
+def get_metrics(gt: List[str], pred: List[str], pool: bool = True):
+    metrics = defaultdict(list)
+    if pool:
+        with Pool() as p:
+            _metrics = p.starmap(compute_metrics, iterable=zip(pred, gt))
+    else:
+        _metrics = [compute_metrics(p, g) for p, g in zip(pred, gt)]
+    for m in _metrics:
+        for key, value in m.items():
+            metrics[key].append(value)
+    return dict(metrics)
+if __name__ == "__main__":
+    args = get_parser()
+    for name, entries in zip(["gt", "pred"], [args.gt, args.pred]):
+        full: Path = args.json.parent / (args.json.stem + "_" + name + "_full.mmd")
+        full.write_text("\n\n------------------\n\n".join(entries))
+    for i, (gt, pr) in enumerate(zip(split_text(args.gt), split_text(args.pred))):
+        sub = ["Text", "Math", "Tables"][i]
+        prpath: Path = args.json.parent / (
+            args.json.stem + "_pred_" + sub.lower() + ".mmd"
+        )
+        prpath.write_text("\n\n------------------\n\n".join(pr))
+        gtpath: Path = args.json.parent / (
+            args.json.stem + "_gt_" + sub.lower() + ".mmd"
+        )
+        gtpath.write_text("\n\n------------------\n\n".join(gt))
+        print("Results for", sub)
+        metrics = get_metrics(gt, pr)
+        print({key: sum(values) / len(values) for key, values in metrics.items()})

nougat/model.py ADDED Viewed

	@@ -0,0 +1,702 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+import logging
+import math
+import os
+from typing import List, Optional, Union
+from collections import defaultdict
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import cv2
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import ImageOps
+from timm.models.swin_transformer import SwinTransformer
+from torchvision.transforms.functional import resize, rotate
+from transformers import (
+    PreTrainedTokenizerFast,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    MBartConfig,
+    MBartForCausalLM,
+)
+from transformers.file_utils import ModelOutput
+from transformers.modeling_utils import PretrainedConfig, PreTrainedModel
+from nougat.postprocessing import postprocess
+from nougat.transforms import train_transform, test_transform
+class SwinEncoder(nn.Module):
+    r"""
+    Encoder based on SwinTransformer
+    Set the initial weights and configuration with a pretrained SwinTransformer and then
+    modify the detailed configurations
+    Args:
+        input_size: Input image size (width, height)
+        align_long_axis: Whether to rotate image if height is greater than width
+        window_size: Window size(=patch size) of SwinTransformer
+        encoder_layer: Number of layers of SwinTransformer encoder
+        name_or_path: Name of a pretrained model name either registered in huggingface.co. or saved in local.
+                      otherwise, `swin_base_patch4_window12_384` will be set (using `timm`).
+    """
+    def __init__(
+        self,
+        input_size: List[int],
+        align_long_axis: bool,
+        window_size: int,
+        encoder_layer: List[int],
+        patch_size: int,
+        embed_dim: int,
+        num_heads: List[int],
+        name_or_path: Union[str, bytes, os.PathLike] = None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.model = SwinTransformer(
+            img_size=self.input_size,
+            depths=self.encoder_layer,
+            window_size=self.window_size,
+            patch_size=self.patch_size,
+            embed_dim=self.embed_dim,
+            num_heads=self.num_heads,
+            num_classes=0,
+        )
+        # weight init with swin
+        if not name_or_path:
+            swin_state_dict = timm.create_model(
+                "swin_base_patch4_window12_384", pretrained=True
+            ).state_dict()
+            new_swin_state_dict = self.model.state_dict()
+            for x in new_swin_state_dict:
+                if x.endswith("relative_position_index") or x.endswith("attn_mask"):
+                    pass
+                elif (
+                    x.endswith("relative_position_bias_table")
+                    and self.model.layers[0].blocks[0].attn.window_size[0] != 12
+                ):
+                    pos_bias = swin_state_dict[x].unsqueeze(0)[0]
+                    old_len = int(math.sqrt(len(pos_bias)))
+                    new_len = int(2 * window_size - 1)
+                    pos_bias = pos_bias.reshape(1, old_len, old_len, -1).permute(
+                        0, 3, 1, 2
+                    )
+                    pos_bias = F.interpolate(
+                        pos_bias,
+                        size=(new_len, new_len),
+                        mode="bicubic",
+                        align_corners=False,
+                    )
+                    new_swin_state_dict[x] = (
+                        pos_bias.permute(0, 2, 3, 1)
+                        .reshape(1, new_len**2, -1)
+                        .squeeze(0)
+                    )
+                else:
+                    new_swin_state_dict[x] = swin_state_dict[x]
+            self.model.load_state_dict(new_swin_state_dict)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch_size, num_channels, height, width)
+        """
+        x = self.model.patch_embed(x)
+        x = self.model.pos_drop(x)
+        x = self.model.layers(x)
+        return x
+    @staticmethod
+    def crop_margin(img: Image.Image) -> Image.Image:
+        data = np.array(img.convert("L"))
+        data = data.astype(np.uint8)
+        max_val = data.max()
+        min_val = data.min()
+        if max_val == min_val:
+            return img
+        data = (data - min_val) / (max_val - min_val) * 255
+        gray = 255 * (data < 200).astype(np.uint8)
+        coords = cv2.findNonZero(gray)  # Find all non-zero points (text)
+        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+        return img.crop((a, b, w + a, h + b))
+    @property
+    def to_tensor(self):
+        if self.training:
+            return train_transform
+        else:
+            return test_transform
+    def prepare_input(
+        self, img: Image.Image, random_padding: bool = False
+    ) -> torch.Tensor:
+        """
+        Convert PIL Image to tensor according to specified input_size after following steps below:
+            - resize
+            - rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
+            - pad
+        """
+        if img is None:
+            return
+        # crop margins
+        try:
+            img = self.crop_margin(img.convert("RGB"))
+        except OSError:
+            # might throw an error for broken files
+            return
+        if img.height == 0 or img.width == 0:
+            return
+        if self.align_long_axis and (
+            (self.input_size[0] > self.input_size[1] and img.width > img.height)
+            or (self.input_size[0] < self.input_size[1] and img.width < img.height)
+        ):
+            img = rotate(img, angle=-90, expand=True)
+        img = resize(img, min(self.input_size))
+        img.thumbnail((self.input_size[1], self.input_size[0]))
+        delta_width = self.input_size[1] - img.width
+        delta_height = self.input_size[0] - img.height
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+        padding = (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+        return self.to_tensor(ImageOps.expand(img, padding))
+class BARTDecoder(nn.Module):
+    """
+    Decoder based on Multilingual BART
+    Set the initial weights and configuration with a pretrained multilingual BART model,
+    and modify the detailed configurations as a Nougat decoder
+    Args:
+        decoder_layer:
+            Number of layers of BARTDecoder
+        max_position_embeddings:
+            The maximum sequence length to be trained
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local,
+            otherwise, `facebook/mbart-large-50` will be set (using `transformers`)
+    """
+    def __init__(
+        self,
+        decoder_layer: int,
+        max_position_embeddings: int,
+        hidden_dimension: int = 1024,
+        name_or_path: Union[str, bytes, os.PathLike] = None,
+    ):
+        super().__init__()
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_position_embeddings
+        if not name_or_path:
+            tokenizer_file = Path(__file__).parent / "dataset" / "tokenizer.json"
+        else:
+            tokenizer_file = Path(name_or_path) / "tokenizer.json"
+        if not tokenizer_file.exists():
+            raise ValueError("Could not find tokenizer file")
+        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_file))
+        self.tokenizer.pad_token = "<pad>"
+        self.tokenizer.bos_token = "<s>"
+        self.tokenizer.eos_token = "</s>"
+        self.tokenizer.unk_token = "<unk>"
+        self.model = MBartForCausalLM(
+            config=MBartConfig(
+                is_decoder=True,
+                is_encoder_decoder=False,
+                add_cross_attention=True,
+                decoder_layers=self.decoder_layer,
+                max_position_embeddings=self.max_position_embeddings,
+                vocab_size=len(self.tokenizer),
+                scale_embedding=True,
+                add_final_layer_norm=True,
+                d_model=hidden_dimension,
+            )
+        )
+        self.model.config.is_encoder_decoder = True  # to get cross-attention
+        self.model.model.decoder.embed_tokens.padding_idx = self.tokenizer.pad_token_id
+        self.model.prepare_inputs_for_generation = self.prepare_inputs_for_inference
+        if not name_or_path:
+            bart_state_dict = MBartForCausalLM.from_pretrained(
+                "facebook/mbart-large-50"
+            ).state_dict()
+            new_bart_state_dict = self.model.state_dict()
+            for x in new_bart_state_dict:
+                if (
+                    x.endswith("embed_positions.weight")
+                    and self.max_position_embeddings != 1024
+                ):
+                    new_bart_state_dict[x] = torch.nn.Parameter(
+                        self.resize_bart_abs_pos_emb(
+                            bart_state_dict[x],
+                            self.max_position_embeddings
+                            + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                        )
+                    )
+                elif x.endswith("embed_tokens.weight") or x.endswith("lm_head.weight"):
+                    new_bart_state_dict[x] = bart_state_dict[x][
+                        : len(self.tokenizer), :
+                    ]
+                else:
+                    new_bart_state_dict[x] = bart_state_dict[x]
+            self.model.load_state_dict(new_bart_state_dict, strict=False)
+    def add_special_tokens(self, list_of_tokens: List[str]):
+        """
+        Add special tokens to tokenizer and resize the token embeddings
+        """
+        newly_added_num = self.tokenizer.add_special_tokens(
+            {"additional_special_tokens": sorted(set(list_of_tokens))}
+        )
+        if newly_added_num > 0:
+            self.model.resize_token_embeddings(len(self.tokenizer))
+    def prepare_inputs_for_inference(
+        self,
+        input_ids: torch.Tensor,
+        encoder_outputs: torch.Tensor,
+        past=None,
+        past_key_values=None,
+        use_cache: bool = None,
+        attention_mask: torch.Tensor = None,
+    ):
+        """
+        Args:
+            input_ids: (batch_size, sequence_length)
+        Returns:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, embedding_dim)
+        """
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long()
+        past = past or past_key_values
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        output = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+            "encoder_hidden_states": encoder_outputs.last_hidden_state,
+        }
+        return output
+    def forward(
+        self,
+        input_ids,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: bool = None,
+        output_attentions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = None,
+    ):
+        return self.model.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+    @staticmethod
+    def resize_bart_abs_pos_emb(weight: torch.Tensor, max_length: int) -> torch.Tensor:
+        """
+        Resize position embeddings
+        Truncate if sequence length of MBart backbone is greater than given max_length,
+        else interpolate to max_length
+        """
+        if weight.shape[0] > max_length:
+            weight = weight[:max_length, ...]
+        else:
+            weight = (
+                F.interpolate(
+                    weight.permute(1, 0).unsqueeze(0),
+                    size=max_length,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .permute(1, 0)
+            )
+        return weight
+class NougatConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NougatModel`]. It is used to
+    instantiate a Nougat model according to the specified arguments, defining the model architecture
+    Args:
+        input_size:
+            Input image size (canvas size) of Nougat.encoder, SwinTransformer in this codebase
+        align_long_axis:
+            Whether to rotate image if height is greater than width
+        window_size:
+            Window size of Nougat.encoder, SwinTransformer in this codebase
+        encoder_layer:
+            Depth of each Nougat.encoder Encoder layer, SwinTransformer in this codebase
+        decoder_layer:
+            Number of hidden layers in the Nougat.decoder, such as BART
+        max_position_embeddings
+            Trained max position embeddings in the Nougat decoder,
+            if not specified, it will have same value with max_length
+        max_length:
+            Max position embeddings(=maximum sequence length) you want to train
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local
+    """
+    model_type = "nougat"
+    def __init__(
+        self,
+        input_size: List[int] = [896, 672],
+        align_long_axis: bool = False,
+        window_size: int = 7,
+        encoder_layer: List[int] = [2, 2, 14, 2],
+        decoder_layer: int = 10,
+        max_position_embeddings: int = None,
+        max_length: int = 4096,
+        name_or_path: Union[str, bytes, os.PathLike] = "",
+        patch_size: int = 4,
+        embed_dim: int = 128,
+        num_heads: List[int] = [4, 8, 16, 32],
+        hidden_dimension: int = 1024,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = (
+            max_length if max_position_embeddings is None else max_position_embeddings
+        )
+        self.max_length = max_length
+        self.name_or_path = name_or_path
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.hidden_dimension = hidden_dimension
+class RunningVarTorch:
+    def __init__(self, L=15, norm=False):
+        self.values = None
+        self.L = L
+        self.norm = norm
+    def push(self, x: torch.Tensor):
+        assert x.dim() == 1
+        if self.values is None:
+            self.values = x[:, None]
+        elif self.values.shape[1] < self.L:
+            self.values = torch.cat((self.values, x[:, None]), 1)
+        else:
+            self.values = torch.cat((self.values[:, 1:], x[:, None]), 1)
+    def variance(self):
+        if self.values is None:
+            return
+        if self.norm:
+            return torch.var(self.values, 1) / self.values.shape[1]
+        else:
+            return torch.var(self.values, 1)
+class StoppingCriteriaScores(StoppingCriteria):
+    def __init__(self, threshold: float = 0.015, window_size: int = 200):
+        super().__init__()
+        self.threshold = threshold
+        self.vars = RunningVarTorch(norm=True)
+        self.varvars = RunningVarTorch(L=window_size)
+        self.stop_inds = defaultdict(int)
+        self.stopped = defaultdict(bool)
+        self.size = 0
+        self.window_size = window_size
+    @torch.no_grad()
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        last_scores = scores[-1]
+        self.vars.push(last_scores.max(1)[0].float().cpu())
+        self.varvars.push(self.vars.variance())
+        self.size += 1
+        if self.size < self.window_size:
+            return False
+        varvar = self.varvars.variance()
+        for b in range(len(last_scores)):
+            if varvar[b] < self.threshold:
+                if self.stop_inds[b] > 0 and not self.stopped[b]:
+                    self.stopped[b] = self.stop_inds[b] >= self.size
+                else:
+                    self.stop_inds[b] = int(
+                        min(max(self.size, 1) * 1.15 + 150 + self.window_size, 4095)
+                    )
+            else:
+                self.stop_inds[b] = 0
+                self.stopped[b] = False
+        return all(self.stopped.values()) and len(self.stopped) > 0
+def batch(l, b=15):
+    subs = []
+    for i in range(len(l) - b):
+        subs.append(l[i : i + b])
+    return subs
+def subdiv(l, b=10):
+    subs = []
+    for i in range(len(l) - b):
+        subs.append(l[: i + b])
+    return subs
+class NougatModel(PreTrainedModel):
+    r"""
+    Nougat: Neural Optical UnderstandinG for Academic documents.
+    The encoder converts an image of an academic document into a series of embeddings.
+    Then, the decoder generates a sequence of tokens based on encoder's output.
+    This sequence can be translated into a structured markup language format.
+    """
+    config_class = NougatConfig
+    base_model_prefix = "nougat"
+    def __init__(self, config: NougatConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = SwinEncoder(
+            input_size=self.config.input_size,
+            align_long_axis=self.config.align_long_axis,
+            window_size=self.config.window_size,
+            encoder_layer=self.config.encoder_layer,
+            name_or_path=self.config.name_or_path,
+            patch_size=self.config.patch_size,
+            embed_dim=self.config.embed_dim,
+            num_heads=self.config.num_heads,
+        )
+        self.decoder = BARTDecoder(
+            max_position_embeddings=self.config.max_position_embeddings,
+            decoder_layer=self.config.decoder_layer,
+            name_or_path=self.config.name_or_path,
+            hidden_dimension=self.config.hidden_dimension,
+        )
+    def forward(
+        self,
+        image_tensors: torch.Tensor,
+        decoder_input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Calculate a loss given an input image and a desired token sequence,
+        the model will be trained in a teacher-forcing manner
+        Args:
+            image_tensors: (batch_size, num_channels, height, width)
+            decoder_input_ids: (batch_size, sequence_length, embedding_dim)
+        """
+        encoder_outputs = self.encoder(image_tensors)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids[:, :-1].contiguous(),
+            encoder_hidden_states=encoder_outputs,
+            attention_mask=attention_mask[:, :-1],
+            labels=decoder_input_ids[:, 1:].contiguous(),
+        )
+        return decoder_outputs
+    def _init_weights(self, *args, **kwargs):
+        return
+    def inference(
+        self,
+        image: Image.Image = None,
+        image_tensors: Optional[torch.Tensor] = None,
+        return_attentions: bool = False,
+        early_stopping: bool = True,
+    ):
+        """
+        Generate a token sequence in an auto-regressive manner.
+        Args:
+            image: input document image (PIL.Image)
+            image_tensors: (1, num_channels, height, width)
+                convert prompt to tensor if image_tensor is not fed
+        """
+        output = {
+            "predictions": list(),
+            "sequences": list(),
+            "repeats": list(),
+            "repetitions": list(),
+        }
+        if image is None and image_tensors is None:
+            logging.warn("Image not found")
+            return output
+        if image_tensors is None:
+            image_tensors = self.encoder.prepare_input(image).unsqueeze(0)
+        if self.device.type != "mps":
+            image_tensors = image_tensors.to(next(self.parameters()).dtype)
+        image_tensors = image_tensors.to(self.device)
+        last_hidden_state = self.encoder(image_tensors)
+        encoder_outputs = ModelOutput(
+            last_hidden_state=last_hidden_state, attentions=None
+        )
+        if len(encoder_outputs.last_hidden_state.size()) == 1:
+            encoder_outputs.last_hidden_state = (
+                encoder_outputs.last_hidden_state.unsqueeze(0)
+            )
+        # get decoder output
+        decoder_output = self.decoder.model.generate(
+            encoder_outputs=encoder_outputs,
+            min_length=1,
+            max_length=self.config.max_length,
+            pad_token_id=self.decoder.tokenizer.pad_token_id,
+            eos_token_id=self.decoder.tokenizer.eos_token_id,
+            use_cache=True,
+            bad_words_ids=[
+                [self.decoder.tokenizer.unk_token_id],
+            ],
+            return_dict_in_generate=True,
+            output_scores=True,
+            output_attentions=return_attentions,
+            do_sample=False,
+            stopping_criteria=StoppingCriteriaList(
+                [StoppingCriteriaScores()] if early_stopping else []
+            ),
+        )
+        output["repetitions"] = decoder_output.sequences.clone()
+        output["sequences"] = decoder_output.sequences.clone()
+        batch_size = len(decoder_output.sequences)
+        logits = torch.stack(decoder_output.scores, 1).cpu().max(-1)
+        values = logits.values
+        indices = logits.indices
+        for b in range(batch_size):
+            mask = indices[b] != self.decoder.tokenizer.pad_token_id
+            N = mask.sum().item()
+            var = np.array(
+                [np.var(s) / len(s) for s in batch(values[b, mask].float().numpy())]
+            )
+            if len(var) < 10:
+                output["repeats"].append(None)
+                continue
+            varvar = np.array([np.var(v) for v in subdiv(var[::-1])][::-1])
+            minlen = 120
+            if (
+                indices[b] == self.decoder.tokenizer.eos_token_id
+            ).any() and N + 1 < indices.shape[1]:
+                # there is an end to the generation, likely no repetitions
+                output["repeats"].append(None)
+                continue
+            small_var = np.where(varvar < 0.045)[0]
+            if early_stopping and len(small_var) > 1:
+                if np.all(np.diff(small_var) < 2):
+                    idx = int(min(max(small_var[0], 1) * 1.08 + minlen, 4095))
+                    if idx / N > 0.9:  # at most last bit
+                        output["repeats"].append(None)
+                        continue
+                    elif small_var[0] < 30:
+                        idx = 0
+                    logging.warn("Found repetitions in sample %i" % b)
+                    output["repeats"].append(idx)
+                    output["sequences"][b, idx:] = self.decoder.tokenizer.pad_token_id
+                    output["repetitions"][b, :idx] = self.decoder.tokenizer.pad_token_id
+                else:
+                    output["repeats"].append(None)
+            else:
+                output["repeats"].append(None)
+        output["repetitions"] = self.decoder.tokenizer.batch_decode(
+            output["repetitions"], skip_special_tokens=True
+        )
+        output["predictions"] = postprocess(
+            self.decoder.tokenizer.batch_decode(
+                output["sequences"], skip_special_tokens=True
+            ),
+            markdown_fix=False,
+        )
+        if return_attentions:
+            output["attentions"] = {
+                "self_attentions": decoder_output.decoder_attentions,
+                "cross_attentions": decoder_output.cross_attentions,
+            }
+        return output
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: Union[str, bytes, os.PathLike],
+        *model_args,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a pretrained nougat model from a pre-trained model configuration
+        Args:
+            model_path:
+                Name of a pretrained model name either registered in huggingface.co. or saved in local.
+        """
+        model = super(NougatModel, cls).from_pretrained(
+            model_path, *model_args, **kwargs
+        )
+        # truncate or interpolate position embeddings of decoder
+        max_length = kwargs.get("max_length", model.config.max_position_embeddings)
+        if (
+            max_length != model.config.max_position_embeddings
+        ):  # if max_length of trained model differs max_length you want to train
+            model.decoder.model.model.decoder.embed_positions.weight = torch.nn.Parameter(
+                model.decoder.resize_bart_abs_pos_emb(
+                    model.decoder.model.model.decoder.embed_positions.weight,
+                    max_length
+                    + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                )
+            )
+            model.config.max_position_embeddings = max_length
+        return model

nougat/postprocessing.py ADDED Viewed

	@@ -0,0 +1,508 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from typing import Union, List
+import re
+import os
+import numpy as np
+from nltk.corpus import words
+from multiprocessing import Pool
+from functools import partial
+from Levenshtein import ratio
+reference_pattern = re.compile(r"^\* \[\d+\]", flags=re.M)
+def markdown_compatible(s: str) -> str:
+    """
+    Make text compatible with Markdown formatting.
+    This function makes various text formatting adjustments to make it compatible with Markdown.
+    Args:
+        s (str): The input text to be made Markdown-compatible.
+    Returns:
+        str: The Markdown-compatible text.
+    """
+    # equation tag
+    s = re.sub(
+        r"^\(([\d.]+[a-zA-Z]?)\) \\\[(.+?)\\\]$", r"\[\2 \\tag{\1}\]", s, flags=re.M
+    )
+    s = re.sub(
+        r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\)$", r"\[\1 \\tag{\2}\]", s, flags=re.M
+    )
+    s = re.sub(
+        r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\) (\\\[.+?\\\])$",
+        r"\[\1 \\tag{\2}\] \3",
+        s,
+        flags=re.M,
+    )  # multi line
+    s = s.replace(r"\. ", ". ")
+    # bold formatting
+    s = s.replace(r"\bm{", r"\mathbf{").replace(r"{\\bm ", r"\mathbf{")
+    # s = s.replace(r"\it{", r"\mathit{").replace(r"{\\it ", r"\mathit{") # not needed
+    s = re.sub(r"\\mbox{ ?\\boldmath\$(.*?)\$}", r"\\mathbf{\1}", s)
+    # s=re.sub(r'\\begin{table}(.+?)\\end{table}\nTable \d+: (.+?)\n',r'\\begin{table}\1\n\\capation{\2}\n\\end{table}\n',s,flags=re.S)
+    # s=re.sub(r'###### Abstract\n(.*?)\n\n',r'\\begin{abstract}\n\1\n\\end{abstract}\n\n',s,flags=re.S)
+    # urls
+    s = re.sub(
+        r"((?:http|ftp|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
+        r"[\1](\1)",
+        s,
+    )
+    # algorithms
+    s = re.sub(r"```\s*(.+?)\s*```", r"```\n\1\n```", s, flags=re.S)
+    # lists
+    return s
+def find_next_punctuation(s: str, start_inx=0):
+    """
+    Find the index of the next punctuation mark
+    Args:
+        s: String to examine
+        start_inx: Index where to start
+    """
+    for i in range(start_inx, len(s)):
+        if s[i] in [".", "?", "!", "\n"]:
+            return i
+    return None
+def find_last_punctuation(s: str, start_inx=0):
+    """
+    Find the index of the last punctuation mark before start_inx
+    Args:
+        s: String to examine
+        start_inx: Index where to look before
+    """
+    for i in range(start_inx - 1, 0, -1):
+        if s[i] in [".", "?", "!", "\n"]:
+            return i
+    return None
+def truncate_repetitions(s: str, min_len=30):
+    """
+    Attempt to truncate repeating segments in the input string.
+    This function looks for the longest repeating substring at the end of the input string and truncates
+    it to appear only once. To be considered for removal, repetitions need to be continuous.
+    Args:
+        s (str): The input raw prediction to be truncated.
+        min_len (int): The minimum length of the repeating segment.
+    Returns:
+        str: The input string with repeated segments truncated.
+    """
+    s_lower = s.lower()
+    s_len = len(s_lower)
+    if s_len < 2 * min_len:
+        return s
+    # try to find a length at which the tail is repeating
+    max_rep_len = None
+    for rep_len in range(min_len, int(s_len / 2)):
+        # check if there is a repetition at the end
+        same = True
+        for i in range(0, rep_len):
+            if s_lower[s_len - rep_len - i - 1] != s_lower[s_len - i - 1]:
+                same = False
+                break
+        if same:
+            max_rep_len = rep_len
+    if max_rep_len is None:
+        return s
+    lcs = s_lower[-max_rep_len:]
+    # remove all but the last repetition
+    st = s
+    st_lower = s_lower
+    while st_lower.endswith(lcs):
+        st = st[:-max_rep_len]
+        st_lower = st_lower[:-max_rep_len]
+    # this is the tail with the repetitions
+    repeating_tail = s_lower[len(st_lower) :]
+    # add until next punctuation and make sure last sentence is not repeating
+    st_lower_out = st_lower
+    while True:
+        sentence_end = find_next_punctuation(s_lower, len(st_lower_out))
+        sentence_start = find_last_punctuation(s_lower, len(st_lower_out))
+        if sentence_end and sentence_start:
+            sentence = s_lower[sentence_start:sentence_end]
+            st_lower_out = s_lower[: sentence_end + 1]
+            if sentence in repeating_tail:
+                break
+        else:
+            break
+    s_out = s[: len(st_lower_out)]
+    return s_out
+def close_envs(s: str) -> str:
+    """checks if table envs are opened but not closed. Appends the closing statements and returns the new string"""
+    envs = ("bmatrix", "pmatrix", "matrix", "tabular", "table")
+    for env in envs:
+        begins, ends = s.count(r"\begin{%s}" % env), s.count(r"\end{%s}" % env)
+        if begins > ends:
+            s += (r"\end{%s}" % env) * (begins - ends)
+    return s
+def remove_numbers(lines):
+    def _clean(s):
+        return re.sub(r"(?:[\d_]|\*\*)", "", s).strip()
+    if type(lines) is str:
+        return _clean(lines)
+    out = []
+    for l in lines:
+        out.append(_clean(l))
+    return out
+def get_slices(lines, clean_lines):
+    """
+    Get slices of text based on specific criteria within the lines.
+    This function identifies and returns slices of text from the input lines based on certain conditions.
+    Args:
+        lines (list of str): The list of lines containing the text.
+        clean_lines (list of str): A cleaned version of the text (without numbers).
+    Returns:
+        list of tuple: A list of tuples representing the start and end indices of text slices.
+    """
+    inds = np.zeros(len(lines))
+    for i in range(len(lines) - 1):
+        j = i + 1
+        while not clean_lines[j] and j < len(lines) - 1:
+            j += 1
+        if (
+            len(clean_lines[i]) < 200
+            and len(clean_lines[i]) > 3
+            and len(clean_lines[j]) < 200
+            and len(clean_lines[j]) > 3
+            and not clean_lines[i].startswith("[MISSING_PAGE")
+            and (
+                clean_lines[i] == clean_lines[j]
+                or ratio(clean_lines[i], clean_lines[j]) > 0.9
+            )
+        ):
+            inds[i:j] = 1
+    ids = np.where(inds)[0]
+    slices = []
+    if len(ids) == 0:
+        return slices
+    j0 = 0
+    for j, x in enumerate(np.diff(ids) > 3):
+        if x:
+            slices.append((ids[j0], ids[j] + 2))
+            j0 = j + 1
+    slices.append((ids[j0], ids[-1] + 2))
+    return [sli for sli in slices if sli[1] - sli[0] > 15]
+def remove_slice_from_lines(lines, clean_text, sli) -> str:
+    """
+    Remove a slice of text from the lines based on specific criteria.
+    This function identifies a slice of text within the lines and removes it based on certain conditions.
+    Args:
+        lines (list of str): The list of lines containing the text.
+        clean_text (list of str): A cleaned version of the text (without numbers).
+        sli (tuple): A tuple representing the start and end indices of the slice to be removed.
+    Returns:
+        str: The removed slice of text as a single string.
+    """
+    base = clean_text[sli[0]]
+    section = list(sli)
+    check_start_flag = False
+    # backwards pass
+    for i in range(max(0, sli[0] - 1), max(0, sli[0] - 5), -1):
+        if not lines[i]:
+            continue
+        if lines[i] == "## References":
+            section[0] = i
+            break
+        elif ratio(base, remove_numbers(lines[i])) < 0.9:
+            section[0] = i + 1
+            potential_ref = remove_numbers(lines[max(0, i - 1)].partition("* [")[-1])
+            if (
+                len(potential_ref) >= 0.75 * len(base)
+                and ratio(base, potential_ref) < 0.9
+            ):
+                section[0] = i
+            check_start_flag = True
+            break
+    # forward pass
+    for i in range(min(len(lines), sli[1]), min(len(lines), sli[1] + 5)):
+        if ratio(base, remove_numbers(lines[i])) < 0.9:
+            section[1] = i
+            break
+    if len(lines) <= section[1]:
+        section[1] = len(lines) - 1
+    to_delete = "\n".join(lines[section[0] : section[1] + 1])
+    # cut off next page content
+    itera, iterb = enumerate(lines[section[1] - 1]), enumerate(lines[section[1]])
+    while True:
+        try:
+            (ia, a) = next(itera)
+            while a.isnumeric():
+                (ia, a) = next(itera)
+            (ib, b) = next(iterb)
+            while b.isnumeric():
+                (ib, b) = next(iterb)
+            if a != b:
+                break
+        except StopIteration:
+            break
+    if check_start_flag and "* [" in to_delete:
+        to_delete = "* [" + to_delete.partition("* [")[-1]
+    try:
+        delta = len(lines[section[1]]) - ib - 1
+        if delta > 0:
+            to_delete = to_delete[:-delta]
+    except UnboundLocalError:
+        pass
+    return to_delete.strip()
+def remove_hallucinated_references(text: str) -> str:
+    """
+    Remove hallucinated or missing references from the text.
+    This function identifies and removes references that are marked as missing or hallucinated
+    from the input text.
+    Args:
+        text (str): The input text containing references.
+    Returns:
+        str: The text with hallucinated references removed.
+    """
+    lines = text.split("\n")
+    if len(lines) == 0:
+        return ""
+    clean_lines = remove_numbers(lines)
+    slices = get_slices(lines, clean_lines)
+    to_delete = []
+    for sli in slices:
+        to_delete.append(remove_slice_from_lines(lines, clean_lines, sli))
+    for to_delete in reversed(to_delete):
+        text = text.replace(to_delete, "\n\n[MISSING_PAGE_POST]\n\n")
+    text = re.sub(
+        r"## References\n+\[MISSING_PAGE_POST(:\d+)?\]",
+        "\n\n[MISSING_PAGE_POST\\1]",
+        text,
+    )
+    return text
+def postprocess_single(generation: str, markdown_fix: bool = True) -> str:
+    """
+    Postprocess a single generated text.
+    Args:
+        generation (str): The generated text to be postprocessed.
+        markdown_fix (bool, optional): Whether to perform Markdown formatting fixes. Default is True.
+    Returns:
+        str: The postprocessed text.
+    """
+    generation = re.sub(
+        r"(?:\n|^)#+ \d*\W? ?(.{100,})", r"\n\1", generation
+    )  # too long section titles probably are none
+    generation = generation.strip()
+    generation = generation.replace("\n* [leftmargin=*]\n", "\n")
+    generation = re.sub(
+        r"^#+ (?:\.?(?:\d|[ixv])+)*\s*(?:$|\n\s*)", "", generation, flags=re.M
+    )
+    # most likely hallucinated titles
+    lines = generation.split("\n")
+    if (
+        lines[-1].startswith("#")
+        and lines[-1].lstrip("#").startswith(" ")
+        and len(lines) > 1
+    ):
+        print("INFO: likely hallucinated title at the end of the page: " + lines[-1])
+        generation = "\n".join(lines[:-1])
+    # obvious repetition detection
+    generation = truncate_repetitions(generation)
+    # Reference corrections
+    generation = remove_hallucinated_references(generation)
+    generation = re.sub(
+        r"^\* \[\d+\](\s?[A-W]\.+\s?){10,}.*$", "", generation, flags=re.M
+    )
+    generation = re.sub(r"^(\* \[\d+\])\[\](.*)$", r"\1\2", generation, flags=re.M)
+    generation = re.sub(r"(^\w\n\n|\n\n\w$)", "", generation)
+    # pmc math artifact correction
+    generation = re.sub(
+        r"([\s.,()])_([a-zA-Z0-9])__([a-zA-Z0-9]){1,3}_([\s.,:()])",
+        r"\1\(\2_{\3}\)\4",
+        generation,
+    )
+    generation = re.sub(
+        r"([\s.,\d])_([a-zA-Z0-9])_([\s.,\d;])", r"\1\(\2\)\3", generation
+    )
+    # footnote mistakes
+    generation = re.sub(
+        r"(\nFootnote .*?:) (?:footnotetext|thanks):\W*(.*(?:\n\n|$))",
+        r"\1 \2",
+        generation,
+    )
+    # TODO Come up with footnote formatting inside a table
+    generation = re.sub(r"\[FOOTNOTE:.+?\](.*?)\[ENDFOOTNOTE\]", "", generation)
+    # itemize post processing
+    for match in reversed(
+        list(
+            re.finditer(
+                r"(?:^)(-|\*)?(?!-|\*) ?((?:\d|[ixv])+ )?.+? (-|\*) (((?:\d|[ixv])+)\.(\d|[ixv]) )?.*(?:$)",
+                generation,
+                flags=re.I | re.M,
+            )
+        )
+    ):
+        start, stop = match.span()
+        delim = match.group(3) + " "
+        splits = match.group(0).split(delim)
+        replacement = ""
+        if match.group(1) is not None:
+            splits = splits[1:]
+            delim1 = match.group(1) + " "
+        else:
+            delim1 = ""
+            # too many false positives
+            continue
+        pre, post = generation[:start], generation[stop:]
+        for i, item in enumerate(splits):
+            level = 0
+            potential_numeral, _, rest = item.strip().partition(" ")
+            if not rest:
+                continue
+            if re.match(
+                r"^[\dixv]+((?:\.[\dixv])?)+$", potential_numeral, flags=re.I | re.M
+            ):
+                level = potential_numeral.count(".")
+            replacement += (
+                ("\n" if i > 0 else "")
+                + ("\t" * level)
+                + (delim if i > 0 or start == 0 else delim1)
+                + item.strip()
+            )
+        if post == "":
+            post = "\n"
+        generation = pre + replacement + post
+    if generation.endswith((".", "}")):
+        generation += "\n\n"
+    if re.match(r"[A-Z0-9,;:]$", generation):
+        # add space in case it there is a comma or word ending
+        generation += " "
+    elif generation.startswith(("#", "**", "\\begin")):
+        generation = "\n\n" + generation
+    elif generation.split("\n")[-1].startswith(("#", "Figure", "Table")):
+        generation = generation + "\n\n"
+    else:
+        try:
+            last_word = generation.split(" ")[-1]
+            if last_word in words.words():
+                generation += " "
+        except LookupError:
+            # add space just in case. Will split words but better than concatenating them
+            generation += " "
+            # download for the next time
+            import nltk
+            nltk.download("words")
+    # table corrections
+    # remove obvious wrong tables
+    for l in generation.split("\n"):
+        if (
+            l.count("\\begin{tabular}") > 15
+            or l.count("\\multicolumn") > 60
+            or l.count("&") > 400
+        ):
+            generation = generation.replace(l, "")
+    # whitespace corrections
+    generation = generation.replace(
+        "\\begin{table} \\begin{tabular}", "\\begin{table}\n\\begin{tabular}"
+    )
+    generation = generation.replace(
+        "\\end{tabular} \\end{table}", "\\end{tabular}\n\\end{table}"
+    )
+    generation = generation.replace("\\end{table} Tab", "\\end{table}\nTab")
+    generation = re.sub(r"(^.+)\\begin{tab", r"\1\n\\begin{tab", generation, flags=re.M)
+    generation = generation.replace(
+        r"\begin{tabular}{l l}  & \\ \end{tabular}", ""
+    ).replace("\\begin{tabular}{}\n\n\\end{tabular}", "")
+    generation = generation.replace("\\begin{array}[]{", "\\begin{array}{")
+    generation = re.sub(
+        r"\\begin{tabular}{([clr ]){2,}}\s*[& ]*\s*(\\\\)? \\end{tabular}",
+        "",
+        generation,
+    )
+    generation = re.sub(r"(\*\*S\. A\. B\.\*\*\n+){2,}", "", generation)
+    generation = re.sub(r"^#+( [\[\d\w])?$", "", generation, flags=re.M)
+    generation = re.sub(r"^\.\s*$", "", generation, flags=re.M)
+    generation = re.sub(r"\n{3,}", "\n\n", generation)
+    if markdown_fix:
+        return markdown_compatible(generation)
+    else:
+        return generation
+def postprocess(
+    generation: Union[str, List[str]], markdown_fix: bool = True
+) -> Union[str, List[str]]:
+    """
+    Postprocess generated text or a list of generated texts.
+    This function can be used to perform postprocessing on generated text, such as fixing Markdown formatting.
+    Args:
+        generation (Union[str, List[str]]): The generated text or a list of generated texts.
+        markdown_fix (bool, optional): Whether to perform Markdown formatting fixes. Default is True.
+    Returns:
+        Union[str, List[str]]: The postprocessed text or list of postprocessed texts.
+    """
+    if type(generation) == list:
+        if os.environ.get("NOUGAT_MULTIPROCESSING"):
+            with Pool(int(os.environ.get("NOUGAT_MULTIPROCESSING"))) as p:
+                return p.map(
+                    partial(postprocess_single, markdown_fix=markdown_fix), generation
+                )
+        else:
+            return [
+                postprocess_single(s, markdown_fix=markdown_fix) for s in generation
+            ]
+    else:
+        return postprocess_single(generation, markdown_fix=markdown_fix)

nougat/transforms.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+# Implements image augmentation
+import albumentations as alb
+from albumentations.pytorch import ToTensorV2
+import cv2
+import numpy as np
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+def alb_wrapper(transform):
+    def f(im):
+        return transform(image=np.asarray(im))["image"]
+    return f
+class Erosion(alb.ImageOnlyTransform):
+    """
+    Apply erosion operation to an image.
+    Erosion is a morphological operation that shrinks the white regions in a binary image.
+    Args:
+        scale (int or tuple/list of int): The scale or range for the size of the erosion kernel.
+            If an integer is provided, a square kernel of that size will be used.
+            If a tuple or list is provided, it should contain two integers representing the minimum
+            and maximum sizes for the erosion kernel.
+        always_apply (bool, optional): Whether to always apply this transformation. Default is False.
+        p (float, optional): The probability of applying this transformation. Default is 0.5.
+    Returns:
+        numpy.ndarray: The transformed image.
+    """
+    def __init__(self, scale, always_apply=False, p=0.5):
+        super().__init__(always_apply=always_apply, p=p)
+        if type(scale) is tuple or type(scale) is list:
+            assert len(scale) == 2
+            self.scale = scale
+        else:
+            self.scale = (scale, scale)
+    def apply(self, img, **params):
+        kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE, tuple(np.random.randint(self.scale[0], self.scale[1], 2))
+        )
+        img = cv2.erode(img, kernel, iterations=1)
+        return img
+class Dilation(alb.ImageOnlyTransform):
+    """
+    Apply dilation operation to an image.
+    Dilation is a morphological operation that expands the white regions in a binary image.
+    Args:
+        scale (int or tuple/list of int): The scale or range for the size of the dilation kernel.
+            If an integer is provided, a square kernel of that size will be used.
+            If a tuple or list is provided, it should contain two integers representing the minimum
+            and maximum sizes for the dilation kernel.
+        always_apply (bool, optional): Whether to always apply this transformation. Default is False.
+        p (float, optional): The probability of applying this transformation. Default is 0.5.
+    Returns:
+        numpy.ndarray: The transformed image.
+    """
+    def __init__(self, scale, always_apply=False, p=0.5):
+        super().__init__(always_apply=always_apply, p=p)
+        if type(scale) is tuple or type(scale) is list:
+            assert len(scale) == 2
+            self.scale = scale
+        else:
+            self.scale = (scale, scale)
+    def apply(self, img, **params):
+        kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE, tuple(np.random.randint(self.scale[0], self.scale[1], 2))
+        )
+        img = cv2.dilate(img, kernel, iterations=1)
+        return img
+class Bitmap(alb.ImageOnlyTransform):
+    """
+    Apply a bitmap-style transformation to an image.
+    This transformation replaces all pixel values below a certain threshold with a specified value.
+    Args:
+        value (int, optional): The value to replace pixels below the threshold with. Default is 0.
+        lower (int, optional): The threshold value below which pixels will be replaced. Default is 200.
+        always_apply (bool, optional): Whether to always apply this transformation. Default is False.
+        p (float, optional): The probability of applying this transformation. Default is 0.5.
+    Returns:
+        numpy.ndarray: The transformed image.
+    """
+    def __init__(self, value=0, lower=200, always_apply=False, p=0.5):
+        super().__init__(always_apply=always_apply, p=p)
+        self.lower = lower
+        self.value = value
+    def apply(self, img, **params):
+        img = img.copy()
+        img[img < self.lower] = self.value
+        return img
+train_transform = alb_wrapper(
+    alb.Compose(
+        [
+            Bitmap(p=0.05),
+            alb.OneOf([Erosion((2, 3)), Dilation((2, 3))], p=0.02),
+            alb.Affine(shear={"x": (0, 3), "y": (-3, 0)}, cval=(255, 255, 255), p=0.03),
+            alb.ShiftScaleRotate(
+                shift_limit_x=(0, 0.04),
+                shift_limit_y=(0, 0.03),
+                scale_limit=(-0.15, 0.03),
+                rotate_limit=2,
+                border_mode=0,
+                interpolation=2,
+                value=(255, 255, 255),
+                p=0.03,
+            ),
+            alb.GridDistortion(
+                distort_limit=0.05,
+                border_mode=0,
+                interpolation=2,
+                value=(255, 255, 255),
+                p=0.04,
+            ),
+            alb.Compose(
+                [
+                    alb.Affine(
+                        translate_px=(0, 5), always_apply=True, cval=(255, 255, 255)
+                    ),
+                    alb.ElasticTransform(
+                        p=1,
+                        alpha=50,
+                        sigma=120 * 0.1,
+                        alpha_affine=120 * 0.01,
+                        border_mode=0,
+                        value=(255, 255, 255),
+                    ),
+                ],
+                p=0.04,
+            ),
+            alb.RandomBrightnessContrast(0.1, 0.1, True, p=0.03),
+            alb.ImageCompression(95, p=0.07),
+            alb.GaussNoise(20, p=0.08),
+            alb.GaussianBlur((3, 3), p=0.03),
+            alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ToTensorV2(),
+        ]
+    )
+)
+test_transform = alb_wrapper(
+    alb.Compose(
+        [
+            alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ToTensorV2(),
+        ]
+    )
+)

nougat/utils/__init__.py ADDED Viewed

File without changes

nougat/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (149 Bytes). View file

nougat/utils/__pycache__/checkpoint.cpython-310.pyc ADDED Viewed

Binary file (3.84 kB). View file

nougat/utils/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (8.96 kB). View file

nougat/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from typing import Optional
+import requests
+import os
+import tqdm
+import io
+from pathlib import Path
+import torch
+BASE_URL = "https://github.com/facebookresearch/nougat/releases/download"
+MODEL_TAG = "0.1.0-small"
+# source: https://stackoverflow.com/a/71459251
+def download_as_bytes_with_progress(url: str, name: str = None) -> bytes:
+    """
+    Download a file from a URL and return the contents as bytes, with progress bar.
+    Args:
+        url: The URL of the file to download.
+        name: The name of the file to save to. If None, the filename will be the same as the URL.
+    Returns:
+        bytes: The contents of the file.
+    """
+    resp = requests.get(url, stream=True, allow_redirects=True)
+    total = int(resp.headers.get("content-length", 0))
+    bio = io.BytesIO()
+    if name is None:
+        name = url
+    with tqdm.tqdm(
+        desc=name,
+        total=total,
+        unit="b",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in resp.iter_content(chunk_size=65536):
+            bar.update(len(chunk))
+            bio.write(chunk)
+    return bio.getvalue()
+def download_checkpoint(checkpoint: Path, model_tag: str = MODEL_TAG):
+    """
+    Download the Nougat model checkpoint.
+    This function downloads the Nougat model checkpoint from GitHub.
+    Args:
+        checkpoint (Path): The path to the checkpoint.
+        model_tag (str): The model tag to download. Default is "0.1.0-small".
+    """
+    print("downloading nougat checkpoint version", model_tag, "to path", checkpoint)
+    files = [
+        "config.json",
+        "pytorch_model.bin",
+        "special_tokens_map.json",
+        "tokenizer.json",
+        "tokenizer_config.json",
+    ]
+    for file in files:
+        download_url = f"{BASE_URL}/{model_tag}/{file}"
+        binary_file = download_as_bytes_with_progress(download_url, file)
+        if len(binary_file) > 15:  # sanity check
+            (checkpoint / file).write_bytes(binary_file)
+def torch_hub(model_tag: Optional[str] = MODEL_TAG) -> Path:
+    old_path = Path(torch.hub.get_dir() + "/nougat")
+    if model_tag is None:
+        model_tag = MODEL_TAG
+    hub_path = old_path.with_name(f"nougat-{model_tag}")
+    if old_path.exists():
+        # move to new format
+        old_path.rename(old_path.with_name("nougat-0.1.0-small"))
+    return hub_path
+def get_checkpoint(
+    checkpoint_path: Optional[os.PathLike] = None,
+    model_tag: str = MODEL_TAG,
+    download: bool = True,
+) -> Path:
+    """
+    Get the path to the Nougat model checkpoint.
+    This function retrieves the path to the Nougat model checkpoint. If the checkpoint does not
+    exist or is empty, it can optionally download the checkpoint.
+    Args:
+        checkpoint_path (Optional[os.PathLike]): The path to the checkpoint. If not provided,
+            it will check the "NOUGAT_CHECKPOINT" environment variable or use the default location.
+            Default is None.
+        model_tag (str): The model tag to download. Default is "0.1.0-small".
+        download (bool): Whether to download the checkpoint if it doesn't exist or is empty.
+            Default is True.
+    Returns:
+        Path: The path to the Nougat model checkpoint.
+    """
+    checkpoint = Path(
+        checkpoint_path or os.environ.get("NOUGAT_CHECKPOINT", torch_hub(model_tag))
+    )
+    if checkpoint.exists() and checkpoint.is_file():
+        checkpoint = checkpoint.parent
+    if download and (not checkpoint.exists() or len(os.listdir(checkpoint)) < 5):
+        checkpoint.mkdir(parents=True, exist_ok=True)
+        download_checkpoint(checkpoint, model_tag=model_tag or MODEL_TAG)
+    return checkpoint
+if __name__ == "__main__":
+    get_checkpoint()

nougat/utils/dataset.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+import logging
+import os
+from math import prod
+from pathlib import Path
+from functools import partial
+import random
+from typing import Dict, Tuple, Callable
+from PIL import Image, UnidentifiedImageError
+from typing import List, Optional
+import torch
+import pypdf
+import orjson
+from torch.utils.data import Dataset
+from transformers.modeling_utils import PreTrainedModel
+from nougat.dataset.rasterize import rasterize_paper
+class ImageDataset(torch.utils.data.Dataset):
+    """
+    Dataset for processing a list of images using a preparation function.
+    This dataset takes a list of image paths and applies a preparation function to each image.
+    Args:
+        img_list (list): List of image paths.
+        prepare (Callable): A preparation function to process the images.
+    Attributes:
+        img_list (list): List of image paths.
+        prepare (Callable): The preparation function.
+    """
+    def __init__(self, img_list, prepare: Callable):
+        super().__init__()
+        self.img_list = img_list
+        self.prepare = prepare
+    def __len__(self):
+        return len(self.img_list)
+    @staticmethod
+    def ignore_none_collate(batch):
+        if batch is None:
+            return
+        try:
+            batch = [x for x in batch if x is not None and x[0] is not None]
+            if len(batch) == 0:
+                return
+            return torch.utils.data.dataloader.default_collate(batch)
+        except AttributeError:
+            pass
+    def __getitem__(self, idx):
+        try:
+            img = Image.open(self.img_list[idx])
+            return self.prepare(img)
+        except Exception as e:
+            logging.error(e)
+class LazyDataset(Dataset):
+    """
+    Lazy loading dataset for processing PDF documents.
+    This dataset allows lazy loading of PDF documents and provides access to processed images
+    using a specified preparation function.
+    Args:
+        pdf (str): Path to the PDF document.
+        prepare (Callable): A preparation function to process the images.
+    Attributes:
+        name (str): Name of the PDF document.
+    """
+    def __init__(self, pdf, prepare: Callable, pages: Optional[List[int]] = None):
+        super().__init__()
+        self.prepare = prepare
+        self.name = str(pdf)
+        self.init_fn = partial(rasterize_paper, pdf, pages=pages)
+        self.dataset = None
+        self.size = len(pypdf.PdfReader(pdf).pages) if pages is None else len(pages)
+    def __len__(self):
+        return self.size
+    def __getitem__(self, i):
+        if i == 0 or self.dataset is None:
+            self.dataset = ImageDataset(self.init_fn(), self.prepare)
+        if i <= self.size and i >= 0:
+            return self.dataset[i], self.name if i == self.size - 1 else ""
+        else:
+            raise IndexError
+    @staticmethod
+    def ignore_none_collate(batch):
+        if batch is None:
+            return None, None
+        try:
+            _batch = []
+            for i, x in enumerate(batch):
+                image, name = x
+                if image is not None:
+                    _batch.append(x)
+                elif name:
+                    if i > 0:
+                        _batch[-1] = (_batch[-1][0], name)
+                    elif len(batch) > 1:
+                        _batch.append((batch[1][0] * 0, name))
+            if len(_batch) == 0:
+                return None, None
+            return torch.utils.data.dataloader.default_collate(_batch)
+        except AttributeError:
+            pass
+        return None, None
+class SciPDFDataset(Dataset):
+    """
+    Custom dataset for scientific PDF data.
+    This dataset loads data from JSONL files and provides access to images, ground truth,
+    and metadata.
+    Args:
+        path_to_index (str): Path to the index file.
+        split (str, optional): Split of the dataset (e.g., "train", "test"). Default is "train".
+        root_name (str, optional): Root directory name. Default is an empty string.
+        template (str, optional): Template for split naming. Default is "%s".
+    Attributes:
+        empty_sample: Placeholder for empty samples.
+    """
+    empty_sample = None
+    def __init__(
+        self,
+        path_to_index: str,
+        split: str = "train",
+        root_name="",
+        template="%s",
+    ) -> None:
+        super().__init__()
+        self.path_to_index = Path(path_to_index)
+        self.root_name = root_name
+        self.path_to_root = self.path_to_index.parent
+        if not split in self.path_to_index.stem:
+            pti = self.path_to_root / (template % split + ".jsonl")
+            if pti.exists():
+                self.path_to_index = pti
+            else:
+                raise ValueError(f'Dataset file for split "{split}" not found: {pti}')
+        self.dataset_file = None  # mulitprocessing
+        # load seek map
+        seek_path = self.path_to_root / (self.path_to_index.stem + ".seek.map")
+        if seek_path.exists():
+            self.seek_map = orjson.loads(seek_path.open().read())
+        else:
+            raise ValueError(
+                'No "%s" found in %s' % (seek_path.name, str(self.path_to_root))
+            )
+        self.dataset_length = len(self.seek_map)
+    def __len__(self) -> int:
+        return self.dataset_length
+    def __getitem__(self, index: int) -> Dict:
+        position = self.seek_map[index]
+        if self.dataset_file is None:
+            self.dataset_file = self.path_to_index.open()
+        self.dataset_file.seek(position)
+        line = self.dataset_file.readline()
+        try:
+            data: Dict = orjson.loads(line)
+        except Exception as e:
+            logging.info(
+                "JSONL for sample %i could not be loaded at position %i: %s\n%s",
+                index,
+                position,
+                str(e),
+                line,
+            )
+            return self.empty_sample
+        img_path: Path = self.path_to_root / self.root_name / data.pop("image")
+        if not img_path.exists():
+            logging.info("Sample %s could not be found.", img_path)
+            return self.empty_sample
+        try:
+            img = Image.open(img_path)
+        except UnidentifiedImageError:
+            logging.info("Image %s could not be opened.", img_path)
+            return self.empty_sample
+        return {"image": img, "ground_truth": data.pop("markdown"), "meta": data}
+    def __iter__(self):
+        for i in range(self.dataset_length):
+            yield self[i]
+class NougatDataset(Dataset):
+    """
+    Args:
+        dataset_path: the path to the jsonl file
+    """
+    def __init__(
+        self,
+        dataset_path: str,
+        nougat_model: PreTrainedModel,
+        max_length: int,
+        split: str = "train",
+        root_name: str = "arxiv",
+    ):
+        super().__init__()
+        self.nougat_model = nougat_model
+        self.max_length = max_length
+        self.split = split
+        self.perturb = "NOUGAT_PERTURB" in os.environ and os.environ["NOUGAT_PERTURB"]
+        # TODO improve naming conventions
+        template = "%s"
+        self.dataset = SciPDFDataset(
+            dataset_path, split=self.split, template=template, root_name=root_name
+        )
+        self.dataset_length = len(self.dataset)
+    def __len__(self) -> int:
+        return self.dataset_length
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Load image from image_path of given dataset_path and convert into input_tensor and labels.
+        Convert gt data into input_ids (tokenized string)
+        Returns:
+            input_tensor : preprocessed image
+            input_ids : tokenized gt_data
+        """
+        sample = self.dataset[idx]
+        if sample is None:
+            # if sample is broken choose another randomly
+            return self[random.randint(0, self.dataset_length - 1)]
+        if sample is None or sample["image"] is None or prod(sample["image"].size) == 0:
+            input_tensor = None
+        else:
+            input_tensor = self.nougat_model.encoder.prepare_input(
+                sample["image"], random_padding=self.split == "train"
+            )
+        tokenizer_out = self.nougat_model.decoder.tokenizer(
+            sample["ground_truth"],
+            max_length=self.max_length,
+            padding="max_length",
+            return_token_type_ids=False,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = tokenizer_out["input_ids"].squeeze(0)
+        attention_mask = tokenizer_out["attention_mask"].squeeze(0)
+        # randomly perturb ground truth tokens
+        if self.split == "train" and self.perturb:
+            # check if we perturb tokens
+            unpadded_length = attention_mask.sum()
+            while random.random() < 0.1:
+                try:
+                    pos = random.randint(1, unpadded_length - 2)
+                    token = random.randint(
+                        23, len(self.nougat_model.decoder.tokenizer) - 1
+                    )
+                    input_ids[pos] = token
+                except ValueError:
+                    break
+        return input_tensor, input_ids, attention_mask

nougat/utils/device.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import torch
+import logging
+def default_batch_size():
+    if torch.cuda.is_available():
+        batch_size = int(
+            torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1000 * 0.3
+        )
+        if batch_size == 0:
+            logging.warning("GPU VRAM is too small. Computing on CPU.")
+    elif torch.backends.mps.is_available():
+        # I don't know if there's an equivalent API so heuristically choosing bs=4
+        batch_size = 4
+    else:
+        # don't know what a good value is here. Would not recommend to run on CPU
+        batch_size = 1
+        logging.warning("No GPU found. Conversion on CPU is very slow.")
+    return batch_size
+def move_to_device(model, bf16: bool = True, cuda: bool = True):
+    try:
+        if torch.backends.mps.is_available():
+            return model.to("mps")
+    except AttributeError:
+        pass
+    if bf16:
+        model = model.to(torch.bfloat16)
+    if cuda and torch.cuda.is_available():
+        model = model.to("cuda")
+    return model

predict.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import sys
+from pathlib import Path
+import logging
+import re
+import argparse
+import re
+from functools import partial
+import torch
+from torch.utils.data import ConcatDataset
+from tqdm import tqdm
+from nougat import NougatModel
+from nougat.utils.dataset import LazyDataset
+from nougat.utils.checkpoint import get_checkpoint
+from nougat.postprocessing import markdown_compatible
+import fitz
+logging.basicConfig(level=logging.INFO)
+if torch.cuda.is_available():
+    BATCH_SIZE = int(
+        torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1000 * 0.3
+    )
+    if BATCH_SIZE == 0:
+        logging.warning("GPU VRAM is too small. Computing on CPU.")
+else:
+    # don't know what a good value is here. Would not recommend to run on CPU
+    BATCH_SIZE = 1
+    logging.warning("No GPU found. Conversion on CPU is very slow.")
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batchsize",
+        "-b",
+        type=int,
+        default=BATCH_SIZE,
+        help="Batch size to use.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        "-c",
+        type=Path,
+        default=None,
+        help="Path to checkpoint directory.",
+    )
+    parser.add_argument("--out", "-o", type=Path, help="Output directory.")
+    parser.add_argument(
+        "--recompute",
+        action="store_true",
+        help="Recompute already computed PDF, discarding previous predictions.",
+    )
+    parser.add_argument(
+        "--markdown",
+        action="store_true",
+        help="Add postprocessing step for markdown compatibility.",
+    )
+    parser.add_argument("pdf", nargs="+", type=Path, help="PDF(s) to process.")
+    args = parser.parse_args()
+    if args.checkpoint is None or not args.checkpoint.exists():
+        args.checkpoint = get_checkpoint(args.checkpoint)
+    if args.out is None:
+        logging.warning("No output directory. Output will be printed to console.")
+    else:
+        if not args.out.exists():
+            logging.info("Output directory does not exist. Creating output directory.")
+            args.out.mkdir(parents=True)
+        if not args.out.is_dir():
+            logging.error("Output has to be directory.")
+            sys.exit(1)
+    if len(args.pdf) == 1 and not args.pdf[0].suffix == ".pdf":
+        # input is a list of pdfs
+        try:
+            args.pdf = [
+                Path(l) for l in open(args.pdf[0]).read().split("\n") if len(l) > 0
+            ]
+        except:
+            pass
+    return args
+def main():
+    args = get_args()
+    model = NougatModel.from_pretrained(args.checkpoint).to(torch.bfloat16)
+    if args.batchsize > 0:
+        if torch.cuda.is_available():
+            model.to("cuda")
+    else:
+        # set batch size to 1. Need to check if there are benefits for CPU conversion for >1
+        args.batchsize = 1
+    model.eval()
+    datasets = []
+    for pdf in args.pdf:
+        if not pdf.exists():
+            continue
+        if args.out:
+            out_path = args.out / pdf.with_suffix(".mmd").name
+            if out_path.exists() and not args.recompute:
+                logging.info(
+                    f"Skipping {pdf.name}, already computed. Run with --recompute to convert again."
+                )
+                continue
+        try:
+            dataset = LazyDataset(
+                pdf, partial(model.encoder.prepare_input, random_padding=False)
+            )
+        except fitz.fitz.FileDataError:
+            logging.info(f"Could not load file {str(pdf)}.")
+            continue
+        datasets.append(dataset)
+    if len(datasets) == 0:
+        return
+    dataloader = torch.utils.data.DataLoader(
+        ConcatDataset(datasets),
+        batch_size=args.batchsize,
+        shuffle=False,
+        collate_fn=LazyDataset.ignore_none_collate,
+    )
+    predictions = []
+    file_index = 0
+    page_num = 0
+    for i, (sample, is_last_page) in enumerate(tqdm(dataloader)):
+        model_output = model.inference(image_tensors=sample)
+        # check if model output is faulty
+        for j, output in enumerate(model_output["predictions"]):
+            if page_num == 0:
+                logging.info(
+                    "Processing file %s with %i pages"
+                    % (datasets[file_index].name, datasets[file_index].size)
+                )
+            page_num += 1
+            if output.strip() == "[MISSING_PAGE_POST]":
+                # uncaught repetitions -- most likely empty page
+                predictions.append(f"\n\n[MISSING_PAGE_EMPTY:{page_num}]\n\n")
+            elif model_output["repeats"][j] is not None:
+                if model_output["repeats"][j] > 0:
+                    # If we end up here, it means the output is most likely not complete and was truncated.
+                    logging.warning(f"Skipping page {page_num} due to repetitions.")
+                    predictions.append(f"\n\n[MISSING_PAGE_FAIL:{page_num}]\n\n")
+                else:
+                    # If we end up here, it means the document page is too different from the training domain.
+                    # This can happen e.g. for cover pages.
+                    predictions.append(
+                        f"\n\n[MISSING_PAGE_EMPTY:{i*args.batchsize+j+1}]\n\n"
+                    )
+            else:
+                if args.markdown:
+                    output = markdown_compatible(output)
+                predictions.append(output)
+            if is_last_page[j]:
+                out = "".join(predictions).strip()
+                out = re.sub(r"\n{3,}", "\n\n", out).strip()
+                if args.out:
+                    out_path = args.out / Path(is_last_page[j]).with_suffix(".mmd").name
+                    out_path.parent.mkdir(parents=True, exist_ok=True)
+                    out_path.write_text(out, encoding="utf-8")
+                else:
+                    print(out, "\n\n")
+                predictions = []
+                page_num = 0
+                file_index += 1
+if __name__ == "__main__":
+    main()