File size: 5,821 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import argparse
import tempfile
import time
from collections import defaultdict

from tqdm import tqdm
import pypdfium2 as pdfium

from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.benchmark.scoring import score_text
from marker.pdf.extract_text import naive_get_text
import json
import os
import subprocess
import shutil
from tabulate import tabulate
import torch

configure_logging()


def start_memory_profiling():
    torch.cuda.memory._record_memory_history(
        max_entries=100000
    )


def stop_memory_profiling(memory_file):
    try:
        torch.cuda.memory._dump_snapshot(memory_file)
    except Exception as e:
        logger.error(f"Failed to capture memory snapshot {e}")

        # Stop recording memory snapshot history.
    torch.cuda.memory._record_memory_history(enabled=None)


def nougat_prediction(pdf_filename, batch_size=1):
    out_dir = tempfile.mkdtemp()
    subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
    md_file = os.listdir(out_dir)[0]
    with open(os.path.join(out_dir, md_file), "r") as f:
        data = f.read()
    shutil.rmtree(out_dir)
    return data


def main():
    parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion.  Needs source pdfs, and a refernece folder with the correct markdown.")
    parser.add_argument("in_folder", help="Input PDF files")
    parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
    parser.add_argument("out_file", help="Output filename")
    parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
    # Nougat batch size 1 uses about as much VRAM as default marker settings
    parser.add_argument("--marker_batch_multiplier", type=int, default=1, help="Batch size multiplier to use for marker when making predictions.")
    parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
    parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
    parser.add_argument("--profile_memory", action="store_true", help="Profile memory usage", default=False)

    args = parser.parse_args()

    methods = ["marker"]
    if args.nougat:
        methods.append("nougat")

    if args.profile_memory:
        start_memory_profiling()

    model_lst = load_all_models()

    if args.profile_memory:
        stop_memory_profiling("model_load.pickle")

    scores = defaultdict(dict)
    benchmark_files = os.listdir(args.in_folder)
    benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
    times = defaultdict(dict)
    pages = defaultdict(int)

    for idx, fname in tqdm(enumerate(benchmark_files)):
        md_filename = fname.rsplit(".", 1)[0] + ".md"

        reference_filename = os.path.join(args.reference_folder, md_filename)
        with open(reference_filename, "r", encoding="utf-8") as f:
            reference = f.read()

        pdf_filename = os.path.join(args.in_folder, fname)
        doc = pdfium.PdfDocument(pdf_filename)
        pages[fname] = len(doc)

        for method in methods:
            start = time.time()
            if method == "marker":
                if args.profile_memory:
                    start_memory_profiling()
                full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=args.marker_batch_multiplier)
                if args.profile_memory:
                    stop_memory_profiling(f"marker_memory_{idx}.pickle")
            elif method == "nougat":
                full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
            elif method == "naive":
                full_text = naive_get_text(doc)
            else:
                raise ValueError(f"Unknown method {method}")

            times[method][fname] = time.time() - start

            score = score_text(full_text, reference)
            scores[method][fname] = score

            if args.md_out_path:
                md_out_filename = f"{method}_{md_filename}"
                with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
                    f.write(full_text)

    total_pages = sum(pages.values())
    with open(args.out_file, "w+") as f:
        write_data = defaultdict(dict)
        for method in methods:
            total_time = sum(times[method].values())
            file_stats = {
                fname:
                {
                    "time": times[method][fname],
                    "score": scores[method][fname],
                    "pages": pages[fname]
                }

                for fname in benchmark_files
            }
            write_data[method] = {
                "files": file_stats,
                "avg_score": sum(scores[method].values()) / len(scores[method]),
                "time_per_page": total_time / total_pages,
                "time_per_doc": total_time / len(scores[method])
            }

        json.dump(write_data, f, indent=4)

    summary_table = []
    score_table = []
    score_headers = benchmark_files
    for method in methods:
        summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
        score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])

    print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
    print("")
    print("Scores by file")
    print(tabulate(score_table, headers=["Method", *score_headers]))


if __name__ == "__main__":
    main()