File size: 4,840 Bytes
0933b39
77fbded
 
 
 
 
 
 
 
 
 
 
 
 
 
0933b39
acbe414
77fbded
 
 
 
 
 
acbe414
77fbded
 
 
 
 
 
 
188f052
 
6c0ba50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188f052
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d8e906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703dc2e
 
 
 
 
 
 
 
 
7e604f0
703dc2e
df456bd
 
 
 
 
 
 
703dc2e
8d8e906
703dc2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import functools
import re
from pathlib import Path
from shutil import copy2

import pymupdf


def remove_images_from_markdown(markdown_text):
    # remove <image> and ![image](path) from markdown
    markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
    markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
    return markdown_text


@functools.lru_cache(maxsize=None)
def trim_pages(pdf_path, output_path, start_page=0, trim_pages=5):
    doc = pymupdf.open(pdf_path)
    parent_dir_name = Path(pdf_path).parent.name
    output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"

    num_pages = len(doc)
    if num_pages > trim_pages:
        to_select = list(range(start_page, min(start_page + trim_pages, num_pages)))
        doc.select(to_select)
        doc.ez_save(output_file_path)
        print("Trimmed pdf to with pages", to_select, "path", output_file_path)
    else:
        copy2(pdf_path, str(output_file_path))

    return str(output_file_path)


def patch_unimernet_model():
    from unimernet.models.unimernet.encoder_decoder import CustomMBartForCausalLM

    # Save the original __init__ method
    original_init = CustomMBartForCausalLM.__init__

    # Define a new __init__ method
    def new_init(self, config):
        config._attn_implementation = "eager"
        original_init(self, config)

    # Monkey patch the __init__ method
    CustomMBartForCausalLM.__init__ = new_init


def fix_problematic_imports():
    import sys
    import types

    # Create a fake 'UnimernetModel' class inside a fake 'Unimernet' module
    fake_unimernet_module = types.ModuleType(
        "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
    )
    fake_unimernet_module.UnimernetModel = type(  # type: ignore
        "UnimernetModel", (), {}
    )

    # Register fake module in sys.modules
    sys.modules[
        "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
    ] = fake_unimernet_module
    
    
def setup_mineru_config():
    import json
    import os
    import requests
    from huggingface_hub import snapshot_download


    def download_json(url):
        response = requests.get(url)
        response.raise_for_status()
        return response.json()


    def download_and_modify_json(url, local_filename, modifications):
        if os.path.exists(local_filename):
            data = json.load(open(local_filename))
            config_version = data.get('config_version', '0.0.0')
            if config_version < '1.2.0':
                data = download_json(url)
        else:
            data = download_json(url)

        for key, value in modifications.items():
            data[key] = value

        with open(local_filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

    mineru_patterns = [
        "models/Layout/YOLO/*",
        "models/MFD/YOLO/*",
        "models/MFR/unimernet_hf_small_2503/*",
        "models/OCR/paddleocr_torch/*",
    ]
    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)

    layoutreader_pattern = [
        "*.json",
        "*.safetensors",
    ]
    layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)

    model_dir = model_dir + '/models'
    print(f'model_dir is: {model_dir}')
    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')

    json_url = 'https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.12/magic-pdf.template.json'
    config_file_name = 'magic-pdf.json'
    home_dir = os.path.expanduser('~')
    config_file = os.path.join(home_dir, config_file_name)

    json_mods = {
        'models-dir': model_dir,
        'layoutreader-model-dir': layoutreader_model_dir,
    }

    download_and_modify_json(json_url, config_file, json_mods)
    print(f'The configuration file has been configured successfully, the path is: {config_file}')


def prepare_env_mineru():
    import json
    import os
    import nltk

    # download nltk data
    nltk.download("punkt_tab")
    nltk.download("averaged_perceptron_tagger_eng")

    home_path = Path.home()
    config_path = home_path / "magic-pdf.json"
    # skip download if config file exists
    if config_path.exists():
        print("Config file exists, skipping models download")
        return

    # download models
    setup_mineru_config()

    with open(config_path, "r") as file:
        data = json.load(file)

    data["device-mode"] = "cuda"
    with open(config_path, "w") as file:
        json.dump(data, file, indent=4)

    os.system(
        f"cp -r resources {home_path}/.local/lib/"
        "python3.10/site-packages/magic_pdf/resources"
    )

    # copy OCR model weight
    target_model_path = home_path / ".paddleocr"
    os.system(f"cp -r paddleocr {target_model_path}")