File size: 4,840 Bytes
0933b39 77fbded 0933b39 acbe414 77fbded acbe414 77fbded 188f052 6c0ba50 188f052 8d8e906 703dc2e 7e604f0 703dc2e df456bd 703dc2e 8d8e906 703dc2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import functools
import re
from pathlib import Path
from shutil import copy2
import pymupdf
def remove_images_from_markdown(markdown_text):
# remove <image> and  from markdown
markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
return markdown_text
@functools.lru_cache(maxsize=None)
def trim_pages(pdf_path, output_path, start_page=0, trim_pages=5):
doc = pymupdf.open(pdf_path)
parent_dir_name = Path(pdf_path).parent.name
output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
num_pages = len(doc)
if num_pages > trim_pages:
to_select = list(range(start_page, min(start_page + trim_pages, num_pages)))
doc.select(to_select)
doc.ez_save(output_file_path)
print("Trimmed pdf to with pages", to_select, "path", output_file_path)
else:
copy2(pdf_path, str(output_file_path))
return str(output_file_path)
def patch_unimernet_model():
from unimernet.models.unimernet.encoder_decoder import CustomMBartForCausalLM
# Save the original __init__ method
original_init = CustomMBartForCausalLM.__init__
# Define a new __init__ method
def new_init(self, config):
config._attn_implementation = "eager"
original_init(self, config)
# Monkey patch the __init__ method
CustomMBartForCausalLM.__init__ = new_init
def fix_problematic_imports():
import sys
import types
# Create a fake 'UnimernetModel' class inside a fake 'Unimernet' module
fake_unimernet_module = types.ModuleType(
"magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
)
fake_unimernet_module.UnimernetModel = type( # type: ignore
"UnimernetModel", (), {}
)
# Register fake module in sys.modules
sys.modules[
"magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
] = fake_unimernet_module
def setup_mineru_config():
import json
import os
import requests
from huggingface_hub import snapshot_download
def download_json(url):
response = requests.get(url)
response.raise_for_status()
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.2.0':
data = download_json(url)
else:
data = download_json(url)
for key, value in modifications.items():
data[key] = value
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
mineru_patterns = [
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr_torch/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_pattern = [
"*.json",
"*.safetensors",
]
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.12/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
def prepare_env_mineru():
import json
import os
import nltk
# download nltk data
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
home_path = Path.home()
config_path = home_path / "magic-pdf.json"
# skip download if config file exists
if config_path.exists():
print("Config file exists, skipping models download")
return
# download models
setup_mineru_config()
with open(config_path, "r") as file:
data = json.load(file)
data["device-mode"] = "cuda"
with open(config_path, "w") as file:
json.dump(data, file, indent=4)
os.system(
f"cp -r resources {home_path}/.local/lib/"
"python3.10/site-packages/magic_pdf/resources"
)
# copy OCR model weight
target_model_path = home_path / ".paddleocr"
os.system(f"cp -r paddleocr {target_model_path}")
|