kitjesen commited on
Commit
c135d9e
·
verified ·
1 Parent(s): c0b9adc

Upload 14 files

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. config.json +2 -2
  3. metadata.json +12 -0
  4. pipeline.py +79 -0
README.md CHANGED
@@ -4,10 +4,10 @@ language:
4
  - en
5
  license: apache-2.0
6
  library_name: transformers
7
- pipeline_tag: document-conversion
8
  tags:
9
  - pdf-to-markdown
10
- - document-conversion
11
  ---
12
 
13
  # MinerU PDF to Markdown Model
 
4
  - en
5
  license: apache-2.0
6
  library_name: transformers
7
+ pipeline_tag: feature-extraction
8
  tags:
9
  - pdf-to-markdown
10
+ - feature-extraction
11
  ---
12
 
13
  # MinerU PDF to Markdown Model
config.json CHANGED
@@ -2,8 +2,8 @@
2
  "architectures": ["MinerUModel"],
3
  "model_type": "mineru",
4
  "framework": "pytorch",
5
- "task": "document-conversion",
6
- "pipeline_tag": "document-conversion",
7
  "model_name_or_path": "kitjesen/MinerU",
8
  "auto_map": {
9
  "AutoModel": "modeling.MinerUModel",
 
2
  "architectures": ["MinerUModel"],
3
  "model_type": "mineru",
4
  "framework": "pytorch",
5
+ "task": "feature-extraction",
6
+ "pipeline_tag": "feature-extraction",
7
  "model_name_or_path": "kitjesen/MinerU",
8
  "auto_map": {
9
  "AutoModel": "modeling.MinerUModel",
metadata.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": ["zh", "en"],
3
+ "license": "apache-2.0",
4
+ "tags": ["pdf-to-markdown", "feature-extraction"],
5
+ "pipeline_tag": "feature-extraction",
6
+ "library_name": "transformers",
7
+ "task_specific_params": {
8
+ "pdf-to-markdown": {
9
+ "max_length": 1024
10
+ }
11
+ }
12
+ }
pipeline.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Pipeline
2
+ import torch
3
+ from typing import Union, List
4
+ import fitz
5
+ import os
6
+ from detectron2.config import get_cfg
7
+ from detectron2.engine import DefaultPredictor
8
+
9
+ class MinerUPipeline(Pipeline):
10
+ def __init__(self, model_path, **kwargs):
11
+ super().__init__(**kwargs)
12
+ # 加载Layout模型
13
+ cfg = get_cfg()
14
+ cfg.merge_from_file(os.path.join(model_path, "models/Layout/config.json"))
15
+ cfg.MODEL.WEIGHTS = os.path.join(model_path, "models/Layout/model_final.pth")
16
+ self.layout_model = DefaultPredictor(cfg)
17
+
18
+ # 加载其他模型
19
+ self.formula_detector = torch.load(os.path.join(model_path, "models/MFD/weights.pt"))
20
+ self.formula_recognizer = AutoModel.from_pretrained(os.path.join(model_path, "models/MFR/UniMERNet"))
21
+ self.table_recognizer = AutoModel.from_pretrained(os.path.join(model_path, "TabRec/StructEqTable"))
22
+
23
+ def preprocess(self, pdf_path):
24
+ """处理PDF输入"""
25
+ doc = fitz.open(pdf_path)
26
+ pages = []
27
+ for page in doc:
28
+ # 获取页面图像
29
+ pix = page.get_pixmap()
30
+ # 转换为模型所需格式
31
+ img = torch.tensor(pix.samples).permute(2, 0, 1).float()
32
+ pages.append(img)
33
+ return pages
34
+
35
+ def _forward(self, pages):
36
+ results = []
37
+ for page in pages:
38
+ # 1. 布局分析
39
+ layout = self.layout_model(page)
40
+
41
+ # 2. 根据布局结果处理不同区域
42
+ text_regions = []
43
+ formula_regions = []
44
+ table_regions = []
45
+
46
+ for region in layout:
47
+ if region.type == "text":
48
+ text_regions.append(self._process_text(region))
49
+ elif region.type == "formula":
50
+ formula_regions.append(self._process_formula(region))
51
+ elif region.type == "table":
52
+ table_regions.append(self._process_table(region))
53
+
54
+ results.append({
55
+ "text": text_regions,
56
+ "formulas": formula_regions,
57
+ "tables": table_regions
58
+ })
59
+
60
+ return results
61
+
62
+ def _process_formula(self, region):
63
+ # 公式检测和识别
64
+ detected = self.formula_detector(region.image)
65
+ return self.formula_recognizer(detected)
66
+
67
+ def _process_table(self, region):
68
+ # 表格识别
69
+ return self.table_recognizer(region.image)
70
+
71
+ def postprocess(self, model_outputs):
72
+ """转换为Markdown"""
73
+ markdown = []
74
+ for page in model_outputs:
75
+ # 组合文本、公式和表格
76
+ markdown.extend(page["text"])
77
+ markdown.extend([f"$${formula}$$" for formula in page["formulas"]])
78
+ markdown.extend([table.to_markdown() for table in page["tables"]])
79
+ return "\n\n".join(markdown)