Upload 6 files
Browse files- README.md +38 -3
- app.py +1 -0
- config.json +28 -0
- metadata.json +6 -0
- model_loader.py +31 -0
- requirements.txt +7 -0
README.md
CHANGED
@@ -1,3 +1,38 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MinerU PDF to Markdown Model
|
2 |
+
|
3 |
+
这个模型可以将PDF文档转换为Markdown格式。
|
4 |
+
|
5 |
+
## 模型架构
|
6 |
+
MinerU使用多模型组合架构:
|
7 |
+
- Layout: 文档布局分析
|
8 |
+
- MFD: 数学公式检测
|
9 |
+
- MFR: 数学公式识别
|
10 |
+
- TabRec: 表格识别与重建
|
11 |
+
|
12 |
+
## 使用方法
|
13 |
+
|
14 |
+
```python
|
15 |
+
from transformers import pipeline
|
16 |
+
|
17 |
+
# 初始化转换器
|
18 |
+
converter = pipeline("pdf-to-markdown", model="your-username/MinerU")
|
19 |
+
|
20 |
+
# 转换PDF文件
|
21 |
+
markdown = converter("document.pdf")
|
22 |
+
```
|
23 |
+
|
24 |
+
## 模型信息
|
25 |
+
- 任务: PDF到Markdown转换
|
26 |
+
- 框架: PyTorch
|
27 |
+
- 许可: Apache 2.0
|
28 |
+
|
29 |
+
## 系统要求
|
30 |
+
- Python >= 3.7
|
31 |
+
- PyTorch >= 1.9.0
|
32 |
+
- transformers >= 4.28.0
|
33 |
+
- detectron2
|
34 |
+
|
35 |
+
## 限制说明
|
36 |
+
- 支持的最大页数: XX页
|
37 |
+
- 支持的PDF最大大小: XX MB
|
38 |
+
- 支持的语言: 中文、英文
|
app.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": ["MinerUModel"],
|
3 |
+
"model_type": "mineru",
|
4 |
+
"framework": "pytorch",
|
5 |
+
"task": "document-conversion",
|
6 |
+
"pipeline_tag": "document-conversion",
|
7 |
+
"submodels": {
|
8 |
+
"layout": {
|
9 |
+
"type": "detectron2",
|
10 |
+
"path": "models/Layout/model_final.pth",
|
11 |
+
"config": "models/Layout/config.json"
|
12 |
+
},
|
13 |
+
"formula_detection": {
|
14 |
+
"type": "pytorch",
|
15 |
+
"path": "models/MFD/weights.pt"
|
16 |
+
},
|
17 |
+
"formula_recognition": {
|
18 |
+
"type": "transformers",
|
19 |
+
"path": "models/MFR/UniMERNet",
|
20 |
+
"model_type": "bert"
|
21 |
+
},
|
22 |
+
"table_recognition": {
|
23 |
+
"type": "transformers",
|
24 |
+
"path": "models/TabRec/StructEqTable",
|
25 |
+
"model_type": "t5"
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
metadata.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"language": ["zh", "en"],
|
3 |
+
"license": "apache-2.0",
|
4 |
+
"tags": ["document-conversion", "pdf-to-markdown"],
|
5 |
+
"pipeline_tag": "document-conversion"
|
6 |
+
}
|
model_loader.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModel, AutoTokenizer
|
3 |
+
from detectron2.config import get_cfg
|
4 |
+
from detectron2.engine import DefaultPredictor
|
5 |
+
import os
|
6 |
+
|
7 |
+
class MinerUModelLoader:
|
8 |
+
@staticmethod
|
9 |
+
def load_models(base_path):
|
10 |
+
models = {}
|
11 |
+
|
12 |
+
# Layout模型加载
|
13 |
+
cfg = get_cfg()
|
14 |
+
cfg.merge_from_file(os.path.join(base_path, "models/Layout/config.json"))
|
15 |
+
cfg.MODEL.WEIGHTS = os.path.join(base_path, "models/Layout/model_final.pth")
|
16 |
+
models["layout"] = DefaultPredictor(cfg)
|
17 |
+
|
18 |
+
# 公式检测模型
|
19 |
+
models["formula_detector"] = torch.load(os.path.join(base_path, "models/MFD/weights.pt"))
|
20 |
+
|
21 |
+
# 公式识别模型
|
22 |
+
models["formula_recognizer"] = AutoModel.from_pretrained(
|
23 |
+
os.path.join(base_path, "models/MFR/UniMERNet")
|
24 |
+
)
|
25 |
+
|
26 |
+
# 表格识别模型
|
27 |
+
models["table_recognizer"] = AutoModel.from_pretrained(
|
28 |
+
os.path.join(base_path, "models/TabRec/StructEqTable")
|
29 |
+
)
|
30 |
+
|
31 |
+
return models
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers>=4.28.0
|
2 |
+
torch>=1.9.0
|
3 |
+
PyMuPDF
|
4 |
+
detectron2
|
5 |
+
numpy
|
6 |
+
opencv-python
|
7 |
+
pandas
|