derful commited on
Commit
240e0a0
·
verified ·
1 Parent(s): b584747

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .github/ISSUE_TEMPLATE/bug_report.yml +85 -0
  3. .github/ISSUE_TEMPLATE/feature_request.md +28 -0
  4. .github/workflows/cla.yml +43 -0
  5. .github/workflows/cli.yml +46 -0
  6. .github/workflows/python-package.yml +126 -0
  7. .github/workflows/rerun.yml +23 -0
  8. .github/workflows/update_base.yml +22 -0
  9. .gitignore +37 -0
  10. LICENSE.md +661 -0
  11. README.md +286 -6
  12. README_zh-CN.md +277 -0
  13. demo/app.py +67 -0
  14. demo/demo.py +31 -0
  15. demo/demo1.json +0 -0
  16. demo/demo1.pdf +0 -0
  17. demo/demo2.json +0 -0
  18. demo/demo2.pdf +3 -0
  19. docs/FAQ_zh_cn.md +85 -0
  20. docs/how_to_download_models_en.md +60 -0
  21. docs/how_to_download_models_zh_cn.md +61 -0
  22. docs/images/flowchart_en.png +0 -0
  23. docs/images/flowchart_zh_cn.png +0 -0
  24. docs/images/project_panorama_en.png +0 -0
  25. docs/images/project_panorama_zh_cn.png +0 -0
  26. magic-pdf.template.json +9 -0
  27. magic_pdf/__init__.py +0 -0
  28. magic_pdf/cli/__init__.py +0 -0
  29. magic_pdf/cli/magicpdf.py +359 -0
  30. magic_pdf/dict2md/__init__.py +0 -0
  31. magic_pdf/dict2md/mkcontent.py +397 -0
  32. magic_pdf/dict2md/ocr_mkcontent.py +363 -0
  33. magic_pdf/filter/__init__.py +0 -0
  34. magic_pdf/filter/pdf_classify_by_type.py +393 -0
  35. magic_pdf/filter/pdf_meta_scan.py +388 -0
  36. magic_pdf/layout/__init__.py +0 -0
  37. magic_pdf/layout/bbox_sort.py +681 -0
  38. magic_pdf/layout/layout_det_utils.py +182 -0
  39. magic_pdf/layout/layout_sort.py +732 -0
  40. magic_pdf/layout/layout_spiler_recog.py +101 -0
  41. magic_pdf/layout/mcol_sort.py +336 -0
  42. magic_pdf/libs/Constants.py +11 -0
  43. magic_pdf/libs/MakeContentConfig.py +10 -0
  44. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  45. magic_pdf/libs/__init__.py +0 -0
  46. magic_pdf/libs/boxbase.py +408 -0
  47. magic_pdf/libs/calc_span_stats.py +239 -0
  48. magic_pdf/libs/commons.py +204 -0
  49. magic_pdf/libs/config_reader.py +73 -0
  50. magic_pdf/libs/convert_utils.py +5 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ demo/demo2.pdf filter=lfs diff=lfs merge=lfs -text
.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug Report | 反馈 Bug
2
+ description: Create a bug report for MinerU | MinerU 的 Bug 反馈
3
+ labels: bug
4
+
5
+ # We omit `title: "..."` so that the field defaults to blank. If we set it to
6
+ # empty string, Github seems to reject this .yml file.
7
+
8
+ body:
9
+
10
+ - type: textarea
11
+ id: description
12
+ attributes:
13
+ label: Description of the bug | 错误描述
14
+ description: |
15
+ A clear and concise description of the bug. | 简单描述遇到的问题
16
+
17
+ validations:
18
+ required: true
19
+
20
+ - type: textarea
21
+ id: reproduce
22
+ attributes:
23
+ label: How to reproduce the bug | 如何复现
24
+
25
+ # Should not word-wrap this description here.
26
+ description: |
27
+ * Explain the steps required to reproduce the bug. | 说明复现此错误所需的步骤。
28
+ * Include required code snippets, example files, etc. | 包含必要的代码片段、示例文件等。
29
+ * Describe what you expected to happen (if not obvious). | 描述你期望发生的情况。
30
+ * If applicable, add screenshots to help explain the problem. | 添加截图以帮助解释问题。
31
+ * Include any other information that could be relevant, for example information about the Python environment. | 包括任何其他可能相关的信息。
32
+
33
+ For problems when building or installing MinerU: | 在构建或安装 MinerU 时遇到的问题:
34
+ * Give the **exact** build/install commands that were run. | 提供**确切**的构建/安装命令。
35
+ * Give the **complete** output from these commands. | 提供这些命令的**完整**输出。
36
+
37
+ validations:
38
+ required: true
39
+
40
+ # - type: markdown
41
+ # attributes:
42
+ # value: |
43
+ # # The information below is required.
44
+
45
+
46
+ - type: dropdown
47
+ id: os_name
48
+ attributes:
49
+ label: Operating system | 操作系统
50
+ #multiple: true
51
+ options:
52
+ -
53
+ - Windows
54
+ - Linux
55
+ - MacOS
56
+ validations:
57
+ required: true
58
+
59
+ - type: dropdown
60
+ id: python_version
61
+ attributes:
62
+ label: Python version | Python 版本
63
+ #multiple: true
64
+ # Need quotes around `3.10` otherwise it is treated as a number and shows as `3.1`.
65
+ options:
66
+ -
67
+ - "3.12"
68
+ - "3.11"
69
+ - "3.10"
70
+ - "3.9"
71
+ validations:
72
+ required: true
73
+
74
+ - type: dropdown
75
+ id: device_mode
76
+ attributes:
77
+ label: Device mode | 设备模式
78
+ #multiple: true
79
+ options:
80
+ -
81
+ - cpu
82
+ - cuda
83
+ - mps
84
+ validations:
85
+ required: true
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request | 功能需求
3
+ about: Suggest an idea for this project | 提出一个有价值的idea
4
+ title: ''
5
+ labels: enhancement
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+ **您的特性请求是否与某个问题相关?请描述。**
12
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13
+ 对存在的问题进行清晰且简洁的描述。例如:我一直很困扰的是 [...]
14
+
15
+ **Describe the solution you'd like**
16
+ **描述您期望的解决方案**
17
+ A clear and concise description of what you want to happen.
18
+ 清晰且简洁地描述您希望实现的内容。
19
+
20
+ **Describe alternatives you've considered**
21
+ **描述您已考虑的替代方案**
22
+ A clear and concise description of any alternative solutions or features you've considered.
23
+ 清晰且简洁地描述您已经考虑过的任何替代解决方案。
24
+
25
+ **Additional context**
26
+ **提供更多细节**
27
+ Add any other context or screenshots about the feature request here.
28
+ 请附上任何相关截图、链接或文件,以帮助我们更好地理解您的请求。
.github/workflows/cla.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "CLA Assistant"
2
+ on:
3
+ issue_comment:
4
+ types: [created]
5
+ pull_request_target:
6
+ types: [opened,closed,synchronize]
7
+
8
+ # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
9
+ permissions:
10
+ actions: write
11
+ contents: write # this can be 'read' if the signatures are in remote repository
12
+ pull-requests: write
13
+ statuses: write
14
+
15
+ jobs:
16
+ CLAAssistant:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - name: "CLA Assistant"
20
+ if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
21
+ uses: contributor-assistant/github-action@v2.4.0
22
+ env:
23
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24
+ # the below token should have repo scope and must be manually added by you in the repository's secret
25
+ # This token is required only if you have configured to store the signatures in a remote repository/organization
26
+ PERSONAL_ACCESS_TOKEN: ${{ secrets.RELEASE_TOKEN }}
27
+ with:
28
+ path-to-signatures: 'signatures/version1/cla.json'
29
+ path-to-document: 'https://github.com/cla-assistant/github-action/blob/master/SAPCLA.md' # e.g. a CLA or a DCO document
30
+ # branch should not be protected
31
+ branch: 'main'
32
+ allowlist: user1,bot*
33
+
34
+ # the followings are the optional inputs - If the optional inputs are not given, then default values will be taken
35
+ #remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository)
36
+ #remote-repository-name: enter the remote repository name where the signatures should be stored (Default is storing the signatures in the same repository)
37
+ #create-file-commit-message: 'For example: Creating file for storing CLA Signatures'
38
+ #signed-commit-message: 'For example: $contributorName has signed the CLA in $owner/$repo#$pullRequestNo'
39
+ #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign'
40
+ #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA'
41
+ #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.'
42
+ #lock-pullrequest-aftermerge: false - if you don't want this bot to automatically lock the pull request after merging (default - true)
43
+ #use-dco-flag: true - If you are using DCO instead of CLA
.github/workflows/cli.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: mineru
5
+ on:
6
+ push:
7
+ branches:
8
+ - "master"
9
+ paths-ignore:
10
+ - "cmds/**"
11
+ - "**.md"
12
+ pull_request:
13
+ branches:
14
+ - "master"
15
+ paths-ignore:
16
+ - "cmds/**"
17
+ - "**.md"
18
+ workflow_dispatch:
19
+ jobs:
20
+ cli-test:
21
+ runs-on: ubuntu-latest
22
+ timeout-minutes: 40
23
+ strategy:
24
+ fail-fast: true
25
+
26
+ steps:
27
+ - name: PDF cli
28
+ uses: actions/checkout@v3
29
+ with:
30
+ fetch-depth: 2
31
+
32
+ - name: check-requirements
33
+ run: |
34
+ pip install -r requirements.txt
35
+ pip install -r requirements-qa.txt
36
+ pip install magic-pdf
37
+ - name: test_cli
38
+ run: |
39
+ cp magic-pdf.template.json ~/magic-pdf.json
40
+ echo $GITHUB_WORKSPACE
41
+ cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py
42
+ cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py
43
+
44
+ - name: benchmark
45
+ run: |
46
+ cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_bench.py
.github/workflows/python-package.yml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Python package
5
+
6
+ on:
7
+ push:
8
+ tags:
9
+ - '*released'
10
+ workflow_dispatch:
11
+
12
+
13
+ jobs:
14
+
15
+ update-version:
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - name: Checkout repository
19
+ uses: actions/checkout@v4
20
+ with:
21
+ ref: master
22
+ fetch-depth: 0
23
+
24
+ - name: Set up Python
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.10"
28
+
29
+ - name: Update version.py
30
+ run: |
31
+ python update_version.py
32
+
33
+ - name: Verify version.py
34
+ run: |
35
+ ls -l magic_pdf/libs/version.py
36
+ cat magic_pdf/libs/version.py
37
+
38
+ - name: Commit changes
39
+ run: |
40
+ git config --local user.email "moe@myhloli.com"
41
+ git config --local user.name "myhloli"
42
+ git add magic_pdf/libs/version.py
43
+ if git diff-index --quiet HEAD; then
44
+ echo "No changes to commit"
45
+ else
46
+ git commit -m "Update version.py with new version"
47
+ fi
48
+ id: commit_changes
49
+
50
+ - name: Push changes
51
+ if: steps.commit_changes.outcome == 'success'
52
+ env:
53
+ GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
54
+ run: |
55
+ git push origin HEAD:master
56
+
57
+ build:
58
+ needs: [ update-version ]
59
+ runs-on: ubuntu-latest
60
+ strategy:
61
+ fail-fast: false
62
+ matrix:
63
+ python-version: ["3.10"]
64
+
65
+ steps:
66
+ - name: Checkout code
67
+ uses: actions/checkout@v4
68
+ with:
69
+ ref: master
70
+ fetch-depth: 0
71
+
72
+ - name: Verify version.py
73
+ run: |
74
+ ls -l magic_pdf/libs/version.py
75
+ cat magic_pdf/libs/version.py
76
+
77
+ - name: Set up Python ${{ matrix.python-version }}
78
+ uses: actions/setup-python@v5
79
+ with:
80
+ python-version: ${{ matrix.python-version }}
81
+
82
+ - name: Install dependencies
83
+ run: |
84
+ python -m pip install --upgrade pip
85
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
86
+
87
+ - name: Install wheel
88
+ run: |
89
+ python -m pip install wheel
90
+
91
+ - name: Build wheel
92
+ run: |
93
+ python setup.py bdist_wheel
94
+
95
+ - name: Upload artifact
96
+ uses: actions/upload-artifact@v4
97
+ with:
98
+ name: wheel-file
99
+ path: dist/*.whl
100
+ retention-days: 30
101
+
102
+ release:
103
+ needs: [ build ]
104
+ runs-on: ubuntu-latest
105
+ steps:
106
+ - name: Checkout code
107
+ uses: actions/checkout@v4
108
+
109
+ - name: Download artifact
110
+ uses: actions/download-artifact@v4
111
+ with:
112
+ name: wheel-file
113
+ path: dist
114
+
115
+ - name: Create and Upload Release
116
+ id: create_release
117
+ uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
118
+ with:
119
+ files: './dist/*.whl'
120
+ env:
121
+ GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
122
+
123
+ - name: Publish distribution to PyPI
124
+ run: |
125
+ pip install twine
126
+ twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
.github/workflows/rerun.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: check-status
2
+
3
+ on:
4
+ workflow_run:
5
+ workflows: [ci]
6
+ types: [completed]
7
+
8
+ jobs:
9
+ on-failure:
10
+ runs-on: pdf
11
+ permissions:
12
+ actions: write
13
+ if: ${{ (github.event.workflow_run.head_branch == 'master') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }}
14
+ steps:
15
+ - run: |
16
+ echo 'The triggering workflow failed'
17
+ sleep 600
18
+ curl -L \
19
+ -X POST \
20
+ -H "Accept: application/vnd.github+json" \
21
+ -H "Authorization: Bearer ${{ github.token }}" \
22
+ -H "X-GitHub-Api-Version: 2022-11-28" \
23
+ https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs
.github/workflows/update_base.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: update-base
5
+ on:
6
+ push:
7
+ tags:
8
+ - '*released'
9
+ workflow_dispatch:
10
+ jobs:
11
+ pdf-test:
12
+ runs-on: pdf
13
+ timeout-minutes: 40
14
+
15
+
16
+ steps:
17
+ - name: update-base
18
+ uses: actions/checkout@v3
19
+ - name: start-update
20
+ run: |
21
+ echo "start test"
22
+
.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.tar
2
+ *.tar.gz
3
+ venv*/
4
+ envs/
5
+ slurm_logs/
6
+
7
+ sync1.sh
8
+ data_preprocess_pj1
9
+ data-preparation1
10
+ __pycache__
11
+ *.log
12
+ *.pyc
13
+ .vscode
14
+ debug/
15
+ *.ipynb
16
+ .idea
17
+
18
+ # vscode history
19
+ .history
20
+
21
+ .DS_Store
22
+ .env
23
+
24
+ bad_words/
25
+ bak/
26
+
27
+ app/tests/*
28
+ temp/
29
+ tmp/
30
+ tmp
31
+ .vscode
32
+ .vscode/
33
+ /tests/
34
+ ocr_demo
35
+
36
+ /app/common/__init__.py
37
+ /magic_pdf/config/__init__.py
LICENSE.md ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU AFFERO GENERAL PUBLIC LICENSE
2
+ Version 3, 19 November 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU Affero General Public License is a free, copyleft license for
11
+ software and other kinds of works, specifically designed to ensure
12
+ cooperation with the community in the case of network server software.
13
+
14
+ The licenses for most software and other practical works are designed
15
+ to take away your freedom to share and change the works. By contrast,
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
26
+
27
+ Developers that use our General Public Licenses protect your rights
28
+ with two steps: (1) assert copyright on the software, and (2) offer
29
+ you this License which gives you legal permission to copy, distribute
30
+ and/or modify the software.
31
+
32
+ A secondary benefit of defending all users' freedom is that
33
+ improvements made in alternate versions of the program, if they
34
+ receive widespread use, become available for other developers to
35
+ incorporate. Many developers of free software are heartened and
36
+ encouraged by the resulting cooperation. However, in the case of
37
+ software used on network servers, this result may fail to come about.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ TERMS AND CONDITIONS
60
+
61
+ 0. Definitions.
62
+
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
71
+
72
+ To "modify" a work means to copy from or adapt all or part of the work
73
+ in a fashion requiring copyright permission, other than the making of an
74
+ exact copy. The resulting work is called a "modified version" of the
75
+ earlier work or a work "based on" the earlier work.
76
+
77
+ A "covered work" means either the unmodified Program or a work based
78
+ on the Program.
79
+
80
+ To "propagate" a work means to do anything with it that, without
81
+ permission, would make you directly or secondarily liable for
82
+ infringement under applicable copyright law, except executing it on a
83
+ computer or modifying a private copy. Propagation includes copying,
84
+ distribution (with or without modification), making available to the
85
+ public, and in some countries other activities as well.
86
+
87
+ To "convey" a work means any kind of propagation that enables other
88
+ parties to make or receive copies. Mere interaction with a user through
89
+ a computer network, with no transfer of a copy, is not conveying.
90
+
91
+ An interactive user interface displays "Appropriate Legal Notices"
92
+ to the extent that it includes a convenient and prominently visible
93
+ feature that (1) displays an appropriate copyright notice, and (2)
94
+ tells the user that there is no warranty for the work (except to the
95
+ extent that warranties are provided), that licensees may convey the
96
+ work under this License, and how to view a copy of this License. If
97
+ the interface presents a list of user commands or options, such as a
98
+ menu, a prominent item in the list meets this criterion.
99
+
100
+ 1. Source Code.
101
+
102
+ The "source code" for a work means the preferred form of the work
103
+ for making modifications to it. "Object code" means any non-source
104
+ form of a work.
105
+
106
+ A "Standard Interface" means an interface that either is an official
107
+ standard defined by a recognized standards body, or, in the case of
108
+ interfaces specified for a particular programming language, one that
109
+ is widely used among developers working in that language.
110
+
111
+ The "System Libraries" of an executable work include anything, other
112
+ than the work as a whole, that (a) is included in the normal form of
113
+ packaging a Major Component, but which is not part of that Major
114
+ Component, and (b) serves only to enable use of the work with that
115
+ Major Component, or to implement a Standard Interface for which an
116
+ implementation is available to the public in source code form. A
117
+ "Major Component", in this context, means a major essential component
118
+ (kernel, window system, and so on) of the specific operating system
119
+ (if any) on which the executable work runs, or a compiler used to
120
+ produce the work, or an object code interpreter used to run it.
121
+
122
+ The "Corresponding Source" for a work in object code form means all
123
+ the source code needed to generate, install, and (for an executable
124
+ work) run the object code and to modify the work, including scripts to
125
+ control those activities. However, it does not include the work's
126
+ System Libraries, or general-purpose tools or generally available free
127
+ programs which are used unmodified in performing those activities but
128
+ which are not part of the work. For example, Corresponding Source
129
+ includes interface definition files associated with source files for
130
+ the work, and the source code for shared libraries and dynamically
131
+ linked subprograms that the work is specifically designed to require,
132
+ such as by intimate data communication or control flow between those
133
+ subprograms and other parts of the work.
134
+
135
+ The Corresponding Source need not include anything that users
136
+ can regenerate automatically from other parts of the Corresponding
137
+ Source.
138
+
139
+ The Corresponding Source for a work in source code form is that
140
+ same work.
141
+
142
+ 2. Basic Permissions.
143
+
144
+ All rights granted under this License are granted for the term of
145
+ copyright on the Program, and are irrevocable provided the stated
146
+ conditions are met. This License explicitly affirms your unlimited
147
+ permission to run the unmodified Program. The output from running a
148
+ covered work is covered by this License only if the output, given its
149
+ content, constitutes a covered work. This License acknowledges your
150
+ rights of fair use or other equivalent, as provided by copyright law.
151
+
152
+ You may make, run and propagate covered works that you do not
153
+ convey, without conditions so long as your license otherwise remains
154
+ in force. You may convey covered works to others for the sole purpose
155
+ of having them make modifications exclusively for you, or provide you
156
+ with facilities for running those works, provided that you comply with
157
+ the terms of this License in conveying all material for which you do
158
+ not control copyright. Those thus making or running the covered works
159
+ for you must do so exclusively on your behalf, under your direction
160
+ and control, on terms that prohibit them from making any copies of
161
+ your copyrighted material outside their relationship with you.
162
+
163
+ Conveying under any other circumstances is permitted solely under
164
+ the conditions stated below. Sublicensing is not allowed; section 10
165
+ makes it unnecessary.
166
+
167
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168
+
169
+ No covered work shall be deemed part of an effective technological
170
+ measure under any applicable law fulfilling obligations under article
171
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172
+ similar laws prohibiting or restricting circumvention of such
173
+ measures.
174
+
175
+ When you convey a covered work, you waive any legal power to forbid
176
+ circumvention of technological measures to the extent such circumvention
177
+ is effected by exercising rights under this License with respect to
178
+ the covered work, and you disclaim any intention to limit operation or
179
+ modification of the work as a means of enforcing, against the work's
180
+ users, your or third parties' legal rights to forbid circumvention of
181
+ technological measures.
182
+
183
+ 4. Conveying Verbatim Copies.
184
+
185
+ You may convey verbatim copies of the Program's source code as you
186
+ receive it, in any medium, provided that you conspicuously and
187
+ appropriately publish on each copy an appropriate copyright notice;
188
+ keep intact all notices stating that this License and any
189
+ non-permissive terms added in accord with section 7 apply to the code;
190
+ keep intact all notices of the absence of any warranty; and give all
191
+ recipients a copy of this License along with the Program.
192
+
193
+ You may charge any price or no price for each copy that you convey,
194
+ and you may offer support or warranty protection for a fee.
195
+
196
+ 5. Conveying Modified Source Versions.
197
+
198
+ You may convey a work based on the Program, or the modifications to
199
+ produce it from the Program, in the form of source code under the
200
+ terms of section 4, provided that you also meet all of these conditions:
201
+
202
+ a) The work must carry prominent notices stating that you modified
203
+ it, and giving a relevant date.
204
+
205
+ b) The work must carry prominent notices stating that it is
206
+ released under this License and any conditions added under section
207
+ 7. This requirement modifies the requirement in section 4 to
208
+ "keep intact all notices".
209
+
210
+ c) You must license the entire work, as a whole, under this
211
+ License to anyone who comes into possession of a copy. This
212
+ License will therefore apply, along with any applicable section 7
213
+ additional terms, to the whole of the work, and all its parts,
214
+ regardless of how they are packaged. This License gives no
215
+ permission to license the work in any other way, but it does not
216
+ invalidate such permission if you have separately received it.
217
+
218
+ d) If the work has interactive user interfaces, each must display
219
+ Appropriate Legal Notices; however, if the Program has interactive
220
+ interfaces that do not display Appropriate Legal Notices, your
221
+ work need not make them do so.
222
+
223
+ A compilation of a covered work with other separate and independent
224
+ works, which are not by their nature extensions of the covered work,
225
+ and which are not combined with it such as to form a larger program,
226
+ in or on a volume of a storage or distribution medium, is called an
227
+ "aggregate" if the compilation and its resulting copyright are not
228
+ used to limit the access or legal rights of the compilation's users
229
+ beyond what the individual works permit. Inclusion of a covered work
230
+ in an aggregate does not cause this License to apply to the other
231
+ parts of the aggregate.
232
+
233
+ 6. Conveying Non-Source Forms.
234
+
235
+ You may convey a covered work in object code form under the terms
236
+ of sections 4 and 5, provided that you also convey the
237
+ machine-readable Corresponding Source under the terms of this License,
238
+ in one of these ways:
239
+
240
+ a) Convey the object code in, or embodied in, a physical product
241
+ (including a physical distribution medium), accompanied by the
242
+ Corresponding Source fixed on a durable physical medium
243
+ customarily used for software interchange.
244
+
245
+ b) Convey the object code in, or embodied in, a physical product
246
+ (including a physical distribution medium), accompanied by a
247
+ written offer, valid for at least three years and valid for as
248
+ long as you offer spare parts or customer support for that product
249
+ model, to give anyone who possesses the object code either (1) a
250
+ copy of the Corresponding Source for all the software in the
251
+ product that is covered by this License, on a durable physical
252
+ medium customarily used for software interchange, for a price no
253
+ more than your reasonable cost of physically performing this
254
+ conveying of source, or (2) access to copy the
255
+ Corresponding Source from a network server at no charge.
256
+
257
+ c) Convey individual copies of the object code with a copy of the
258
+ written offer to provide the Corresponding Source. This
259
+ alternative is allowed only occasionally and noncommercially, and
260
+ only if you received the object code with such an offer, in accord
261
+ with subsection 6b.
262
+
263
+ d) Convey the object code by offering access from a designated
264
+ place (gratis or for a charge), and offer equivalent access to the
265
+ Corresponding Source in the same way through the same place at no
266
+ further charge. You need not require recipients to copy the
267
+ Corresponding Source along with the object code. If the place to
268
+ copy the object code is a network server, the Corresponding Source
269
+ may be on a different server (operated by you or a third party)
270
+ that supports equivalent copying facilities, provided you maintain
271
+ clear directions next to the object code saying where to find the
272
+ Corresponding Source. Regardless of what server hosts the
273
+ Corresponding Source, you remain obligated to ensure that it is
274
+ available for as long as needed to satisfy these requirements.
275
+
276
+ e) Convey the object code using peer-to-peer transmission, provided
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
280
+
281
+ A separable portion of the object code, whose source code is excluded
282
+ from the Corresponding Source as a System Library, need not be
283
+ included in conveying the object code work.
284
+
285
+ A "User Product" is either (1) a "consumer product", which means any
286
+ tangible personal property which is normally used for personal, family,
287
+ or household purposes, or (2) anything designed or sold for incorporation
288
+ into a dwelling. In determining whether a product is a consumer product,
289
+ doubtful cases shall be resolved in favor of coverage. For a particular
290
+ product received by a particular user, "normally used" refers to a
291
+ typical or common use of that class of product, regardless of the status
292
+ of the particular user or of the way in which the particular user
293
+ actually uses, or expects or is expected to use, the product. A product
294
+ is a consumer product regardless of whether the product has substantial
295
+ commercial, industrial or non-consumer uses, unless such uses represent
296
+ the only significant mode of use of the product.
297
+
298
+ "Installation Information" for a User Product means any methods,
299
+ procedures, authorization keys, or other information required to install
300
+ and execute modified versions of a covered work in that User Product from
301
+ a modified version of its Corresponding Source. The information must
302
+ suffice to ensure that the continued functioning of the modified object
303
+ code is in no case prevented or interfered with solely because
304
+ modification has been made.
305
+
306
+ If you convey an object code work under this section in, or with, or
307
+ specifically for use in, a User Product, and the conveying occurs as
308
+ part of a transaction in which the right of possession and use of the
309
+ User Product is transferred to the recipient in perpetuity or for a
310
+ fixed term (regardless of how the transaction is characterized), the
311
+ Corresponding Source conveyed under this section must be accompanied
312
+ by the Installation Information. But this requirement does not apply
313
+ if neither you nor any third party retains the ability to install
314
+ modified object code on the User Product (for example, the work has
315
+ been installed in ROM).
316
+
317
+ The requirement to provide Installation Information does not include a
318
+ requirement to continue to provide support service, warranty, or updates
319
+ for a work that has been modified or installed by the recipient, or for
320
+ the User Product in which it has been modified or installed. Access to a
321
+ network may be denied when the modification itself materially and
322
+ adversely affects the operation of the network or violates the rules and
323
+ protocols for communication across the network.
324
+
325
+ Corresponding Source conveyed, and Installation Information provided,
326
+ in accord with this section must be in a format that is publicly
327
+ documented (and with an implementation available to the public in
328
+ source code form), and must require no special password or key for
329
+ unpacking, reading or copying.
330
+
331
+ 7. Additional Terms.
332
+
333
+ "Additional permissions" are terms that supplement the terms of this
334
+ License by making exceptions from one or more of its conditions.
335
+ Additional permissions that are applicable to the entire Program shall
336
+ be treated as though they were included in this License, to the extent
337
+ that they are valid under applicable law. If additional permissions
338
+ apply only to part of the Program, that part may be used separately
339
+ under those permissions, but the entire Program remains governed by
340
+ this License without regard to the additional permissions.
341
+
342
+ When you convey a copy of a covered work, you may at your option
343
+ remove any additional permissions from that copy, or from any part of
344
+ it. (Additional permissions may be written to require their own
345
+ removal in certain cases when you modify the work.) You may place
346
+ additional permissions on material, added by you to a covered work,
347
+ for which you have or can give appropriate copyright permission.
348
+
349
+ Notwithstanding any other provision of this License, for material you
350
+ add to a covered work, you may (if authorized by the copyright holders of
351
+ that material) supplement the terms of this License with terms:
352
+
353
+ a) Disclaiming warranty or limiting liability differently from the
354
+ terms of sections 15 and 16 of this License; or
355
+
356
+ b) Requiring preservation of specified reasonable legal notices or
357
+ author attributions in that material or in the Appropriate Legal
358
+ Notices displayed by works containing it; or
359
+
360
+ c) Prohibiting misrepresentation of the origin of that material, or
361
+ requiring that modified versions of such material be marked in
362
+ reasonable ways as different from the original version; or
363
+
364
+ d) Limiting the use for publicity purposes of names of licensors or
365
+ authors of the material; or
366
+
367
+ e) Declining to grant rights under trademark law for use of some
368
+ trade names, trademarks, or service marks; or
369
+
370
+ f) Requiring indemnification of licensors and authors of that
371
+ material by anyone who conveys the material (or modified versions of
372
+ it) with contractual assumptions of liability to the recipient, for
373
+ any liability that these contractual assumptions directly impose on
374
+ those licensors and authors.
375
+
376
+ All other non-permissive additional terms are considered "further
377
+ restrictions" within the meaning of section 10. If the Program as you
378
+ received it, or any part of it, contains a notice stating that it is
379
+ governed by this License along with a term that is a further
380
+ restriction, you may remove that term. If a license document contains
381
+ a further restriction but permits relicensing or conveying under this
382
+ License, you may add to a covered work material governed by the terms
383
+ of that license document, provided that the further restriction does
384
+ not survive such relicensing or conveying.
385
+
386
+ If you add terms to a covered work in accord with this section, you
387
+ must place, in the relevant source files, a statement of the
388
+ additional terms that apply to those files, or a notice indicating
389
+ where to find the applicable terms.
390
+
391
+ Additional terms, permissive or non-permissive, may be stated in the
392
+ form of a separately written license, or stated as exceptions;
393
+ the above requirements apply either way.
394
+
395
+ 8. Termination.
396
+
397
+ You may not propagate or modify a covered work except as expressly
398
+ provided under this License. Any attempt otherwise to propagate or
399
+ modify it is void, and will automatically terminate your rights under
400
+ this License (including any patent licenses granted under the third
401
+ paragraph of section 11).
402
+
403
+ However, if you cease all violation of this License, then your
404
+ license from a particular copyright holder is reinstated (a)
405
+ provisionally, unless and until the copyright holder explicitly and
406
+ finally terminates your license, and (b) permanently, if the copyright
407
+ holder fails to notify you of the violation by some reasonable means
408
+ prior to 60 days after the cessation.
409
+
410
+ Moreover, your license from a particular copyright holder is
411
+ reinstated permanently if the copyright holder notifies you of the
412
+ violation by some reasonable means, this is the first time you have
413
+ received notice of violation of this License (for any work) from that
414
+ copyright holder, and you cure the violation prior to 30 days after
415
+ your receipt of the notice.
416
+
417
+ Termination of your rights under this section does not terminate the
418
+ licenses of parties who have received copies or rights from you under
419
+ this License. If your rights have been terminated and not permanently
420
+ reinstated, you do not qualify to receive new licenses for the same
421
+ material under section 10.
422
+
423
+ 9. Acceptance Not Required for Having Copies.
424
+
425
+ You are not required to accept this License in order to receive or
426
+ run a copy of the Program. Ancillary propagation of a covered work
427
+ occurring solely as a consequence of using peer-to-peer transmission
428
+ to receive a copy likewise does not require acceptance. However,
429
+ nothing other than this License grants you permission to propagate or
430
+ modify any covered work. These actions infringe copyright if you do
431
+ not accept this License. Therefore, by modifying or propagating a
432
+ covered work, you indicate your acceptance of this License to do so.
433
+
434
+ 10. Automatic Licensing of Downstream Recipients.
435
+
436
+ Each time you convey a covered work, the recipient automatically
437
+ receives a license from the original licensors, to run, modify and
438
+ propagate that work, subject to this License. You are not responsible
439
+ for enforcing compliance by third parties with this License.
440
+
441
+ An "entity transaction" is a transaction transferring control of an
442
+ organization, or substantially all assets of one, or subdividing an
443
+ organization, or merging organizations. If propagation of a covered
444
+ work results from an entity transaction, each party to that
445
+ transaction who receives a copy of the work also receives whatever
446
+ licenses to the work the party's predecessor in interest had or could
447
+ give under the previous paragraph, plus a right to possession of the
448
+ Corresponding Source of the work from the predecessor in interest, if
449
+ the predecessor has it or can get it with reasonable efforts.
450
+
451
+ You may not impose any further restrictions on the exercise of the
452
+ rights granted or affirmed under this License. For example, you may
453
+ not impose a license fee, royalty, or other charge for exercise of
454
+ rights granted under this License, and you may not initiate litigation
455
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
456
+ any patent claim is infringed by making, using, selling, offering for
457
+ sale, or importing the Program or any portion of it.
458
+
459
+ 11. Patents.
460
+
461
+ A "contributor" is a copyright holder who authorizes use under this
462
+ License of the Program or a work on which the Program is based. The
463
+ work thus licensed is called the contributor's "contributor version".
464
+
465
+ A contributor's "essential patent claims" are all patent claims
466
+ owned or controlled by the contributor, whether already acquired or
467
+ hereafter acquired, that would be infringed by some manner, permitted
468
+ by this License, of making, using, or selling its contributor version,
469
+ but do not include claims that would be infringed only as a
470
+ consequence of further modification of the contributor version. For
471
+ purposes of this definition, "control" includes the right to grant
472
+ patent sublicenses in a manner consistent with the requirements of
473
+ this License.
474
+
475
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
476
+ patent license under the contributor's essential patent claims, to
477
+ make, use, sell, offer for sale, import and otherwise run, modify and
478
+ propagate the contents of its contributor version.
479
+
480
+ In the following three paragraphs, a "patent license" is any express
481
+ agreement or commitment, however denominated, not to enforce a patent
482
+ (such as an express permission to practice a patent or covenant not to
483
+ sue for patent infringement). To "grant" such a patent license to a
484
+ party means to make such an agreement or commitment not to enforce a
485
+ patent against the party.
486
+
487
+ If you convey a covered work, knowingly relying on a patent license,
488
+ and the Corresponding Source of the work is not available for anyone
489
+ to copy, free of charge and under the terms of this License, through a
490
+ publicly available network server or other readily accessible means,
491
+ then you must either (1) cause the Corresponding Source to be so
492
+ available, or (2) arrange to deprive yourself of the benefit of the
493
+ patent license for this particular work, or (3) arrange, in a manner
494
+ consistent with the requirements of this License, to extend the patent
495
+ license to downstream recipients. "Knowingly relying" means you have
496
+ actual knowledge that, but for the patent license, your conveying the
497
+ covered work in a country, or your recipient's use of the covered work
498
+ in a country, would infringe one or more identifiable patents in that
499
+ country that you have reason to believe are valid.
500
+
501
+ If, pursuant to or in connection with a single transaction or
502
+ arrangement, you convey, or propagate by procuring conveyance of, a
503
+ covered work, and grant a patent license to some of the parties
504
+ receiving the covered work authorizing them to use, propagate, modify
505
+ or convey a specific copy of the covered work, then the patent license
506
+ you grant is automatically extended to all recipients of the covered
507
+ work and works based on it.
508
+
509
+ A patent license is "discriminatory" if it does not include within
510
+ the scope of its coverage, prohibits the exercise of, or is
511
+ conditioned on the non-exercise of one or more of the rights that are
512
+ specifically granted under this License. You may not convey a covered
513
+ work if you are a party to an arrangement with a third party that is
514
+ in the business of distributing software, under which you make payment
515
+ to the third party based on the extent of your activity of conveying
516
+ the work, and under which the third party grants, to any of the
517
+ parties who would receive the covered work from you, a discriminatory
518
+ patent license (a) in connection with copies of the covered work
519
+ conveyed by you (or copies made from those copies), or (b) primarily
520
+ for and in connection with specific products or compilations that
521
+ contain the covered work, unless you entered into that arrangement,
522
+ or that patent license was granted, prior to 28 March 2007.
523
+
524
+ Nothing in this License shall be construed as excluding or limiting
525
+ any implied license or other defenses to infringement that may
526
+ otherwise be available to you under applicable patent law.
527
+
528
+ 12. No Surrender of Others' Freedom.
529
+
530
+ If conditions are imposed on you (whether by court order, agreement or
531
+ otherwise) that contradict the conditions of this License, they do not
532
+ excuse you from the conditions of this License. If you cannot convey a
533
+ covered work so as to satisfy simultaneously your obligations under this
534
+ License and any other pertinent obligations, then as a consequence you may
535
+ not convey it at all. For example, if you agree to terms that obligate you
536
+ to collect a royalty for further conveying from those to whom you convey
537
+ the Program, the only way you could satisfy both those terms and this
538
+ License would be to refrain entirely from conveying the Program.
539
+
540
+ 13. Remote Network Interaction; Use with the GNU General Public License.
541
+
542
+ Notwithstanding any other provision of this License, if you modify the
543
+ Program, your modified version must prominently offer all users
544
+ interacting with it remotely through a computer network (if your version
545
+ supports such interaction) an opportunity to receive the Corresponding
546
+ Source of your version by providing access to the Corresponding Source
547
+ from a network server at no charge, through some standard or customary
548
+ means of facilitating copying of software. This Corresponding Source
549
+ shall include the Corresponding Source for any work covered by version 3
550
+ of the GNU General Public License that is incorporated pursuant to the
551
+ following paragraph.
552
+
553
+ Notwithstanding any other provision of this License, you have
554
+ permission to link or combine any covered work with a work licensed
555
+ under version 3 of the GNU General Public License into a single
556
+ combined work, and to convey the resulting work. The terms of this
557
+ License will continue to apply to the part which is the covered work,
558
+ but the work with which it is combined will remain governed by version
559
+ 3 of the GNU General Public License.
560
+
561
+ 14. Revised Versions of this License.
562
+
563
+ The Free Software Foundation may publish revised and/or new versions of
564
+ the GNU Affero General Public License from time to time. Such new versions
565
+ will be similar in spirit to the present version, but may differ in detail to
566
+ address new problems or concerns.
567
+
568
+ Each version is given a distinguishing version number. If the
569
+ Program specifies that a certain numbered version of the GNU Affero General
570
+ Public License "or any later version" applies to it, you have the
571
+ option of following the terms and conditions either of that numbered
572
+ version or of any later version published by the Free Software
573
+ Foundation. If the Program does not specify a version number of the
574
+ GNU Affero General Public License, you may choose any version ever published
575
+ by the Free Software Foundation.
576
+
577
+ If the Program specifies that a proxy can decide which future
578
+ versions of the GNU Affero General Public License can be used, that proxy's
579
+ public statement of acceptance of a version permanently authorizes you
580
+ to choose that version for the Program.
581
+
582
+ Later license versions may give you additional or different
583
+ permissions. However, no additional obligations are imposed on any
584
+ author or copyright holder as a result of your choosing to follow a
585
+ later version.
586
+
587
+ 15. Disclaimer of Warranty.
588
+
589
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597
+
598
+ 16. Limitation of Liability.
599
+
600
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608
+ SUCH DAMAGES.
609
+
610
+ 17. Interpretation of Sections 15 and 16.
611
+
612
+ If the disclaimer of warranty and limitation of liability provided
613
+ above cannot be given local legal effect according to their terms,
614
+ reviewing courts shall apply local law that most closely approximates
615
+ an absolute waiver of all civil liability in connection with the
616
+ Program, unless a warranty or assumption of liability accompanies a
617
+ copy of the Program in return for a fee.
618
+
619
+ END OF TERMS AND CONDITIONS
620
+
621
+ How to Apply These Terms to Your New Programs
622
+
623
+ If you develop a new program, and you want it to be of the greatest
624
+ possible use to the public, the best way to achieve this is to make it
625
+ free software which everyone can redistribute and change under these terms.
626
+
627
+ To do so, attach the following notices to the program. It is safest
628
+ to attach them to the start of each source file to most effectively
629
+ state the exclusion of warranty; and each file should have at least
630
+ the "copyright" line and a pointer to where the full notice is found.
631
+
632
+ <one line to give the program's name and a brief idea of what it does.>
633
+ Copyright (C) <year> <name of author>
634
+
635
+ This program is free software: you can redistribute it and/or modify
636
+ it under the terms of the GNU Affero General Public License as published
637
+ by the Free Software Foundation, either version 3 of the License, or
638
+ (at your option) any later version.
639
+
640
+ This program is distributed in the hope that it will be useful,
641
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
642
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643
+ GNU Affero General Public License for more details.
644
+
645
+ You should have received a copy of the GNU Affero General Public License
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
647
+
648
+ Also add information on how to contact you by electronic and paper mail.
649
+
650
+ If your software can interact with users remotely through a computer
651
+ network, you should also make sure that it provides a way for users to
652
+ get its source. For example, if your program is a web application, its
653
+ interface could display a "Source" link that leads users to an archive
654
+ of the code. There are many ways you could offer source, and different
655
+ solutions will be better for different programs; see section 13 for the
656
+ specific requirements.
657
+
658
+ You should also get your employer (if you work as a programmer) or school,
659
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
660
+ For more information on this, and how to apply and follow the GNU AGPL, see
661
+ <https://www.gnu.org/licenses/>.
README.md CHANGED
@@ -1,12 +1,292 @@
1
  ---
2
  title: MinerU
3
- emoji: 📈
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 4.39.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: MinerU
3
+ app_file: ./demo/app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.39.0
 
 
6
  ---
7
+ <div id="top"></div>
8
+ <div align="center">
9
 
10
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
11
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
12
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
13
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
14
+ [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
15
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
16
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
17
+
18
+
19
+
20
+
21
+ [English](README.md) | [简体中文](README_zh-CN.md)
22
+
23
+ </div>
24
+
25
+ <div align="center">
26
+
27
+ </div>
28
+
29
+ # MinerU
30
+
31
+
32
+ ## Introduction
33
+
34
+ MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
35
+
36
+ - [Magic-PDF](#Magic-PDF) PDF Document Extraction
37
+ - [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
38
+
39
+
40
+ # Magic-PDF
41
+
42
+
43
+ ## Introduction
44
+
45
+ Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
46
+
47
+ Key features include:
48
+
49
+ - Support for multiple front-end model inputs
50
+ - Removal of headers, footers, footnotes, and page numbers
51
+ - Human-readable layout formatting
52
+ - Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
53
+ - Extraction and display of images and tables within markdown
54
+ - Conversion of equations into LaTeX format
55
+ - Automatic detection and conversion of garbled PDFs
56
+ - Compatibility with CPU and GPU environments
57
+ - Available for Windows, Linux, and macOS platforms
58
+
59
+
60
+ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
61
+
62
+
63
+
64
+ ## Project Panorama
65
+
66
+ ![Project Panorama](docs/images/project_panorama_en.png)
67
+
68
+
69
+ ## Flowchart
70
+
71
+ ![Flowchart](docs/images/flowchart_en.png)
72
+
73
+ ### Dependency repositorys
74
+
75
+ - [PDF-Extract-Kit : A Comprehensive Toolkit for High-Quality PDF Content Extraction](https://github.com/opendatalab/PDF-Extract-Kit) 🚀🚀🚀
76
+
77
+ ## Getting Started
78
+
79
+ ### Requirements
80
+
81
+ - Python >= 3.9
82
+
83
+ Using a virtual environment is recommended to avoid potential dependency conflicts; both venv and conda are suitable.
84
+ For example:
85
+ ```bash
86
+ conda create -n MinerU python=3.10
87
+ conda activate MinerU
88
+ ```
89
+
90
+ ### Installation and Configuration
91
+
92
+ #### 1. Install Magic-PDF
93
+
94
+ Install the full-feature package with pip:
95
+ >Note: The pip-installed package supports CPU-only and is ideal for quick tests.
96
+ >
97
+ >For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
98
+
99
+ ```bash
100
+ pip install magic-pdf[full-cpu]
101
+ ```
102
+ The full-feature package depends on detectron2, which requires a compilation installation.
103
+ If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114
104
+ Alternatively, you can directly use our precompiled whl package (limited to Python 3.10):
105
+
106
+ ```bash
107
+ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
108
+ ```
109
+
110
+
111
+ #### 2. Downloading model weights files
112
+
113
+ For detailed references, please see below [how_to_download_models](docs/how_to_download_models_en.md)
114
+
115
+ After downloading the model weights, move the 'models' directory to a directory on a larger disk space, preferably an SSD.
116
+
117
+
118
+ #### 3. Copy the Configuration File and Make Configurations
119
+ You can get the [magic-pdf.template.json](magic-pdf.template.json) file in the repository root directory.
120
+ ```bash
121
+ cp magic-pdf.template.json ~/magic-pdf.json
122
+ ```
123
+ In magic-pdf.json, configure "models-dir" to point to the directory where the model weights files are located.
124
+
125
+ ```json
126
+ {
127
+ "models-dir": "/tmp/models"
128
+ }
129
+ ```
130
+
131
+
132
+ #### 4. Acceleration Using CUDA or MPS
133
+ If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you can leverage acceleration with CUDA or MPS respectively.
134
+ ##### CUDA
135
+
136
+ You need to install the corresponding PyTorch version according to your CUDA version.
137
+ This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
138
+ ```bash
139
+ pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
140
+ ```
141
+ Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json.
142
+ ```json
143
+ {
144
+ "device-mode":"cuda"
145
+ }
146
+ ```
147
+
148
+ ##### MPS
149
+
150
+ For macOS users with M-series chip devices, you can use MPS for inference acceleration.
151
+ You also need to modify the value of "device-mode" in the configuration file magic-pdf.json.
152
+ ```json
153
+ {
154
+ "device-mode":"mps"
155
+ }
156
+ ```
157
+
158
+
159
+ ### Usage
160
+
161
+ #### 1.Usage via Command Line
162
+
163
+ ###### simple
164
+
165
+ ```bash
166
+ magic-pdf pdf-command --pdf "pdf_path" --inside_model true
167
+ ```
168
+ After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
169
+ You can find the corresponding xxx_model.json file in the markdown directory.
170
+ If you intend to do secondary development on the post-processing pipeline, you can use the command:
171
+ ```bash
172
+ magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
173
+ ```
174
+ In this way, you won't need to re-run the model data, making debugging more convenient.
175
+
176
+
177
+ ###### more
178
+
179
+ ```bash
180
+ magic-pdf --help
181
+ ```
182
+
183
+
184
+ #### 2. Usage via Api
185
+
186
+ ###### Local
187
+ ```python
188
+ image_writer = DiskReaderWriter(local_image_dir)
189
+ image_dir = str(os.path.basename(local_image_dir))
190
+ jso_useful_key = {"_pdf_type": "", "model_list": []}
191
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
192
+ pipe.pipe_classify()
193
+ pipe.pipe_parse()
194
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
195
+ ```
196
+
197
+ ###### Object Storage
198
+ ```python
199
+ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
200
+ image_dir = "s3://img_bucket/"
201
+ s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
202
+ pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
203
+ jso_useful_key = {"_pdf_type": "", "model_list": []}
204
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
205
+ pipe.pipe_classify()
206
+ pipe.pipe_parse()
207
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
208
+ ```
209
+
210
+ Demo can be referred to [demo.py](demo/demo.py)
211
+
212
+
213
+ # Magic-Doc
214
+
215
+
216
+ ## Introduction
217
+
218
+ Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
219
+
220
+ Key Features Include:
221
+
222
+ - Web Page Extraction
223
+ - Cross-modal precise parsing of text, images, tables, and formula information.
224
+
225
+ - E-Book Document Extraction
226
+ - Supports various document formats including epub, mobi, with full adaptation for text and images.
227
+
228
+ - Language Type Identification
229
+ - Accurate recognition of 176 languages.
230
+
231
+ https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
232
+
233
+
234
+
235
+ https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
236
+
237
+
238
+
239
+ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
240
+
241
+
242
+
243
+
244
+ ## Project Repository
245
+
246
+ - [Magic-Doc](https://github.com/InternLM/magic-doc)
247
+ Outstanding Webpage and E-book Extraction Tool
248
+
249
+
250
+ # All Thanks To Our Contributors
251
+
252
+ <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
253
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
254
+ </a>
255
+
256
+
257
+ # License Information
258
+
259
+ [LICENSE.md](LICENSE.md)
260
+
261
+ The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
262
+
263
+
264
+ # Acknowledgments
265
+
266
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
267
+ - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
268
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
269
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
270
+
271
+
272
+ # Citation
273
+
274
+ ```bibtex
275
+ @misc{2024mineru,
276
+ title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
277
+ author={MinerU Contributors},
278
+ howpublished = {\url{https://github.com/opendatalab/MinerU}},
279
+ year={2024}
280
+ }
281
+ ```
282
+
283
+
284
+ # Star History
285
+
286
+ <a>
287
+ <picture>
288
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
289
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
290
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
291
+ </picture>
292
+ </a>
README_zh-CN.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div id="top"></div>
2
+ <div align="center">
3
+
4
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
5
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
6
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
7
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
8
+ [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
9
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
10
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
11
+
12
+ [English](README.md) | [简体中文](README_zh-CN.md)
13
+
14
+ </div>
15
+
16
+ <div align="center">
17
+
18
+ </div>
19
+
20
+ # MinerU
21
+
22
+
23
+ ## 简介
24
+
25
+ MinerU 是一款一站式、开源、高质量的数据提取工具,主要包含以下功能:
26
+
27
+ - [Magic-PDF](#Magic-PDF) PDF文档提取
28
+ - [Magic-Doc](#Magic-Doc) 网页与电子书提取
29
+
30
+ # Magic-PDF
31
+
32
+
33
+ ## 简介
34
+
35
+ Magic-PDF 是一款将 PDF 转化为 markdown 格式的工具。支持转换本地文档或者位于支持S3协议对象存储上的文件。
36
+
37
+ 主要功能包含
38
+
39
+ - 支持多种前端模型输入
40
+ - 删除页眉、页脚、脚注、页码等元素
41
+ - 符合人类阅读顺序的排版格式
42
+ - 保留原文档的结构和格式,包括标题、段落、列表等
43
+ - 提取图像和表格并在markdown中展示
44
+ - 将公式转换成latex
45
+ - 乱码PDF自动识别并转换
46
+ - 支持cpu和gpu环境
47
+ - 支持windows/linux/mac平台
48
+
49
+
50
+ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
51
+
52
+
53
+
54
+ ## 项目全景
55
+
56
+ ![项目全景图](docs/images/project_panorama_zh_cn.png)
57
+
58
+ ## 流程图
59
+
60
+ ![流程图](docs/images/flowchart_zh_cn.png)
61
+
62
+ ### 子模块仓库
63
+
64
+ - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
65
+ - 高质量的PDF内容提取工具包
66
+
67
+ ## 上手指南
68
+
69
+ ### 配置要求
70
+
71
+ python >= 3.9
72
+
73
+ 推荐使用虚拟环境,以避免可能发生的依赖冲突,venv和conda均可使用。
74
+ 例如:
75
+ ```bash
76
+ conda create -n MinerU python=3.10
77
+ conda activate MinerU
78
+ ```
79
+ 开发基于python 3.10,如果在其他版本python出现问题请切换至3.10。
80
+
81
+ ### 安装配置
82
+
83
+ #### 1. 安装Magic-PDF
84
+
85
+ 使用pip安装完整功能包:
86
+ >受pypi限制,pip安装的完整功能包仅支持cpu推理,建议只用于快速测试解析能力。
87
+ >
88
+ >如需在生产环境使用CUDA/MPS加速请参考[使用CUDA或MPS加速推理](#4-使用CUDA或MPS加速推理)
89
+ ```bash
90
+ pip install magic-pdf[full-cpu]
91
+ ```
92
+ 完整功能包依赖detectron2,该库需要编译安装,如需自行编译,请参考 https://github.com/facebookresearch/detectron2/issues/5114
93
+ 或是直接使用我们预编译的whl包(仅限python 3.10):
94
+ ```bash
95
+ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
96
+ ```
97
+
98
+ #### 2. 下载模型权重文件
99
+
100
+ 详细参考 [如何下载模型文件](docs/how_to_download_models_zh_cn.md)
101
+ 下载后请将models目录移动到空间较大的ssd磁盘目录
102
+
103
+ #### 3. 拷贝配置文件并进行配置
104
+ 在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
105
+ ```bash
106
+ cp magic-pdf.template.json ~/magic-pdf.json
107
+ ```
108
+ 在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
109
+ ```json
110
+ {
111
+ "models-dir": "/tmp/models"
112
+ }
113
+ ```
114
+
115
+ #### 4. 使用CUDA或MPS加速推理
116
+ 如您有可用的Nvidia显卡或在使用Apple Silicon的Mac,可以使用CUDA或MPS进行加速
117
+ ##### CUDA
118
+
119
+ 需要根据自己的CUDA版本安装对应的pytorch版本
120
+ 以下是对应CUDA 11.8版本的安装命令,更多信息请参考 https://pytorch.org/get-started/locally/
121
+ ```bash
122
+ pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
123
+ ```
124
+
125
+ 同时需要修改配置文件magic-pdf.json中"device-mode"的值
126
+ ```json
127
+ {
128
+ "device-mode":"cuda"
129
+ }
130
+ ```
131
+
132
+ ##### MPS
133
+ 使用macOS(M系列芯片设备)可以使用MPS进行推理加速
134
+ 需要修改配置文件magic-pdf.json中"device-mode"的值
135
+ ```json
136
+ {
137
+ "device-mode":"mps"
138
+ }
139
+ ```
140
+
141
+
142
+ ### 使用说明
143
+
144
+ #### 1. 通过命令行使用
145
+
146
+ ###### 直接使用
147
+
148
+ ```bash
149
+ magic-pdf pdf-command --pdf "pdf_path" --inside_model true
150
+ ```
151
+ 程序运行完成后,你可以在"/tmp/magic-pdf"目录下看到生成的markdown文件,markdown目录中可以找到对应的xxx_model.json文件
152
+ 如果您有意对后处理pipeline进行二次开发,可以使用命令
153
+ ```bash
154
+ magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
155
+ ```
156
+ 这样就不需要重跑模型数据,调试起来更方便
157
+
158
+ ###### 更多用法
159
+
160
+ ```bash
161
+ magic-pdf --help
162
+ ```
163
+
164
+
165
+ #### 2. 通过接口调用
166
+
167
+ ###### 本地使用
168
+ ```python
169
+ image_writer = DiskReaderWriter(local_image_dir)
170
+ image_dir = str(os.path.basename(local_image_dir))
171
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
172
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
173
+ pipe.pipe_classify()
174
+ pipe.pipe_parse()
175
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
176
+ ```
177
+
178
+ ###### 在对象存储上使用
179
+ ```python
180
+ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
181
+ image_dir = "s3://img_bucket/"
182
+ s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
183
+ pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
184
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
185
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
186
+ pipe.pipe_classify()
187
+ pipe.pipe_parse()
188
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
189
+ ```
190
+
191
+ 详细实现可参考 [demo.py](demo/demo.py)
192
+
193
+
194
+ ### 常见问题处理解答
195
+
196
+ 参考 [FAQ](docs/FAQ_zh_cn.md)
197
+
198
+
199
+ # Magic-Doc
200
+
201
+
202
+ ## 简介
203
+
204
+ Magic-Doc 是一款支持将网页或多格式电子书转换为 markdown 格式的工具。
205
+
206
+ 主要功能包含
207
+
208
+ - Web网页提取
209
+ - 跨模态精准解析图文、表格、公式信息
210
+
211
+ - 电子书文献提取
212
+ - 支持 epub,mobi等多格式文献,文本图片全适配
213
+
214
+ - 语言类型鉴定
215
+ - 支持176种语言的准确识别
216
+
217
+ https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
218
+
219
+
220
+
221
+ https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
222
+
223
+
224
+
225
+ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
226
+
227
+
228
+
229
+
230
+ ## 项目仓库
231
+
232
+ - [Magic-Doc](https://github.com/InternLM/magic-doc)
233
+ 优秀的网页与电子书提取工具
234
+
235
+
236
+ ## 感谢我们的贡献者
237
+
238
+ <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
239
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
240
+ </a>
241
+
242
+
243
+ ## 版权说明
244
+
245
+ [LICENSE.md](LICENSE.md)
246
+
247
+ 本项目目前采用PyMuPDF以实现高级功能,但因其遵循AGPL协议,可能对某些使用场景构成限制。未来版本迭代中,我们计划探索并替换为许可条款更为宽松的PDF处理库,以提升用户友好度及灵活性。
248
+
249
+
250
+ ## 致谢
251
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
252
+ - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
253
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
254
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
255
+
256
+
257
+ # 引用
258
+
259
+ ```bibtex
260
+ @misc{2024mineru,
261
+ title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
262
+ author={MinerU Contributors},
263
+ howpublished = {\url{https://github.com/opendatalab/MinerU}},
264
+ year={2024}
265
+ }
266
+ ```
267
+
268
+
269
+ # Star History
270
+
271
+ <a>
272
+ <picture>
273
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
274
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
275
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
276
+ </picture>
277
+ </a>
demo/app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ from loguru import logger
5
+ from magic_pdf.pipe.UNIPipe import UNIPipe
6
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
7
+ import magic_pdf.model as model_config
8
+
9
+ model_config.__use_inside_model__ = True
10
+
11
+
12
+ def process_pdf(file_path):
13
+ try:
14
+ pdf_bytes = open(file_path, "rb").read()
15
+ model_json = [] # model_json传空list使用内置模型解析
16
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
17
+ local_image_dir = os.path.join('uploads', 'images')
18
+ if not os.path.exists(local_image_dir):
19
+ os.makedirs(local_image_dir)
20
+ image_dir = str(os.path.basename(local_image_dir))
21
+ image_writer = DiskReaderWriter(local_image_dir)
22
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
23
+ pipe.pipe_classify()
24
+ if len(model_json) == 0:
25
+ if model_config.__use_inside_model__:
26
+ pipe.pipe_analyze()
27
+ else:
28
+ logger.error("need model list input")
29
+ return None
30
+ pipe.pipe_parse()
31
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
32
+ return md_content
33
+ except Exception as e:
34
+ logger.exception(e)
35
+ return None
36
+
37
+
38
+ def extract_markdown_from_pdf(pdf):
39
+ # 保存上传的PDF文件
40
+ file_path = os.path.join('uploads', pdf.name)
41
+ with open(file_path, 'wb') as f:
42
+ f.write(pdf.read())
43
+
44
+ # 处理PDF文件并生成Markdown内容
45
+ md_content = process_pdf(file_path)
46
+ return md_content
47
+
48
+
49
+ def main():
50
+ # 创建Gradio接口
51
+ with gr.Blocks() as demo:
52
+ gr.Markdown("# PDF to Markdown Converter")
53
+
54
+ with gr.Row():
55
+ with gr.Column():
56
+ pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
57
+ md_output = gr.Markdown(label="Extracted Markdown")
58
+
59
+ extract_button = gr.Button("Extract Markdown")
60
+ extract_button.click(extract_markdown_from_pdf, inputs=[
61
+ pdf_file], outputs=[md_output])
62
+
63
+ demo.launch(share=True)
64
+
65
+
66
+ if __name__ == '__main__':
67
+ main()
demo/demo.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ from loguru import logger
5
+
6
+ from magic_pdf.pipe.UNIPipe import UNIPipe
7
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
8
+
9
+ import magic_pdf.model as model_config
10
+ model_config.__use_inside_model__ = True
11
+
12
+ try:
13
+ current_script_dir = os.path.dirname(os.path.abspath(__file__))
14
+ demo_name = "demo1"
15
+ pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
16
+ model_path = os.path.join(current_script_dir, f"{demo_name}.json")
17
+ pdf_bytes = open(pdf_path, "rb").read()
18
+ # model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
19
+ model_json = [] # model_json传空list使用内置模型解析
20
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
21
+ local_image_dir = os.path.join(current_script_dir, 'images')
22
+ image_dir = str(os.path.basename(local_image_dir))
23
+ image_writer = DiskReaderWriter(local_image_dir)
24
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
25
+ pipe.pipe_classify()
26
+ pipe.pipe_parse()
27
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
28
+ with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
29
+ f.write(md_content)
30
+ except Exception as e:
31
+ logger.exception(e)
demo/demo1.json ADDED
The diff for this file is too large to render. See raw diff
 
demo/demo1.pdf ADDED
Binary file (337 kB). View file
 
demo/demo2.json ADDED
The diff for this file is too large to render. See raw diff
 
demo/demo2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e94e95637356e1599510436278747d1150a3dfb822233bdc77a9dcb9a4fc6e4
3
+ size 1808096
docs/FAQ_zh_cn.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 常见问题解答
2
+
3
+ ### 1.离线部署首次运行,报错urllib.error.URLError: <urlopen error [Errno 101] Network is unreachable>
4
+
5
+ 首次运行需要在线下载一个小的语言检测模型,如果是离线部署需要手动下载该模型并放到指定目录。
6
+ 参考:https://github.com/opendatalab/MinerU/issues/121
7
+
8
+ ### 2.在较新版本的mac上使用命令安装pip install magic-pdf[full-cpu] zsh: no matches found: magic-pdf[full-cpu]
9
+
10
+ 在 macOS 上,默认的 shell 从 Bash 切换到了 Z shell,而 Z shell 对于某些类型的字符串匹配有特殊的处理逻辑,这可能导致no matches found错误。
11
+ 可以通过在命令行禁用globbing特性,再尝试运行安装命令
12
+ ```bash
13
+ setopt no_nomatch
14
+ pip install magic-pdf[full-cpu]
15
+ ```
16
+
17
+ ### 3.在intel cpu 的mac上 安装最新版的完整功能包 magic-pdf[full-cpu] (0.6.x) 不成功
18
+
19
+ 完整功能包依赖的公式解析库unimernet限制了pytorch的最低版本为2.3.0,而pytorch官方没有为intel cpu的macOS 提供2.3.0版本的预编译包,所以会产生依赖不兼容的问题。
20
+ 可以先尝试安装unimernet的老版本之后再尝试安装完整功能包的其他依赖。(为避免依赖冲突,请激活一个全新的虚拟环境)
21
+ ```bash
22
+ pip install magic-pdf
23
+ pip install unimernet==0.1.0
24
+ pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
25
+ ```
26
+
27
+ ### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败
28
+
29
+ 卸载torch和torchvision,重新安装nightly构建版torch和torchvision
30
+ ```bash
31
+ pip uninstall torch torchvision
32
+ pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
33
+ ```
34
+ 参考: https://github.com/opendatalab/PDF-Extract-Kit/issues/23
35
+
36
+ ### 5.使用过程中遇到paddle相关的报错FatalError: Illegal instruction is detected by the operating system.
37
+
38
+ paddlepaddle 2.6.1与部分linux系统环境存在兼容性问题。
39
+ 可尝试降级到2.5.2使用,
40
+ ```bash
41
+ pip install paddlepaddle==2.5.2
42
+ ```
43
+ 或卸载paddlepaddle,重新安装paddlepaddle-gpu
44
+ ```bash
45
+ pip uninstall paddlepaddle
46
+ pip install paddlepaddle-gpu
47
+ ```
48
+ 参考:https://github.com/opendatalab/MinerU/issues/146
49
+
50
+ ### 6.使用过程中遇到_pickle.UnpicklingError: invalid load key, 'v'.错误
51
+
52
+ 可能是由于模型文件未下载完整导致,可尝试重现下载模型文件后再试
53
+ 参考:https://github.com/opendatalab/MinerU/issues/143
54
+
55
+ ### 7.程序运行完成后,找不到tmp目录
56
+
57
+ 程序输出目录是在"magic-pdf.json"中通过
58
+ ```json
59
+ {
60
+ "temp-output-dir": "/tmp"
61
+ }
62
+ ```
63
+ 进行配置的。
64
+ 如果没有更改这个参数,使用默认的配置执行程序,在linux/macOS会在绝对路径"/tmp"下创建一个"magic-pdf"文件夹作为输出路径。
65
+ 而在windows下,默认的输出路径与执行命令时,命令行所在的盘符相关,如果命令行在C盘,则默认输出路径为"C://tmp/magic-pdf"。
66
+ 参考:https://github.com/opendatalab/MinerU/issues/149
67
+
68
+ ### 8.模型文件应该下载到哪里/models-dir的配置应该怎么填
69
+
70
+ 模型文件的路径输入是在"magic-pdf.json"中通过
71
+ ```json
72
+ {
73
+ "models-dir": "/tmp/models"
74
+ }
75
+ ```
76
+ 进行配置的。
77
+ 这个路径是绝对路径而不是相对路径,绝对路径的获取可在models目录中通过命令 "pwd" 获取。
78
+ 参考:https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
79
+
80
+ ### 9.命令行中 --model "model_json_path" 指的是什么?
81
+
82
+ model_json 指的是通过模型分析后生成的一种有特定格式的json文件。
83
+ 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。
84
+ 如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。
85
+ 参考:https://github.com/opendatalab/MinerU/issues/128
docs/how_to_download_models_en.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Install Git LFS
2
+ Before you begin, make sure Git Large File Storage (Git LFS) is installed on your system. Install it using the following command:
3
+
4
+ ```bash
5
+ git lfs install
6
+ ```
7
+
8
+ ### Download the Model from Hugging Face
9
+ To download the `PDF-Extract-Kit` model from Hugging Face, use the following command:
10
+
11
+ ```bash
12
+ git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
13
+ ```
14
+
15
+ Ensure that Git LFS is enabled during the clone to properly download all large files.
16
+
17
+
18
+
19
+ ### Download the Model from ModelScope
20
+
21
+ #### SDK Download
22
+
23
+ ```bash
24
+ # First, install the ModelScope library using pip:
25
+ pip install modelscope
26
+ ```
27
+
28
+ ```python
29
+ # Use the following Python code to download the model using the ModelScope SDK:
30
+ from modelscope import snapshot_download
31
+ model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
32
+ ```
33
+
34
+ #### Git Download
35
+ Alternatively, you can use Git to clone the model repository from ModelScope:
36
+
37
+ ```bash
38
+ git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
39
+ ```
40
+
41
+
42
+ Put [model files]() here:
43
+
44
+ ```
45
+ ./
46
+ ├── Layout
47
+ │ ├── config.json
48
+ │ └── weights.pth
49
+ ├── MFD
50
+ │ └── weights.pt
51
+ ├── MFR
52
+ │ └── UniMERNet
53
+ │ ├── config.json
54
+ │ ├── preprocessor_config.json
55
+ │ ├── pytorch_model.bin
56
+ │ ├── README.md
57
+ │ ├── tokenizer_config.json
58
+ │ └── tokenizer.json
59
+ └── README.md
60
+ ```
docs/how_to_download_models_zh_cn.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### 安装 Git LFS
2
+ 开始之前,请确保您的系统上已安装 Git 大文件存储 (Git LFS)。使用以下命令进行安装
3
+
4
+ ```bash
5
+ git lfs install
6
+ ```
7
+
8
+ ### 从 Hugging Face 下载模型
9
+ 请使用以下命令从 Hugging Face 下载 PDF-Extract-Kit 模型:
10
+
11
+ ```bash
12
+ git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
13
+ ```
14
+
15
+ 确保在克隆过程中启用了 Git LFS,以便正确下载所有大文件。
16
+
17
+
18
+ ### 从 ModelScope 下载模型
19
+
20
+ #### SDK下载
21
+
22
+ ```bash
23
+ # 首先安装modelscope
24
+ pip install modelscope
25
+ ```
26
+
27
+ ```python
28
+ # 使用modelscope sdk下载模型
29
+ from modelscope import snapshot_download
30
+ model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
31
+ ```
32
+
33
+ #### Git下载
34
+ 也可以使用git clone从 ModelScope 下载模型:
35
+
36
+ ```bash
37
+ git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
38
+ ```
39
+
40
+
41
+ 将 'models' 目录移动到具有较大磁盘空间的目录中,最好是在固态硬盘(SSD)上。
42
+
43
+
44
+ 模型文件夹的结构如下,包含了不同组件的配置文件和权重文件:
45
+ ```
46
+ ./
47
+ ├── Layout
48
+ │ ├── config.json
49
+ │ └── model_final.pth
50
+ ├── MFD
51
+ │ └── weights.pt
52
+ ├── MFR
53
+ │ └── UniMERNet
54
+ │ ├── config.json
55
+ │ ├── preprocessor_config.json
56
+ │ ├── pytorch_model.bin
57
+ │ ├── README.md
58
+ │ ├── tokenizer_config.json
59
+ │ └── tokenizer.json
60
+ └── README.md
61
+ ```
docs/images/flowchart_en.png ADDED
docs/images/flowchart_zh_cn.png ADDED
docs/images/project_panorama_en.png ADDED
docs/images/project_panorama_zh_cn.png ADDED
magic-pdf.template.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bucket_info":{
3
+ "bucket-name-1":["ak", "sk", "endpoint"],
4
+ "bucket-name-2":["ak", "sk", "endpoint"]
5
+ },
6
+ "temp-output-dir":"/tmp",
7
+ "models-dir":"/tmp/models",
8
+ "device-mode":"cpu"
9
+ }
magic_pdf/__init__.py ADDED
File without changes
magic_pdf/cli/__init__.py ADDED
File without changes
magic_pdf/cli/magicpdf.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 这里实现2个click命令:
3
+ 第一个:
4
+ 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
5
+ 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
6
+ 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
7
+ 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
8
+ 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
9
+
10
+ 最后把以上步骤准备好的对象传入真正的解析API
11
+
12
+ 第二个:
13
+ 接收1)pdf的本地路径。2)模型json文件(可选)。然后:
14
+ 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
15
+ 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
16
+ 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
17
+
18
+
19
+ 效果:
20
+ python magicpdf.py json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
21
+ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
22
+ """
23
+
24
+ import os
25
+ import json as json_parse
26
+ import click
27
+ from loguru import logger
28
+ from pathlib import Path
29
+ from magic_pdf.libs.version import __version__
30
+
31
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
32
+ from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
33
+ from magic_pdf.pipe.UNIPipe import UNIPipe
34
+ from magic_pdf.pipe.OCRPipe import OCRPipe
35
+ from magic_pdf.pipe.TXTPipe import TXTPipe
36
+ from magic_pdf.libs.path_utils import (
37
+ parse_s3path,
38
+ parse_s3_range_params,
39
+ remove_non_official_s3_args,
40
+ )
41
+ from magic_pdf.libs.config_reader import (
42
+ get_local_dir,
43
+ get_s3_config,
44
+ )
45
+ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
46
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
47
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
48
+ import csv
49
+ import copy
50
+ import magic_pdf.model as model_config
51
+
52
+ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
53
+
54
+
55
+ def prepare_env(pdf_file_name, method):
56
+ local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
57
+
58
+ local_image_dir = os.path.join(str(local_parent_dir), "images")
59
+ local_md_dir = local_parent_dir
60
+ os.makedirs(local_image_dir, exist_ok=True)
61
+ os.makedirs(local_md_dir, exist_ok=True)
62
+ return local_image_dir, local_md_dir
63
+
64
+
65
+ def write_to_csv(csv_file_path, csv_data):
66
+ with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
67
+ # 创建csv writer对象
68
+ csv_writer = csv.writer(csvfile)
69
+ # 写入数据
70
+ csv_writer.writerow(csv_data)
71
+ logger.info(f"数据已成功追加到 '{csv_file_path}'")
72
+
73
+
74
+ def do_parse(
75
+ pdf_file_name,
76
+ pdf_bytes,
77
+ model_list,
78
+ parse_method,
79
+ f_draw_span_bbox=True,
80
+ f_draw_layout_bbox=True,
81
+ f_dump_md=True,
82
+ f_dump_middle_json=True,
83
+ f_dump_model_json=True,
84
+ f_dump_orig_pdf=True,
85
+ f_dump_content_list=True,
86
+ f_make_md_mode=MakeMode.MM_MD,
87
+ ):
88
+
89
+ orig_model_list = copy.deepcopy(model_list)
90
+
91
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
92
+ logger.info(f"local output dir is {local_md_dir}")
93
+ image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
94
+ image_dir = str(os.path.basename(local_image_dir))
95
+
96
+ if parse_method == "auto":
97
+ jso_useful_key = {"_pdf_type": "", "model_list": model_list}
98
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
99
+ elif parse_method == "txt":
100
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
101
+ elif parse_method == "ocr":
102
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
103
+ else:
104
+ logger.error("unknown parse method")
105
+ exit(1)
106
+
107
+ pipe.pipe_classify()
108
+
109
+ """如果没有传入有效的模型数据,则使用内置model解析"""
110
+ if len(model_list) == 0:
111
+ if model_config.__use_inside_model__:
112
+ pipe.pipe_analyze()
113
+ orig_model_list = copy.deepcopy(pipe.model_list)
114
+ else:
115
+ logger.error("need model list input")
116
+ exit(1)
117
+
118
+ pipe.pipe_parse()
119
+ pdf_info = pipe.pdf_mid_data["pdf_info"]
120
+ if f_draw_layout_bbox:
121
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
122
+ if f_draw_span_bbox:
123
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
124
+
125
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
126
+ if f_dump_md:
127
+ """写markdown"""
128
+ md_writer.write(
129
+ content=md_content,
130
+ path=f"{pdf_file_name}.md",
131
+ mode=AbsReaderWriter.MODE_TXT,
132
+ )
133
+
134
+ if f_dump_middle_json:
135
+ """写middle_json"""
136
+ md_writer.write(
137
+ content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
138
+ path=f"{pdf_file_name}_middle.json",
139
+ mode=AbsReaderWriter.MODE_TXT,
140
+ )
141
+
142
+ if f_dump_model_json:
143
+ """写model_json"""
144
+ md_writer.write(
145
+ content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
146
+ path=f"{pdf_file_name}_model.json",
147
+ mode=AbsReaderWriter.MODE_TXT,
148
+ )
149
+
150
+ if f_dump_orig_pdf:
151
+ """写源pdf"""
152
+ md_writer.write(
153
+ content=pdf_bytes,
154
+ path=f"{pdf_file_name}_origin.pdf",
155
+ mode=AbsReaderWriter.MODE_BIN,
156
+ )
157
+
158
+ content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
159
+ if f_dump_content_list:
160
+ """写content_list"""
161
+ md_writer.write(
162
+ content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
163
+ path=f"{pdf_file_name}_content_list.json",
164
+ mode=AbsReaderWriter.MODE_TXT,
165
+ )
166
+
167
+
168
+ @click.group()
169
+ @click.version_option(__version__, "--version", "-v", help="显示版本信息")
170
+ @click.help_option("--help", "-h", help="显示帮助信息")
171
+ def cli():
172
+ pass
173
+
174
+
175
+ @cli.command()
176
+ @click.option("--json", type=str, help="输入一个S3路径")
177
+ @click.option(
178
+ "--method",
179
+ type=parse_pdf_methods,
180
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
181
+ default="auto",
182
+ )
183
+ @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
184
+ @click.option("--model_mode", type=click.STRING, default="full",
185
+ help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
186
+ def json_command(json, method, inside_model, model_mode):
187
+ model_config.__use_inside_model__ = inside_model
188
+ model_config.__model_mode__ = model_mode
189
+
190
+ if not json.startswith("s3://"):
191
+ logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
192
+ exit(1)
193
+
194
+ def read_s3_path(s3path):
195
+ bucket, key = parse_s3path(s3path)
196
+
197
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
198
+ s3_rw = S3ReaderWriter(
199
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
200
+ )
201
+ may_range_params = parse_s3_range_params(s3path)
202
+ if may_range_params is None or 2 != len(may_range_params):
203
+ byte_start, byte_end = 0, None
204
+ else:
205
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
206
+ byte_end += byte_start - 1
207
+ return s3_rw.read_jsonl(
208
+ remove_non_official_s3_args(s3path),
209
+ byte_start,
210
+ byte_end,
211
+ AbsReaderWriter.MODE_BIN,
212
+ )
213
+
214
+ jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
215
+ s3_file_path = jso.get("file_location")
216
+ if s3_file_path is None:
217
+ s3_file_path = jso.get("path")
218
+ pdf_file_name = Path(s3_file_path).stem
219
+ pdf_data = read_s3_path(s3_file_path)
220
+
221
+ do_parse(
222
+ pdf_file_name,
223
+ pdf_data,
224
+ jso["doc_layout_result"],
225
+ method,
226
+ )
227
+
228
+
229
+ @cli.command()
230
+ @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
231
+ @click.option(
232
+ "--method",
233
+ type=parse_pdf_methods,
234
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
235
+ default="auto",
236
+ )
237
+ @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
238
+ @click.option("--model_mode", type=click.STRING, default="full",
239
+ help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
240
+ def local_json_command(local_json, method, inside_model, model_mode):
241
+ model_config.__use_inside_model__ = inside_model
242
+ model_config.__model_mode__ = model_mode
243
+
244
+ def read_s3_path(s3path):
245
+ bucket, key = parse_s3path(s3path)
246
+
247
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
248
+ s3_rw = S3ReaderWriter(
249
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
250
+ )
251
+ may_range_params = parse_s3_range_params(s3path)
252
+ if may_range_params is None or 2 != len(may_range_params):
253
+ byte_start, byte_end = 0, None
254
+ else:
255
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
256
+ byte_end += byte_start - 1
257
+ return s3_rw.read_jsonl(
258
+ remove_non_official_s3_args(s3path),
259
+ byte_start,
260
+ byte_end,
261
+ AbsReaderWriter.MODE_BIN,
262
+ )
263
+
264
+ with open(local_json, "r", encoding="utf-8") as f:
265
+ for json_line in f:
266
+ jso = json_parse.loads(json_line)
267
+
268
+ s3_file_path = jso.get("file_location")
269
+ if s3_file_path is None:
270
+ s3_file_path = jso.get("path")
271
+ pdf_file_name = Path(s3_file_path).stem
272
+ pdf_data = read_s3_path(s3_file_path)
273
+ do_parse(
274
+ pdf_file_name,
275
+ pdf_data,
276
+ jso["doc_layout_result"],
277
+ method,
278
+ )
279
+
280
+
281
+ @cli.command()
282
+ @click.option(
283
+ "--pdf", type=click.Path(exists=True), required=True,
284
+ help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
285
+ @click.option("--model", type=click.Path(exists=True), help="模型的路径")
286
+ @click.option(
287
+ "--method",
288
+ type=parse_pdf_methods,
289
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
290
+ default="auto",
291
+ )
292
+ @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
293
+ @click.option("--model_mode", type=click.STRING, default="full",
294
+ help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
295
+ def pdf_command(pdf, model, method, inside_model, model_mode):
296
+ model_config.__use_inside_model__ = inside_model
297
+ model_config.__model_mode__ = model_mode
298
+
299
+ def read_fn(path):
300
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
301
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
302
+
303
+ def get_model_json(model_path, doc_path):
304
+ # 这里处理pdf和模型相关的逻辑
305
+ if model_path is None:
306
+ file_name_without_extension, extension = os.path.splitext(doc_path)
307
+ if extension == ".pdf":
308
+ model_path = file_name_without_extension + ".json"
309
+ else:
310
+ raise Exception("pdf_path input error")
311
+ if not os.path.exists(model_path):
312
+ logger.warning(
313
+ f"not found json {model_path} existed"
314
+ )
315
+ # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
316
+ model_json = "[]"
317
+ else:
318
+ model_json = read_fn(model_path).decode("utf-8")
319
+ else:
320
+ model_json = read_fn(model_path).decode("utf-8")
321
+
322
+ return model_json
323
+
324
+ def parse_doc(doc_path):
325
+ try:
326
+ file_name = str(Path(doc_path).stem)
327
+ pdf_data = read_fn(doc_path)
328
+ jso = json_parse.loads(get_model_json(model, doc_path))
329
+
330
+ do_parse(
331
+ file_name,
332
+ pdf_data,
333
+ jso,
334
+ method,
335
+ )
336
+
337
+ except Exception as e:
338
+ logger.exception(e)
339
+
340
+ if not pdf:
341
+ logger.error(f"Error: Missing argument '--pdf'.")
342
+ exit(f"Error: Missing argument '--pdf'.")
343
+ else:
344
+ '''适配多个文档的list文件输入'''
345
+ if pdf.endswith(".list"):
346
+ with open(pdf, "r") as f:
347
+ for line in f.readlines():
348
+ line = line.strip()
349
+ parse_doc(line)
350
+ else:
351
+ '''适配单个文档的输入'''
352
+ parse_doc(pdf)
353
+
354
+
355
+ if __name__ == "__main__":
356
+ """
357
+ python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
358
+ """
359
+ cli()
magic_pdf/dict2md/__init__.py ADDED
File without changes
magic_pdf/dict2md/mkcontent.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from loguru import logger
3
+
4
+ from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
5
+ from magic_pdf.libs.commons import join_path
6
+ from magic_pdf.libs.ocr_content_type import ContentType
7
+
8
+ TYPE_INLINE_EQUATION = ContentType.InlineEquation
9
+ TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
10
+ UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
11
+
12
+
13
+ @DeprecationWarning
14
+ def mk_nlp_markdown_1(para_dict: dict):
15
+ """
16
+ 对排序后的bboxes拼接内容
17
+ """
18
+ content_lst = []
19
+ for _, page_info in para_dict.items():
20
+ para_blocks = page_info.get("para_blocks")
21
+ if not para_blocks:
22
+ continue
23
+
24
+ for block in para_blocks:
25
+ item = block["paras"]
26
+ for _, p in item.items():
27
+ para_text = p["para_text"]
28
+ is_title = p["is_para_title"]
29
+ title_level = p['para_title_level']
30
+ md_title_prefix = "#"*title_level
31
+ if is_title:
32
+ content_lst.append(f"{md_title_prefix} {para_text}")
33
+ else:
34
+ content_lst.append(para_text)
35
+
36
+ content_text = "\n\n".join(content_lst)
37
+
38
+ return content_text
39
+
40
+
41
+
42
+ # 找到目标字符串在段落中的索引
43
+ def __find_index(paragraph, target):
44
+ index = paragraph.find(target)
45
+ if index != -1:
46
+ return index
47
+ else:
48
+ return None
49
+
50
+
51
+ def __insert_string(paragraph, target, postion):
52
+ new_paragraph = paragraph[:postion] + target + paragraph[postion:]
53
+ return new_paragraph
54
+
55
+
56
+ def __insert_after(content, image_content, target):
57
+ """
58
+ 在content中找到target,将image_content插入到target后面
59
+ """
60
+ index = content.find(target)
61
+ if index != -1:
62
+ content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
63
+ else:
64
+ logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
65
+ return content
66
+
67
+ def __insert_before(content, image_content, target):
68
+ """
69
+ 在content中找到target,将image_content插入到target前面
70
+ """
71
+ index = content.find(target)
72
+ if index != -1:
73
+ content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
74
+ else:
75
+ logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
76
+ return content
77
+
78
+
79
+ @DeprecationWarning
80
+ def mk_mm_markdown_1(para_dict: dict):
81
+ """拼装多模态markdown"""
82
+ content_lst = []
83
+ for _, page_info in para_dict.items():
84
+ page_lst = [] # 一个page内的段落列表
85
+ para_blocks = page_info.get("para_blocks")
86
+ pymu_raw_blocks = page_info.get("preproc_blocks")
87
+
88
+ all_page_images = []
89
+ all_page_images.extend(page_info.get("images",[]))
90
+ all_page_images.extend(page_info.get("image_backup", []) )
91
+ all_page_images.extend(page_info.get("tables",[]))
92
+ all_page_images.extend(page_info.get("table_backup",[]) )
93
+
94
+ if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
95
+ for img in all_page_images:
96
+ page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
97
+ page_md = "\n\n".join(page_lst)
98
+
99
+ else:
100
+ for block in para_blocks:
101
+ item = block["paras"]
102
+ for _, p in item.items():
103
+ para_text = p["para_text"]
104
+ is_title = p["is_para_title"]
105
+ title_level = p['para_title_level']
106
+ md_title_prefix = "#"*title_level
107
+ if is_title:
108
+ page_lst.append(f"{md_title_prefix} {para_text}")
109
+ else:
110
+ page_lst.append(para_text)
111
+
112
+ """拼装成一个页面的文本"""
113
+ page_md = "\n\n".join(page_lst)
114
+ """插入图片"""
115
+ for img in all_page_images:
116
+ imgbox = img['bbox']
117
+ img_content = f"![]({img['image_path']})"
118
+ # 先看在哪个block内
119
+ for block in pymu_raw_blocks:
120
+ bbox = block['bbox']
121
+ if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
122
+ for l in block['lines']:
123
+ line_box = l['bbox']
124
+ if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面
125
+ line_txt = "".join([s['text'] for s in l['spans']])
126
+ page_md = __insert_before(page_md, img_content, line_txt)
127
+ break
128
+ break
129
+ else:# 在行与行之间
130
+ # 找���图片x0,y0与line的x0,y0最近的line
131
+ min_distance = 100000
132
+ min_line = None
133
+ for l in block['lines']:
134
+ line_box = l['bbox']
135
+ distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
136
+ if distance < min_distance:
137
+ min_distance = distance
138
+ min_line = l
139
+ if min_line:
140
+ line_txt = "".join([s['text'] for s in min_line['spans']])
141
+ img_h = imgbox[3] - imgbox[1]
142
+ if min_distance<img_h: # 文字在图片前面
143
+ page_md = __insert_after(page_md, img_content, line_txt)
144
+ else:
145
+ page_md = __insert_before(page_md, img_content, line_txt)
146
+ else:
147
+ logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
148
+ else:# 应当在两个block之间
149
+ # 找到上方最近的block,如果上方没有就找大下方最近的block
150
+ top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
151
+ if top_txt_block:
152
+ line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
153
+ page_md = __insert_after(page_md, img_content, line_txt)
154
+ else:
155
+ bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
156
+ if bottom_txt_block:
157
+ line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
158
+ page_md = __insert_before(page_md, img_content, line_txt)
159
+ else:
160
+ logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
161
+
162
+ content_lst.append(page_md)
163
+
164
+ """拼装成全部页面的文本"""
165
+ content_text = "\n\n".join(content_lst)
166
+
167
+ return content_text
168
+
169
+
170
+ def __insert_after_para(text, type, element, content_list):
171
+ """
172
+ 在content_list中找到text,将image_path作为一个新的node插入到text后面
173
+ """
174
+ for i, c in enumerate(content_list):
175
+ content_type = c.get("type")
176
+ if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
177
+ if type == "image":
178
+ content_node = {
179
+ "type": "image",
180
+ "img_path": element.get("image_path"),
181
+ "img_alt": "",
182
+ "img_title": "",
183
+ "img_caption": "",
184
+ }
185
+ elif type == "table":
186
+ content_node = {
187
+ "type": "table",
188
+ "img_path": element.get("image_path"),
189
+ "table_latex": element.get("text"),
190
+ "table_title": "",
191
+ "table_caption": "",
192
+ "table_quality": element.get("quality"),
193
+ }
194
+ content_list.insert(i+1, content_node)
195
+ break
196
+ else:
197
+ logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
198
+
199
+
200
+
201
+ def __insert_before_para(text, type, element, content_list):
202
+ """
203
+ 在content_list中找到text,将image_path作为一个新的node插入到text前面
204
+ """
205
+ for i, c in enumerate(content_list):
206
+ content_type = c.get("type")
207
+ if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
208
+ if type == "image":
209
+ content_node = {
210
+ "type": "image",
211
+ "img_path": element.get("image_path"),
212
+ "img_alt": "",
213
+ "img_title": "",
214
+ "img_caption": "",
215
+ }
216
+ elif type == "table":
217
+ content_node = {
218
+ "type": "table",
219
+ "img_path": element.get("image_path"),
220
+ "table_latex": element.get("text"),
221
+ "table_title": "",
222
+ "table_caption": "",
223
+ "table_quality": element.get("quality"),
224
+ }
225
+ content_list.insert(i, content_node)
226
+ break
227
+ else:
228
+ logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
229
+
230
+
231
+ def mk_universal_format(pdf_info_list: list, img_buket_path):
232
+ """
233
+ 构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
234
+ """
235
+ content_lst = []
236
+ for page_info in pdf_info_list:
237
+ page_lst = [] # 一���page内的段落列表
238
+ para_blocks = page_info.get("para_blocks")
239
+ pymu_raw_blocks = page_info.get("preproc_blocks")
240
+
241
+ all_page_images = []
242
+ all_page_images.extend(page_info.get("images",[]))
243
+ all_page_images.extend(page_info.get("image_backup", []) )
244
+ # all_page_images.extend(page_info.get("tables",[]))
245
+ # all_page_images.extend(page_info.get("table_backup",[]) )
246
+ all_page_tables = []
247
+ all_page_tables.extend(page_info.get("tables", []))
248
+
249
+ if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
250
+ for img in all_page_images:
251
+ content_node = {
252
+ "type": "image",
253
+ "img_path": join_path(img_buket_path, img['image_path']),
254
+ "img_alt":"",
255
+ "img_title":"",
256
+ "img_caption":""
257
+ }
258
+ page_lst.append(content_node) # TODO 图片顺序
259
+ for table in all_page_tables:
260
+ content_node = {
261
+ "type": "table",
262
+ "img_path": join_path(img_buket_path, table['image_path']),
263
+ "table_latex": table.get("text"),
264
+ "table_title": "",
265
+ "table_caption": "",
266
+ "table_quality": table.get("quality"),
267
+ }
268
+ page_lst.append(content_node) # TODO 图片顺序
269
+ else:
270
+ for block in para_blocks:
271
+ item = block["paras"]
272
+ for _, p in item.items():
273
+ font_type = p['para_font_type']# 对于文本来说,要么是普通文本,要么是个行间公式
274
+ if font_type == TYPE_INTERLINE_EQUATION:
275
+ content_node = {
276
+ "type": "equation",
277
+ "latex": p["para_text"]
278
+ }
279
+ page_lst.append(content_node)
280
+ else:
281
+ para_text = p["para_text"]
282
+ is_title = p["is_para_title"]
283
+ title_level = p['para_title_level']
284
+
285
+ if is_title:
286
+ content_node = {
287
+ "type": f"h{title_level}",
288
+ "text": para_text
289
+ }
290
+ page_lst.append(content_node)
291
+ else:
292
+ content_node = {
293
+ "type": "text",
294
+ "text": para_text
295
+ }
296
+ page_lst.append(content_node)
297
+
298
+ content_lst.extend(page_lst)
299
+
300
+ """插入图片"""
301
+ for img in all_page_images:
302
+ insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
303
+
304
+ """插入表格"""
305
+ for table in all_page_tables:
306
+ insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
307
+ # end for
308
+ return content_lst
309
+
310
+
311
+ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
312
+ element_bbox = element['bbox']
313
+ # 先看在哪个block内
314
+ for block in pymu_raw_blocks:
315
+ bbox = block['bbox']
316
+ if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
317
+ 3] + 1: # 确定在这个大的block内,然后进入逐行比较距离
318
+ for l in block['lines']:
319
+ line_box = l['bbox']
320
+ if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
321
+ 3] + 1: # 在line内的,插入line前面
322
+ line_txt = "".join([s['text'] for s in l['spans']])
323
+ __insert_before_para(line_txt, type, element, content_lst)
324
+ break
325
+ break
326
+ else: # 在行与行之间
327
+ # 找到图片x0,y0与line的x0,y0最近的line
328
+ min_distance = 100000
329
+ min_line = None
330
+ for l in block['lines']:
331
+ line_box = l['bbox']
332
+ distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
333
+ if distance < min_distance:
334
+ min_distance = distance
335
+ min_line = l
336
+ if min_line:
337
+ line_txt = "".join([s['text'] for s in min_line['spans']])
338
+ img_h = element_bbox[3] - element_bbox[1]
339
+ if min_distance < img_h: # 文字在图片前面
340
+ __insert_after_para(line_txt, type, element, content_lst)
341
+ else:
342
+ __insert_before_para(line_txt, type, element, content_lst)
343
+ break
344
+ else:
345
+ logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
346
+ else: # 应当在两个block之间
347
+ # 找到上方最近的block,如果上方没有就找大下方最近的block
348
+ top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
349
+ if top_txt_block:
350
+ line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
351
+ __insert_after_para(line_txt, type, element, content_lst)
352
+ else:
353
+ bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
354
+ if bottom_txt_block:
355
+ line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
356
+ __insert_before_para(line_txt, type, element, content_lst)
357
+ else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
358
+ logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
359
+
360
+
361
+ def mk_mm_markdown(content_list):
362
+ """
363
+ 基于同一格式的内容列表,构造markdown,含图片
364
+ """
365
+ content_md = []
366
+ for c in content_list:
367
+ content_type = c.get("type")
368
+ if content_type == "text":
369
+ content_md.append(c.get("text"))
370
+ elif content_type == "equation":
371
+ content = c.get("latex")
372
+ if content.startswith("$$") and content.endswith("$$"):
373
+ content_md.append(content)
374
+ else:
375
+ content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
376
+ elif content_type in UNI_FORMAT_TEXT_TYPE:
377
+ content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
378
+ elif content_type == "image":
379
+ content_md.append(f"![]({c.get('img_path')})")
380
+ return "\n\n".join(content_md)
381
+
382
+ def mk_nlp_markdown(content_list):
383
+ """
384
+ 基于同一格式的内容列表,构造markdown,不含图片
385
+ """
386
+ content_md = []
387
+ for c in content_list:
388
+ content_type = c.get("type")
389
+ if content_type == "text":
390
+ content_md.append(c.get("text"))
391
+ elif content_type == "equation":
392
+ content_md.append(f"$$\n{c.get('latex')}\n$$")
393
+ elif content_type == "table":
394
+ content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
395
+ elif content_type in UNI_FORMAT_TEXT_TYPE:
396
+ content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
397
+ return "\n\n".join(content_md)
magic_pdf/dict2md/ocr_mkcontent.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+
3
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
4
+ from magic_pdf.libs.commons import join_path
5
+ from magic_pdf.libs.language import detect_lang
6
+ from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
7
+ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
8
+ import wordninja
9
+ import re
10
+
11
+
12
+ def split_long_words(text):
13
+ segments = text.split(' ')
14
+ for i in range(len(segments)):
15
+ words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
16
+ for j in range(len(words)):
17
+ if len(words[j]) > 15:
18
+ words[j] = ' '.join(wordninja.split(words[j]))
19
+ segments[i] = ''.join(words)
20
+ return ' '.join(segments)
21
+
22
+
23
+ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
24
+ markdown = []
25
+ for page_info in pdf_info_list:
26
+ paras_of_layout = page_info.get("para_blocks")
27
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
28
+ markdown.extend(page_markdown)
29
+ return '\n\n'.join(markdown)
30
+
31
+
32
+ def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
33
+ markdown = []
34
+ for page_info in pdf_info_dict:
35
+ paras_of_layout = page_info.get("para_blocks")
36
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
37
+ markdown.extend(page_markdown)
38
+ return '\n\n'.join(markdown)
39
+
40
+
41
+ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
42
+ markdown_with_para_and_pagination = []
43
+ page_no = 0
44
+ for page_info in pdf_info_dict:
45
+ paras_of_layout = page_info.get("para_blocks")
46
+ if not paras_of_layout:
47
+ continue
48
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
49
+ markdown_with_para_and_pagination.append({
50
+ 'page_no': page_no,
51
+ 'md_content': '\n\n'.join(page_markdown)
52
+ })
53
+ page_no += 1
54
+ return markdown_with_para_and_pagination
55
+
56
+
57
+ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
58
+ page_markdown = []
59
+ for paras in paras_of_layout:
60
+ for para in paras:
61
+ para_text = ''
62
+ for line in para:
63
+ for span in line['spans']:
64
+ span_type = span.get('type')
65
+ content = ''
66
+ language = ''
67
+ if span_type == ContentType.Text:
68
+ content = span['content']
69
+ language = detect_lang(content)
70
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
71
+ content = ocr_escape_special_markdown_char(split_long_words(content))
72
+ else:
73
+ content = ocr_escape_special_markdown_char(content)
74
+ elif span_type == ContentType.InlineEquation:
75
+ content = f"${span['content']}$"
76
+ elif span_type == ContentType.InterlineEquation:
77
+ content = f"\n$$\n{span['content']}\n$$\n"
78
+ elif span_type in [ContentType.Image, ContentType.Table]:
79
+ if mode == 'mm':
80
+ content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
81
+ elif mode == 'nlp':
82
+ pass
83
+ if content != '':
84
+ if language == 'en': # 英文语境下 content间需要空格分隔
85
+ para_text += content + ' '
86
+ else: # 中文语境下,content间不需要空格分隔
87
+ para_text += content
88
+ if para_text.strip() == '':
89
+ continue
90
+ else:
91
+ page_markdown.append(para_text.strip() + ' ')
92
+ return page_markdown
93
+
94
+
95
+ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
96
+ page_markdown = []
97
+ for para_block in paras_of_layout:
98
+ para_text = ''
99
+ para_type = para_block['type']
100
+ if para_type == BlockType.Text:
101
+ para_text = merge_para_with_text(para_block)
102
+ elif para_type == BlockType.Title:
103
+ para_text = f"# {merge_para_with_text(para_block)}"
104
+ elif para_type == BlockType.InterlineEquation:
105
+ para_text = merge_para_with_text(para_block)
106
+ elif para_type == BlockType.Image:
107
+ if mode == 'nlp':
108
+ continue
109
+ elif mode == 'mm':
110
+ for block in para_block['blocks']: # 1st.拼image_body
111
+ if block['type'] == BlockType.ImageBody:
112
+ for line in block['lines']:
113
+ for span in line['spans']:
114
+ if span['type'] == ContentType.Image:
115
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
116
+ for block in para_block['blocks']: # 2nd.拼image_caption
117
+ if block['type'] == BlockType.ImageCaption:
118
+ para_text += merge_para_with_text(block)
119
+ elif para_type == BlockType.Table:
120
+ if mode == 'nlp':
121
+ continue
122
+ elif mode == 'mm':
123
+ for block in para_block['blocks']: # 1st.拼table_caption
124
+ if block['type'] == BlockType.TableCaption:
125
+ para_text += merge_para_with_text(block)
126
+ for block in para_block['blocks']: # 2nd.拼table_body
127
+ if block['type'] == BlockType.TableBody:
128
+ for line in block['lines']:
129
+ for span in line['spans']:
130
+ if span['type'] == ContentType.Table:
131
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
132
+ for block in para_block['blocks']: # 3rd.拼table_footnote
133
+ if block['type'] == BlockType.TableFootnote:
134
+ para_text += merge_para_with_text(block)
135
+
136
+ if para_text.strip() == '':
137
+ continue
138
+ else:
139
+ page_markdown.append(para_text.strip() + ' ')
140
+
141
+ return page_markdown
142
+
143
+
144
+ def merge_para_with_text(para_block):
145
+ para_text = ''
146
+ for line in para_block['lines']:
147
+ line_text = ""
148
+ line_lang = ""
149
+ for span in line['spans']:
150
+ span_type = span['type']
151
+ if span_type == ContentType.Text:
152
+ line_text += span['content'].strip()
153
+ if line_text != "":
154
+ line_lang = detect_lang(line_text)
155
+ for span in line['spans']:
156
+ span_type = span['type']
157
+ content = ''
158
+ if span_type == ContentType.Text:
159
+ content = span['content']
160
+ language = detect_lang(content)
161
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
162
+ content = ocr_escape_special_markdown_char(split_long_words(content))
163
+ else:
164
+ content = ocr_escape_special_markdown_char(content)
165
+ elif span_type == ContentType.InlineEquation:
166
+ content = f"${span['content']}$"
167
+ elif span_type == ContentType.InterlineEquation:
168
+ content = f"\n$$\n{span['content']}\n$$\n"
169
+
170
+ if content != '':
171
+ if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
172
+ para_text += content # 中文语境下,content间不需要空格分隔
173
+ else:
174
+ para_text += content + ' ' # 英文语境下 content间需要空格分隔
175
+ return para_text
176
+
177
+
178
+ def para_to_standard_format(para, img_buket_path):
179
+ para_content = {}
180
+ if len(para) == 1:
181
+ para_content = line_to_standard_format(para[0], img_buket_path)
182
+ elif len(para) > 1:
183
+ para_text = ''
184
+ inline_equation_num = 0
185
+ for line in para:
186
+ for span in line['spans']:
187
+ language = ''
188
+ span_type = span.get('type')
189
+ content = ""
190
+ if span_type == ContentType.Text:
191
+ content = span['content']
192
+ language = detect_lang(content)
193
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
194
+ content = ocr_escape_special_markdown_char(split_long_words(content))
195
+ else:
196
+ content = ocr_escape_special_markdown_char(content)
197
+ elif span_type == ContentType.InlineEquation:
198
+ content = f"${span['content']}$"
199
+ inline_equation_num += 1
200
+
201
+ if language == 'en': # 英文语境下 content间需要空格分隔
202
+ para_text += content + ' '
203
+ else: # 中文语境下,content间不需要空格分隔
204
+ para_text += content
205
+ para_content = {
206
+ 'type': 'text',
207
+ 'text': para_text,
208
+ 'inline_equation_num': inline_equation_num
209
+ }
210
+ return para_content
211
+
212
+
213
+ def para_to_standard_format_v2(para_block, img_buket_path):
214
+ para_type = para_block['type']
215
+ if para_type == BlockType.Text:
216
+ para_content = {
217
+ 'type': 'text',
218
+ 'text': merge_para_with_text(para_block),
219
+ }
220
+ elif para_type == BlockType.Title:
221
+ para_content = {
222
+ 'type': 'text',
223
+ 'text': merge_para_with_text(para_block),
224
+ 'text_level': 1
225
+ }
226
+ elif para_type == BlockType.InterlineEquation:
227
+ para_content = {
228
+ 'type': 'equation',
229
+ 'text': merge_para_with_text(para_block),
230
+ 'text_format': "latex"
231
+ }
232
+ elif para_type == BlockType.Image:
233
+ para_content = {
234
+ 'type': 'image',
235
+ }
236
+ for block in para_block['blocks']:
237
+ if block['type'] == BlockType.ImageBody:
238
+ para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
239
+ if block['type'] == BlockType.ImageCaption:
240
+ para_content['img_caption'] = merge_para_with_text(block)
241
+ elif para_type == BlockType.Table:
242
+ para_content = {
243
+ 'type': 'table',
244
+ }
245
+ for block in para_block['blocks']:
246
+ if block['type'] == BlockType.TableBody:
247
+ para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
248
+ if block['type'] == BlockType.TableCaption:
249
+ para_content['table_caption'] = merge_para_with_text(block)
250
+ if block['type'] == BlockType.TableFootnote:
251
+ para_content['table_footnote'] = merge_para_with_text(block)
252
+
253
+ return para_content
254
+
255
+
256
+ def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
257
+ content_list = []
258
+ for page_info in pdf_info_dict:
259
+ paras_of_layout = page_info.get("para_blocks")
260
+ if not paras_of_layout:
261
+ continue
262
+ for para_block in paras_of_layout:
263
+ para_content = para_to_standard_format_v2(para_block, img_buket_path)
264
+ content_list.append(para_content)
265
+ return content_list
266
+
267
+
268
+ def line_to_standard_format(line, img_buket_path):
269
+ line_text = ""
270
+ inline_equation_num = 0
271
+ for span in line['spans']:
272
+ if not span.get('content'):
273
+ if not span.get('image_path'):
274
+ continue
275
+ else:
276
+ if span['type'] == ContentType.Image:
277
+ content = {
278
+ 'type': 'image',
279
+ 'img_path': join_path(img_buket_path, span['image_path'])
280
+ }
281
+ return content
282
+ elif span['type'] == ContentType.Table:
283
+ content = {
284
+ 'type': 'table',
285
+ 'img_path': join_path(img_buket_path, span['image_path'])
286
+ }
287
+ return content
288
+ else:
289
+ if span['type'] == ContentType.InterlineEquation:
290
+ interline_equation = span['content']
291
+ content = {
292
+ 'type': 'equation',
293
+ 'latex': f"$$\n{interline_equation}\n$$"
294
+ }
295
+ return content
296
+ elif span['type'] == ContentType.InlineEquation:
297
+ inline_equation = span['content']
298
+ line_text += f"${inline_equation}$"
299
+ inline_equation_num += 1
300
+ elif span['type'] == ContentType.Text:
301
+ text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
302
+ line_text += text_content
303
+ content = {
304
+ 'type': 'text',
305
+ 'text': line_text,
306
+ 'inline_equation_num': inline_equation_num
307
+ }
308
+ return content
309
+
310
+
311
+ def ocr_mk_mm_standard_format(pdf_info_dict: list):
312
+ """
313
+ content_list
314
+ type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
315
+ latex string latex文本字段。
316
+ text string 纯文本格式的文本数据。
317
+ md string markdown格式的文本数据。
318
+ img_path string s3://full/path/to/img.jpg
319
+ """
320
+ content_list = []
321
+ for page_info in pdf_info_dict:
322
+ blocks = page_info.get("preproc_blocks")
323
+ if not blocks:
324
+ continue
325
+ for block in blocks:
326
+ for line in block['lines']:
327
+ content = line_to_standard_format(line)
328
+ content_list.append(content)
329
+ return content_list
330
+
331
+
332
+ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
333
+ output_content = []
334
+ for page_info in pdf_info_dict:
335
+ if page_info.get("need_drop", False):
336
+ drop_reason = page_info.get("drop_reason")
337
+ if drop_mode == DropMode.NONE:
338
+ pass
339
+ elif drop_mode == DropMode.WHOLE_PDF:
340
+ raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
341
+ elif drop_mode == DropMode.SINGLE_PAGE:
342
+ logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
343
+ continue
344
+ else:
345
+ raise Exception(f"drop_mode can not be null")
346
+
347
+ paras_of_layout = page_info.get("para_blocks")
348
+ if not paras_of_layout:
349
+ continue
350
+ if make_mode == MakeMode.MM_MD:
351
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
352
+ output_content.extend(page_markdown)
353
+ elif make_mode == MakeMode.NLP_MD:
354
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
355
+ output_content.extend(page_markdown)
356
+ elif make_mode == MakeMode.STANDARD_FORMAT:
357
+ for para_block in paras_of_layout:
358
+ para_content = para_to_standard_format_v2(para_block, img_buket_path)
359
+ output_content.append(para_content)
360
+ if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
361
+ return '\n\n'.join(output_content)
362
+ elif make_mode == MakeMode.STANDARD_FORMAT:
363
+ return output_content
magic_pdf/filter/__init__.py ADDED
File without changes
magic_pdf/filter/pdf_classify_by_type.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 根据利用meta_scan得到的结果,对pdf是否为文字版进行分类。
3
+ 定义标准:
4
+ 一、什么pdf会是文字pdf,只要满足以下任意一条
5
+ 1. 随机抽取N页,如果有任何一页文字数目大于100
6
+ 2. 只要存在一个页面,图片的数量为0
7
+ 二、什么是扫描版pdf,只要满足以下任意一条
8
+ 1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~
9
+ 2. 大部分页面上文字的长度都是相等的。
10
+
11
+ """
12
+ import json
13
+ import sys
14
+ from collections import Counter
15
+
16
+ import click
17
+ import numpy as np
18
+ from loguru import logger
19
+
20
+ from magic_pdf.libs.commons import mymax, get_top_percent_list
21
+ from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
22
+
23
+ TEXT_LEN_THRESHOLD = 100
24
+ AVG_TEXT_LEN_THRESHOLD = 100
25
+ TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
26
+
27
+
28
+ # 一个拼接图片的方案,将某些特殊扫描版本的拆图拼成一张整图
29
+ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
30
+ # 先通过set去除所有bbox重叠的图片数据
31
+ image_list_result = []
32
+ for page_images in image_list:
33
+ page_result = []
34
+ dedup = set()
35
+ for img in page_images:
36
+ x0, y0, x1, y1, img_bojid = img
37
+ if (x0, y0, x1, y1) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉
38
+ continue
39
+ else:
40
+ dedup.add((x0, y0, x1, y1))
41
+ page_result.append([x0, y0, x1, y1, img_bojid])
42
+ image_list_result.append(page_result)
43
+
44
+ # 接下来,将同一页可拼接的图片进行合并
45
+ merged_images = []
46
+ for page_images in image_list_result:
47
+ if not page_images:
48
+ continue
49
+
50
+ # 先将同一页的图片从上到下,从左到右进行排序
51
+ page_images.sort(key=lambda img: (img[1], img[0]))
52
+
53
+ merged = [page_images[0]]
54
+
55
+ for img in page_images[1:]:
56
+ x0, y0, x1, y1, imgid = img
57
+
58
+ last_img = merged[-1]
59
+ last_x0, last_y0, last_x1, last_y1, last_imgid = last_img
60
+
61
+ # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件
62
+ full_width = abs(x1 - x0) >= page_width * 0.9
63
+ full_height = abs(y1 - y0) >= page_height * 0.9
64
+
65
+ # 如果宽达标,检测是否能竖着拼
66
+ if full_width:
67
+ # 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
68
+ close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
69
+ last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
70
+
71
+ # 如果高达标,检测是否可以横着拼
72
+ if full_height:
73
+ # 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
74
+ close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
75
+ last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
76
+
77
+ # Check if the image can be merged with the last image
78
+ if (full_width and close1) or (full_height and close2):
79
+ # Merge the image with the last image
80
+ merged[-1] = [min(x0, last_x0), min(y0, last_y0),
81
+ max(x1, last_x1), max(y1, last_y1), imgid]
82
+ else:
83
+ # Add the image as a new image
84
+ merged.append(img)
85
+
86
+ merged_images.append(merged)
87
+
88
+ return merged_images
89
+
90
+
91
+ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list):
92
+ """
93
+ 80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False,否则返回True
94
+ :param pdf_path:
95
+ :param total_page:
96
+ :param page_width:
97
+ :param page_height:
98
+ :param img_sz_list:
99
+ :return:
100
+ """
101
+ # # 只要有一页没有图片,那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf,上面有一些空白页面,既没有图片也没有文字。
102
+ # if any([len(img_sz) == 0 for img_sz in img_sz_list]): # 含有不含图片的页面
103
+ # # 现在找到这些页面的index
104
+ # empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0]
105
+ # # 然后检查这些页面上是否有文字
106
+ # text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0]
107
+ # if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD: # 没有图片,但是有文字,说明可能是个文字版,如果没有文字则无法判断,留给下一步,现在��求这页文字量超过一定阈值
108
+ # return True
109
+
110
+ # 通过objid去掉重复出现10次以上的图片,这些图片是隐藏的透明图层,其特点是id都一样
111
+ # 先对每个id出现的次数做个统计
112
+ objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
113
+ # 再去掉出现次数大于10的
114
+ if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
115
+ total_page = scan_max_page
116
+
117
+ repeat_threshold = 2 # 把bad_image的阈值设为2
118
+ # repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
119
+ bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
120
+ # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])]
121
+ # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0]
122
+
123
+ # 特殊情况,一个文字版pdf,每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上
124
+ # fake_image_ids = [objid for objid in bad_image_objid if
125
+ # any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for
126
+ # x0, y0, x1, y1, _ in images])] # 原来的代码,any里面恒为true了,原因???
127
+ # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images
128
+ # if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9]
129
+
130
+ # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
131
+ # return True
132
+
133
+ img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
134
+ img_sz_list] # 过滤掉重复出现的图片
135
+
136
+ # 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
137
+ img_sz_list = merge_images(img_sz_list, page_width, page_height)
138
+
139
+ # 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
140
+ max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
141
+ img_sz_list]
142
+ page_area = page_width * page_height
143
+ max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
144
+ max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
145
+
146
+ if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
147
+ # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
148
+ return False
149
+ else:
150
+ return True
151
+
152
+
153
+ def classify_by_text_len(text_len_list: list, total_page: int):
154
+ """
155
+ 随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
156
+ 查看页面上的文字长度,如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD,那么就是文字pdf
157
+ :param total_page:
158
+ :param text_len_list:
159
+ :return:
160
+ """
161
+ select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO) # 选取10%的页面
162
+ if select_page_cnt < 5:
163
+ select_page_cnt = total_page
164
+
165
+ # # 排除头尾各10页
166
+ # if total_page > 20: # 如果总页数大于20
167
+ # page_range = list(range(10, total_page - 10)) # 从第11页到倒数第11页
168
+ # else:
169
+ # page_range = list(range(total_page)) # 否则选择所有页面
170
+ # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False)
171
+ # 排除前后10页对只有21,22页的pdf很尴尬,如果选出来的中间那一两页恰好没字容易误判,有了avg_words规则,这个规则可以忽略
172
+ page_num = np.random.choice(total_page, select_page_cnt, replace=False)
173
+ text_len_lst = [text_len_list[i] for i in page_num]
174
+ is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
175
+ return is_text_pdf
176
+
177
+
178
+ def classify_by_avg_words(text_len_list: list):
179
+ """
180
+ 补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
181
+ 主要是各种图集
182
+ :param text_len_list:
183
+ :return:
184
+ """
185
+ sum_words = sum(text_len_list)
186
+ count_of_numbers = len(text_len_list)
187
+ if count_of_numbers == 0:
188
+ is_text_pdf = False
189
+ else:
190
+ avg_words = round(sum_words / count_of_numbers)
191
+ if avg_words > AVG_TEXT_LEN_THRESHOLD:
192
+ is_text_pdf = True
193
+ else:
194
+ is_text_pdf = False
195
+
196
+ return is_text_pdf
197
+
198
+
199
+ def classify_by_img_num(img_sz_list: list, img_num_list: list):
200
+ """
201
+ 补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
202
+ 这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素,img_num_list中每一页的数量都很大且相同
203
+ :param img_sz_list:
204
+ :param img_num_list:
205
+ :return:
206
+ """
207
+ # 计算img_sz_list中非空元素的个数
208
+ count_img_sz_list_not_none = sum(1 for item in img_sz_list if item)
209
+ # 获取前80%的元素
210
+ top_eighty_percent = get_top_percent_list(img_num_list, 0.8)
211
+ # img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
212
+ if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
213
+
214
+ #拿max和min的值,用来判断list内的值是否全都相等
215
+ # min_imgs = min(img_num_list)
216
+ # max_imgs = max(img_num_list)
217
+ #
218
+ # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
219
+ return False # 如果满足这个条件,一定不是文字版pdf
220
+ else:
221
+ return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
222
+
223
+
224
+ def classify_by_text_layout(text_layout_per_page: list):
225
+ """
226
+ 判断文本布局是否以竖排为主。
227
+
228
+ Args:
229
+ text_layout_per_page (list): 文本布局列表,列表中的每个元素表示一页的文本布局,
230
+ 值为'vertical'表示竖排,值为'horizontal'表示横排。
231
+
232
+ Returns:
233
+ bool: 若文本布局以竖排为主,则返回False;否则返回True。
234
+ """
235
+ # 统计text_layout_per_page中竖排的个数
236
+ count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical')
237
+ # 统计text_layout_per_page中横排的个数
238
+ count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal')
239
+ # 计算text_layout_per_page中竖排的占比
240
+ known_layout_cnt = count_vertical + count_horizontal
241
+ if known_layout_cnt != 0:
242
+ ratio = count_vertical / known_layout_cnt
243
+ if ratio >= 0.5: # 阈值设为0.5,适配3页里面有2页和两页里有一页的情况
244
+ return False # 文本布局以竖排为主,认为不是文字版pdf
245
+ else:
246
+ return True # 文本布局以横排为主,认为是文字版pdf
247
+ else:
248
+ return False # 文本布局未知,默认认为不是文字版pdf
249
+
250
+
251
+ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
252
+ """
253
+ 判断一页是否由细长条组成,有两个条件:
254
+ 1. 图片的宽或高达到页面宽或高的90%,且长边需要是窄边长度的数倍以上
255
+ 2. 整个页面所有的图片有80%以上满足条件1
256
+
257
+ Args:
258
+ page_width (float): 页面宽度
259
+ page_height (float): 页面高度
260
+ img_sz_list (list): 图片尺寸列表,每个元素为一个元组,表示图片的矩形区域和尺寸,形如(x0, y0, x1, y1, size),其中(x0, y0)为矩形区域的左上角坐标,(x1, y1)为矩形区域的右下角坐标,size为图片的尺寸
261
+
262
+ Returns:
263
+ bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
264
+ """
265
+
266
+ def is_narrow_strip(img):
267
+ x0, y0, x1, y1, _ = img
268
+ width, height = x1 - x0, y1 - y0
269
+ return any([
270
+ # 图片宽度大于等于页面宽度的90%,且宽度大于等于高度4倍
271
+ width >= page_width * 0.9 and width >= height * 4,
272
+ # 图片高度大于等于页面高度的90%,且高度大于等于宽度4倍
273
+ height >= page_height * 0.9 and height >= width * 4,
274
+ ])
275
+
276
+ # 初始化满足条件的页面数量
277
+ narrow_strip_pages_count = 0
278
+
279
+ # 遍历所有页面
280
+ for page_img_list in img_sz_list:
281
+ # 忽略空页面
282
+ if not page_img_list:
283
+ continue
284
+
285
+ # 计算页面中的图片总数
286
+ total_images = len(page_img_list)
287
+
288
+ # 计算页面中细长条图片的数量
289
+ narrow_strip_images_count = 0
290
+ for img in page_img_list:
291
+ if is_narrow_strip(img):
292
+ narrow_strip_images_count += 1
293
+ # 如果细长条图片的数量少于5,跳过
294
+ if narrow_strip_images_count < 5:
295
+ continue
296
+ else:
297
+ # 如果细长条图片的比例大于或等于0.8,增加满足条件的页面数量
298
+ if narrow_strip_images_count / total_images >= 0.8:
299
+ narrow_strip_pages_count += 1
300
+
301
+ # 计算满足条件的页面的比例
302
+ narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list)
303
+
304
+ return narrow_strip_pages_ratio < 0.5
305
+
306
+
307
+ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
308
+ text_layout_list: list, invalid_chars: bool):
309
+ """
310
+ 这里的图片和页面长度单位是pts
311
+ :param total_page:
312
+ :param text_len_list:
313
+ :param page_width:
314
+ :param page_height:
315
+ :param img_sz_list:
316
+ :param pdf_path:
317
+ :return:
318
+ """
319
+ results = {
320
+ 'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list),
321
+ 'by_text_len': classify_by_text_len(text_len_list, total_page),
322
+ 'by_avg_words': classify_by_avg_words(text_len_list),
323
+ 'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
324
+ 'by_text_layout': classify_by_text_layout(text_layout_list),
325
+ 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
326
+ 'by_invalid_chars': invalid_chars,
327
+ }
328
+
329
+ if all(results.values()):
330
+ return True, results
331
+ elif not any(results.values()):
332
+ return False, results
333
+ else:
334
+ logger.warning(
335
+ f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
336
+ f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
337
+ f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
338
+ f" by_invalid_chars: {results['by_invalid_chars']}",
339
+ file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
340
+ return False, results
341
+
342
+
343
+ @click.command()
344
+ @click.option("--json-file", type=str, help="pdf信息")
345
+ def main(json_file):
346
+ if json_file is None:
347
+ print("json_file is None", file=sys.stderr)
348
+ exit(0)
349
+ try:
350
+ with open(json_file, "r") as f:
351
+ for l in f:
352
+ if l.strip() == "":
353
+ continue
354
+ o = json.loads(l)
355
+ total_page = o["total_page"]
356
+ page_width = o["page_width_pts"]
357
+ page_height = o["page_height_pts"]
358
+ img_sz_list = o["image_info_per_page"]
359
+ text_len_list = o['text_len_per_page']
360
+ text_layout_list = o['text_layout_per_page']
361
+ pdf_path = o['pdf_path']
362
+ is_encrypted = o['is_encrypted']
363
+ is_needs_password = o['is_needs_password']
364
+ if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
365
+ continue
366
+ tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
367
+ o['is_text_pdf'] = tag
368
+ print(json.dumps(o, ensure_ascii=False))
369
+ except Exception as e:
370
+ print("ERROR: ", e, file=sys.stderr)
371
+
372
+
373
+ if __name__ == "__main__":
374
+ main()
375
+ # false = False
376
+ # true = True
377
+ # null = None
378
+ # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}}
379
+ # o = json.loads(json.dumps(o))
380
+ # total_page = o["total_page"]
381
+ # page_width = o["page_width_pts"]
382
+ # page_height = o["page_height_pts"]
383
+ # img_sz_list = o["image_info_per_page"]
384
+ # text_len_list = o['text_len_per_page']
385
+ # pdf_path = o['pdf_path']
386
+ # is_encrypted = o['is_encrypted']
387
+ # is_needs_password = o['is_needs_password']
388
+ # if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
389
+ # print("加密的")
390
+ # exit(0)
391
+ # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list)
392
+ # o['is_text_pdf'] = tag
393
+ # print(json.dumps(o, ensure_ascii=False))
magic_pdf/filter/pdf_meta_scan.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 输入: s3路径,每行一个
3
+ 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
4
+ """
5
+ import sys
6
+ import click
7
+
8
+ from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
9
+ from magic_pdf.libs.commons import fitz
10
+ from loguru import logger
11
+ from collections import Counter
12
+
13
+ from magic_pdf.libs.drop_reason import DropReason
14
+ from magic_pdf.libs.language import detect_lang
15
+ from magic_pdf.libs.pdf_check import detect_invalid_chars
16
+
17
+ scan_max_page = 50
18
+ junk_limit_min = 10
19
+
20
+
21
+ def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
22
+ max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
23
+ result]
24
+ page_area = int(page_width_pts) * int(page_height_pts)
25
+ max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
26
+ max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
27
+ return max_image_area_per_page
28
+
29
+
30
+ def process_image(page, junk_img_bojids=[]):
31
+ page_result = [] # 存每个页面里的多张图四元组信息
32
+ items = page.get_images()
33
+ dedup = set()
34
+ for img in items:
35
+ # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
36
+ img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
37
+ if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
38
+ continue
39
+ recs = page.get_image_rects(img, transform=True)
40
+ if recs:
41
+ rec = recs[0][0]
42
+ x0, y0, x1, y1 = map(int, rec)
43
+ width = x1 - x0
44
+ height = y1 - y0
45
+ if (x0, y0, x1, y1, img_bojid) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉
46
+ continue
47
+ if not all([width, height]): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
48
+ continue
49
+ dedup.add((x0, y0, x1, y1, img_bojid))
50
+ page_result.append([x0, y0, x1, y1, img_bojid])
51
+ return page_result
52
+
53
+
54
+ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
55
+ """
56
+ 返回每个页面里的图片的四元组,每个页面多个图片。
57
+ :param doc:
58
+ :return:
59
+ """
60
+ # 使用 Counter 计数 img_bojid 的出现次数
61
+ img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
62
+ # 找出出现次数超过 len(doc) 半数的 img_bojid
63
+
64
+ junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
65
+
66
+ junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
67
+
68
+ #todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
69
+ #有两种扫描版,一种文字版,这里可能会有误判
70
+ #扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
71
+ #扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
72
+ #文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
73
+ imgs_len_list = [len(page.get_images()) for page in doc]
74
+
75
+ special_limit_pages = 10
76
+
77
+ # 统一用前十页结果做判断
78
+ result = []
79
+ break_loop = False
80
+ for i, page in enumerate(doc):
81
+ if break_loop:
82
+ break
83
+ if i >= special_limit_pages:
84
+ break
85
+ page_result = process_image(page) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
86
+ result.append(page_result)
87
+ for item in result:
88
+ if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
89
+ if max(imgs_len_list) == min(imgs_len_list) and max(
90
+ imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
91
+ junk_img_bojids = []
92
+ else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
93
+ pass
94
+ break_loop = True
95
+ break
96
+ if not break_loop:
97
+ # 获取前80%的元素
98
+ top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
99
+ # 检查前80%的元素是否都相等
100
+ if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
101
+
102
+ # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
103
+ # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
104
+
105
+ #前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
106
+ max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
107
+ if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
108
+ junk_img_bojids = []
109
+ else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
110
+ pass
111
+ else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
112
+ junk_img_bojids = []
113
+
114
+ #正式进入取前50页图片的信息流程
115
+ result = []
116
+ for i, page in enumerate(doc):
117
+ if i >= scan_max_page:
118
+ break
119
+ page_result = process_image(page, junk_img_bojids)
120
+ # logger.info(f"page {i} img_len: {len(page_result)}")
121
+ result.append(page_result)
122
+
123
+ return result, junk_img_bojids
124
+
125
+
126
+ def get_pdf_page_size_pts(doc: fitz.Document):
127
+ page_cnt = len(doc)
128
+ l: int = min(page_cnt, 50)
129
+ #把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
130
+ page_width_list = []
131
+ page_height_list = []
132
+ for i in range(l):
133
+ page = doc[i]
134
+ page_rect = page.rect
135
+ page_width_list.append(page_rect.width)
136
+ page_height_list.append(page_rect.height)
137
+
138
+ page_width_list.sort()
139
+ page_height_list.sort()
140
+
141
+ median_width = page_width_list[len(page_width_list) // 2]
142
+ median_height = page_height_list[len(page_height_list) // 2]
143
+
144
+ return median_width, median_height
145
+
146
+
147
+ def get_pdf_textlen_per_page(doc: fitz.Document):
148
+ text_len_lst = []
149
+ for page in doc:
150
+ # 拿包含img和text的所有blocks
151
+ # text_block = page.get_text("blocks")
152
+ # 拿所有text的blocks
153
+ # text_block = page.get_text("words")
154
+ # text_block_len = sum([len(t[4]) for t in text_block])
155
+ #拿所有text的str
156
+ text_block = page.get_text("text")
157
+ text_block_len = len(text_block)
158
+ # logger.info(f"page {page.number} text_block_len: {text_block_len}")
159
+ text_len_lst.append(text_block_len)
160
+
161
+ return text_len_lst
162
+
163
+
164
+ def get_pdf_text_layout_per_page(doc: fitz.Document):
165
+ """
166
+ 根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
167
+
168
+ Args:
169
+ doc (fitz.Document): PDF文档对象。
170
+
171
+ Returns:
172
+ List[str]: 每一页的文本布局(横向、纵向、未知)。
173
+
174
+ """
175
+ text_layout_list = []
176
+
177
+ for page_id, page in enumerate(doc):
178
+ if page_id >= scan_max_page:
179
+ break
180
+ # 创建每一页的纵向和横向的文本行数计数器
181
+ vertical_count = 0
182
+ horizontal_count = 0
183
+ text_dict = page.get_text("dict")
184
+ if "blocks" in text_dict:
185
+ for block in text_dict["blocks"]:
186
+ if 'lines' in block:
187
+ for line in block["lines"]:
188
+ # 获取line的bbox顶点坐标
189
+ x0, y0, x1, y1 = line['bbox']
190
+ # 计算bbox的宽高
191
+ width = x1 - x0
192
+ height = y1 - y0
193
+ # 计算bbox的面积
194
+ area = width * height
195
+ font_sizes = []
196
+ for span in line['spans']:
197
+ if 'size' in span:
198
+ font_sizes.append(span['size'])
199
+ if len(font_sizes) > 0:
200
+ average_font_size = sum(font_sizes) / len(font_sizes)
201
+ else:
202
+ average_font_size = 10 # 有的line拿不到font_size,先定一个阈值100
203
+ if area <= average_font_size ** 2: # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
204
+ continue
205
+ else:
206
+ if 'wmode' in line: # 通过wmode判断文本方向
207
+ if line['wmode'] == 1: # 判断是否为竖向文本
208
+ vertical_count += 1
209
+ elif line['wmode'] == 0: # 判断是否为横向文本
210
+ horizontal_count += 1
211
+ # if 'dir' in line: # 通过旋转角度计算判断文本方向
212
+ # # 获取行的 "dir" 值
213
+ # dir_value = line['dir']
214
+ # cosine, sine = dir_value
215
+ # # 计算角度
216
+ # angle = math.degrees(math.acos(cosine))
217
+ #
218
+ # # 判断是否为横向文本
219
+ # if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
220
+ # # line_text = ' '.join(span['text'] for span in line['spans'])
221
+ # # print('This line is horizontal:', line_text)
222
+ # horizontal_count += 1
223
+ # # 判断是否为纵向文本
224
+ # elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
225
+ # # line_text = ' '.join(span['text'] for span in line['spans'])
226
+ # # print('This line is vertical:', line_text)
227
+ # vertical_count += 1
228
+ # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
229
+ # 判断每一页的文本布局
230
+ if vertical_count == 0 and horizontal_count == 0: # 该页没有文本,无法判断
231
+ text_layout_list.append("unknow")
232
+ continue
233
+ else:
234
+ if vertical_count > horizontal_count: # 该页的文本纵向行数大于横向的
235
+ text_layout_list.append("vertical")
236
+ else: # 该页的文本横向行数大于纵向的
237
+ text_layout_list.append("horizontal")
238
+ # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
239
+ return text_layout_list
240
+
241
+
242
+ '''定义一个自定义异常用来抛出单页svg太多的pdf'''
243
+
244
+
245
+ class PageSvgsTooManyError(Exception):
246
+ def __init__(self, message="Page SVGs are too many"):
247
+ self.message = message
248
+ super().__init__(self.message)
249
+
250
+
251
+ def get_svgs_per_page(doc: fitz.Document):
252
+ svgs_len_list = []
253
+ for page_id, page in enumerate(doc):
254
+ # svgs = page.get_drawings()
255
+ svgs = page.get_cdrawings() # 切换成get_cdrawings,效率更高
256
+ len_svgs = len(svgs)
257
+ if len_svgs >= 3000:
258
+ raise PageSvgsTooManyError()
259
+ else:
260
+ svgs_len_list.append(len_svgs)
261
+ # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
262
+ return svgs_len_list
263
+
264
+
265
+ def get_imgs_per_page(doc: fitz.Document):
266
+ imgs_len_list = []
267
+ for page_id, page in enumerate(doc):
268
+ imgs = page.get_images()
269
+ imgs_len_list.append(len(imgs))
270
+ # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
271
+
272
+ return imgs_len_list
273
+
274
+
275
+ def get_language(doc: fitz.Document):
276
+ """
277
+ 获取PDF文档的语言。
278
+ Args:
279
+ doc (fitz.Document): PDF文档对象。
280
+ Returns:
281
+ str: 文档语言,如 "en-US"。
282
+ """
283
+ language_lst = []
284
+ for page_id, page in enumerate(doc):
285
+ if page_id >= scan_max_page:
286
+ break
287
+ # 拿所有text的str
288
+ text_block = page.get_text("text")
289
+ page_language = detect_lang(text_block)
290
+ language_lst.append(page_language)
291
+
292
+ # logger.info(f"page_id: {page_id}, page_language: {page_language}")
293
+
294
+ # 统计text_language_list中每种语言的个数
295
+ count_dict = Counter(language_lst)
296
+ # 输出text_language_list中出现的次数最多的语言
297
+ language = max(count_dict, key=count_dict.get)
298
+ return language
299
+
300
+
301
+ def check_invalid_chars(pdf_bytes):
302
+ """
303
+ 乱码检测
304
+ """
305
+ return detect_invalid_chars(pdf_bytes)
306
+
307
+
308
+ def pdf_meta_scan(pdf_bytes: bytes):
309
+ """
310
+ :param s3_pdf_path:
311
+ :param pdf_bytes: pdf文件的二进制数据
312
+ 几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取
313
+ """
314
+ doc = fitz.open("pdf", pdf_bytes)
315
+ is_needs_password = doc.needs_pass
316
+ is_encrypted = doc.is_encrypted
317
+ total_page = len(doc)
318
+ if total_page == 0:
319
+ logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
320
+ result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
321
+ return result
322
+ else:
323
+ page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
324
+ # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
325
+
326
+ # svgs_per_page = get_svgs_per_page(doc)
327
+ # logger.info(f"svgs_per_page: {svgs_per_page}")
328
+ imgs_per_page = get_imgs_per_page(doc)
329
+ # logger.info(f"imgs_per_page: {imgs_per_page}")
330
+
331
+ image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
332
+ # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
333
+ text_len_per_page = get_pdf_textlen_per_page(doc)
334
+ # logger.info(f"text_len_per_page: {text_len_per_page}")
335
+ text_layout_per_page = get_pdf_text_layout_per_page(doc)
336
+ # logger.info(f"text_layout_per_page: {text_layout_per_page}")
337
+ text_language = get_language(doc)
338
+ # logger.info(f"text_language: {text_language}")
339
+ invalid_chars = check_invalid_chars(pdf_bytes)
340
+ # logger.info(f"invalid_chars: {invalid_chars}")
341
+
342
+ # 最后输出一条json
343
+ res = {
344
+ "is_needs_password": is_needs_password,
345
+ "is_encrypted": is_encrypted,
346
+ "total_page": total_page,
347
+ "page_width_pts": int(page_width_pts),
348
+ "page_height_pts": int(page_height_pts),
349
+ "image_info_per_page": image_info_per_page,
350
+ "text_len_per_page": text_len_per_page,
351
+ "text_layout_per_page": text_layout_per_page,
352
+ "text_language": text_language,
353
+ # "svgs_per_page": svgs_per_page,
354
+ "imgs_per_page": imgs_per_page, # 增加每页img数量list
355
+ "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
356
+ "invalid_chars": invalid_chars,
357
+ "metadata": doc.metadata
358
+ }
359
+ # logger.info(json.dumps(res, ensure_ascii=False))
360
+ return res
361
+
362
+
363
+ @click.command()
364
+ @click.option('--s3-pdf-path', help='s3上pdf文件的路径')
365
+ @click.option('--s3-profile', help='s3上的profile')
366
+ def main(s3_pdf_path: str, s3_profile: str):
367
+ """
368
+
369
+ """
370
+ try:
371
+ file_content = read_file(s3_pdf_path, s3_profile)
372
+ pdf_meta_scan(file_content)
373
+ except Exception as e:
374
+ print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
375
+ logger.exception(e)
376
+
377
+
378
+ if __name__ == '__main__':
379
+ main()
380
+ # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
381
+ # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
382
+ # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
383
+ # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
384
+ # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
385
+ # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
386
+ # doc = fitz.open("pdf", file_content)
387
+ # text_layout_lst = get_pdf_text_layout_per_page(doc)
388
+ # print(text_layout_lst)
magic_pdf/layout/__init__.py ADDED
File without changes
magic_pdf/layout/bbox_sort.py ADDED
@@ -0,0 +1,681 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
2
+ # 其中x0, y0代表左上角坐标,x1, y1代表右下角坐标,坐标原点在左上角。
3
+
4
+
5
+
6
+ from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
7
+ from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
8
+ from magic_pdf.libs.commons import mymax
9
+
10
+ X0_IDX = 0
11
+ Y0_IDX = 1
12
+ X1_IDX = 2
13
+ Y1_IDX = 3
14
+ CONTENT_IDX = 4
15
+ IDX_X = 5
16
+ IDX_Y = 6
17
+ CONTENT_TYPE_IDX = 7
18
+
19
+ X0_EXT_IDX = 8
20
+ Y0_EXT_IDX = 9
21
+ X1_EXT_IDX = 10
22
+ Y1_EXT_IDX = 11
23
+
24
+
25
+ def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
26
+ """
27
+ text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
28
+ 把bbox重新组装成一个list,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是pymupdf里的block结构
29
+ """
30
+ all_bboxes = []
31
+
32
+ for image in image_info:
33
+ box = image['bbox']
34
+ # 由于没有实现横向的栏切分,因此在这里先过滤掉一些小的图片。这些图片有可能影响layout,造成没有横向栏切分的情况下,layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
35
+ # 把长宽都小于50的去掉
36
+ if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
37
+ continue
38
+ all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
39
+
40
+ for table in table_info:
41
+ box = table['bbox']
42
+ all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
43
+
44
+ """由于公式与段落混合,因此公式不再参与layout划分,无需加入all_bboxes"""
45
+ # 加入文本block
46
+ text_block_temp = []
47
+ for block in text_raw_blocks:
48
+ bbox = block['bbox']
49
+ text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
50
+
51
+ text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)
52
+ text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox,有可能让layout探测陷入无限循环
53
+
54
+
55
+ """找出会影响layout的色块、横向分割线"""
56
+ spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
57
+ # 还要去掉存在于spilter_bboxes里的text_block
58
+ if len(spilter_bboxes) > 0:
59
+ text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
60
+
61
+ for bbox in text_block_new:
62
+ all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
63
+
64
+ for bbox in spilter_bboxes:
65
+ all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
66
+
67
+
68
+ return all_bboxes
69
+
70
+ def resolve_bbox_overlap_for_layout_det(bboxes:list):
71
+ """
72
+ 1. 去掉bbox互相包含的,去掉被包含的
73
+ 2. 上下方向上如果有重叠,就扩大大box范围,直到覆盖小box
74
+ """
75
+ def _is_in_other_bbox(i:int):
76
+ """
77
+ 判断i个box是否被其他box有所包含
78
+ """
79
+ for j in range(0, len(bboxes)):
80
+ if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
81
+ return True
82
+ # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
83
+ # return True
84
+
85
+ return False
86
+
87
+ # 首先去掉被包含的bbox
88
+ new_bbox_1 = []
89
+ for i in range(0, len(bboxes)):
90
+ if not _is_in_other_bbox(i):
91
+ new_bbox_1.append(bboxes[i])
92
+
93
+ # 其次扩展大的box
94
+ new_box = []
95
+ new_bbox_2 = []
96
+ len_1 = len(new_bbox_2)
97
+ while True:
98
+ merged_idx = []
99
+ for i in range(0, len(new_bbox_1)):
100
+ if i in merged_idx:
101
+ continue
102
+ for j in range(i+1, len(new_bbox_1)):
103
+ if j in merged_idx:
104
+ continue
105
+ bx1 = new_bbox_1[i]
106
+ bx2 = new_bbox_1[j]
107
+ if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
108
+ merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
109
+ new_bbox_2.append(merged_box)
110
+ merged_idx.append(i)
111
+ merged_idx.append(j)
112
+
113
+ for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
114
+ if i not in merged_idx:
115
+ new_bbox_2.append(new_bbox_1[i])
116
+
117
+ if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
118
+ break
119
+ else:
120
+ len_1 = len(new_bbox_2)
121
+ new_box = new_bbox_2
122
+ new_bbox_1, new_bbox_2 = new_bbox_2, []
123
+
124
+ return new_box
125
+
126
+
127
+ def filter_lines_bbox(bboxes: list):
128
+ """
129
+ 过滤掉bbox为空的行
130
+ """
131
+ new_box = []
132
+ for box in bboxes:
133
+ x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
134
+ if abs(x0-x1)<=1 or abs(y0-y1)<=1:
135
+ continue
136
+ else:
137
+ new_box.append(box)
138
+ return new_box
139
+
140
+
141
+ ################################################################################
142
+ # 第一种排序算法
143
+ # 以下是基于延长线遮挡做的一个算法
144
+ #
145
+ ################################################################################
146
+ def find_all_left_bbox(this_bbox, all_bboxes) -> list:
147
+ """
148
+ 寻找this_bbox左边的所有bbox
149
+ """
150
+ left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
151
+ return left_boxes
152
+
153
+
154
+ def find_all_top_bbox(this_bbox, all_bboxes) -> list:
155
+ """
156
+ 寻找this_bbox上面的所有bbox
157
+ """
158
+ top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
159
+ return top_boxes
160
+
161
+
162
+ def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
163
+ """
164
+ 寻找this_bbox在all_bboxes中的遮挡深度 idx_x
165
+ """
166
+ if this_bbox[IDX_X] is not None:
167
+ return this_bbox[IDX_X]
168
+ else:
169
+ all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
170
+ if len(all_left_bboxes) == 0:
171
+ this_bbox[IDX_X] = 0
172
+ else:
173
+ all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
174
+ max_idx_x = mymax(all_left_bboxes_idx)
175
+ this_bbox[IDX_X] = max_idx_x + 1
176
+ return this_bbox[IDX_X]
177
+
178
+
179
+ def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
180
+ """
181
+ 寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
182
+ """
183
+ if this_bbox[IDX_Y] is not None:
184
+ return this_bbox[IDX_Y]
185
+ else:
186
+ all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
187
+ if len(all_top_bboxes) == 0:
188
+ this_bbox[IDX_Y] = 0
189
+ else:
190
+ all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
191
+ max_idx_y = mymax(all_top_bboxes_idx)
192
+ this_bbox[IDX_Y] = max_idx_y + 1
193
+ return this_bbox[IDX_Y]
194
+
195
+
196
+ def bbox_sort(all_bboxes: list):
197
+ """
198
+ 排序
199
+ """
200
+ all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
201
+ all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
202
+ all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
203
+
204
+ all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
205
+ all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
206
+ all_bboxes_idx.sort(key=lambda x: x[0])
207
+ sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
208
+ return sorted_bboxes
209
+
210
+
211
+ ################################################################################
212
+ # 第二种排序算法
213
+ # 下面的算法在计算idx_x和idx_y的时候不考虑延长线,而只考虑实际的长或者宽被遮挡的情况
214
+ #
215
+ ################################################################################
216
+
217
+ def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
218
+ """
219
+ 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
220
+ """
221
+ left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
222
+ box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
223
+ this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
224
+ box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
225
+
226
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个
227
+ if len(left_boxes) > 0:
228
+ left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
229
+ left_boxes = [left_boxes[0]]
230
+ else:
231
+ left_boxes = []
232
+ return left_boxes
233
+
234
+
235
+ def get_and_set_idx_x_2(this_bbox, all_bboxes):
236
+ """
237
+ 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
238
+ 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
239
+ """
240
+ if this_bbox[IDX_X] is not None:
241
+ return this_bbox[IDX_X]
242
+ else:
243
+ left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
244
+ if len(left_nearest_bbox) == 0:
245
+ this_bbox[IDX_X] = 0
246
+ else:
247
+ left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
248
+ this_bbox[IDX_X] = left_idx_x + 1
249
+ return this_bbox[IDX_X]
250
+
251
+
252
+ def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
253
+ """
254
+ 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
255
+ """
256
+ top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
257
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
258
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
259
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
260
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个
261
+ if len(top_boxes) > 0:
262
+ top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
263
+ top_boxes = [top_boxes[0]]
264
+ else:
265
+ top_boxes = []
266
+ return top_boxes
267
+
268
+
269
+ def get_and_set_idx_y_2(this_bbox, all_bboxes):
270
+ """
271
+ 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
272
+ 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
273
+ """
274
+ if this_bbox[IDX_Y] is not None:
275
+ return this_bbox[IDX_Y]
276
+ else:
277
+ top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
278
+ if len(top_nearest_bbox) == 0:
279
+ this_bbox[IDX_Y] = 0
280
+ else:
281
+ top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
282
+ this_bbox[IDX_Y] = top_idx_y + 1
283
+ return this_bbox[IDX_Y]
284
+
285
+
286
+ def paper_bbox_sort(all_bboxes: list, page_width, page_height):
287
+ all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
288
+ all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
289
+ all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
290
+
291
+ all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
292
+ all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
293
+ all_bboxes_idx.sort(key=lambda x: x[0])
294
+ sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
295
+ return sorted_bboxes
296
+
297
+ ################################################################################
298
+ """
299
+ 第三种排序算法, 假设page的最左侧为X0,最右侧为X1,最上侧为Y0,最下侧为Y1
300
+ 这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下:
301
+ 1. 首先在水平方向上对bbox进行扩展。扩展方法是:
302
+ - 对每个bbox,找到其左边最近的bbox(也就是y方向有重叠),然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox,那么就将其左边界扩展到page的最左侧X0。
303
+ - 对每个bbox,找到其右边最近的bbox(也就是y方向有重叠),然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox,那么就将其右边界扩展到page的最右侧X1。
304
+ - 经过上面2个步骤,bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
305
+
306
+ 2. 合并所有的连续水平方向的bbox, 合并方法是:
307
+ - 对bbox进行y方向排序,然后从上到下遍历所有bbox,如果当前bbox和下一个bbox的x0, x1等于X0, X1,那么就合并这两个bbox。
308
+
309
+ 3. 然后在垂直方向上对bbox进行扩展。扩展方法是:
310
+ - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
311
+ 针对每个block
312
+ - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有,则x0=X0
313
+ - x1: 找到位于右侧x=x1延长线右侧所有的bboxes, 找到最小的x0, 让x1=x0-1。如果没有,则x1=X1
314
+ 随后在垂直方向上合并所有的连续的block,方法如下:
315
+ - 对block进行x方向排序,然后从左到右遍历所有block,如果当前block和下一个block的x0, x1相等,那么就合并这两个block。
316
+ 如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
317
+ 如果在某个垂直方向上无法被完全分割到一个block,那么就将这个block打上标签'BAD_LAYOUT'。
318
+ 至此完成,一个页面的预处理,天然的block要么属于'GOOD_LAYOUT',要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面,可以先按照自上而下,自左到右进行天然排序,也可以先过滤掉这种书籍。
319
+ (完成条件下次加强:进行水平方向切分,把混乱的layout部分尽可能切割出去)
320
+ """
321
+ ################################################################################
322
+ def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
323
+ """
324
+ 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
325
+ 这里使用扩展之后的bbox
326
+ """
327
+ left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
328
+ box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
329
+ this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
330
+ box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
331
+
332
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个
333
+ if len(left_boxes) > 0:
334
+ left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
335
+ left_boxes = left_boxes
336
+ else:
337
+ left_boxes = []
338
+ return left_boxes
339
+
340
+ def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
341
+ """
342
+ 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
343
+ 这里使用扩展之后的bbox
344
+ """
345
+ top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
346
+ box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
347
+ this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
348
+ box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
349
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个
350
+ if len(top_boxes) > 0:
351
+ top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
352
+ top_boxes = top_boxes
353
+ else:
354
+ top_boxes = []
355
+ return top_boxes
356
+
357
+ def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
358
+ """
359
+ 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
360
+ 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
361
+ """
362
+ if this_bbox[IDX_X] is not None:
363
+ return this_bbox[IDX_X]
364
+ else:
365
+ left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
366
+ if len(left_nearest_bbox) == 0:
367
+ this_bbox[IDX_X] = 0
368
+ else:
369
+ left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
370
+ this_bbox[IDX_X] = mymax(left_idx_x) + 1
371
+ return this_bbox[IDX_X]
372
+
373
+ def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
374
+ """
375
+ 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
376
+ 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
377
+ """
378
+ if this_bbox[IDX_Y] is not None:
379
+ return this_bbox[IDX_Y]
380
+ else:
381
+ top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
382
+ if len(top_nearest_bbox) == 0:
383
+ this_bbox[IDX_Y] = 0
384
+ else:
385
+ top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
386
+ this_bbox[IDX_Y] = mymax(top_idx_y) + 1
387
+ return this_bbox[IDX_Y]
388
+
389
+ def _paper_bbox_sort_ext(all_bboxes: list):
390
+ all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
391
+ all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
392
+ all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
393
+
394
+ all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
395
+ all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
396
+ all_bboxes_idx.sort(key=lambda x: x[0])
397
+ sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
398
+ return sorted_bboxes
399
+
400
+ # ===============================================================================================
401
+ def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
402
+ """
403
+ 寻找this_bbox左边的所有bbox, 使用延长线
404
+ """
405
+ left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
406
+ if len(left_boxes):
407
+ left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
408
+ left_boxes = left_boxes[0]
409
+ else:
410
+ left_boxes = None
411
+
412
+ return left_boxes
413
+
414
+ def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
415
+ """
416
+ 寻找this_bbox右边的所有bbox, 使用延长线
417
+ """
418
+ right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
419
+ if len(right_boxes):
420
+ right_boxes.sort(key=lambda x: x[X0_IDX])
421
+ right_boxes = right_boxes[0]
422
+ else:
423
+ right_boxes = None
424
+ return right_boxes
425
+
426
+ # =============================================================================================
427
+
428
+ def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
429
+ """
430
+ 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox, 不用延长线并且不能像
431
+ """
432
+ left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
433
+ box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
434
+ this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
435
+ box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
436
+
437
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
438
+ if len(left_boxes) > 0:
439
+ left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
440
+ left_boxes = left_boxes[0]
441
+ else:
442
+ left_boxes = None
443
+ return left_boxes
444
+
445
+ def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
446
+ """
447
+ 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
448
+ """
449
+ right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
450
+ this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
451
+ box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
452
+ box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
453
+
454
+ if len(right_bboxes)>0:
455
+ right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
456
+ right_bboxes = right_bboxes[0]
457
+ else:
458
+ right_bboxes = None
459
+ return right_bboxes
460
+
461
+ def reset_idx_x_y(all_boxes:list)->list:
462
+ for box in all_boxes:
463
+ box[IDX_X] = None
464
+ box[IDX_Y] = None
465
+
466
+ return all_boxes
467
+
468
+ # ===================================================================================================
469
+ def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
470
+ """
471
+ 找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
472
+ """
473
+ top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
474
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
475
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
476
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
477
+ # 然后再过滤一下,找到上方距离this_bbox最近的那个
478
+ if len(top_bboxes) > 0:
479
+ top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
480
+ top_bboxes = top_bboxes[0]
481
+ else:
482
+ top_bboxes = None
483
+ return top_bboxes
484
+
485
+ def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
486
+ """
487
+ 找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
488
+ """
489
+ bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
490
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
491
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
492
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
493
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个
494
+ if len(bottom_bboxes) > 0:
495
+ bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
496
+ bottom_bboxes = bottom_bboxes[0]
497
+ else:
498
+ bottom_bboxes = None
499
+ return bottom_bboxes
500
+
501
+ def find_boundry_bboxes(bboxes:list) -> tuple:
502
+ """
503
+ 找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
504
+ """
505
+ x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
506
+ for box in bboxes:
507
+ x0 = min(box[X0_IDX], x0)
508
+ y0 = min(box[Y0_IDX], y0)
509
+ x1 = max(box[X1_IDX], x1)
510
+ y1 = max(box[Y1_IDX], y1)
511
+
512
+ return x0, y0, x1, y1
513
+
514
+
515
+ def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
516
+ """
517
+ 在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
518
+ """
519
+ for box in bboxes:
520
+ top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
521
+ bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
522
+ if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
523
+ box[X0_EXT_IDX] = box[X0_IDX]
524
+ box[Y0_EXT_IDX] = boundry_y0
525
+ box[X1_EXT_IDX] = box[X1_IDX]
526
+ box[Y1_EXT_IDX] = boundry_y1
527
+ # else:
528
+ # if top_nearest_bbox is None:
529
+ # box[Y0_EXT_IDX] = boundry_y0
530
+ # else:
531
+ # box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
532
+ # if bottom_nearest_bbox is None:
533
+ # box[Y1_EXT_IDX] = boundry_y1
534
+ # else:
535
+ # box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
536
+ # box[X0_EXT_IDX] = box[X0_IDX]
537
+ # box[X1_EXT_IDX] = box[X1_IDX]
538
+ return bboxes
539
+
540
+
541
+ # ===================================================================================================
542
+
543
+ def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
544
+ """
545
+ 增加预处理行为的排序:
546
+ return:
547
+ [
548
+ {
549
+ "layout_bbox": [x0, y0, x1, y1],
550
+ "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
551
+ "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
552
+ }
553
+ ]
554
+ """
555
+ sorted_layouts = [] # 最后的返回结果
556
+ page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
557
+
558
+ all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
559
+ # 首先在水平方向上扩展独占一行的bbox
560
+ for bbox in all_bboxes:
561
+ left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
562
+ right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
563
+ if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
564
+ bbox[X0_EXT_IDX] = page_x0
565
+ bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
566
+ bbox[X1_EXT_IDX] = page_x1
567
+ bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
568
+
569
+ # 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
570
+ if len(all_bboxes)==1:
571
+ return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
572
+ if len(all_bboxes)==0:
573
+ return []
574
+
575
+ """
576
+ 然后合并所有连续水平方向的bbox.
577
+
578
+ """
579
+ all_bboxes.sort(key=lambda x: x[Y0_IDX])
580
+ h_bboxes = []
581
+ h_bbox_group = []
582
+ v_boxes = []
583
+
584
+ for bbox in all_bboxes:
585
+ if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
586
+ h_bbox_group.append(bbox)
587
+ else:
588
+ if len(h_bbox_group)>0:
589
+ h_bboxes.append(h_bbox_group)
590
+ h_bbox_group = []
591
+ # 最后一个group
592
+ if len(h_bbox_group)>0:
593
+ h_bboxes.append(h_bbox_group)
594
+
595
+ """
596
+ 现在h_bboxes里面是所有的group了,每个group都是一个list
597
+ 对h_bboxes里的每个group进行计算放回到sorted_layouts里
598
+ """
599
+ for gp in h_bboxes:
600
+ gp.sort(key=lambda x: x[Y0_IDX])
601
+ block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
602
+ # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
603
+ x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
604
+ block_info["layout_bbox"] = [x0, y0, x1, y1]
605
+ sorted_layouts.append(block_info)
606
+
607
+ # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
608
+ h_split_lines = [page_y0]
609
+ for gp in h_bboxes:
610
+ layout_bbox = gp['layout_bbox']
611
+ y0, y1 = layout_bbox[1], layout_bbox[3]
612
+ h_split_lines.append(y0)
613
+ h_split_lines.append(y1)
614
+ h_split_lines.append(page_y1)
615
+
616
+ unsplited_bboxes = []
617
+ for i in range(0, len(h_split_lines), 2):
618
+ start_y0, start_y1 = h_split_lines[i:i+2]
619
+ # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
620
+ bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
621
+ unsplited_bboxes.append(bboxes_in_block)
622
+ # ================== 至此,水平方向的 已经切分排序完毕====================================
623
+ """
624
+ 接下来针对每个非水平的部分切分垂直方向的
625
+ 此时,只剩下了无法被完全水平打通的bbox了。对这些box,优先进行垂直扩展,然后进行垂直切分.
626
+ 分3步:
627
+ 1. 先把能完全垂直打通的隔离出去当做一个layout
628
+ 2. 其余的先垂直切分
629
+ 3. 垂直切分之后的部分再尝试水平切分
630
+ 4. 剩下的不能被切分的各个部分当成一个layout
631
+ """
632
+ # 对每部分进行垂直切分
633
+ for bboxes_in_block in unsplited_bboxes:
634
+ # 首先对这个block的bbox进行垂直方向上的扩展
635
+ boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block)
636
+ # 进行垂直方向上的扩展
637
+ extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
638
+ # 然后对这个block进行垂直方向上的切分
639
+ extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大,代表了从左到右读取
640
+ v_boxes_group = []
641
+ for bbox in extended_vertical_bboxes:
642
+ if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
643
+ v_boxes_group.append(bbox)
644
+ else:
645
+ if len(v_boxes_group)>0:
646
+ v_boxes.append(v_boxes_group)
647
+ v_boxes_group = []
648
+
649
+ if len(v_boxes_group)>0:
650
+
651
+ v_boxes.append(v_boxes_group)
652
+
653
+ # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了,因为上面已经做了
654
+ for gp in v_boxes:
655
+ gp.sort(key=lambda x: x[X0_IDX])
656
+ block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
657
+ # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
658
+ x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
659
+ block_info["layout_bbox"] = [x0, y0, x1, y1]
660
+ sorted_layouts.append(block_info)
661
+
662
+ # 在垂直方向上,划分子块,也就是用贯通的垂直线进行切分。这些被切分出来的块,极大可能是可被垂直切分的,如果不能完全的垂直切分,那么尝试水平切分。都不能的则当成一个layout
663
+ v_split_lines = [boundry_x0]
664
+ for gp in v_boxes:
665
+ layout_bbox = gp['layout_bbox']
666
+ x0, x1 = layout_bbox[0], layout_bbox[2]
667
+ v_split_lines.append(x0)
668
+ v_split_lines.append(x1)
669
+ v_split_lines.append(boundry_x1)
670
+
671
+ reset_idx_x_y(all_bboxes)
672
+ all_boxes = _paper_bbox_sort_ext(all_bboxes)
673
+ return all_boxes
674
+
675
+
676
+
677
+
678
+
679
+
680
+
681
+
magic_pdf/layout/layout_det_utils.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
2
+ from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
3
+
4
+
5
+ def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
6
+ """
7
+ 在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox, 不用延长线
8
+ 并且要考虑两个box左右相交的情况,如果相交了,那么右侧的box就不算最左侧。
9
+ """
10
+ left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]
11
+ and any([
12
+ box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
13
+ this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
14
+ box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
15
+
16
+ # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
17
+ if len(left_boxes) > 0:
18
+ left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
19
+ left_boxes = left_boxes[0]
20
+ else:
21
+ left_boxes = None
22
+ return left_boxes
23
+
24
+ def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
25
+ """
26
+ 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
27
+ """
28
+ right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]
29
+ and any([
30
+ this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
31
+ box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
32
+ box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
33
+
34
+ if len(right_bboxes)>0:
35
+ right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
36
+ right_bboxes = right_bboxes[0]
37
+ else:
38
+ right_bboxes = None
39
+ return right_bboxes
40
+
41
+ def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
42
+ """
43
+ 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
44
+ """
45
+ top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
46
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
47
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
48
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
49
+
50
+ if len(top_bboxes)>0:
51
+ top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
52
+ top_bboxes = top_bboxes[0]
53
+ else:
54
+ top_bboxes = None
55
+ return top_bboxes
56
+
57
+ def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
58
+ """
59
+ 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
60
+ """
61
+ bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
62
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
63
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
64
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
65
+
66
+ if len(bottom_bboxes)>0:
67
+ bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
68
+ bottom_bboxes = bottom_bboxes[0]
69
+ else:
70
+ bottom_bboxes = None
71
+ return bottom_bboxes
72
+
73
+ # ===================================================================================================================
74
+ def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
75
+ """
76
+ 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
77
+ """
78
+ bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
79
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
80
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
81
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
82
+
83
+ if len(bottom_bboxes)>0:
84
+ # y0最小, X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
85
+ bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
86
+ bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
87
+ # 然后再y1相同的情况下,找到x1最大的那个
88
+ bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
89
+ bottom_bboxes = bottom_bboxes[0]
90
+ else:
91
+ bottom_bboxes = None
92
+ return bottom_bboxes
93
+
94
+ def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
95
+ """
96
+ 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必���是直接遮挡的那种
97
+ """
98
+ bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
99
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
100
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
101
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
102
+
103
+ if len(bottom_bboxes)>0:
104
+ # y0最小, X0最小的那个
105
+ bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
106
+ bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
107
+ # 然后再y0相同的情况下,找到x0最小的那个
108
+ bottom_bboxes.sort(key=lambda x: x[X0_IDX])
109
+ bottom_bboxes = bottom_bboxes[0]
110
+ else:
111
+ bottom_bboxes = None
112
+ return bottom_bboxes
113
+
114
+ def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
115
+ """
116
+ 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
117
+ """
118
+ top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
119
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
120
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
121
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
122
+
123
+ if len(top_bboxes)>0:
124
+ # y1最大, X0最小的那个
125
+ top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
126
+ top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
127
+ # 然后再y1相同的情况下,找到x0最小的那个
128
+ top_bboxes.sort(key=lambda x: x[X0_IDX])
129
+ top_bboxes = top_bboxes[0]
130
+ else:
131
+ top_bboxes = None
132
+ return top_bboxes
133
+
134
+ def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
135
+ """
136
+ 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
137
+ """
138
+ top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
139
+ box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
140
+ this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
141
+ box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
142
+
143
+ if len(top_bboxes)>0:
144
+ # y1最大, X1最大的那个
145
+ top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
146
+ top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
147
+ # 然后再y1相同的情况下,找到x1最大的那个
148
+ top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
149
+ top_bboxes = top_bboxes[0]
150
+ else:
151
+ top_bboxes = None
152
+ return top_bboxes
153
+
154
+ # ===================================================================================================================
155
+
156
+ def get_left_edge_bboxes(all_bboxes) -> list:
157
+ """
158
+ 返回最左边的bbox
159
+ """
160
+ left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
161
+ return left_bboxes
162
+
163
+ def get_right_edge_bboxes(all_bboxes) -> list:
164
+ """
165
+ 返回最右边的bbox
166
+ """
167
+ right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
168
+ return right_bboxes
169
+
170
+ def fix_vertical_bbox_pos(bboxes:list):
171
+ """
172
+ 检查这批bbox在垂直方向是否有轻微的重叠,如果重叠了,就把重叠的bbox往下移动一点
173
+ 在x方向上必须一个包含或者被包含,或者完全重叠,不能只有部分重叠
174
+ """
175
+ bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
176
+ for i in range(0, len(bboxes)):
177
+ for j in range(i+1, len(bboxes)):
178
+ if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
179
+ # 如果两个bbox有部分重叠,那么就把下面的bbox往下移动一点
180
+ bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
181
+ break
182
+ return bboxes
magic_pdf/layout/layout_sort.py ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 对pdf上的box进行layout识别,并对内部组成的box进行排序
3
+ """
4
+
5
+ from loguru import logger
6
+ from magic_pdf.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort
7
+ from magic_pdf.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes
8
+ from magic_pdf.libs.boxbase import get_bbox_in_boundry
9
+
10
+
11
+ LAYOUT_V = "V"
12
+ LAYOUT_H = "H"
13
+ LAYOUT_UNPROC = "U"
14
+ LAYOUT_BAD = "B"
15
+
16
+ def _is_single_line_text(bbox):
17
+ """
18
+ 检查bbox里面的文字是否只有一行
19
+ """
20
+ return True # TODO
21
+ box_type = bbox[CONTENT_TYPE_IDX]
22
+ if box_type != 'text':
23
+ return False
24
+ paras = bbox[CONTENT_IDX]["paras"]
25
+ text_content = ""
26
+ for para_id, para in paras.items(): # 拼装内部的段落文本
27
+ is_title = para['is_title']
28
+ if is_title!=0:
29
+ text_content += f"## {para['text']}"
30
+ else:
31
+ text_content += para["text"]
32
+ text_content += "\n\n"
33
+
34
+ return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split("\n\n")) <= 1
35
+
36
+
37
+ def _horizontal_split(bboxes:list, boundry:tuple, avg_font_size=20)-> list:
38
+ """
39
+ 对bboxes进行水平切割
40
+ 方法是:找到左侧和右侧都没有被直接遮挡的box,然后进行扩展,之后进行切割
41
+ return:
42
+ 返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平,u代表未探测的,v代表垂直布局
43
+ """
44
+ sorted_layout_blocks = [] # 这是要最终返回的值
45
+
46
+ bound_x0, bound_y0, bound_x1, bound_y1 = boundry
47
+ all_bboxes = get_bbox_in_boundry(bboxes, boundry)
48
+ #all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
49
+ """
50
+ 首先在水平方向上扩展独占一行的bbox
51
+
52
+ """
53
+ last_h_split_line_y1 = bound_y0 #记录下上次的水平分割线
54
+ for i, bbox in enumerate(all_bboxes):
55
+ left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes) # 非扩展线
56
+ right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
57
+ if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
58
+ """
59
+ 然而,如果只是孤立的一行文字,那么就还要满足以下几个条件才可以:
60
+ 1. bbox和中心线相交。或者
61
+ 2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
62
+ 3. TODO 加强条件:这个bbox上方和下方是同一列column,那么就不能算作独占一行
63
+ """
64
+ # 先检查这个bbox里是否只包含一行文字
65
+ is_single_line = _is_single_line_text(bbox)
66
+ """
67
+ 这里有个点需要注意,当页面内容不是居中的时候,第一次调用传递的是page的boundry,这个时候mid_x就不是中心线了.
68
+ 所以这里计算出最紧致的boundry,然后再计算mid_x
69
+ """
70
+ boundry_real_x0, boundry_real_x1 = min([bbox[X0_IDX] for bbox in all_bboxes]), max([bbox[X1_IDX] for bbox in all_bboxes])
71
+ mid_x = (boundry_real_x0+boundry_real_x1)/2
72
+ # 检查这个box是否内容在中心线有交
73
+ # 必须跨过去2个字符的宽度
74
+ is_cross_boundry_mid_line = min(mid_x-bbox[X0_IDX], bbox[X1_IDX]-mid_x) > avg_font_size*2
75
+ """
76
+ 检查条件2
77
+ """
78
+ is_belong_to_col = False
79
+ """
80
+ 检查是否能被上方col吸收,方法是:
81
+ 1. 上方非空且不是独占一行的,并且
82
+ 2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
83
+ """
84
+ """
85
+ 以迭代的方式向上找,查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
86
+ """
87
+ #先确定上方的y0, y0
88
+ b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
89
+ #然后从box开始逐个向上找到所有与box在x上有交集的box
90
+ box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
91
+ bbox_in_bound_check = get_bbox_in_boundry(all_bboxes, box_to_check)
92
+
93
+ bboxes_on_top = []
94
+ virtual_box = bbox
95
+ while True:
96
+ b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
97
+ if b_on_top is not None:
98
+ bboxes_on_top.append(b_on_top)
99
+ virtual_box = [min([virtual_box[X0_IDX], b_on_top[X0_IDX]]), min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]), max([virtual_box[X1_IDX], b_on_top[X1_IDX]]), b_y1]
100
+ else:
101
+ break
102
+
103
+ # 随后确定这些box的最小x0, 最大x1
104
+ if len(bboxes_on_top)>0 and len(bboxes_on_top) != len(bbox_in_bound_check):# virtual_box可能会膨胀到占满整个区域,这实际上就不能属于一个col了。
105
+ min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
106
+ # 然后采用一种比较粗糙的方法,看min_x0,max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
107
+
108
+ if not any([b[X0_IDX] <= min_x0-1 <= b[X1_IDX] or b[X0_IDX] <= max_x1+1 <= b[X1_IDX] for b in bbox_in_bound_check]):
109
+ # 其上,下都不能被扩展成行,暂时只检查一下上方 TODO
110
+ top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
111
+ bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
112
+ if not any([
113
+ top_nearest_bbox is not None and (find_all_left_bbox_direct(top_nearest_bbox, bboxes) is None and find_all_right_bbox_direct(top_nearest_bbox, bboxes) is None),
114
+ bottom_nearest_bbox is not None and (find_all_left_bbox_direct(bottom_nearest_bbox, bboxes) is None and find_all_right_bbox_direct(bottom_nearest_bbox, bboxes) is None),
115
+ top_nearest_bbox is None or bottom_nearest_bbox is None
116
+ ]):
117
+ is_belong_to_col = True
118
+
119
+ # 检查是否能被下方col吸收 TODO
120
+
121
+ """
122
+ 这里为什么没有is_cross_boundry_mid_line的条件呢?
123
+ 确实有些杂志左右两栏宽度不是对称的。
124
+ """
125
+ if not is_belong_to_col or is_cross_boundry_mid_line:
126
+ bbox[X0_EXT_IDX] = bound_x0
127
+ bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
128
+ bbox[X1_EXT_IDX] = bound_x1
129
+ bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
130
+ last_h_split_line_y1 = bbox[Y1_IDX] # 更新这条线
131
+ else:
132
+ continue
133
+ """
134
+ 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
135
+ 然后合并所有连续水平方向的bbox.
136
+ """
137
+ all_bboxes.sort(key=lambda x: x[Y0_IDX])
138
+ h_bboxes = []
139
+ h_bbox_group = []
140
+
141
+ for bbox in all_bboxes:
142
+ if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
143
+ h_bbox_group.append(bbox)
144
+ else:
145
+ if len(h_bbox_group)>0:
146
+ h_bboxes.append(h_bbox_group)
147
+ h_bbox_group = []
148
+ # 最后一个group
149
+ if len(h_bbox_group)>0:
150
+ h_bboxes.append(h_bbox_group)
151
+
152
+ """
153
+ 现在h_bboxes里面是所有的group了,每个group都是一个list
154
+ 对h_bboxes里的每个group进行计算放回到sorted_layouts里
155
+ """
156
+ h_layouts = []
157
+ for gp in h_bboxes:
158
+ gp.sort(key=lambda x: x[Y0_IDX])
159
+ # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
160
+ x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
161
+ h_layouts.append([x0, y0, x1, y1, LAYOUT_H]) # 水平的布局
162
+
163
+ """
164
+ 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
165
+ """
166
+ h_split_lines = [bound_y0]
167
+ for gp in h_bboxes: # gp是一个list[bbox_list]
168
+ y0, y1 = gp[0][1], gp[-1][3]
169
+ h_split_lines.append(y0)
170
+ h_split_lines.append(y1)
171
+ h_split_lines.append(bound_y1)
172
+
173
+ unsplited_bboxes = []
174
+ for i in range(0, len(h_split_lines), 2):
175
+ start_y0, start_y1 = h_split_lines[i:i+2]
176
+ # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
177
+ bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
178
+ unsplited_bboxes.append(bboxes_in_block)
179
+ # 接着把未处理的加入到h_layouts里
180
+ for bboxes_in_block in unsplited_bboxes:
181
+ if len(bboxes_in_block) == 0:
182
+ continue
183
+ x0, y0, x1, y1 = bound_x0, min([bbox[Y0_IDX] for bbox in bboxes_in_block]), bound_x1, max([bbox[Y1_IDX] for bbox in bboxes_in_block])
184
+ h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
185
+
186
+ h_layouts.sort(key=lambda x: x[1]) # 按照y0排序, 也就是从上到下的顺序
187
+
188
+ """
189
+ 转换成如下格式返回
190
+ """
191
+ for layout in h_layouts:
192
+ sorted_layout_blocks.append({
193
+ "layout_bbox": layout[:4],
194
+ "layout_label":layout[4],
195
+ "sub_layout":[],
196
+ })
197
+ return sorted_layout_blocks
198
+
199
+ ###############################################################################################
200
+ #
201
+ # 垂直方向的处理
202
+ #
203
+ #
204
+ ###############################################################################################
205
+ def _vertical_align_split_v1(bboxes:list, boundry:tuple)-> list:
206
+ """
207
+ 计算垂直方向上的对齐, 并分割bboxes成layout。负责对一列多行的进行列维度分割。
208
+ 如果不能完全分割,剩余部分作为layout_lable为u的layout返回
209
+ -----------------------
210
+ | | |
211
+ | | |
212
+ | | |
213
+ | | |
214
+ -------------------------
215
+ 此函数会将:以上布局将会切分出来2列
216
+ """
217
+ sorted_layout_blocks = [] # 这是要最终返回的值
218
+ new_boundry = [boundry[0], boundry[1], boundry[2], boundry[3]]
219
+
220
+ v_blocks = []
221
+ """
222
+ 先从左到右切分
223
+ """
224
+ while True:
225
+ all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
226
+ left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
227
+ if len(left_edge_bboxes) == 0:
228
+ break
229
+ right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes])+1
230
+ # 然后检查这条线能不与其他bbox的左边界相交或者重合
231
+ if any([bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]):
232
+ # 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。
233
+ break
234
+ else: # 说明成功分割出一列
235
+ # 找到左侧边界最靠左的bbox作为layout的x0
236
+ layout_x0 = min([bbox[X0_IDX] for bbox in left_edge_bboxes]) # 这里主要是为了画出来有一定间距
237
+ v_blocks.append([layout_x0, new_boundry[1], right_split_line_x1, new_boundry[3], LAYOUT_V])
238
+ new_boundry[0] = right_split_line_x1 # 更新边界
239
+
240
+ """
241
+ 再从右到左切, 此时如果还是无法完全切分,那么剩余部分作为layout_lable为u的layout返回
242
+ """
243
+ unsplited_block = []
244
+ while True:
245
+ all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
246
+ right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
247
+ if len(right_edge_bboxes) == 0:
248
+ break
249
+ left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes])-1
250
+ # 然后检查这条线能不与其他bbox的左边界相交或者重合
251
+ if any([bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]):
252
+ # 这里是余下的
253
+ unsplited_block.append([new_boundry[0], new_boundry[1], new_boundry[2], new_boundry[3], LAYOUT_UNPROC])
254
+ break
255
+ else:
256
+ # 找到右侧边界最靠右的bbox作为layout的x1
257
+ layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
258
+ v_blocks.append([left_split_line_x0, new_boundry[1], layout_x1, new_boundry[3], LAYOUT_V])
259
+ new_boundry[2] = left_split_line_x0 # 更新右边界
260
+
261
+ """
262
+ 最后拼装成layout格式返回
263
+ """
264
+ for block in v_blocks:
265
+ sorted_layout_blocks.append({
266
+ "layout_bbox": block[:4],
267
+ "layout_label":block[4],
268
+ "sub_layout":[],
269
+ })
270
+ for block in unsplited_block:
271
+ sorted_layout_blocks.append({
272
+ "layout_bbox": block[:4],
273
+ "layout_label":block[4],
274
+ "sub_layout":[],
275
+ })
276
+
277
+ # 按照x0排序
278
+ sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
279
+ return sorted_layout_blocks
280
+
281
+ def _vertical_align_split_v2(bboxes:list, boundry:tuple)-> list:
282
+ """
283
+ 改进的 _vertical_align_split算法,原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分,导致整个layout多列被识别为一列。
284
+ 利用从左上角的box开始向下看的方法,不断扩展w_x0, w_x1,直到不能继续向下扩展,或者到达边界下边界。
285
+ """
286
+ sorted_layout_blocks = [] # 这是要最终返回的值
287
+ new_boundry = [boundry[0], boundry[1], boundry[2], boundry[3]]
288
+ bad_boxes = [] # 被割中的box
289
+ v_blocks = []
290
+ while True:
291
+ all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
292
+ if len(all_bboxes) == 0:
293
+ break
294
+ left_top_box = min(all_bboxes, key=lambda x: (x[X0_IDX],x[Y0_IDX]))# 这里应该加强,检查一下必须是在第一列的 TODO
295
+ start_box = [left_top_box[X0_IDX], left_top_box[Y0_IDX], left_top_box[X1_IDX], left_top_box[Y1_IDX]]
296
+ w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
297
+ """
298
+ 然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
299
+ 扩展之后,宽度会增加,随后用x=w_x1来检测在边界内是否有box与相交,如果相交,那么就说明不能再扩展了。
300
+ 当不能扩展的时候就要看是否到达下边界:
301
+ 1. 达到,那么更新左边界继续分下一个列
302
+ 2. 没有达到,那么此时开始从右侧切分进入下面的循环里
303
+ """
304
+ while left_top_box is not None: # 向下去找
305
+ virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
306
+ left_top_box = find_bottom_bbox_direct_from_left_edge(virtual_box, all_bboxes)
307
+ if left_top_box:
308
+ w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max([virtual_box[X1_IDX], left_top_box[X1_IDX]])
309
+ # 万一这个初始的box在column中间,那么还要向上看
310
+ start_box = [w_x0, start_box[Y0_IDX], w_x1, start_box[Y1_IDX]] # 扩展一下宽度更鲁棒
311
+ left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
312
+ while left_top_box is not None: # 向上去找
313
+ virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
314
+ left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
315
+ if left_top_box:
316
+ w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max([virtual_box[X1_IDX], left_top_box[X1_IDX]])
317
+
318
+ # 检查相交
319
+ if any([bbox[X0_IDX] <= w_x1+1 <= bbox[X1_IDX] for bbox in all_bboxes]):
320
+ for b in all_bboxes:
321
+ if b[X0_IDX] <= w_x1+1 <= b[X1_IDX]:
322
+ bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
323
+ break
324
+ else: # 说明成功分割出一列
325
+ v_blocks.append([w_x0, new_boundry[1], w_x1, new_boundry[3], LAYOUT_V])
326
+ new_boundry[0] = w_x1 # 更新边界
327
+
328
+ """
329
+ 接着开始从右上角的box扫描
330
+ """
331
+ w_x0 , w_x1 = 0, 0
332
+ unsplited_block = []
333
+ while True:
334
+ all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
335
+ if len(all_bboxes) == 0:
336
+ break
337
+ # 先找到X1最大的
338
+ bbox_list_sorted = sorted(all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True)
339
+ # Then, find the boxes with the smallest Y0 value
340
+ bigest_x1 = bbox_list_sorted[0][X1_IDX]
341
+ boxes_with_bigest_x1 = [bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1] # 也就是最靠右的那些
342
+ right_top_box = min(boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]) # y0最小的那个
343
+ start_box = [right_top_box[X0_IDX], right_top_box[Y0_IDX], right_top_box[X1_IDX], right_top_box[Y1_IDX]]
344
+ w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
345
+
346
+ while right_top_box is not None:
347
+ virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
348
+ right_top_box = find_bottom_bbox_direct_from_right_edge(virtual_box, all_bboxes)
349
+ if right_top_box:
350
+ w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max([w_x1, right_top_box[X1_IDX]])
351
+ # 在向上扫描
352
+ start_box = [w_x0, start_box[Y0_IDX], w_x1, start_box[Y1_IDX]] # 扩展一下宽度更鲁棒
353
+ right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
354
+ while right_top_box is not None:
355
+ virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
356
+ right_top_box = find_top_bbox_direct_from_right_edge(virtual_box, all_bboxes)
357
+ if right_top_box:
358
+ w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max([w_x1, right_top_box[X1_IDX]])
359
+
360
+ # 检查是否与其他box相交, 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。
361
+ if any([bbox[X0_IDX] <= w_x0-1 <= bbox[X1_IDX] for bbox in all_bboxes]):
362
+ unsplited_block.append([new_boundry[0], new_boundry[1], new_boundry[2], new_boundry[3], LAYOUT_UNPROC])
363
+ for b in all_bboxes:
364
+ if b[X0_IDX] <= w_x0-1 <= b[X1_IDX]:
365
+ bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
366
+ break
367
+ else: # 说明成功分割出一列
368
+ v_blocks.append([w_x0, new_boundry[1], w_x1, new_boundry[3], LAYOUT_V])
369
+ new_boundry[2] = w_x0
370
+
371
+ """转换数据结构"""
372
+ for block in v_blocks:
373
+ sorted_layout_blocks.append({
374
+ "layout_bbox": block[:4],
375
+ "layout_label":block[4],
376
+ "sub_layout":[],
377
+ })
378
+
379
+ for block in unsplited_block:
380
+ sorted_layout_blocks.append({
381
+ "layout_bbox": block[:4],
382
+ "layout_label":block[4],
383
+ "sub_layout":[],
384
+ "bad_boxes": bad_boxes # 记录下来,这个box是被割中的
385
+ })
386
+
387
+
388
+ # 按照x0排序
389
+ sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
390
+ return sorted_layout_blocks
391
+
392
+
393
+
394
+
395
+ def _try_horizontal_mult_column_split(bboxes:list, boundry:tuple)-> list:
396
+ """
397
+ 尝试水平切分,如果切分不动,那就当一个BAD_LAYOUT返回
398
+ ------------------
399
+ | | |
400
+ ------------------
401
+ | | | | <- 这里是此函数要切分的场景
402
+ ------------------
403
+ | | |
404
+ | | |
405
+ """
406
+ pass
407
+
408
+
409
+
410
+
411
+ def _vertical_split(bboxes:list, boundry:tuple)-> list:
412
+ """
413
+ 从垂直方向进行切割,分block
414
+ 这个版本里,如果垂直切分不动,那就当一个BAD_LAYOUT返回
415
+
416
+ --------------------------
417
+ | | |
418
+ | | |
419
+ | |
420
+ 这种列是此函数要切分的 -> | |
421
+ | |
422
+ | | |
423
+ | | |
424
+ -------------------------
425
+ """
426
+ sorted_layout_blocks = [] # 这是要最终返回的值
427
+
428
+ bound_x0, bound_y0, bound_x1, bound_y1 = boundry
429
+ all_bboxes = get_bbox_in_boundry(bboxes, boundry)
430
+ """
431
+ all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
432
+ all_bboxes = fix_hor_bbox_pos(all_bboxes) # 水平解覆盖
433
+
434
+ 这两行代码目前先不执行,因为公式检测,表格检测还不是很成熟,导致非常多的textblock参与了运算,时间消耗太大。
435
+ 这两行代码的作用是:
436
+ 如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩,从而避免重叠。对布局切分来说带来正反馈。
437
+ """
438
+
439
+ #all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
440
+ """
441
+ 首先在垂直方向上扩展独占一行的bbox
442
+
443
+ """
444
+ for bbox in all_bboxes:
445
+ top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes) # 非扩展线
446
+ bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
447
+ if top_nearest_bbox is None and bottom_nearest_bbox is None and not any([b[X0_IDX]<bbox[X1_IDX]<b[X1_IDX] or b[X0_IDX]<bbox[X0_IDX]<b[X1_IDX] for b in all_bboxes]): # 独占一列, 且不和其他重叠
448
+ bbox[X0_EXT_IDX] = bbox[X0_IDX]
449
+ bbox[Y0_EXT_IDX] = bound_y0
450
+ bbox[X1_EXT_IDX] = bbox[X1_IDX]
451
+ bbox[Y1_EXT_IDX] = bound_y1
452
+
453
+ """
454
+ 此时独占一列的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
455
+ 然后合并所有连续垂直方向的bbox.
456
+ """
457
+ all_bboxes.sort(key=lambda x: x[X0_IDX])
458
+ # fix: 这里水平方向的列不要合并成一个行,因为需要保证返回给下游的最小block,总是可以无脑从上到下阅读文字。
459
+ v_bboxes = []
460
+ for box in all_bboxes:
461
+ if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1:
462
+ v_bboxes.append(box)
463
+
464
+ """
465
+ 现在v_bboxes里面是所有的group了,每个group都是一个list
466
+ 对v_bboxes里的每个group进行计算放回到sorted_layouts里
467
+ """
468
+ v_layouts = []
469
+ for vbox in v_bboxes:
470
+ #gp.sort(key=lambda x: x[X0_IDX])
471
+ # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
472
+ x0, y0, x1, y1 = vbox[X0_EXT_IDX], vbox[Y0_EXT_IDX], vbox[X1_EXT_IDX], vbox[Y1_EXT_IDX]
473
+ v_layouts.append([x0, y0, x1, y1, LAYOUT_V]) # 垂直的布局
474
+
475
+ """
476
+ 接下来利用这些连续的垂直bbox的layout_bbox的x0, x1,从垂直上切分开其余的为几个部分
477
+ """
478
+ v_split_lines = [bound_x0]
479
+ for gp in v_bboxes:
480
+ x0, x1 = gp[X0_IDX], gp[X1_IDX]
481
+ v_split_lines.append(x0)
482
+ v_split_lines.append(x1)
483
+ v_split_lines.append(bound_x1)
484
+
485
+ unsplited_bboxes = []
486
+ for i in range(0, len(v_split_lines), 2):
487
+ start_x0, start_x1 = v_split_lines[i:i+2]
488
+ # 然后找出[start_x0, start_x1]之间的其他bbox,这些组成一个未分割板块
489
+ bboxes_in_block = [bbox for bbox in all_bboxes if bbox[X0_IDX]>=start_x0 and bbox[X1_IDX]<=start_x1]
490
+ unsplited_bboxes.append(bboxes_in_block)
491
+ # 接着把未处理的加入到v_layouts里
492
+ for bboxes_in_block in unsplited_bboxes:
493
+ if len(bboxes_in_block) == 0:
494
+ continue
495
+ x0, y0, x1, y1 = min([bbox[X0_IDX] for bbox in bboxes_in_block]), bound_y0, max([bbox[X1_IDX] for bbox in bboxes_in_block]), bound_y1
496
+ v_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC]) # 说明这篇区域未能够分析出可靠的版面
497
+
498
+ v_layouts.sort(key=lambda x: x[0]) # 按照x0排序, 也就是从左到右的顺序
499
+
500
+ for layout in v_layouts:
501
+ sorted_layout_blocks.append({
502
+ "layout_bbox": layout[:4],
503
+ "layout_label":layout[4],
504
+ "sub_layout":[],
505
+ })
506
+
507
+ """
508
+ 至此,垂直方向切成了2种类型,其一是独占一列的,其二是未处理的。
509
+ 下面对这些未处理的进行垂直方向切分,这个切分要切出来类似“吕”这种类型的垂直方向的布局
510
+ """
511
+ for i, layout in enumerate(sorted_layout_blocks):
512
+ if layout['layout_label'] == LAYOUT_UNPROC:
513
+ x0, y0, x1, y1 = layout['layout_bbox']
514
+ v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
515
+ sorted_layout_blocks[i] = {
516
+ "layout_bbox": [x0, y0, x1, y1],
517
+ "layout_label": LAYOUT_H,
518
+ "sub_layout": v_split_layouts
519
+ }
520
+ layout['layout_label'] = LAYOUT_H # 被垂线切分成了水平布局
521
+
522
+ return sorted_layout_blocks
523
+
524
+
525
+ def split_layout(bboxes:list, boundry:tuple, page_num:int)-> list:
526
+ """
527
+ 把bboxes切割成layout
528
+ return:
529
+ [
530
+ {
531
+ "layout_bbox": [x0, y0, x1, y1],
532
+ "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
533
+ "sub_layout": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
534
+ }
535
+ ]
536
+ example:
537
+ [
538
+ {
539
+ "layout_bbox": [0, 0, 100, 100],
540
+ "layout_label":"u|v|h|b",
541
+ "sub_layout":[
542
+
543
+ ]
544
+ },
545
+ {
546
+ "layout_bbox": [0, 0, 100, 100],
547
+ "layout_label":"u|v|h|b",
548
+ "sub_layout":[
549
+ {
550
+ "layout_bbox": [0, 0, 100, 100],
551
+ "layout_label":"u|v|h|b",
552
+ "content_bboxes":[
553
+ [],
554
+ [],
555
+ []
556
+ ]
557
+ },
558
+ {
559
+ "layout_bbox": [0, 0, 100, 100],
560
+ "layout_label":"u|v|h|b",
561
+ "sub_layout":[
562
+
563
+ ]
564
+ }
565
+ }
566
+ ]
567
+ """
568
+ sorted_layouts = [] # 最终返回的结果
569
+
570
+ boundry_x0, boundry_y0, boundry_x1, boundry_y1 = boundry
571
+ if len(bboxes) <=1:
572
+ return [
573
+ {
574
+ "layout_bbox": [boundry_x0, boundry_y0, boundry_x1, boundry_y1],
575
+ "layout_label": LAYOUT_V,
576
+ "sub_layout":[]
577
+ }
578
+ ]
579
+
580
+ """
581
+ 接下来按照先水平后垂直的顺序进行切分
582
+ """
583
+ bboxes = paper_bbox_sort(bboxes, boundry_x1-boundry_x0, boundry_y1-boundry_y0)
584
+ sorted_layouts = _horizontal_split(bboxes, boundry) # 通过水平分割出来的layout
585
+ for i, layout in enumerate(sorted_layouts):
586
+ x0, y0, x1, y1 = layout['layout_bbox']
587
+ layout_type = layout['layout_label']
588
+ if layout_type == LAYOUT_UNPROC: # 说明是非独占单行的,这些需要垂直切分
589
+ v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
590
+
591
+ """
592
+ 最后这里有个逻辑问题:如果这个函数只分离出来了一个column layout,那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
593
+ box已经把行全部剥离了,所以这里必须十多个列才可以。如果只剥离出来一个layout,并且是多个box,那么就说明这个layout是无法分割的,标记为LAYOUT_UNPROC
594
+ """
595
+ layout_label = LAYOUT_V
596
+ if len(v_split_layouts) == 1:
597
+ if len(v_split_layouts[0]['sub_layout']) == 0:
598
+ layout_label = LAYOUT_UNPROC
599
+ #logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
600
+
601
+ """
602
+ 组合起来最终的layout
603
+ """
604
+ sorted_layouts[i] = {
605
+ "layout_bbox": [x0, y0, x1, y1],
606
+ "layout_label": layout_label,
607
+ "sub_layout": v_split_layouts
608
+ }
609
+ layout['layout_label'] = LAYOUT_H
610
+
611
+ """
612
+ 水平和垂直方向都切分完毕了。此时还有一些未处理的,这些未处理的可能是因为水平和垂直方向都无法切分。
613
+ 这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分,如果也不能切分最终就当做BAD_LAYOUT返回
614
+ """
615
+ # TODO
616
+
617
+ return sorted_layouts
618
+
619
+
620
+ def get_bboxes_layout(all_boxes:list, boundry:tuple, page_id:int):
621
+ """
622
+ 对利用layout排序之后的box,进行排序
623
+ return:
624
+ [
625
+ {
626
+ "layout_bbox": [x0, y0, x1, y1],
627
+ "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
628
+ },
629
+ ]
630
+ """
631
+ def _preorder_traversal(layout):
632
+ """
633
+ 对sorted_layouts的叶子节点,也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序,也就是从上到下,从左到右的顺序
634
+ """
635
+ sorted_layout_blocks = []
636
+ for layout in layout:
637
+ sub_layout = layout['sub_layout']
638
+ if len(sub_layout) == 0:
639
+ sorted_layout_blocks.append(layout)
640
+ else:
641
+ s = _preorder_traversal(sub_layout)
642
+ sorted_layout_blocks.extend(s)
643
+ return sorted_layout_blocks
644
+ # -------------------------------------------------------------------------------------------------------------------------
645
+ sorted_layouts = split_layout(all_boxes, boundry, page_id)# 先切分成layout,得到一个Tree
646
+ total_sorted_layout_blocks = _preorder_traversal(sorted_layouts)
647
+ return total_sorted_layout_blocks, sorted_layouts
648
+
649
+
650
+ def get_columns_cnt_of_layout(layout_tree):
651
+ """
652
+ 获取一个layout的宽度
653
+ """
654
+ max_width_list = [0] # 初始化一个元素,防止max,min函数报错
655
+
656
+ for items in layout_tree: # 针对每一层(横切)计算列数,横着的算一列
657
+ layout_type = items['layout_label']
658
+ sub_layouts = items['sub_layout']
659
+ if len(sub_layouts)==0:
660
+ max_width_list.append(1)
661
+ else:
662
+ if layout_type == LAYOUT_H:
663
+ max_width_list.append(1)
664
+ else:
665
+ width = 0
666
+ for l in sub_layouts:
667
+ if len(l['sub_layout']) == 0:
668
+ width += 1
669
+ else:
670
+ for lay in l['sub_layout']:
671
+ width += get_columns_cnt_of_layout([lay])
672
+ max_width_list.append(width)
673
+
674
+ return max(max_width_list)
675
+
676
+
677
+
678
+ def sort_with_layout(bboxes:list, page_width, page_height) -> (list,list):
679
+ """
680
+ 输入是一个bbox的list.
681
+ 获取到输入之后,先进行layout切分,然后对这些bbox进行排序。返回排序后的bboxes
682
+ """
683
+
684
+ new_bboxes = []
685
+ for box in bboxes:
686
+ # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
687
+ new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None, box[4]])
688
+
689
+ layout_bboxes, _ = get_bboxes_layout(new_bboxes, [0, 0, page_width, page_height], 0)
690
+ if any([lay['layout_label']==LAYOUT_UNPROC for lay in layout_bboxes]):
691
+ logger.warning(f"drop this pdf, reason: 复杂版面")
692
+ return None,None
693
+
694
+ sorted_bboxes = []
695
+ # 利用layout bbox每次框定一些box,然后排序
696
+ for layout in layout_bboxes:
697
+ lbox = layout['layout_bbox']
698
+ bbox_in_layout = get_bbox_in_boundry(new_bboxes, lbox)
699
+ sorted_bbox = paper_bbox_sort(bbox_in_layout, lbox[2]-lbox[0], lbox[3]-lbox[1])
700
+ sorted_bboxes.extend(sorted_bbox)
701
+
702
+ return sorted_bboxes, layout_bboxes
703
+
704
+
705
+ def sort_text_block(text_block, layout_bboxes):
706
+ """
707
+ 对一页的text_block进行排序
708
+ """
709
+ sorted_text_bbox = []
710
+ all_text_bbox = []
711
+ # 做一个box=>text的映射
712
+ box_to_text = {}
713
+ for blk in text_block:
714
+ box = blk['bbox']
715
+ box_to_text[(box[0], box[1], box[2], box[3])] = blk
716
+ all_text_bbox.append(box)
717
+
718
+ # text_blocks_to_sort = []
719
+ # for box in box_to_text.keys():
720
+ # text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
721
+
722
+ # 按照layout_bboxes的顺序,对text_block进行排序
723
+ for layout in layout_bboxes:
724
+ layout_box = layout['layout_bbox']
725
+ text_bbox_in_layout = get_bbox_in_boundry(all_text_bbox, [layout_box[0]-1, layout_box[1]-1, layout_box[2]+1, layout_box[3]+1])
726
+ #sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
727
+ text_bbox_in_layout.sort(key = lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
728
+ #sorted_bbox = [[b] for b in text_blocks_to_sort]
729
+ for sb in text_bbox_in_layout:
730
+ sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
731
+
732
+ return sorted_text_bbox
magic_pdf/layout/layout_spiler_recog.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 找到能分割布局的水平的横线、色块
3
+ """
4
+
5
+ import os
6
+ from magic_pdf.libs.commons import fitz
7
+ from magic_pdf.libs.boxbase import _is_in_or_part_overlap
8
+
9
+
10
+ def __rect_filter_by_width(rect, page_w, page_h):
11
+ mid_x = page_w/2
12
+ if rect[0]< mid_x < rect[2]:
13
+ return True
14
+ return False
15
+
16
+
17
+ def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
18
+ """
19
+ 不能出现在table和image的位置
20
+ """
21
+ for box in image_bboxes:
22
+ if _is_in_or_part_overlap(rect, box):
23
+ return False
24
+
25
+ for box in table_bboxes:
26
+ if _is_in_or_part_overlap(rect, box):
27
+ return False
28
+
29
+ return True
30
+
31
+
32
+ def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
33
+ save_path = "./tmp/debug.pdf"
34
+ if os.path.exists(save_path):
35
+ # 删除已经存在的文件
36
+ os.remove(save_path)
37
+ # 创建一个新的空白 PDF 文件
38
+ doc = fitz.open('')
39
+
40
+ width = page.rect.width
41
+ height = page.rect.height
42
+ new_page = doc.new_page(width=width, height=height)
43
+
44
+ shape = new_page.new_shape()
45
+ for bbox in bboxes1:
46
+ # 原始box画上去
47
+ rect = fitz.Rect(*bbox[0:4])
48
+ shape = new_page.new_shape()
49
+ shape.draw_rect(rect)
50
+ shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
51
+ shape.finish()
52
+ shape.commit()
53
+
54
+ for bbox in bboxes2:
55
+ # 原始box画上去
56
+ rect = fitz.Rect(*bbox[0:4])
57
+ shape = new_page.new_shape()
58
+ shape.draw_rect(rect)
59
+ shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
60
+ shape.finish()
61
+ shape.commit()
62
+
63
+ for bbox in bboxes3:
64
+ # 原始box画上去
65
+ rect = fitz.Rect(*bbox[0:4])
66
+ shape = new_page.new_shape()
67
+ shape.draw_rect(rect)
68
+ shape.finish(color=fitz.pdfcolor['red'], fill=None)
69
+ shape.finish()
70
+ shape.commit()
71
+
72
+ parent_dir = os.path.dirname(save_path)
73
+ if not os.path.exists(parent_dir):
74
+ os.makedirs(parent_dir)
75
+
76
+ doc.save(save_path)
77
+ doc.close()
78
+
79
+ def get_spilter_of_page(page, image_bboxes, table_bboxes):
80
+ """
81
+ 获取到色块和横线
82
+ """
83
+ cdrawings = page.get_cdrawings()
84
+
85
+ spilter_bbox = []
86
+ for block in cdrawings:
87
+ if 'fill' in block:
88
+ fill = block['fill']
89
+ if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
90
+ rect = block['rect']
91
+ if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
92
+ spilter_bbox.append(list(rect))
93
+
94
+ """过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
95
+ for box in spilter_bbox:
96
+ if box[3]-box[1] <= 0:
97
+ box[3] = box[1] + 1
98
+
99
+ #__debug_show_page(page, spilter_bbox, [], [])
100
+
101
+ return spilter_bbox
magic_pdf/layout/mcol_sort.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is an advanced PyMuPDF utility for detecting multi-column pages.
3
+ It can be used in a shell script, or its main function can be imported and
4
+ invoked as descript below.
5
+
6
+ Features
7
+ ---------
8
+ - Identify text belonging to (a variable number of) columns on the page.
9
+ - Text with different background color is handled separately, allowing for
10
+ easier treatment of side remarks, comment boxes, etc.
11
+ - Uses text block detection capability to identify text blocks and
12
+ uses the block bboxes as primary structuring principle.
13
+ - Supports ignoring footers via a footer margin parameter.
14
+ - Returns re-created text boundary boxes (integer coordinates), sorted ascending
15
+ by the top, then by the left coordinates.
16
+
17
+ Restrictions
18
+ -------------
19
+ - Only supporting horizontal, left-to-right text
20
+ - Returns a list of text boundary boxes - not the text itself. The caller is
21
+ expected to extract text from within the returned boxes.
22
+ - Text written above images is ignored altogether (option).
23
+ - This utility works as expected in most cases. The following situation cannot
24
+ be handled correctly:
25
+ * overlapping (non-disjoint) text blocks
26
+ * image captions are not recognized and are handled like normal text
27
+
28
+ Usage
29
+ ------
30
+ - As a CLI shell command use
31
+
32
+ python multi_column.py input.pdf footer_margin
33
+
34
+ Where footer margin is the height of the bottom stripe to ignore on each page.
35
+ This code is intended to be modified according to your need.
36
+
37
+ - Use in a Python script as follows:
38
+
39
+ ----------------------------------------------------------------------------------
40
+ from multi_column import column_boxes
41
+
42
+ # for each page execute
43
+ bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
44
+
45
+ # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
46
+ # then x0 coordinates. Their text content can be extracted by all PyMuPDF
47
+ # get_text() variants, like for instance the following:
48
+ for rect in bboxes:
49
+ print(page.get_text(clip=rect, sort=True))
50
+ ----------------------------------------------------------------------------------
51
+ """
52
+ import sys
53
+ from magic_pdf.libs.commons import fitz
54
+
55
+
56
+ def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
57
+ """Determine bboxes which wrap a column."""
58
+ paths = page.get_drawings()
59
+ bboxes = []
60
+
61
+ # path rectangles
62
+ path_rects = []
63
+
64
+ # image bboxes
65
+ img_bboxes = []
66
+
67
+ # bboxes of non-horizontal text
68
+ # avoid when expanding horizontal text boxes
69
+ vert_bboxes = []
70
+
71
+ # compute relevant page area
72
+ clip = +page.rect
73
+ clip.y1 -= footer_margin # Remove footer area
74
+ clip.y0 += header_margin # Remove header area
75
+
76
+ def can_extend(temp, bb, bboxlist):
77
+ """Determines whether rectangle 'temp' can be extended by 'bb'
78
+ without intersecting any of the rectangles contained in 'bboxlist'.
79
+
80
+ Items of bboxlist may be None if they have been removed.
81
+
82
+ Returns:
83
+ True if 'temp' has no intersections with items of 'bboxlist'.
84
+ """
85
+ for b in bboxlist:
86
+ if not intersects_bboxes(temp, vert_bboxes) and (
87
+ b == None or b == bb or (temp & b).is_empty
88
+ ):
89
+ continue
90
+ return False
91
+
92
+ return True
93
+
94
+ def in_bbox(bb, bboxes):
95
+ """Return 1-based number if a bbox contains bb, else return 0."""
96
+ for i, bbox in enumerate(bboxes):
97
+ if bb in bbox:
98
+ return i + 1
99
+ return 0
100
+
101
+ def intersects_bboxes(bb, bboxes):
102
+ """Return True if a bbox intersects bb, else return False."""
103
+ for bbox in bboxes:
104
+ if not (bb & bbox).is_empty:
105
+ return True
106
+ return False
107
+
108
+ def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
109
+ """Extend a bbox to the right page border.
110
+
111
+ Whenever there is no text to the right of a bbox, enlarge it up
112
+ to the right page border.
113
+
114
+ Args:
115
+ bboxes: (list[IRect]) bboxes to check
116
+ width: (int) page width
117
+ path_bboxes: (list[IRect]) bboxes with a background color
118
+ vert_bboxes: (list[IRect]) bboxes with vertical text
119
+ img_bboxes: (list[IRect]) bboxes of images
120
+ Returns:
121
+ Potentially modified bboxes.
122
+ """
123
+ for i, bb in enumerate(bboxes):
124
+ # do not extend text with background color
125
+ if in_bbox(bb, path_bboxes):
126
+ continue
127
+
128
+ # do not extend text in images
129
+ if in_bbox(bb, img_bboxes):
130
+ continue
131
+
132
+ # temp extends bb to the right page border
133
+ temp = +bb
134
+ temp.x1 = width
135
+
136
+ # do not cut through colored background or images
137
+ if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
138
+ continue
139
+
140
+ # also, do not intersect other text bboxes
141
+ check = can_extend(temp, bb, bboxes)
142
+ if check:
143
+ bboxes[i] = temp # replace with enlarged bbox
144
+
145
+ return [b for b in bboxes if b != None]
146
+
147
+ def clean_nblocks(nblocks):
148
+ """Do some elementary cleaning."""
149
+
150
+ # 1. remove any duplicate blocks.
151
+ blen = len(nblocks)
152
+ if blen < 2:
153
+ return nblocks
154
+ start = blen - 1
155
+ for i in range(start, -1, -1):
156
+ bb1 = nblocks[i]
157
+ bb0 = nblocks[i - 1]
158
+ if bb0 == bb1:
159
+ del nblocks[i]
160
+
161
+ # 2. repair sequence in special cases:
162
+ # consecutive bboxes with almost same bottom value are sorted ascending
163
+ # by x-coordinate.
164
+ y1 = nblocks[0].y1 # first bottom coordinate
165
+ i0 = 0 # its index
166
+ i1 = -1 # index of last bbox with same bottom
167
+
168
+ # Iterate over bboxes, identifying segments with approx. same bottom value.
169
+ # Replace every segment by its sorted version.
170
+ for i in range(1, len(nblocks)):
171
+ b1 = nblocks[i]
172
+ if abs(b1.y1 - y1) > 10: # different bottom
173
+ if i1 > i0: # segment length > 1? Sort it!
174
+ nblocks[i0 : i1 + 1] = sorted(
175
+ nblocks[i0 : i1 + 1], key=lambda b: b.x0
176
+ )
177
+ y1 = b1.y1 # store new bottom value
178
+ i0 = i # store its start index
179
+ i1 = i # store current index
180
+ if i1 > i0: # segment waiting to be sorted
181
+ nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
182
+ return nblocks
183
+
184
+ # extract vector graphics
185
+ for p in paths:
186
+ path_rects.append(p["rect"].irect)
187
+ path_bboxes = path_rects
188
+
189
+ # sort path bboxes by ascending top, then left coordinates
190
+ path_bboxes.sort(key=lambda b: (b.y0, b.x0))
191
+
192
+ # bboxes of images on page, no need to sort them
193
+ for item in page.get_images():
194
+ img_bboxes.extend(page.get_image_rects(item[0]))
195
+
196
+ # blocks of text on page
197
+ blocks = page.get_text(
198
+ "dict",
199
+ flags=fitz.TEXTFLAGS_TEXT,
200
+ clip=clip,
201
+ )["blocks"]
202
+
203
+ # Make block rectangles, ignoring non-horizontal text
204
+ for b in blocks:
205
+ bbox = fitz.IRect(b["bbox"]) # bbox of the block
206
+
207
+ # ignore text written upon images
208
+ if no_image_text and in_bbox(bbox, img_bboxes):
209
+ continue
210
+
211
+ # confirm first line to be horizontal
212
+ line0 = b["lines"][0] # get first line
213
+ if line0["dir"] != (1, 0): # only accept horizontal text
214
+ vert_bboxes.append(bbox)
215
+ continue
216
+
217
+ srect = fitz.EMPTY_IRECT()
218
+ for line in b["lines"]:
219
+ lbbox = fitz.IRect(line["bbox"])
220
+ text = "".join([s["text"].strip() for s in line["spans"]])
221
+ if len(text) > 1:
222
+ srect |= lbbox
223
+ bbox = +srect
224
+
225
+ if not bbox.is_empty:
226
+ bboxes.append(bbox)
227
+
228
+ # Sort text bboxes by ascending background, top, then left coordinates
229
+ bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
230
+
231
+ # Extend bboxes to the right where possible
232
+ bboxes = extend_right(
233
+ bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
234
+ )
235
+
236
+ # immediately return of no text found
237
+ if bboxes == []:
238
+ return []
239
+
240
+ # --------------------------------------------------------------------
241
+ # Join bboxes to establish some column structure
242
+ # --------------------------------------------------------------------
243
+ # the final block bboxes on page
244
+ nblocks = [bboxes[0]] # pre-fill with first bbox
245
+ bboxes = bboxes[1:] # remaining old bboxes
246
+
247
+ for i, bb in enumerate(bboxes): # iterate old bboxes
248
+ check = False # indicates unwanted joins
249
+
250
+ # check if bb can extend one of the new blocks
251
+ for j in range(len(nblocks)):
252
+ nbb = nblocks[j] # a new block
253
+
254
+ # never join across columns
255
+ if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
256
+ continue
257
+
258
+ # never join across different background colors
259
+ if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
260
+ continue
261
+
262
+ temp = bb | nbb # temporary extension of new block
263
+ check = can_extend(temp, nbb, nblocks)
264
+ if check == True:
265
+ break
266
+
267
+ if not check: # bb cannot be used to extend any of the new bboxes
268
+ nblocks.append(bb) # so add it to the list
269
+ j = len(nblocks) - 1 # index of it
270
+ temp = nblocks[j] # new bbox added
271
+
272
+ # check if some remaining bbox is contained in temp
273
+ check = can_extend(temp, bb, bboxes)
274
+ if check == False:
275
+ nblocks.append(bb)
276
+ else:
277
+ nblocks[j] = temp
278
+ bboxes[i] = None
279
+
280
+ # do some elementary cleaning
281
+ nblocks = clean_nblocks(nblocks)
282
+
283
+ # return identified text bboxes
284
+ return nblocks
285
+
286
+
287
+ if __name__ == "__main__":
288
+ """Only for debugging purposes, currently.
289
+
290
+ Draw red borders around the returned text bboxes and insert
291
+ the bbox number.
292
+ Then save the file under the name "input-blocks.pdf".
293
+ """
294
+
295
+ # get the file name
296
+ filename = sys.argv[1]
297
+
298
+ # check if footer margin is given
299
+ if len(sys.argv) > 2:
300
+ footer_margin = int(sys.argv[2])
301
+ else: # use default vaue
302
+ footer_margin = 50
303
+
304
+ # check if header margin is given
305
+ if len(sys.argv) > 3:
306
+ header_margin = int(sys.argv[3])
307
+ else: # use default vaue
308
+ header_margin = 50
309
+
310
+ # open document
311
+ doc = fitz.open(filename)
312
+
313
+ # iterate over the pages
314
+ for page in doc:
315
+ # remove any geometry issues
316
+ page.wrap_contents()
317
+
318
+ # get the text bboxes
319
+ bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
320
+
321
+ # prepare a canvas to draw rectangles and text
322
+ shape = page.new_shape()
323
+
324
+ # iterate over the bboxes
325
+ for i, rect in enumerate(bboxes):
326
+ shape.draw_rect(rect) # draw a border
327
+
328
+ # write sequence number
329
+ shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
330
+
331
+ # finish drawing / text with color red
332
+ shape.finish(color=fitz.pdfcolor["red"])
333
+ shape.commit() # store to the page
334
+
335
+ # save document with text bboxes
336
+ doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
magic_pdf/libs/Constants.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ span维度自定义字段
3
+ """
4
+ # span是否是跨页合并的
5
+ CROSS_PAGE = "cross_page"
6
+
7
+ """
8
+ block维度自定义字段
9
+ """
10
+ # block中lines是否被删除
11
+ LINES_DELETED = "lines_deleted"
magic_pdf/libs/MakeContentConfig.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ class MakeMode:
2
+ MM_MD = "mm_markdown"
3
+ NLP_MD = "nlp_markdown"
4
+ STANDARD_FORMAT = "standard_format"
5
+
6
+
7
+ class DropMode:
8
+ WHOLE_PDF = "whole_pdf"
9
+ SINGLE_PAGE = "single_page"
10
+ NONE = "none"
magic_pdf/libs/ModelBlockTypeEnum.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+ class ModelBlockTypeEnum(Enum):
4
+ TITLE = 0
5
+ PLAIN_TEXT = 1
6
+ ABANDON = 2
7
+ ISOLATE_FORMULA = 8
8
+ EMBEDDING = 13
9
+ ISOLATED = 14
magic_pdf/libs/__init__.py ADDED
File without changes
magic_pdf/libs/boxbase.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from loguru import logger
4
+ import math
5
+
6
+ def _is_in_or_part_overlap(box1, box2) -> bool:
7
+ """
8
+ 两个bbox是否有部分重叠或者包含
9
+ """
10
+ if box1 is None or box2 is None:
11
+ return False
12
+
13
+ x0_1, y0_1, x1_1, y1_1 = box1
14
+ x0_2, y0_2, x1_2, y1_2 = box2
15
+
16
+ return not (x1_1 < x0_2 or # box1在box2的左边
17
+ x0_1 > x1_2 or # box1在box2的右边
18
+ y1_1 < y0_2 or # box1在box2的上边
19
+ y0_1 > y1_2) # box1在box2的下边
20
+
21
+ def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
22
+ """
23
+ 判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
24
+
25
+ """
26
+ if box1 is None or box2 is None:
27
+ return False
28
+
29
+ x0_1, y0_1, x1_1, y1_1 = box1
30
+ x0_2, y0_2, x1_2, y1_2 = box2
31
+
32
+ if not _is_in_or_part_overlap(box1, box2):
33
+ return False
34
+
35
+ # 计算重叠面积
36
+ x_left = max(x0_1, x0_2)
37
+ y_top = max(y0_1, y0_2)
38
+ x_right = min(x1_1, x1_2)
39
+ y_bottom = min(y1_1, y1_2)
40
+ overlap_area = (x_right - x_left) * (y_bottom - y_top)
41
+
42
+ # 计算box1的面积
43
+ box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
44
+
45
+ return overlap_area / box1_area > area_ratio_threshold
46
+
47
+
48
+ def _is_in(box1, box2) -> bool:
49
+ """
50
+ box1是否完全在box2里面
51
+ """
52
+ x0_1, y0_1, x1_1, y1_1 = box1
53
+ x0_2, y0_2, x1_2, y1_2 = box2
54
+
55
+ return (x0_1 >= x0_2 and # box1的左边界不在box2的左边外
56
+ y0_1 >= y0_2 and # box1的上边界不在box2的上边外
57
+ x1_1 <= x1_2 and # box1的右边界不在box2的右边外
58
+ y1_1 <= y1_2) # box1的下边界不在box2的下边外
59
+
60
+ def _is_part_overlap(box1, box2) -> bool:
61
+ """
62
+ 两个bbox是否有部分重叠,但不完全包含
63
+ """
64
+ if box1 is None or box2 is None:
65
+ return False
66
+
67
+ return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
68
+
69
+ def _left_intersect(left_box, right_box):
70
+ "检查两个box的左边界是否有交集,也就是left_box的右边界是否在right_box的左边界内"
71
+ if left_box is None or right_box is None:
72
+ return False
73
+
74
+ x0_1, y0_1, x1_1, y1_1 = left_box
75
+ x0_2, y0_2, x1_2, y1_2 = right_box
76
+
77
+ return x1_1>x0_2 and x0_1<x0_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
78
+
79
+ def _right_intersect(left_box, right_box):
80
+ """
81
+ 检查box是否在右侧边界有交集,也就是left_box的左边界是否在right_box的右边界内
82
+ """
83
+ if left_box is None or right_box is None:
84
+ return False
85
+
86
+ x0_1, y0_1, x1_1, y1_1 = left_box
87
+ x0_2, y0_2, x1_2, y1_2 = right_box
88
+
89
+ return x0_1<x1_2 and x1_1>x1_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
90
+
91
+
92
+ def _is_vertical_full_overlap(box1, box2, x_torlence=2):
93
+ """
94
+ x方向上:要么box1包含box2, 要么box2包含box1。不能部分包含
95
+ y方向上:box1和box2有重叠
96
+ """
97
+ # 解析box的坐标
98
+ x11, y11, x12, y12 = box1 # 左上角和右下角的坐标 (x1, y1, x2, y2)
99
+ x21, y21, x22, y22 = box2
100
+
101
+ # 在x轴方向上,box1是否包含box2 或 box2包含box1
102
+ contains_in_x = (x11-x_torlence <= x21 and x12+x_torlence >= x22) or (x21-x_torlence <= x11 and x22+x_torlence >= x12)
103
+
104
+ # 在y轴方向上,box1和box2是否有重叠
105
+ overlap_in_y = not (y12 < y21 or y11 > y22)
106
+
107
+ return contains_in_x and overlap_in_y
108
+
109
+
110
+ def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
111
+ """
112
+ 检查box1下方和box2的上方有轻微的重叠,轻微程度收到y_tolerance的限制
113
+ 这个函数和_is_vertical-full_overlap的区别是,这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度
114
+ """
115
+ if box1 is None or box2 is None:
116
+ return False
117
+
118
+ x0_1, y0_1, x1_1, y1_1 = box1
119
+ x0_2, y0_2, x1_2, y1_2 = box2
120
+ tolerance_margin = 2
121
+ is_xdir_full_overlap = ((x0_1-tolerance_margin<=x0_2<=x1_1+tolerance_margin and x0_1-tolerance_margin<=x1_2<=x1_1+tolerance_margin) or (x0_2-tolerance_margin<=x0_1<=x1_2+tolerance_margin and x0_2-tolerance_margin<=x1_1<=x1_2+tolerance_margin))
122
+
123
+ return y0_2<y1_1 and 0<(y1_1-y0_2)<y_tolerance and is_xdir_full_overlap
124
+
125
+ def _is_left_overlap(box1, box2,):
126
+ """
127
+ 检查box1的左侧是否和box2有重叠
128
+ 在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系,也就是无论box1在box2下方还是box2在box1下方,都可以检测到重叠。
129
+ X方向上
130
+ """
131
+ def __overlap_y(Ay1, Ay2, By1, By2):
132
+ return max(0, min(Ay2, By2) - max(Ay1, By1))
133
+
134
+ if box1 is None or box2 is None:
135
+ return False
136
+
137
+ x0_1, y0_1, x1_1, y1_1 = box1
138
+ x0_2, y0_2, x1_2, y1_2 = box2
139
+
140
+ y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
141
+ ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1-y0_1!=0 else 0
142
+ ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2-y0_2!=0 else 0
143
+ vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
144
+
145
+ #vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
146
+ return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
147
+
148
+
149
+ def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
150
+ """检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
151
+ _, y0_1, _, y1_1 = bbox1
152
+ _, y0_2, _, y1_2 = bbox2
153
+
154
+ overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
155
+ height1, height2 = y1_1 - y0_1, y1_2 - y0_2
156
+ max_height = max(height1, height2)
157
+ min_height = min(height1, height2)
158
+
159
+ return (overlap / min_height) > overlap_ratio_threshold
160
+
161
+
162
+
163
+ def calculate_iou(bbox1, bbox2):
164
+ """
165
+ 计算两个边界框的交并比(IOU)。
166
+
167
+ Args:
168
+ bbox1 (list[float]): 第一个边界框的坐标,格式为 [x1, y1, x2, y2],其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。
169
+ bbox2 (list[float]): 第二个边界框的坐标,格式与 `bbox1` 相同。
170
+
171
+ Returns:
172
+ float: 两个边界框的交并比(IOU),取值范围为 [0, 1]。
173
+
174
+ """
175
+ # Determine the coordinates of the intersection rectangle
176
+ x_left = max(bbox1[0], bbox2[0])
177
+ y_top = max(bbox1[1], bbox2[1])
178
+ x_right = min(bbox1[2], bbox2[2])
179
+ y_bottom = min(bbox1[3], bbox2[3])
180
+
181
+ if x_right < x_left or y_bottom < y_top:
182
+ return 0.0
183
+
184
+ # The area of overlap area
185
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
186
+
187
+ # The area of both rectangles
188
+ bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
189
+ bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
190
+
191
+ # Compute the intersection over union by taking the intersection area
192
+ # and dividing it by the sum of both areas minus the intersection area
193
+ iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
194
+ return iou
195
+
196
+
197
+ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
198
+ """
199
+ 计算box1和box2的重叠面积占最小面积的box的比例
200
+ """
201
+ # Determine the coordinates of the intersection rectangle
202
+ x_left = max(bbox1[0], bbox2[0])
203
+ y_top = max(bbox1[1], bbox2[1])
204
+ x_right = min(bbox1[2], bbox2[2])
205
+ y_bottom = min(bbox1[3], bbox2[3])
206
+
207
+ if x_right < x_left or y_bottom < y_top:
208
+ return 0.0
209
+
210
+ # The area of overlap area
211
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
212
+ min_box_area = min([(bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]), (bbox2[3]-bbox2[1])*(bbox2[2]-bbox2[0])])
213
+ if min_box_area==0:
214
+ return 0
215
+ else:
216
+ return intersection_area / min_box_area
217
+
218
+ def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
219
+ """
220
+ 计算box1和box2的重叠面积占bbox1的比例
221
+ """
222
+ # Determine the coordinates of the intersection rectangle
223
+ x_left = max(bbox1[0], bbox2[0])
224
+ y_top = max(bbox1[1], bbox2[1])
225
+ x_right = min(bbox1[2], bbox2[2])
226
+ y_bottom = min(bbox1[3], bbox2[3])
227
+
228
+ if x_right < x_left or y_bottom < y_top:
229
+ return 0.0
230
+
231
+ # The area of overlap area
232
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
233
+ bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
234
+ if bbox1_area == 0:
235
+ return 0
236
+ else:
237
+ return intersection_area / bbox1_area
238
+
239
+
240
+ def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
241
+ """
242
+ 通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
243
+ 如果比例大于ratio,则返回小的那个bbox,
244
+ 否则返回None
245
+ """
246
+ x1_min, y1_min, x1_max, y1_max = bbox1
247
+ x2_min, y2_min, x2_max, y2_max = bbox2
248
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
249
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
250
+ overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
251
+ if overlap_ratio > ratio:
252
+ if area1 <= area2:
253
+ return bbox1
254
+ else:
255
+ return bbox2
256
+ else:
257
+ return None
258
+
259
+ def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
260
+ x0, y0, x1, y1 = boundry
261
+ new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
262
+ return new_boxes
263
+
264
+
265
+ def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
266
+ """
267
+ 判断一个bbox是否在pdf页面的边缘
268
+ """
269
+ x0, x1 = bbox[0], bbox[2]
270
+ if x1<=width*side_threshold or x0>=width*(1-side_threshold):
271
+ return True
272
+ return False
273
+
274
+ def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
275
+ tolerance_margin = 4
276
+ top_boxes = [box for box in pymu_blocks if obj_bbox[1]-box['bbox'][3] >=-tolerance_margin and not _is_in(box['bbox'], obj_bbox)]
277
+ # 然后找到X方向上有互相重叠的
278
+ top_boxes = [box for box in top_boxes if any([obj_bbox[0]-tolerance_margin <=box['bbox'][0]<=obj_bbox[2]+tolerance_margin,
279
+ obj_bbox[0]-tolerance_margin <=box['bbox'][2]<=obj_bbox[2]+tolerance_margin,
280
+ box['bbox'][0]-tolerance_margin <=obj_bbox[0]<=box['bbox'][2]+tolerance_margin,
281
+ box['bbox'][0]-tolerance_margin <=obj_bbox[2]<=box['bbox'][2]+tolerance_margin
282
+ ])]
283
+
284
+ # 然后找到y1最大的那个
285
+ if len(top_boxes)>0:
286
+ top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
287
+ return top_boxes[0]
288
+ else:
289
+ return None
290
+
291
+
292
+ def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
293
+ bottom_boxes = [box for box in pymu_blocks if box['bbox'][1] - obj_bbox[3]>=-2 and not _is_in(box['bbox'], obj_bbox)]
294
+ # 然后找到X方向上有互相重叠的
295
+ bottom_boxes = [box for box in bottom_boxes if any([obj_bbox[0]-2 <=box['bbox'][0]<=obj_bbox[2]+2,
296
+ obj_bbox[0]-2 <=box['bbox'][2]<=obj_bbox[2]+2,
297
+ box['bbox'][0]-2 <=obj_bbox[0]<=box['bbox'][2]+2,
298
+ box['bbox'][0]-2 <=obj_bbox[2]<=box['bbox'][2]+2
299
+ ])]
300
+
301
+ # 然后找到y0最小的那个
302
+ if len(bottom_boxes)>0:
303
+ bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
304
+ return bottom_boxes[0]
305
+ else:
306
+ return None
307
+
308
+ def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
309
+ """
310
+ 寻找左侧最近的文本block
311
+ """
312
+ left_boxes = [box for box in pymu_blocks if obj_bbox[0]-box['bbox'][2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
313
+ # 然后找到X方向上有互相重叠的
314
+ left_boxes = [box for box in left_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2,
315
+ obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
316
+ box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
317
+ box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
318
+ ])]
319
+
320
+ # 然后找到x1最大的那个
321
+ if len(left_boxes)>0:
322
+ left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
323
+ return left_boxes[0]
324
+ else:
325
+ return None
326
+
327
+
328
+ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
329
+ """
330
+ 寻找右侧最近的文本block
331
+ """
332
+ right_boxes = [box for box in pymu_blocks if box['bbox'][0]-obj_bbox[2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
333
+ # 然后找到X方向上有互相重叠的
334
+ right_boxes = [box for box in right_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2,
335
+ obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
336
+ box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
337
+ box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
338
+ ])]
339
+
340
+ # 然后找到x0最小的那个
341
+ if len(right_boxes)>0:
342
+ right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
343
+ return right_boxes[0]
344
+ else:
345
+ return None
346
+
347
+
348
+ def bbox_relative_pos(bbox1, bbox2):
349
+ """
350
+ 判断两个矩形框的相对位置关系
351
+
352
+ Args:
353
+ bbox1: 一个四元组,表示第一个矩形框的左上角和右下角的坐标,格式为(x1, y1, x1b, y1b)
354
+ bbox2: 一个四元组,表示第二个矩形框的左上角和右下角的坐标,格式为(x2, y2, x2b, y2b)
355
+
356
+ Returns:
357
+ 一个四元组,表示矩形框1相对于矩形框2的位置关系,格式为(left, right, bottom, top)
358
+ 其中,left表示矩形框1是否在矩形框2的左侧,right表示矩形框1是否在矩形框2的右侧,
359
+ bottom表示矩形框1是否在矩形框2的下方,top表示矩形框1是否在矩形框2的上方
360
+
361
+ """
362
+ x1, y1, x1b, y1b = bbox1
363
+ x2, y2, x2b, y2b = bbox2
364
+
365
+ left = x2b < x1
366
+ right = x1b < x2
367
+ bottom = y2b < y1
368
+ top = y1b < y2
369
+ return left, right, bottom, top
370
+
371
+ def bbox_distance(bbox1, bbox2):
372
+ """
373
+ 计算两个矩形框的距离。
374
+
375
+ Args:
376
+ bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。
377
+ bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。
378
+
379
+ Returns:
380
+ float: 矩形框之间的距离。
381
+
382
+ """
383
+ def dist(point1, point2):
384
+ return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
385
+
386
+ x1, y1, x1b, y1b = bbox1
387
+ x2, y2, x2b, y2b = bbox2
388
+
389
+ left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
390
+
391
+ if top and left:
392
+ return dist((x1, y1b), (x2b, y2))
393
+ elif left and bottom:
394
+ return dist((x1, y1), (x2b, y2b))
395
+ elif bottom and right:
396
+ return dist((x1b, y1), (x2, y2b))
397
+ elif right and top:
398
+ return dist((x1b, y1b), (x2, y2))
399
+ elif left:
400
+ return x1 - x2b
401
+ elif right:
402
+ return x2 - x1b
403
+ elif bottom:
404
+ return y1 - y2b
405
+ elif top:
406
+ return y2 - y1b
407
+ else: # rectangles intersect
408
+ return 0
magic_pdf/libs/calc_span_stats.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import json
4
+ import pandas as pd
5
+ from pandas import DataFrame as df
6
+ from matplotlib import pyplot as plt
7
+ from termcolor import cprint
8
+
9
+ """
10
+ Execute this script in the following way:
11
+
12
+ 1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
13
+
14
+ code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
15
+
16
+ 2. Under the directory code-clean, execute the following command:
17
+
18
+ $ python -m libs.calc_span_stats
19
+
20
+ """
21
+
22
+
23
+ def print_green_on_red(text):
24
+ cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
25
+
26
+
27
+ def print_green(text):
28
+ print()
29
+ cprint(text, "green", attrs=["bold"], end="\n\n")
30
+
31
+
32
+ def print_red(text):
33
+ print()
34
+ cprint(text, "red", attrs=["bold"], end="\n\n")
35
+
36
+
37
+ def safe_get(dict_obj, key, default):
38
+ val = dict_obj.get(key)
39
+ if val is None:
40
+ return default
41
+ else:
42
+ return val
43
+
44
+
45
+ class SpanStatsCalc:
46
+ """Calculate statistics of span."""
47
+
48
+ def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
49
+ """Draw multiple figures in one figure."""
50
+ # make a canvas
51
+ fig = plt.figure(fig_num, figsize=(20, 20))
52
+
53
+ pass
54
+
55
+ def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
56
+ """Calculate statistics per pdf_dict."""
57
+ span_stats = pd.DataFrame()
58
+
59
+ span_stats = []
60
+ span_id = 0
61
+ for page_id, blocks in pdf_dict.items():
62
+ if page_id.startswith("page_"):
63
+ if "para_blocks" in blocks.keys():
64
+ for para_block in blocks["para_blocks"]:
65
+ for line in para_block["lines"]:
66
+ for span in line["spans"]:
67
+ span_text = safe_get(span, "text", "")
68
+ span_font_name = safe_get(span, "font", "")
69
+ span_font_size = safe_get(span, "size", 0)
70
+ span_font_color = safe_get(span, "color", "")
71
+ span_font_flags = safe_get(span, "flags", 0)
72
+
73
+ span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
74
+ span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
75
+ span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
76
+ span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
77
+ span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
78
+ span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
79
+ span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
80
+ span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
81
+
82
+ span_stats.append(
83
+ {
84
+ "span_id": span_id, # id of span
85
+ "page_id": page_id, # page number of pdf
86
+ "span_text": span_text, # text of span
87
+ "span_font_name": span_font_name, # font name of span
88
+ "span_font_size": span_font_size, # font size of span
89
+ "span_font_color": span_font_color, # font color of span
90
+ "span_font_flags": span_font_flags, # font flags of span
91
+ "span_is_superscript": int(
92
+ span_is_super_script
93
+ ), # indicate whether the span is super script or not
94
+ "span_is_italic": int(span_is_italic), # indicate whether the span is italic or not
95
+ "span_is_serifed": int(span_is_serifed), # indicate whether the span is serifed or not
96
+ "span_is_sans_serifed": int(
97
+ span_is_sans_serifed
98
+ ), # indicate whether the span is sans serifed or not
99
+ "span_is_monospaced": int(
100
+ span_is_monospaced
101
+ ), # indicate whether the span is monospaced or not
102
+ "span_is_proportional": int(
103
+ span_is_proportional
104
+ ), # indicate whether the span is proportional or not
105
+ "span_is_bold": int(span_is_bold), # indicate whether the span is bold or not
106
+ }
107
+ )
108
+
109
+ span_id += 1
110
+
111
+ span_stats = pd.DataFrame(span_stats)
112
+ # print(span_stats)
113
+
114
+ return span_stats
115
+
116
+
117
+ def __find_pdf_dic_files(
118
+ jf_name="pdf_dic.json",
119
+ base_code_name="code-clean",
120
+ tgt_base_dir_name="tmp",
121
+ unittest_dir_name="unittest",
122
+ md_dir_name="md",
123
+ book_names=[
124
+ "scihub",
125
+ ], # other possible values: "zlib", "arxiv" and so on
126
+ ):
127
+ pdf_dict_files = []
128
+
129
+ curr_dir = os.path.dirname(__file__)
130
+
131
+ for i in range(len(curr_dir)):
132
+ if curr_dir[i : i + len(base_code_name)] == base_code_name:
133
+ base_code_dir_name = curr_dir[: i + len(base_code_name)]
134
+ for book_name in book_names:
135
+ search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
136
+ if os.path.exists(base_code_dir_name):
137
+ search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
138
+ for root, dirs, files in os.walk(search_dir_name):
139
+ for file in files:
140
+ if file == jf_name:
141
+ pdf_dict_files.append(os.path.join(root, file))
142
+ break
143
+
144
+ return pdf_dict_files
145
+
146
+
147
+ def combine_span_texts(group_df, span_stats):
148
+ combined_span_texts = []
149
+ for _, row in group_df.iterrows():
150
+ curr_span_id = row.name
151
+ curr_span_text = row["span_text"]
152
+
153
+ pre_span_id = curr_span_id - 1
154
+ pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
155
+
156
+ next_span_id = curr_span_id + 1
157
+ next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
158
+
159
+ # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
160
+ pointer_sign = "→ → → "
161
+ combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
162
+ combined_span_texts.append(combined_text)
163
+
164
+ return "\n\n".join(combined_span_texts)
165
+
166
+
167
+ # pd.set_option("display.max_colwidth", None) # 设置为 None 来显示完整的文本
168
+ pd.set_option("display.max_rows", None) # 设置为 None 来显示更多的行
169
+
170
+
171
+ def main():
172
+ pdf_dict_files = __find_pdf_dic_files()
173
+ # print(pdf_dict_files)
174
+
175
+ span_stats_calc = SpanStatsCalc()
176
+
177
+ for pdf_dict_file in pdf_dict_files:
178
+ print("-" * 100)
179
+ print_green_on_red(f"Processing {pdf_dict_file}")
180
+
181
+ with open(pdf_dict_file, "r", encoding="utf-8") as f:
182
+ pdf_dict = json.load(f)
183
+
184
+ raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
185
+ save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
186
+ raw_df.to_csv(save_path, index=False)
187
+
188
+ filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
189
+ if filtered_df.empty:
190
+ print("No superscript span found!")
191
+ continue
192
+
193
+ filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
194
+
195
+ combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df) # type: ignore
196
+
197
+ final_df = filtered_grouped_df.size().reset_index(name="count")
198
+ final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
199
+
200
+ print(final_df)
201
+
202
+ final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
203
+
204
+ save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
205
+ # 使用 UTF-8 编码并添加 BOM,确保所有字段被双引号包围
206
+ final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
207
+
208
+ # 创建一个 2x2 的图表布局
209
+ fig, axs = plt.subplots(2, 2, figsize=(15, 10))
210
+
211
+ # 按照 span_font_name 分类作图
212
+ final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
213
+
214
+ # 按照 span_font_size 分类作图
215
+ final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
216
+
217
+ # 按照 span_font_color 分类作图
218
+ final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
219
+
220
+ # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
221
+ grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
222
+ grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
223
+
224
+ # 调整布局
225
+ plt.tight_layout()
226
+
227
+ # 显示图表
228
+ # plt.show()
229
+
230
+ # 保存图表到 PNG 文件
231
+ save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
232
+ plt.savefig(save_path)
233
+
234
+ # 清除画布
235
+ plt.clf()
236
+
237
+
238
+ if __name__ == "__main__":
239
+ main()
magic_pdf/libs/commons.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os, re, configparser
4
+ import subprocess
5
+ import time
6
+
7
+ import boto3
8
+ from loguru import logger
9
+ from boto3.s3.transfer import TransferConfig
10
+ from botocore.config import Config
11
+
12
+ import fitz # 1.23.9中已经切换到rebase
13
+ # import fitz_old as fitz # 使用1.23.9之前的pymupdf库
14
+
15
+
16
+ def get_delta_time(input_time):
17
+ return round(time.time() - input_time, 2)
18
+
19
+
20
+ def join_path(*args):
21
+ return '/'.join(str(s).rstrip('/') for s in args)
22
+
23
+
24
+ #配置全局的errlog_path,方便demo同步引用
25
+ error_log_path = "s3://llm-pdf-text/err_logs/"
26
+ # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
27
+ json_dump_path = "s3://llm-pdf-text/json_dump/"
28
+
29
+ # s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
30
+
31
+
32
+ def get_top_percent_list(num_list, percent):
33
+ """
34
+ 获取列表中前百分之多少的元素
35
+ :param num_list:
36
+ :param percent:
37
+ :return:
38
+ """
39
+ if len(num_list) == 0:
40
+ top_percent_list = []
41
+ else:
42
+ # 对imgs_len_list排序
43
+ sorted_imgs_len_list = sorted(num_list, reverse=True)
44
+ # 计算 percent 的索引
45
+ top_percent_index = int(len(sorted_imgs_len_list) * percent)
46
+ # 取前80%的元素
47
+ top_percent_list = sorted_imgs_len_list[:top_percent_index]
48
+ return top_percent_list
49
+
50
+
51
+ def formatted_time(time_stamp):
52
+ dt_object = datetime.datetime.fromtimestamp(time_stamp)
53
+ output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
54
+ return output_time
55
+
56
+
57
+ def mymax(alist: list):
58
+ if len(alist) == 0:
59
+ return 0 # 空是0, 0*0也是0大小q
60
+ else:
61
+ return max(alist)
62
+
63
+ def parse_aws_param(profile):
64
+ if isinstance(profile, str):
65
+ # 解析配置文件
66
+ config_file = join_path(os.path.expanduser("~"), ".aws", "config")
67
+ credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
68
+ config = configparser.ConfigParser()
69
+ config.read(credentials_file)
70
+ config.read(config_file)
71
+ # 获取 AWS 账户相关信息
72
+ ak = config.get(profile, "aws_access_key_id")
73
+ sk = config.get(profile, "aws_secret_access_key")
74
+ if profile == "default":
75
+ s3_str = config.get(f"{profile}", "s3")
76
+ else:
77
+ s3_str = config.get(f"profile {profile}", "s3")
78
+ end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
79
+ if end_match:
80
+ endpoint = end_match.group(1)
81
+ else:
82
+ raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
83
+ style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
84
+ if style_match:
85
+ addressing_style = style_match.group(1)
86
+ else:
87
+ addressing_style = "path"
88
+ elif isinstance(profile, dict):
89
+ ak = profile["ak"]
90
+ sk = profile["sk"]
91
+ endpoint = profile["endpoint"]
92
+ addressing_style = "auto"
93
+
94
+ return ak, sk, endpoint, addressing_style
95
+
96
+
97
+ def parse_bucket_key(s3_full_path: str):
98
+ """
99
+ 输入 s3://bucket/path/to/my/file.txt
100
+ 输出 bucket, path/to/my/file.txt
101
+ """
102
+ s3_full_path = s3_full_path.strip()
103
+ if s3_full_path.startswith("s3://"):
104
+ s3_full_path = s3_full_path[5:]
105
+ if s3_full_path.startswith("/"):
106
+ s3_full_path = s3_full_path[1:]
107
+ bucket, key = s3_full_path.split("/", 1)
108
+ return bucket, key
109
+
110
+
111
+ def read_file(pdf_path: str, s3_profile):
112
+ if pdf_path.startswith("s3://"):
113
+ ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
114
+ cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
115
+ config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
116
+ bucket_name, bucket_key = parse_bucket_key(pdf_path)
117
+ res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
118
+ file_content = res["Body"].read()
119
+ return file_content
120
+ else:
121
+ with open(pdf_path, "rb") as f:
122
+ return f.read()
123
+
124
+
125
+ def get_docx_model_output(pdf_model_output, page_id):
126
+
127
+ model_output_json = pdf_model_output[page_id]
128
+
129
+ return model_output_json
130
+
131
+
132
+ def list_dir(dir_path:str, s3_profile:str):
133
+ """
134
+ 列出dir_path下的所有文件
135
+ """
136
+ ret = []
137
+
138
+ if dir_path.startswith("s3"):
139
+ ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
140
+ s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
141
+ bucket, path = s3info[0][0], s3info[0][1]
142
+ try:
143
+ cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
144
+ config=Config(s3={'addressing_style': addressing_style}))
145
+ def list_obj_scluster():
146
+ marker = None
147
+ while True:
148
+ list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
149
+ if marker:
150
+ list_kwargs['Marker'] = marker
151
+ response = cli.list_objects(**list_kwargs)
152
+ contents = response.get("Contents", [])
153
+ yield from contents
154
+ if not response.get("IsTruncated") or len(contents)==0:
155
+ break
156
+ marker = contents[-1]['Key']
157
+
158
+
159
+ for info in list_obj_scluster():
160
+ file_path = info['Key']
161
+ #size = info['Size']
162
+
163
+ if path!="":
164
+ afile = file_path[len(path):]
165
+ if afile.endswith(".json"):
166
+ ret.append(f"s3://{bucket}/{file_path}")
167
+
168
+ return ret
169
+
170
+ except Exception as e:
171
+ logger.exception(e)
172
+ exit(-1)
173
+ else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
174
+
175
+ for root, dirs, files in os.walk(dir_path):
176
+ for file in files:
177
+ if file.endswith(".json"):
178
+ ret.append(join_path(root, file))
179
+ ret.sort()
180
+ return ret
181
+
182
+ def get_img_s3_client(save_path:str, image_s3_config:str):
183
+ """
184
+ """
185
+ if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
186
+ ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
187
+ img_s3_client = boto3.client(
188
+ service_name="s3",
189
+ aws_access_key_id=ak,
190
+ aws_secret_access_key=sk,
191
+ endpoint_url=end_point,
192
+ config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
193
+ )
194
+ else:
195
+ img_s3_client = None
196
+
197
+ return img_s3_client
198
+
199
+ if __name__=="__main__":
200
+ s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
201
+ s3_profile = "langchao"
202
+ ret = list_dir(s3_path, s3_profile)
203
+ print(ret)
204
+
magic_pdf/libs/config_reader.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
3
+
4
+ """
5
+
6
+ import json
7
+ import os
8
+
9
+ from loguru import logger
10
+
11
+ from magic_pdf.libs.commons import parse_bucket_key
12
+
13
+
14
+ def read_config():
15
+ home_dir = os.path.expanduser("~")
16
+
17
+ config_file = os.path.join(home_dir, "magic-pdf.json")
18
+
19
+ if not os.path.exists(config_file):
20
+ raise Exception(f"{config_file} not found")
21
+
22
+ with open(config_file, "r") as f:
23
+ config = json.load(f)
24
+ return config
25
+
26
+
27
+ def get_s3_config(bucket_name: str):
28
+ """
29
+ ~/magic-pdf.json 读出来
30
+ """
31
+ config = read_config()
32
+
33
+ bucket_info = config.get("bucket_info")
34
+ if bucket_name not in bucket_info:
35
+ access_key, secret_key, storage_endpoint = bucket_info["[default]"]
36
+ else:
37
+ access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
38
+
39
+ if access_key is None or secret_key is None or storage_endpoint is None:
40
+ raise Exception("ak, sk or endpoint not found in magic-pdf.json")
41
+
42
+ # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
43
+
44
+ return access_key, secret_key, storage_endpoint
45
+
46
+
47
+ def get_s3_config_dict(path: str):
48
+ access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
49
+ return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
50
+
51
+
52
+ def get_bucket_name(path):
53
+ bucket, key = parse_bucket_key(path)
54
+ return bucket
55
+
56
+
57
+ def get_local_dir():
58
+ config = read_config()
59
+ return config.get("temp-output-dir", "/tmp")
60
+
61
+
62
+ def get_local_models_dir():
63
+ config = read_config()
64
+ return config.get("models-dir", "/tmp/models")
65
+
66
+
67
+ def get_device():
68
+ config = read_config()
69
+ return config.get("device-mode", "cpu")
70
+
71
+
72
+ if __name__ == "__main__":
73
+ ak, sk, endpoint = get_s3_config("llm-raw")
magic_pdf/libs/convert_utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ def dict_to_list(input_dict):
2
+ items_list = []
3
+ for _, item in input_dict.items():
4
+ items_list.append(item)
5
+ return items_list