KevinHuSh
commited on
Commit
·
5db8a67
1
Parent(s):
be5bfb7
remove PyMuPDF (#618)
Browse files### What problem does this PR solve?
#613
### Type of change
- [x] Other (please describe):
- api/utils/file_utils.py +3 -5
- deepdoc/parser/pdf_parser.py +2 -21
- deepdoc/vision/__init__.py +7 -8
- rag/utils/minio_conn.py +1 -1
- requirements.txt +0 -2
api/utils/file_utils.py
CHANGED
@@ -19,7 +19,7 @@ import os
|
|
19 |
import re
|
20 |
from io import BytesIO
|
21 |
|
22 |
-
import
|
23 |
from PIL import Image
|
24 |
from cachetools import LRUCache, cached
|
25 |
from ruamel.yaml import YAML
|
@@ -172,11 +172,9 @@ def filename_type(filename):
|
|
172 |
def thumbnail(filename, blob):
|
173 |
filename = filename.lower()
|
174 |
if re.match(r".*\.pdf$", filename):
|
175 |
-
pdf =
|
176 |
-
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
177 |
buffered = BytesIO()
|
178 |
-
|
179 |
-
pix.samples).save(buffered, format="png")
|
180 |
return "data:image/png;base64," + \
|
181 |
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
182 |
|
|
|
19 |
import re
|
20 |
from io import BytesIO
|
21 |
|
22 |
+
import pdfplumber
|
23 |
from PIL import Image
|
24 |
from cachetools import LRUCache, cached
|
25 |
from ruamel.yaml import YAML
|
|
|
172 |
def thumbnail(filename, blob):
|
173 |
filename = filename.lower()
|
174 |
if re.match(r".*\.pdf$", filename):
|
175 |
+
pdf = pdfplumber.open(BytesIO(blob))
|
|
|
176 |
buffered = BytesIO()
|
177 |
+
pdf.pages[0].to_image().annotated.save(buffered, format="png")
|
|
|
178 |
return "data:image/png;base64," + \
|
179 |
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
180 |
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
import os
|
3 |
import random
|
4 |
|
5 |
-
import fitz
|
6 |
import xgboost as xgb
|
7 |
from io import BytesIO
|
8 |
import torch
|
@@ -922,9 +921,7 @@ class RAGFlowPdfParser:
|
|
922 |
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
923 |
return len(pdf.pages)
|
924 |
except Exception as e:
|
925 |
-
|
926 |
-
stream=fnm, filetype="pdf")
|
927 |
-
return len(pdf)
|
928 |
|
929 |
def __images__(self, fnm, zoomin=3, page_from=0,
|
930 |
page_to=299, callback=None):
|
@@ -946,23 +943,7 @@ class RAGFlowPdfParser:
|
|
946 |
self.pdf.pages[page_from:page_to]]
|
947 |
self.total_page = len(self.pdf.pages)
|
948 |
except Exception as e:
|
949 |
-
|
950 |
-
fnm, str) else fitz.open(
|
951 |
-
stream=fnm, filetype="pdf")
|
952 |
-
self.page_images = []
|
953 |
-
self.page_chars = []
|
954 |
-
mat = fitz.Matrix(zoomin, zoomin)
|
955 |
-
self.total_page = len(self.pdf)
|
956 |
-
for i, page in enumerate(self.pdf):
|
957 |
-
if i < page_from:
|
958 |
-
continue
|
959 |
-
if i >= page_to:
|
960 |
-
break
|
961 |
-
pix = page.get_pixmap(matrix=mat)
|
962 |
-
img = Image.frombytes("RGB", [pix.width, pix.height],
|
963 |
-
pix.samples)
|
964 |
-
self.page_images.append(img)
|
965 |
-
self.page_chars.append([])
|
966 |
|
967 |
self.outlines = []
|
968 |
try:
|
|
|
2 |
import os
|
3 |
import random
|
4 |
|
|
|
5 |
import xgboost as xgb
|
6 |
from io import BytesIO
|
7 |
import torch
|
|
|
921 |
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
922 |
return len(pdf.pages)
|
923 |
except Exception as e:
|
924 |
+
logging.error(str(e))
|
|
|
|
|
925 |
|
926 |
def __images__(self, fnm, zoomin=3, page_from=0,
|
927 |
page_to=299, callback=None):
|
|
|
943 |
self.pdf.pages[page_from:page_to]]
|
944 |
self.total_page = len(self.pdf.pages)
|
945 |
except Exception as e:
|
946 |
+
logging.error(str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
947 |
|
948 |
self.outlines = []
|
949 |
try:
|
deepdoc/vision/__init__.py
CHANGED
@@ -1,12 +1,13 @@
|
|
|
|
1 |
|
2 |
from .ocr import OCR
|
3 |
from .recognizer import Recognizer
|
4 |
from .layout_recognizer import LayoutRecognizer
|
5 |
from .table_structure_recognizer import TableStructureRecognizer
|
6 |
|
|
|
7 |
def init_in_out(args):
|
8 |
from PIL import Image
|
9 |
-
import fitz
|
10 |
import os
|
11 |
import traceback
|
12 |
from api.utils.file_utils import traversal_files
|
@@ -18,13 +19,11 @@ def init_in_out(args):
|
|
18 |
|
19 |
def pdf_pages(fnm, zoomin=3):
|
20 |
nonlocal outputs, images
|
21 |
-
pdf =
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
pix.samples)
|
27 |
-
images.append(img)
|
28 |
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
29 |
|
30 |
def images_and_outputs(fnm):
|
|
|
1 |
+
import pdfplumber
|
2 |
|
3 |
from .ocr import OCR
|
4 |
from .recognizer import Recognizer
|
5 |
from .layout_recognizer import LayoutRecognizer
|
6 |
from .table_structure_recognizer import TableStructureRecognizer
|
7 |
|
8 |
+
|
9 |
def init_in_out(args):
|
10 |
from PIL import Image
|
|
|
11 |
import os
|
12 |
import traceback
|
13 |
from api.utils.file_utils import traversal_files
|
|
|
19 |
|
20 |
def pdf_pages(fnm, zoomin=3):
|
21 |
nonlocal outputs, images
|
22 |
+
pdf = pdfplumber.open(fnm)
|
23 |
+
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
24 |
+
enumerate(pdf.pages)]
|
25 |
+
|
26 |
+
for i, page in enumerate(images):
|
|
|
|
|
27 |
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
28 |
|
29 |
def images_and_outputs(fnm):
|
rag/utils/minio_conn.py
CHANGED
@@ -35,7 +35,7 @@ class RAGFlowMinio(object):
|
|
35 |
self.conn = None
|
36 |
|
37 |
def put(self, bucket, fnm, binary):
|
38 |
-
for _ in range(
|
39 |
try:
|
40 |
if not self.conn.bucket_exists(bucket):
|
41 |
self.conn.make_bucket(bucket)
|
|
|
35 |
self.conn = None
|
36 |
|
37 |
def put(self, bucket, fnm, binary):
|
38 |
+
for _ in range(3):
|
39 |
try:
|
40 |
if not self.conn.bucket_exists(bucket):
|
41 |
self.conn.make_bucket(bucket)
|
requirements.txt
CHANGED
@@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
|
91 |
pydantic==2.6.2
|
92 |
pydantic_core==2.16.3
|
93 |
PyJWT==2.8.0
|
94 |
-
PyMuPDF==1.23.25
|
95 |
-
PyMuPDFb==1.23.22
|
96 |
PyMySQL==1.1.0
|
97 |
PyPDF2==3.0.1
|
98 |
pypdfium2==4.27.0
|
|
|
91 |
pydantic==2.6.2
|
92 |
pydantic_core==2.16.3
|
93 |
PyJWT==2.8.0
|
|
|
|
|
94 |
PyMySQL==1.1.0
|
95 |
PyPDF2==3.0.1
|
96 |
pypdfium2==4.27.0
|