KevinHuSh commited on
Commit
5db8a67
·
1 Parent(s): be5bfb7

remove PyMuPDF (#618)

Browse files

### What problem does this PR solve?
#613

### Type of change


- [x] Other (please describe):

api/utils/file_utils.py CHANGED
@@ -19,7 +19,7 @@ import os
19
  import re
20
  from io import BytesIO
21
 
22
- import fitz
23
  from PIL import Image
24
  from cachetools import LRUCache, cached
25
  from ruamel.yaml import YAML
@@ -172,11 +172,9 @@ def filename_type(filename):
172
  def thumbnail(filename, blob):
173
  filename = filename.lower()
174
  if re.match(r".*\.pdf$", filename):
175
- pdf = fitz.open(stream=blob, filetype="pdf")
176
- pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
177
  buffered = BytesIO()
178
- Image.frombytes("RGB", [pix.width, pix.height],
179
- pix.samples).save(buffered, format="png")
180
  return "data:image/png;base64," + \
181
  base64.b64encode(buffered.getvalue()).decode("utf-8")
182
 
 
19
  import re
20
  from io import BytesIO
21
 
22
+ import pdfplumber
23
  from PIL import Image
24
  from cachetools import LRUCache, cached
25
  from ruamel.yaml import YAML
 
172
  def thumbnail(filename, blob):
173
  filename = filename.lower()
174
  if re.match(r".*\.pdf$", filename):
175
+ pdf = pdfplumber.open(BytesIO(blob))
 
176
  buffered = BytesIO()
177
+ pdf.pages[0].to_image().annotated.save(buffered, format="png")
 
178
  return "data:image/png;base64," + \
179
  base64.b64encode(buffered.getvalue()).decode("utf-8")
180
 
deepdoc/parser/pdf_parser.py CHANGED
@@ -2,7 +2,6 @@
2
  import os
3
  import random
4
 
5
- import fitz
6
  import xgboost as xgb
7
  from io import BytesIO
8
  import torch
@@ -922,9 +921,7 @@ class RAGFlowPdfParser:
922
  fnm) if not binary else pdfplumber.open(BytesIO(binary))
923
  return len(pdf.pages)
924
  except Exception as e:
925
- pdf = fitz.open(fnm) if not binary else fitz.open(
926
- stream=fnm, filetype="pdf")
927
- return len(pdf)
928
 
929
  def __images__(self, fnm, zoomin=3, page_from=0,
930
  page_to=299, callback=None):
@@ -946,23 +943,7 @@ class RAGFlowPdfParser:
946
  self.pdf.pages[page_from:page_to]]
947
  self.total_page = len(self.pdf.pages)
948
  except Exception as e:
949
- self.pdf = fitz.open(fnm) if isinstance(
950
- fnm, str) else fitz.open(
951
- stream=fnm, filetype="pdf")
952
- self.page_images = []
953
- self.page_chars = []
954
- mat = fitz.Matrix(zoomin, zoomin)
955
- self.total_page = len(self.pdf)
956
- for i, page in enumerate(self.pdf):
957
- if i < page_from:
958
- continue
959
- if i >= page_to:
960
- break
961
- pix = page.get_pixmap(matrix=mat)
962
- img = Image.frombytes("RGB", [pix.width, pix.height],
963
- pix.samples)
964
- self.page_images.append(img)
965
- self.page_chars.append([])
966
 
967
  self.outlines = []
968
  try:
 
2
  import os
3
  import random
4
 
 
5
  import xgboost as xgb
6
  from io import BytesIO
7
  import torch
 
921
  fnm) if not binary else pdfplumber.open(BytesIO(binary))
922
  return len(pdf.pages)
923
  except Exception as e:
924
+ logging.error(str(e))
 
 
925
 
926
  def __images__(self, fnm, zoomin=3, page_from=0,
927
  page_to=299, callback=None):
 
943
  self.pdf.pages[page_from:page_to]]
944
  self.total_page = len(self.pdf.pages)
945
  except Exception as e:
946
+ logging.error(str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
 
948
  self.outlines = []
949
  try:
deepdoc/vision/__init__.py CHANGED
@@ -1,12 +1,13 @@
 
1
 
2
  from .ocr import OCR
3
  from .recognizer import Recognizer
4
  from .layout_recognizer import LayoutRecognizer
5
  from .table_structure_recognizer import TableStructureRecognizer
6
 
 
7
  def init_in_out(args):
8
  from PIL import Image
9
- import fitz
10
  import os
11
  import traceback
12
  from api.utils.file_utils import traversal_files
@@ -18,13 +19,11 @@ def init_in_out(args):
18
 
19
  def pdf_pages(fnm, zoomin=3):
20
  nonlocal outputs, images
21
- pdf = fitz.open(fnm)
22
- mat = fitz.Matrix(zoomin, zoomin)
23
- for i, page in enumerate(pdf):
24
- pix = page.get_pixmap(matrix=mat)
25
- img = Image.frombytes("RGB", [pix.width, pix.height],
26
- pix.samples)
27
- images.append(img)
28
  outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
29
 
30
  def images_and_outputs(fnm):
 
1
+ import pdfplumber
2
 
3
  from .ocr import OCR
4
  from .recognizer import Recognizer
5
  from .layout_recognizer import LayoutRecognizer
6
  from .table_structure_recognizer import TableStructureRecognizer
7
 
8
+
9
  def init_in_out(args):
10
  from PIL import Image
 
11
  import os
12
  import traceback
13
  from api.utils.file_utils import traversal_files
 
19
 
20
  def pdf_pages(fnm, zoomin=3):
21
  nonlocal outputs, images
22
+ pdf = pdfplumber.open(fnm)
23
+ images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
24
+ enumerate(pdf.pages)]
25
+
26
+ for i, page in enumerate(images):
 
 
27
  outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
28
 
29
  def images_and_outputs(fnm):
rag/utils/minio_conn.py CHANGED
@@ -35,7 +35,7 @@ class RAGFlowMinio(object):
35
  self.conn = None
36
 
37
  def put(self, bucket, fnm, binary):
38
- for _ in range(10):
39
  try:
40
  if not self.conn.bucket_exists(bucket):
41
  self.conn.make_bucket(bucket)
 
35
  self.conn = None
36
 
37
  def put(self, bucket, fnm, binary):
38
+ for _ in range(3):
39
  try:
40
  if not self.conn.bucket_exists(bucket):
41
  self.conn.make_bucket(bucket)
requirements.txt CHANGED
@@ -91,8 +91,6 @@ pycryptodomex==3.20.0
91
  pydantic==2.6.2
92
  pydantic_core==2.16.3
93
  PyJWT==2.8.0
94
- PyMuPDF==1.23.25
95
- PyMuPDFb==1.23.22
96
  PyMySQL==1.1.0
97
  PyPDF2==3.0.1
98
  pypdfium2==4.27.0
 
91
  pydantic==2.6.2
92
  pydantic_core==2.16.3
93
  PyJWT==2.8.0
 
 
94
  PyMySQL==1.1.0
95
  PyPDF2==3.0.1
96
  pypdfium2==4.27.0