KevinHuSh commited on
Commit
e34cb81
·
1 Parent(s): 657bc8a

refine pdf parser, add time zone to userinfo (#112)

Browse files
api/db/db_models.py CHANGED
@@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin):
354
  avatar = TextField(null=True, help_text="avatar base64 string")
355
  language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
356
  color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
 
357
  last_login_time = DateTimeField(null=True)
358
  is_authenticated = CharField(max_length=1, null=False, default="1")
359
  is_active = CharField(max_length=1, null=False, default="1")
 
354
  avatar = TextField(null=True, help_text="avatar base64 string")
355
  language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
356
  color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
357
+ timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
358
  last_login_time = DateTimeField(null=True)
359
  is_authenticated = CharField(max_length=1, null=False, default="1")
360
  is_active = CharField(max_length=1, null=False, default="1")
deepdoc/parser/pdf_parser.py CHANGED
@@ -313,9 +313,19 @@ class HuParser:
313
  while i < len(bxs) - 1:
314
  b = bxs[i]
315
  b_ = bxs[i + 1]
316
- if b.get("layoutno", "0") != b_.get("layoutno", "1"):
317
  i += 1
318
  continue
 
 
 
 
 
 
 
 
 
 
319
 
320
  dis_thr = 1
321
  dis = b["x1"] - b_["x0"]
@@ -642,9 +652,9 @@ class HuParser:
642
 
643
  tk, tv = nearest(tables)
644
  fk, fv = nearest(figures)
645
- if min(tv, fv) > 2000:
646
- i += 1
647
- continue
648
  if tv < fv:
649
  tables[tk].insert(0, c)
650
  logging.debug(
@@ -711,12 +721,7 @@ class HuParser:
711
 
712
  # crop figure out and add caption
713
  for k, bxs in figures.items():
714
- txt = "\n".join(
715
- [b["text"] for b in bxs
716
- if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
717
- and len(b["text"].strip()) >= 4
718
- ]
719
- )
720
  if not txt:
721
  continue
722
 
 
313
  while i < len(bxs) - 1:
314
  b = bxs[i]
315
  b_ = bxs[i + 1]
316
+ if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
317
  i += 1
318
  continue
319
+ if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
320
+ # merge
321
+ bxs[i]["x1"] = b_["x1"]
322
+ bxs[i]["top"] = (b["top"] + b_["top"]) / 2
323
+ bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
324
+ bxs[i]["text"] += b_["text"]
325
+ bxs.pop(i + 1)
326
+ continue
327
+ i += 1
328
+ continue
329
 
330
  dis_thr = 1
331
  dis = b["x1"] - b_["x0"]
 
652
 
653
  tk, tv = nearest(tables)
654
  fk, fv = nearest(figures)
655
+ #if min(tv, fv) > 2000:
656
+ # i += 1
657
+ # continue
658
  if tv < fv:
659
  tables[tk].insert(0, c)
660
  logging.debug(
 
721
 
722
  # crop figure out and add caption
723
  for k, bxs in figures.items():
724
+ txt = "\n".join([b["text"] for b in bxs])
 
 
 
 
 
725
  if not txt:
726
  continue
727
 
deepdoc/vision/layout_recognizer.py CHANGED
@@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer):
96
  continue
97
 
98
  bxs[i]["layoutno"] = f"{ty}-{ii}"
99
- bxs[i]["layout_type"] = lts_[ii]["type"]
100
  i += 1
101
 
102
  for lt in ["footer", "header", "reference", "figure caption",
@@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer):
105
 
106
  # add box to figure layouts which has not text box
107
  for i, lt in enumerate(
108
- [lt for lt in lts if lt["type"] == "figure"]):
109
  if lt.get("visited"):
110
  continue
111
  lt = deepcopy(lt)
 
96
  continue
97
 
98
  bxs[i]["layoutno"] = f"{ty}-{ii}"
99
+ bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
100
  i += 1
101
 
102
  for lt in ["footer", "header", "reference", "figure caption",
 
105
 
106
  # add box to figure layouts which has not text box
107
  for i, lt in enumerate(
108
+ [lt for lt in lts if lt["type"] in ["figure","equation"]]):
109
  if lt.get("visited"):
110
  continue
111
  lt = deepcopy(lt)
deepdoc/vision/ocr.py CHANGED
@@ -21,7 +21,6 @@ from .operators import *
21
  import numpy as np
22
  import onnxruntime as ort
23
 
24
- from api.utils.file_utils import get_project_base_directory
25
  from .postprocess import build_post_process
26
  from rag.settings import cron_logger
27
 
 
21
  import numpy as np
22
  import onnxruntime as ort
23
 
 
24
  from .postprocess import build_post_process
25
  from rag.settings import cron_logger
26
 
deepdoc/vision/recognizer.py CHANGED
@@ -276,18 +276,18 @@ class Recognizer(object):
276
  def find_overlapped_with_threashold(box, boxes, thr=0.3):
277
  if not boxes:
278
  return
279
- max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
280
  s, e = 0, len(boxes)
281
  for i in range(s, e):
282
  ov = Recognizer.overlapped_area(box, boxes[i])
283
  _ov = Recognizer.overlapped_area(boxes[i], box)
284
- if (ov, _ov) < (max_overlaped, _max_overlaped):
285
  continue
286
- max_overlaped_i = i
287
- max_overlaped = ov
288
- _max_overlaped = _ov
289
 
290
- return max_overlaped_i
291
 
292
  def preprocess(self, image_list):
293
  inputs = []
 
276
  def find_overlapped_with_threashold(box, boxes, thr=0.3):
277
  if not boxes:
278
  return
279
+ max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
280
  s, e = 0, len(boxes)
281
  for i in range(s, e):
282
  ov = Recognizer.overlapped_area(box, boxes[i])
283
  _ov = Recognizer.overlapped_area(boxes[i], box)
284
+ if (ov, _ov) < (max_overlapped, _max_overlapped):
285
  continue
286
+ max_overlapped_i = i
287
+ max_overlapped = ov
288
+ _max_overlapped = _ov
289
 
290
+ return max_overlapped_i
291
 
292
  def preprocess(self, image_list):
293
  inputs = []
rag/app/naive.py CHANGED
@@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
101
  d = copy.deepcopy(doc)
102
  if pdf_parser:
103
  d["image"], poss = pdf_parser.crop(ck, need_position=True)
104
- add_positions(d, poss)
105
  ck = pdf_parser.remove_tag(ck)
106
  tokenize(d, ck, eng)
107
  res.append(d)
@@ -112,7 +112,7 @@ if __name__ == "__main__":
112
  import sys
113
 
114
 
115
- def dummy(a, b):
116
  pass
117
 
118
 
 
101
  d = copy.deepcopy(doc)
102
  if pdf_parser:
103
  d["image"], poss = pdf_parser.crop(ck, need_position=True)
104
+ add_positions(d, poss, from_page)
105
  ck = pdf_parser.remove_tag(ck)
106
  tokenize(d, ck, eng)
107
  res.append(d)
 
112
  import sys
113
 
114
 
115
+ def dummy(prog=None, msg=""):
116
  pass
117
 
118
 
rag/nlp/search.py CHANGED
@@ -82,8 +82,8 @@ class Dealer:
82
  )
83
  else:
84
  s = s.sort(
85
- {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
86
- {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
87
  {"create_time": {"order": "desc", "unmapped_type": "date"}},
88
  {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
89
  )
 
82
  )
83
  else:
84
  s = s.sort(
85
+ {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
86
+ {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
87
  {"create_time": {"order": "desc", "unmapped_type": "date"}},
88
  {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
89
  )