KevinHuSh commited on
Commit
08bab63
·
1 Parent(s): e55650e

refine for English corpus (#135)

Browse files
deepdoc/parser/excel_parser.py CHANGED
@@ -5,6 +5,27 @@ from io import BytesIO
5
 
6
 
7
  class HuExcelParser:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def __call__(self, fnm):
9
  if isinstance(fnm, str):
10
  wb = load_workbook(fnm)
 
5
 
6
 
7
  class HuExcelParser:
8
+ def html(self, fnm):
9
+ if isinstance(fnm, str):
10
+ wb = load_workbook(fnm)
11
+ else:
12
+ wb = load_workbook(BytesIO(fnm))
13
+ tb = ""
14
+ for sheetname in wb.sheetnames:
15
+ ws = wb[sheetname]
16
+ rows = list(ws.rows)
17
+ tb += f"<table><caption>{sheetname}</caption><tr>"
18
+ for t in list(rows[0]): tb += f"<th>{t.value}</th>"
19
+ tb += "</tr>"
20
+ for r in list(rows[1:]):
21
+ tb += "<tr>"
22
+ for i,c in enumerate(r):
23
+ if c.value is None: tb += "<td></td>"
24
+ else: tb += f"<td>{c.value}</td>"
25
+ tb += "</tr>"
26
+ tb += "</table>\n"
27
+ return tb
28
+
29
  def __call__(self, fnm):
30
  if isinstance(fnm, str):
31
  wb = load_workbook(fnm)
deepdoc/parser/pdf_parser.py CHANGED
@@ -17,7 +17,6 @@ from rag.nlp import huqie
17
  from copy import deepcopy
18
  from huggingface_hub import hf_hub_download
19
 
20
-
21
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
22
 
23
 
@@ -25,7 +24,7 @@ class HuParser:
25
  def __init__(self):
26
  self.ocr = OCR()
27
  if hasattr(self, "model_speciess"):
28
- self.layouter = LayoutRecognizer("layout."+self.model_speciess)
29
  else:
30
  self.layouter = LayoutRecognizer("layout")
31
  self.tbl_det = TableStructureRecognizer()
@@ -141,7 +140,7 @@ class HuParser:
141
  for j in range(i, -1, -1):
142
  # restore the order using th
143
  if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
144
- and arr[j + 1]["top"] < arr[j]["top"]\
145
  and arr[j + 1]["page_number"] == arr[j]["page_number"]:
146
  tmp = arr[j]
147
  arr[j] = arr[j + 1]
@@ -278,8 +277,10 @@ class HuParser:
278
 
279
  for b in bxs:
280
  if not b["text"]:
281
- left, right, top, bott = b["x0"]*ZM, b["x1"]*ZM, b["top"]*ZM, b["bottom"]*ZM
282
- b["text"] = self.ocr.recognize(np.array(img), np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
 
 
283
  del b["txt"]
284
  bxs = [b for b in bxs if b["text"]]
285
  if self.mean_height[-1] == 0:
@@ -315,7 +316,8 @@ class HuParser:
315
  while i < len(bxs) - 1:
316
  b = bxs[i]
317
  b_ = bxs[i + 1]
318
- if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
 
319
  i += 1
320
  continue
321
  if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
@@ -376,9 +378,13 @@ class HuParser:
376
  b["page_number"] == b_["page_number"] and b_["top"] - \
377
  b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
378
  b["page_number"] < b_["page_number"] and abs(
379
- b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
380
  ]
381
- if any(feats) and not any(concatting_feats):
 
 
 
 
382
  i += 1
383
  continue
384
  # merge up and down
@@ -503,18 +509,21 @@ class HuParser:
503
  findit = False
504
  i = 0
505
  while i < len(self.boxes):
506
- if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
 
507
  i += 1
508
  continue
509
  findit = True
510
  eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
511
  self.boxes.pop(i)
512
  if i >= len(self.boxes): break
513
- prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
 
514
  while not prefix:
515
  self.boxes.pop(i)
516
  if i >= len(self.boxes): break
517
- prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
 
518
  self.boxes.pop(i)
519
  if i >= len(self.boxes) or not prefix: break
520
  for j in range(i, min(i + 128, len(self.boxes))):
@@ -522,13 +531,13 @@ class HuParser:
522
  continue
523
  for k in range(i, j): self.boxes.pop(i)
524
  break
525
- if findit:return
526
 
527
  page_dirty = [0] * len(self.page_images)
528
  for b in self.boxes:
529
  if re.search(r"(··|··|··)", b["text"]):
530
- page_dirty[b["page_number"]-1] += 1
531
- page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
532
  if not page_dirty: return
533
  i = 0
534
  while i < len(self.boxes):
@@ -546,7 +555,7 @@ class HuParser:
546
  self.boxes.pop(i)
547
  continue
548
  if not b_["text"].strip():
549
- self.boxes.pop(i+1)
550
  continue
551
 
552
  if b["text"].strip()[0] != b_["text"].strip()[0] \
@@ -574,8 +583,10 @@ class HuParser:
574
  continue
575
  lout_no = str(self.boxes[i]["page_number"]) + \
576
  "-" + str(self.boxes[i]["layoutno"])
577
- if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
578
- "figure caption", "reference"]:
 
 
579
  nomerge_lout_no.append(lst_lout_no)
580
  if self.boxes[i]["layout_type"] == "table":
581
  if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
@@ -654,7 +665,7 @@ class HuParser:
654
 
655
  tk, tv = nearest(tables)
656
  fk, fv = nearest(figures)
657
- #if min(tv, fv) > 2000:
658
  # i += 1
659
  # continue
660
  if tv < fv and tk:
@@ -699,7 +710,7 @@ class HuParser:
699
  "layoutno", "")))
700
 
701
  left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
702
- poss.append((pn+self.page_from, left, right, top, bott))
703
  return self.page_images[pn] \
704
  .crop((left * ZM, top * ZM,
705
  right * ZM, bott * ZM))
@@ -738,7 +749,7 @@ class HuParser:
738
  for k, bxs in tables.items():
739
  if not bxs:
740
  continue
741
- bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
742
  poss = []
743
  res.append((cropout(bxs, "table", poss),
744
  self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
@@ -879,7 +890,8 @@ class HuParser:
879
  self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
880
  self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
881
  enumerate(self.pdf.pages[page_from:page_to])]
882
- self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
 
883
  self.total_page = len(self.pdf.pages)
884
  except Exception as e:
885
  self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
@@ -888,8 +900,8 @@ class HuParser:
888
  mat = fitz.Matrix(zoomin, zoomin)
889
  self.total_page = len(self.pdf)
890
  for i, page in enumerate(self.pdf):
891
- if i < page_from:continue
892
- if i >= page_to:break
893
  pix = page.get_pixmap(matrix=mat)
894
  img = Image.frombytes("RGB", [pix.width, pix.height],
895
  pix.samples)
@@ -897,7 +909,9 @@ class HuParser:
897
  self.page_chars.append([])
898
 
899
  logging.info("Images converted.")
900
- self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
 
 
901
  if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
902
  self.is_english = True
903
  else:
@@ -927,11 +941,12 @@ class HuParser:
927
  # self.page_cum_height.append(
928
  # np.max([c["bottom"] for c in chars]))
929
  self.__ocr(i + 1, img, chars, zoomin)
930
- if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
931
 
932
  if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
933
  bxes = [b for bxs in self.boxes for b in bxs]
934
- self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
 
935
 
936
  logging.info("Is it English:", self.is_english)
937
 
@@ -964,12 +979,13 @@ class HuParser:
964
  if need_position: return None, None
965
  return
966
 
967
- max_width = np.max([right-left for (_, left, right, _, _) in poss])
968
  GAP = 6
969
  pos = poss[0]
970
- poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0)))
971
  pos = poss[-1]
972
- poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
 
973
 
974
  positions = []
975
  for ii, (pns, left, right, top, bottom) in enumerate(poss):
@@ -984,9 +1000,9 @@ class HuParser:
984
  bottom, self.page_images[pns[0]].size[1])
985
  ))
986
  )
987
- if 0 < ii < len(poss)-1:
988
- positions.append((pns[0]+self.page_from, left, right, top, min(
989
- bottom, self.page_images[pns[0]].size[1])/ZM))
990
  bottom -= self.page_images[pns[0]].size[1]
991
  for pn in pns[1:]:
992
  imgs.append(
@@ -997,7 +1013,7 @@ class HuParser:
997
  ))
998
  )
999
  if 0 < ii < len(poss) - 1:
1000
- positions.append((pn+self.page_from, left, right, 0, min(
1001
  bottom, self.page_images[pn].size[1]) / ZM))
1002
  bottom -= self.page_images[pn].size[1]
1003
 
@@ -1026,6 +1042,19 @@ class HuParser:
1026
  return pic, positions
1027
  return pic
1028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1029
 
1030
  if __name__ == "__main__":
1031
  pass
 
17
  from copy import deepcopy
18
  from huggingface_hub import hf_hub_download
19
 
 
20
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
21
 
22
 
 
24
  def __init__(self):
25
  self.ocr = OCR()
26
  if hasattr(self, "model_speciess"):
27
+ self.layouter = LayoutRecognizer("layout." + self.model_speciess)
28
  else:
29
  self.layouter = LayoutRecognizer("layout")
30
  self.tbl_det = TableStructureRecognizer()
 
140
  for j in range(i, -1, -1):
141
  # restore the order using th
142
  if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
143
+ and arr[j + 1]["top"] < arr[j]["top"] \
144
  and arr[j + 1]["page_number"] == arr[j]["page_number"]:
145
  tmp = arr[j]
146
  arr[j] = arr[j + 1]
 
277
 
278
  for b in bxs:
279
  if not b["text"]:
280
+ left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
281
+ b["text"] = self.ocr.recognize(np.array(img),
282
+ np.array([[left, top], [right, top], [right, bott], [left, bott]],
283
+ dtype=np.float32))
284
  del b["txt"]
285
  bxs = [b for b in bxs if b["text"]]
286
  if self.mean_height[-1] == 0:
 
316
  while i < len(bxs) - 1:
317
  b = bxs[i]
318
  b_ = bxs[i + 1]
319
+ if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
320
+ "equation"]:
321
  i += 1
322
  continue
323
  if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
 
378
  b["page_number"] == b_["page_number"] and b_["top"] - \
379
  b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
380
  b["page_number"] < b_["page_number"] and abs(
381
+ b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
382
  ]
383
+ # split features
384
+ detach_feats = [b["x1"] < b_["x0"],
385
+ b["x0"] > b_["x1"]]
386
+ if (any(feats) and not any(concatting_feats)) or any(detach_feats):
387
+ print(b["text"], b_["text"], any(feats), any(concatting_feats), any(detach_feats))
388
  i += 1
389
  continue
390
  # merge up and down
 
509
  findit = False
510
  i = 0
511
  while i < len(self.boxes):
512
+ if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
513
+ re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
514
  i += 1
515
  continue
516
  findit = True
517
  eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
518
  self.boxes.pop(i)
519
  if i >= len(self.boxes): break
520
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
521
+ self.boxes[i]["text"].strip().split(" ")[:2])
522
  while not prefix:
523
  self.boxes.pop(i)
524
  if i >= len(self.boxes): break
525
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
526
+ self.boxes[i]["text"].strip().split(" ")[:2])
527
  self.boxes.pop(i)
528
  if i >= len(self.boxes) or not prefix: break
529
  for j in range(i, min(i + 128, len(self.boxes))):
 
531
  continue
532
  for k in range(i, j): self.boxes.pop(i)
533
  break
534
+ if findit: return
535
 
536
  page_dirty = [0] * len(self.page_images)
537
  for b in self.boxes:
538
  if re.search(r"(··|··|··)", b["text"]):
539
+ page_dirty[b["page_number"] - 1] += 1
540
+ page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
541
  if not page_dirty: return
542
  i = 0
543
  while i < len(self.boxes):
 
555
  self.boxes.pop(i)
556
  continue
557
  if not b_["text"].strip():
558
+ self.boxes.pop(i + 1)
559
  continue
560
 
561
  if b["text"].strip()[0] != b_["text"].strip()[0] \
 
583
  continue
584
  lout_no = str(self.boxes[i]["page_number"]) + \
585
  "-" + str(self.boxes[i]["layoutno"])
586
+ if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
587
+ "title",
588
+ "figure caption",
589
+ "reference"]:
590
  nomerge_lout_no.append(lst_lout_no)
591
  if self.boxes[i]["layout_type"] == "table":
592
  if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
 
665
 
666
  tk, tv = nearest(tables)
667
  fk, fv = nearest(figures)
668
+ # if min(tv, fv) > 2000:
669
  # i += 1
670
  # continue
671
  if tv < fv and tk:
 
710
  "layoutno", "")))
711
 
712
  left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
713
+ poss.append((pn + self.page_from, left, right, top, bott))
714
  return self.page_images[pn] \
715
  .crop((left * ZM, top * ZM,
716
  right * ZM, bott * ZM))
 
749
  for k, bxs in tables.items():
750
  if not bxs:
751
  continue
752
+ bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
753
  poss = []
754
  res.append((cropout(bxs, "table", poss),
755
  self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
 
890
  self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
891
  self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
892
  enumerate(self.pdf.pages[page_from:page_to])]
893
+ self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
894
+ self.pdf.pages[page_from:page_to]]
895
  self.total_page = len(self.pdf.pages)
896
  except Exception as e:
897
  self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
 
900
  mat = fitz.Matrix(zoomin, zoomin)
901
  self.total_page = len(self.pdf)
902
  for i, page in enumerate(self.pdf):
903
+ if i < page_from: continue
904
+ if i >= page_to: break
905
  pix = page.get_pixmap(matrix=mat)
906
  img = Image.frombytes("RGB", [pix.width, pix.height],
907
  pix.samples)
 
909
  self.page_chars.append([])
910
 
911
  logging.info("Images converted.")
912
+ self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
913
+ random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
914
+ range(len(self.page_chars))]
915
  if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
916
  self.is_english = True
917
  else:
 
941
  # self.page_cum_height.append(
942
  # np.max([c["bottom"] for c in chars]))
943
  self.__ocr(i + 1, img, chars, zoomin)
944
+ if callback: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
945
 
946
  if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
947
  bxes = [b for bxs in self.boxes for b in bxs]
948
+ self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
949
+ "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
950
 
951
  logging.info("Is it English:", self.is_english)
952
 
 
979
  if need_position: return None, None
980
  return
981
 
982
+ max_width = np.max([right - left for (_, left, right, _, _) in poss])
983
  GAP = 6
984
  pos = poss[0]
985
+ poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
986
  pos = poss[-1]
987
+ poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
988
+ min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
989
 
990
  positions = []
991
  for ii, (pns, left, right, top, bottom) in enumerate(poss):
 
1000
  bottom, self.page_images[pns[0]].size[1])
1001
  ))
1002
  )
1003
+ if 0 < ii < len(poss) - 1:
1004
+ positions.append((pns[0] + self.page_from, left, right, top, min(
1005
+ bottom, self.page_images[pns[0]].size[1]) / ZM))
1006
  bottom -= self.page_images[pns[0]].size[1]
1007
  for pn in pns[1:]:
1008
  imgs.append(
 
1013
  ))
1014
  )
1015
  if 0 < ii < len(poss) - 1:
1016
+ positions.append((pn + self.page_from, left, right, 0, min(
1017
  bottom, self.page_images[pn].size[1]) / ZM))
1018
  bottom -= self.page_images[pn].size[1]
1019
 
 
1042
  return pic, positions
1043
  return pic
1044
 
1045
+ def get_position(self, bx, ZM):
1046
+ poss = []
1047
+ pn = bx["page_number"]
1048
+ top = bx["top"] - self.page_cum_height[pn - 1]
1049
+ bott = bx["bottom"] - self.page_cum_height[pn - 1]
1050
+ poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
1051
+ while bott * ZM > self.page_images[pn - 1].size[1]:
1052
+ bott -= self.page_images[pn - 1].size[1] / ZM
1053
+ top = 0
1054
+ pn += 1
1055
+ poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
1056
+ return poss
1057
+
1058
 
1059
  if __name__ == "__main__":
1060
  pass
rag/app/manual.py CHANGED
@@ -30,19 +30,6 @@ class Pdf(PdfParser):
30
  # print(b)
31
  print("OCR:", timer()-start)
32
 
33
- def get_position(bx):
34
- poss = []
35
- pn = bx["page_number"]
36
- top = bx["top"] - self.page_cum_height[pn - 1]
37
- bott = bx["bottom"] - self.page_cum_height[pn - 1]
38
- poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
39
- while bott * zoomin > self.page_images[pn - 1].size[1]:
40
- bott -= self.page_images[pn- 1].size[1] / zoomin
41
- top = 0
42
- pn += 1
43
- poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
44
- return poss
45
-
46
  def tag(pn, left, right, top, bottom):
47
  return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
48
  .format(pn, left, right, top, bottom)
@@ -54,7 +41,7 @@ class Pdf(PdfParser):
54
  callback(0.67, "Table analysis finished.")
55
  self._text_merge()
56
  tbls = self._extract_table_figure(True, zoomin, True, True)
57
- self._naive_vertical_merge()
58
  self._filter_forpages()
59
  callback(0.68, "Text merging finished")
60
 
@@ -74,7 +61,7 @@ class Pdf(PdfParser):
74
  sec_ids.append(sid)
75
  #print(lvl, self.boxes[i]["text"], most_level)
76
 
77
- sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
78
  for (img, rows), poss in tbls:
79
  sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
80
 
 
30
  # print(b)
31
  print("OCR:", timer()-start)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def tag(pn, left, right, top, bottom):
34
  return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
35
  .format(pn, left, right, top, bottom)
 
41
  callback(0.67, "Table analysis finished.")
42
  self._text_merge()
43
  tbls = self._extract_table_figure(True, zoomin, True, True)
44
+ self._concat_downward()
45
  self._filter_forpages()
46
  callback(0.68, "Text merging finished")
47
 
 
61
  sec_ids.append(sid)
62
  #print(lvl, self.boxes[i]["text"], most_level)
63
 
64
+ sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
65
  for (img, rows), poss in tbls:
66
  sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
67
 
rag/app/naive.py CHANGED
@@ -14,7 +14,7 @@ import copy
14
  import re
15
  from rag.app import laws
16
  from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
17
- from deepdoc.parser import PdfParser
18
  from rag.settings import cron_logger
19
 
20
 
@@ -74,6 +74,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
74
  sections, tbls = pdf_parser(filename if not binary else binary,
75
  from_page=from_page, to_page=to_page, callback=callback)
76
  res = tokenize_table(tbls, doc, eng)
 
 
 
 
77
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
78
  callback(0.1, "Start to parse.")
79
  txt = ""
 
14
  import re
15
  from rag.app import laws
16
  from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
17
+ from deepdoc.parser import PdfParser, ExcelParser
18
  from rag.settings import cron_logger
19
 
20
 
 
74
  sections, tbls = pdf_parser(filename if not binary else binary,
75
  from_page=from_page, to_page=to_page, callback=callback)
76
  res = tokenize_table(tbls, doc, eng)
77
+ elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
78
+ callback(0.1, "Start to parse.")
79
+ excel_parser = ExcelParser()
80
+ sections = [(excel_parser.html(binary), "")]
81
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
82
  callback(0.1, "Start to parse.")
83
  txt = ""
rag/app/paper.py CHANGED
@@ -15,7 +15,7 @@ import re
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
- from rag.nlp import huqie, tokenize, tokenize_table, add_positions
19
  from deepdoc.parser import PdfParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
@@ -46,11 +46,11 @@ class Pdf(PdfParser):
46
  self._table_transformer_job(zoomin)
47
  callback(0.68, "Table analysis finished")
48
  self._text_merge()
 
49
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
50
- self._concat_downward(concat_between_pages=False)
51
  self._filter_forpages()
52
  callback(0.75, "Text merging finished.")
53
- tbls = self._extract_table_figure(True, zoomin, True, True)
54
 
55
  # clean mess
56
  if column_width < self.page_images[0].size[0] / zoomin / 2:
@@ -59,24 +59,24 @@ class Pdf(PdfParser):
59
  self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
60
  for b in self.boxes:
61
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
62
- freq = Counter([b["text"] for b in self.boxes])
63
- garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
64
- i = 0
65
- while i < len(self.boxes):
66
- if self.boxes[i]["text"] in garbage \
67
- or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
68
- or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
69
- self.boxes.pop(i)
70
- elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
71
- '1'):
72
- # merge within same layouts
73
- self.boxes[i + 1]["top"] = self.boxes[i]["top"]
74
- self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
75
- self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
76
- self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
77
- self.boxes.pop(i)
78
- else:
79
- i += 1
80
 
81
  def _begin(txt):
82
  return re.match(
@@ -88,7 +88,7 @@ class Pdf(PdfParser):
88
  "title":"",
89
  "authors": "",
90
  "abstract": "",
91
- "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
92
  re.match(r"(text|title)", b.get("layoutno", "text"))],
93
  "tables": tbls
94
  }
@@ -119,11 +119,10 @@ class Pdf(PdfParser):
119
  if re.match("(abstract|摘要)", txt):
120
  if len(txt.split(" ")) > 32 or len(txt) > 64:
121
  abstr = txt + self._line_tag(b, zoomin)
122
- i += 1
123
  break
124
- txt = self.boxes[i + 1]["text"].lower().strip()
125
  if len(txt.split(" ")) > 32 or len(txt) > 64:
126
- abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
127
  i += 1
128
  break
129
  if not abstr: i = 0
@@ -136,7 +135,7 @@ class Pdf(PdfParser):
136
  "title": title if title else filename,
137
  "authors": " ".join(authors),
138
  "abstract": abstr,
139
- "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
140
  re.match(r"(text|title)", b.get("layoutno", "text"))],
141
  "tables": tbls
142
  }
@@ -153,7 +152,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
153
  paper = pdf_parser(filename if not binary else binary,
154
  from_page=from_page, to_page=to_page, callback=callback)
155
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
156
- doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
 
157
  "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
158
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
159
  doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
@@ -173,6 +173,38 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
173
  tokenize(d, txt, eng)
174
  res.append(d)
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  readed = [0] * len(paper["lines"])
177
  # find colon firstly
178
  i = 0
@@ -252,6 +284,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
252
 
253
  if __name__ == "__main__":
254
  import sys
255
- def dummy(a, b):
256
  pass
257
  chunk(sys.argv[1], callback=dummy)
 
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
+ from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
19
  from deepdoc.parser import PdfParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
 
46
  self._table_transformer_job(zoomin)
47
  callback(0.68, "Table analysis finished")
48
  self._text_merge()
49
+ tbls = self._extract_table_figure(True, zoomin, True, True)
50
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
51
+ self._concat_downward()
52
  self._filter_forpages()
53
  callback(0.75, "Text merging finished.")
 
54
 
55
  # clean mess
56
  if column_width < self.page_images[0].size[0] / zoomin / 2:
 
59
  self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
60
  for b in self.boxes:
61
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
62
+ # freq = Counter([b["text"] for b in self.boxes])
63
+ # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
64
+ # i = 0
65
+ # while i < len(self.boxes):
66
+ # if self.boxes[i]["text"] in garbage \
67
+ # or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
68
+ # or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
69
+ # self.boxes.pop(i)
70
+ # elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
71
+ # '1'):
72
+ # # merge within same layouts
73
+ # self.boxes[i + 1]["top"] = self.boxes[i]["top"]
74
+ # self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
75
+ # self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
76
+ # self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
77
+ # self.boxes.pop(i)
78
+ # else:
79
+ # i += 1
80
 
81
  def _begin(txt):
82
  return re.match(
 
88
  "title":"",
89
  "authors": "",
90
  "abstract": "",
91
+ "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
92
  re.match(r"(text|title)", b.get("layoutno", "text"))],
93
  "tables": tbls
94
  }
 
119
  if re.match("(abstract|摘要)", txt):
120
  if len(txt.split(" ")) > 32 or len(txt) > 64:
121
  abstr = txt + self._line_tag(b, zoomin)
 
122
  break
123
+ txt = self.boxes[i]["text"].lower().strip()
124
  if len(txt.split(" ")) > 32 or len(txt) > 64:
125
+ abstr = txt + self._line_tag(self.boxes[i], zoomin)
126
  i += 1
127
  break
128
  if not abstr: i = 0
 
135
  "title": title if title else filename,
136
  "authors": " ".join(authors),
137
  "abstract": abstr,
138
+ "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
139
  re.match(r"(text|title)", b.get("layoutno", "text"))],
140
  "tables": tbls
141
  }
 
152
  paper = pdf_parser(filename if not binary else binary,
153
  from_page=from_page, to_page=to_page, callback=callback)
154
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
155
+
156
+ doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
157
  "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
158
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
159
  doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
 
173
  tokenize(d, txt, eng)
174
  res.append(d)
175
 
176
+ sorted_sections = paper["sections"]
177
+ # set pivot using the most frequent type of title,
178
+ # then merge between 2 pivot
179
+ bull = bullets_category([txt for txt, _ in sorted_sections])
180
+ most_level, levels = title_frequency(bull, sorted_sections)
181
+ assert len(sorted_sections) == len(levels)
182
+ sec_ids = []
183
+ sid = 0
184
+ for i, lvl in enumerate(levels):
185
+ if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
186
+ sec_ids.append(sid)
187
+ print(lvl, sorted_sections[i][0], most_level, sid)
188
+
189
+ chunks = []
190
+ last_sid = -2
191
+ for (txt, _), sec_id in zip(sorted_sections, sec_ids):
192
+ if sec_id == last_sid:
193
+ if chunks:
194
+ chunks[-1] += "\n" + txt
195
+ continue
196
+ chunks.append(txt)
197
+ last_sid = sec_id
198
+ for txt in chunks:
199
+ d = copy.deepcopy(doc)
200
+ d["image"], poss = pdf_parser.crop(txt, need_position=True)
201
+ add_positions(d, poss)
202
+ tokenize(d, pdf_parser.remove_tag(txt), eng)
203
+ res.append(d)
204
+ print("----------------------\n", pdf_parser.remove_tag(txt))
205
+
206
+ return res
207
+
208
  readed = [0] * len(paper["lines"])
209
  # find colon firstly
210
  i = 0
 
284
 
285
  if __name__ == "__main__":
286
  import sys
287
+ def dummy(prog=None, msg=""):
288
  pass
289
  chunk(sys.argv[1], callback=dummy)
rag/app/qa.py CHANGED
@@ -16,7 +16,7 @@ from io import BytesIO
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
18
  from rag.nlp import is_english, random_choices
19
- from rag.nlp import huqie, stemmer
20
  from deepdoc.parser import ExcelParser
21
 
22
 
@@ -73,12 +73,8 @@ def beAdoc(d, q, a, eng):
73
  aprefix = "Answer: " if eng else "回答:"
74
  d["content_with_weight"] = "\t".join(
75
  [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
76
- if eng:
77
- d["content_ltks"] = " ".join([stemmer.stem(w)
78
- for w in word_tokenize(q)])
79
- else:
80
- d["content_ltks"] = huqie.qie(q)
81
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
82
  return d
83
 
84
 
 
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
18
  from rag.nlp import is_english, random_choices
19
+ from rag.nlp import huqie
20
  from deepdoc.parser import ExcelParser
21
 
22
 
 
73
  aprefix = "Answer: " if eng else "回答:"
74
  d["content_with_weight"] = "\t".join(
75
  [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
76
+ d["content_ltks"] = huqie.qie(q)
77
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 
 
 
 
78
  return d
79
 
80
 
rag/app/table.py CHANGED
@@ -74,9 +74,9 @@ def trans_datatime(s):
74
 
75
  def trans_bool(s):
76
  if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
77
- return ["yes", "是"]
78
  if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
79
- return ["no", "否"]
80
 
81
 
82
  def column_data_type(arr):
@@ -92,7 +92,7 @@ def column_data_type(arr):
92
  counts["int"] += 1
93
  elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
94
  counts["float"] += 1
95
- elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
96
  counts["bool"] += 1
97
  elif trans_datatime(str(a)):
98
  counts["datetime"] += 1
 
74
 
75
  def trans_bool(s):
76
  if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
77
+ return "yes"
78
  if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
79
+ return "no"
80
 
81
 
82
  def column_data_type(arr):
 
92
  counts["int"] += 1
93
  elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
94
  counts["float"] += 1
95
+ elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
96
  counts["bool"] += 1
97
  elif trans_datatime(str(a)):
98
  counts["datetime"] += 1
rag/nlp/__init__.py CHANGED
@@ -3,14 +3,9 @@ from collections import Counter
3
 
4
  from rag.utils import num_tokens_from_string
5
  from . import huqie
6
- from nltk import word_tokenize
7
  import re
8
  import copy
9
 
10
- from nltk.stem import PorterStemmer
11
-
12
- stemmer = PorterStemmer()
13
-
14
 
15
  BULLET_PATTERN = [[
16
  r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
@@ -77,13 +72,8 @@ def is_english(texts):
77
  def tokenize(d, t, eng):
78
  d["content_with_weight"] = t
79
  t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
80
- if eng:
81
- t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
82
- d["content_ltks"] = " ".join([stemmer.stem(w)
83
- for w in word_tokenize(t)])
84
- else:
85
- d["content_ltks"] = huqie.qie(t)
86
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
87
 
88
 
89
  def tokenize_table(tbls, doc, eng, batch_size=10):
@@ -94,8 +84,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
94
  continue
95
  if isinstance(rows, str):
96
  d = copy.deepcopy(doc)
97
- r = re.sub(r"<[^<>]{,12}>", "", rows)
98
- tokenize(d, r, eng)
99
  d["content_with_weight"] = rows
100
  d["image"] = img
101
  add_positions(d, poss)
 
3
 
4
  from rag.utils import num_tokens_from_string
5
  from . import huqie
 
6
  import re
7
  import copy
8
 
 
 
 
 
9
 
10
  BULLET_PATTERN = [[
11
  r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
 
72
  def tokenize(d, t, eng):
73
  d["content_with_weight"] = t
74
  t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
75
+ d["content_ltks"] = huqie.qie(t)
76
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 
 
 
 
 
77
 
78
 
79
  def tokenize_table(tbls, doc, eng, batch_size=10):
 
84
  continue
85
  if isinstance(rows, str):
86
  d = copy.deepcopy(doc)
87
+ tokenize(d, rows, eng)
 
88
  d["content_with_weight"] = rows
89
  d["image"] = img
90
  add_positions(d, poss)
rag/nlp/huqie.py CHANGED
@@ -8,7 +8,8 @@ import re
8
  import string
9
  import sys
10
  from hanziconv import HanziConv
11
-
 
12
  from api.utils.file_utils import get_project_base_directory
13
 
14
 
@@ -45,6 +46,9 @@ class Huqie:
45
  self.trie_ = datrie.Trie(string.printable)
46
  self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
47
 
 
 
 
48
  self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
49
  try:
50
  self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
@@ -239,6 +243,10 @@ class Huqie:
239
  def qie(self, line):
240
  line = self._strQ2B(line).lower()
241
  line = self._tradi2simp(line)
 
 
 
 
242
  arr = re.split(self.SPLIT_CHAR, line)
243
  res = []
244
  for L in arr:
@@ -290,8 +298,12 @@ class Huqie:
290
  return self.merge_(res)
291
 
292
  def qieqie(self, tks):
 
 
 
 
293
  res = []
294
- for tk in tks.split(" "):
295
  if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
296
  res.append(tk)
297
  continue
 
8
  import string
9
  import sys
10
  from hanziconv import HanziConv
11
+ from nltk import word_tokenize
12
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
13
  from api.utils.file_utils import get_project_base_directory
14
 
15
 
 
46
  self.trie_ = datrie.Trie(string.printable)
47
  self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
48
 
49
+ self.stemmer = PorterStemmer()
50
+ self.lemmatizer = WordNetLemmatizer()
51
+
52
  self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
53
  try:
54
  self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
 
243
  def qie(self, line):
244
  line = self._strQ2B(line).lower()
245
  line = self._tradi2simp(line)
246
+ zh_num = len([1 for c in line if is_chinese(c)])
247
+ if zh_num < len(line) * 0.2:
248
+ return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
249
+
250
  arr = re.split(self.SPLIT_CHAR, line)
251
  res = []
252
  for L in arr:
 
298
  return self.merge_(res)
299
 
300
  def qieqie(self, tks):
301
+ tks = tks.split(" ")
302
+ zh_num = len([1 for c in tks if c and is_chinese(c[0])])
303
+ if zh_num < len(tks) * 0.2:return " ".join(tks)
304
+
305
  res = []
306
+ for tk in tks:
307
  if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
308
  res.append(tk)
309
  continue
rag/nlp/query.py CHANGED
@@ -4,8 +4,8 @@ import json
4
  import re
5
  import logging
6
  import copy
7
- import math
8
- from elasticsearch_dsl import Q, Search
9
  from rag.nlp import huqie, term_weight, synonym
10
 
11
 
@@ -33,12 +33,14 @@ class EsQueryer:
33
 
34
  @staticmethod
35
  def rmWWW(txt):
36
- txt = re.sub(
37
- r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*",
38
- "",
39
- txt)
40
- return re.sub(
41
- r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
 
 
42
 
43
  def question(self, txt, tbl="qa", min_match="60%"):
44
  txt = re.sub(
@@ -50,7 +52,7 @@ class EsQueryer:
50
  txt = EsQueryer.rmWWW(txt)
51
 
52
  if not self.isChinese(txt):
53
- tks = [t for t in txt.split(" ") if t.strip()]
54
  q = tks
55
  for i in range(1, len(tks)):
56
  q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@@ -58,9 +60,9 @@ class EsQueryer:
58
  q.append(txt)
59
  return Q("bool",
60
  must=Q("query_string", fields=self.flds,
61
- type="best_fields", query=" OR ".join(q),
62
  boost=1, minimum_should_match=min_match)
63
- ), txt.split(" ")
64
 
65
  def needQieqie(tk):
66
  if len(tk) < 4:
@@ -160,8 +162,8 @@ class EsQueryer:
160
  s += v# * dtwt[k]
161
  q = 1e-9
162
  for k, v in qtwt.items():
163
- q += v * v
164
- d = 1e-9
165
- for k, v in dtwt.items():
166
- d += v * v
167
- return s / q#math.sqrt(q) / math.sqrt(d)
 
4
  import re
5
  import logging
6
  import copy
7
+ from elasticsearch_dsl import Q
8
+
9
  from rag.nlp import huqie, term_weight, synonym
10
 
11
 
 
33
 
34
  @staticmethod
35
  def rmWWW(txt):
36
+ patts = [
37
+ (r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*", ""),
38
+ (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
39
+ (r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down)", " ")
40
+ ]
41
+ for r, p in patts:
42
+ txt = re.sub(r, p, txt, flags=re.IGNORECASE)
43
+ return txt
44
 
45
  def question(self, txt, tbl="qa", min_match="60%"):
46
  txt = re.sub(
 
52
  txt = EsQueryer.rmWWW(txt)
53
 
54
  if not self.isChinese(txt):
55
+ tks = huqie.qie(txt).split(" ")
56
  q = tks
57
  for i in range(1, len(tks)):
58
  q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
 
60
  q.append(txt)
61
  return Q("bool",
62
  must=Q("query_string", fields=self.flds,
63
+ type="best_fields", query=" ".join(q),
64
  boost=1, minimum_should_match=min_match)
65
+ ), tks
66
 
67
  def needQieqie(tk):
68
  if len(tk) < 4:
 
162
  s += v# * dtwt[k]
163
  q = 1e-9
164
  for k, v in qtwt.items():
165
+ q += v #* v
166
+ #d = 1e-9
167
+ #for k, v in dtwt.items():
168
+ # d += v * v
169
+ return s / q #math.sqrt(q) / math.sqrt(d)
rag/nlp/search.py CHANGED
@@ -196,7 +196,24 @@ class Dealer:
196
  def insert_citations(self, answer, chunks, chunk_v,
197
  embd_mdl, tkweight=0.7, vtweight=0.3):
198
  assert len(chunks) == len(chunk_v)
199
- pieces = re.split(r"([;。?!!\n]|[a-z][.?;!][ \n])", answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  for i in range(1, len(pieces)):
201
  if re.match(r"[a-z][.?;!][ \n]", pieces[i]):
202
  pieces[i - 1] += pieces[i][0]
@@ -226,7 +243,7 @@ class Dealer:
226
  chunks_tks,
227
  tkweight, vtweight)
228
  mx = np.max(sim) * 0.99
229
- if mx < 0.66:
230
  continue
231
  cites[idx[i]] = list(
232
  set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
@@ -249,6 +266,7 @@ class Dealer:
249
 
250
  def rerank(self, sres, query, tkweight=0.3,
251
  vtweight=0.7, cfield="content_ltks"):
 
252
  ins_embd = [
253
  Dealer.trans2floats(
254
  sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
@@ -258,8 +276,7 @@ class Dealer:
258
  for i in sres.ids]
259
  sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
260
  ins_embd,
261
- huqie.qie(
262
- query).split(" "),
263
  ins_tw, tkweight, vtweight)
264
  return sim, tksim, vtsim
265
 
 
196
  def insert_citations(self, answer, chunks, chunk_v,
197
  embd_mdl, tkweight=0.7, vtweight=0.3):
198
  assert len(chunks) == len(chunk_v)
199
+ pieces = re.split(r"(```)", answer)
200
+ if len(pieces) >= 3:
201
+ i = 0
202
+ pieces_ = []
203
+ while i < len(pieces):
204
+ if pieces[i] == "```":
205
+ st = i
206
+ i += 1
207
+ while i<len(pieces) and pieces[i] != "```":
208
+ i += 1
209
+ if i < len(pieces): i += 1
210
+ pieces_.append("".join(pieces[st: i])+"\n")
211
+ else:
212
+ pieces_.extend(re.split(r"([^\|][;。?!!\n]|[a-z][.?;!][ \n])", pieces[i]))
213
+ i += 1
214
+ pieces = pieces_
215
+ else:
216
+ pieces = re.split(r"([^\|][;。?!!\n]|[a-z][.?;!][ \n])", answer)
217
  for i in range(1, len(pieces)):
218
  if re.match(r"[a-z][.?;!][ \n]", pieces[i]):
219
  pieces[i - 1] += pieces[i][0]
 
243
  chunks_tks,
244
  tkweight, vtweight)
245
  mx = np.max(sim) * 0.99
246
+ if mx < 0.7:
247
  continue
248
  cites[idx[i]] = list(
249
  set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
 
266
 
267
  def rerank(self, sres, query, tkweight=0.3,
268
  vtweight=0.7, cfield="content_ltks"):
269
+ _, keywords = self.qryr.question(query)
270
  ins_embd = [
271
  Dealer.trans2floats(
272
  sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
 
276
  for i in sres.ids]
277
  sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
278
  ins_embd,
279
+ keywords,
 
280
  ins_tw, tkweight, vtweight)
281
  return sim, tksim, vtsim
282
 
rag/svr/task_broker.py CHANGED
@@ -82,12 +82,14 @@ def dispatch():
82
  tsks = []
83
  if r["type"] == FileType.PDF.value:
84
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
 
 
85
  for s,e in r["parser_config"].get("pages", [(0,100000)]):
86
  e = min(e, pages)
87
- for p in range(s, e, 5):
88
  task = new_task()
89
  task["from_page"] = p
90
- task["to_page"] = min(p + 5, e)
91
  tsks.append(task)
92
  elif r["parser_id"] == "table":
93
  rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
 
82
  tsks = []
83
  if r["type"] == FileType.PDF.value:
84
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
85
+ page_size = 5
86
+ if r["parser_id"] == "paper": page_size = 12
87
  for s,e in r["parser_config"].get("pages", [(0,100000)]):
88
  e = min(e, pages)
89
+ for p in range(s, e, page_size):
90
  task = new_task()
91
  task["from_page"] = p
92
+ task["to_page"] = min(p + page_size, e)
93
  tsks.append(task)
94
  elif r["parser_id"] == "table":
95
  rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))