KevinHuSh commited on
Commit
b085dec
·
1 Parent(s): 0cf2c67

add use layout or not option (#145)

Browse files

* add use layout or not option

* trival

api/apps/conversation_app.py CHANGED
@@ -196,7 +196,10 @@ def chat(dialog, messages, **kwargs):
196
 
197
  for _ in range(len(questions)//2):
198
  questions.append(questions[-1])
199
- kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
 
 
 
200
  dialog.similarity_threshold,
201
  dialog.vector_similarity_weight, top=1024, aggs=False)
202
  knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
 
196
 
197
  for _ in range(len(questions)//2):
198
  questions.append(questions[-1])
199
+ if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
200
+ kbinfos = {"total":0, "chunks":[],"doc_aggs":[]}
201
+ else:
202
+ kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
203
  dialog.similarity_threshold,
204
  dialog.vector_similarity_weight, top=1024, aggs=False)
205
  knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
api/apps/document_app.py CHANGED
@@ -310,7 +310,10 @@ def change_parser():
310
  if not e:
311
  return get_data_error_result(retmsg="Document not found!")
312
  if doc.parser_id.lower() == req["parser_id"].lower():
313
- return get_json_result(data=True)
 
 
 
314
 
315
  if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
316
  return get_data_error_result(retmsg="Not supported yet!")
@@ -319,6 +322,8 @@ def change_parser():
319
  {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
320
  if not e:
321
  return get_data_error_result(retmsg="Document not found!")
 
 
322
  if doc.token_num > 0:
323
  e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
324
  doc.process_duation * -1)
 
310
  if not e:
311
  return get_data_error_result(retmsg="Document not found!")
312
  if doc.parser_id.lower() == req["parser_id"].lower():
313
+ if "parser_config" in req:
314
+ if req["parser_config"] == doc.parser_config:
315
+ return get_json_result(data=True)
316
+ else: return get_json_result(data=True)
317
 
318
  if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
319
  return get_data_error_result(retmsg="Not supported yet!")
 
322
  {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
323
  if not e:
324
  return get_data_error_result(retmsg="Document not found!")
325
+ if "parser_config" in req:
326
+ DocumentService.update_parser_config(doc.id, req["parser_config"])
327
  if doc.token_num > 0:
328
  e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
329
  doc.process_duation * -1)
api/db/init_data.py CHANGED
@@ -276,7 +276,7 @@ def init_llm_factory():
276
  drop table llm_factories;
277
  update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
278
  update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
279
- update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture';
280
  alter table knowledgebase modify avatar longtext;
281
  alter table user modify avatar longtext;
282
  alter table dialog modify icon longtext;
@@ -297,5 +297,4 @@ def init_web_data():
297
 
298
  if __name__ == '__main__':
299
  init_web_db()
300
- init_web_data()
301
- add_tenant_llm()
 
276
  drop table llm_factories;
277
  update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
278
  update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
279
+ update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
280
  alter table knowledgebase modify avatar longtext;
281
  alter table user modify avatar longtext;
282
  alter table dialog modify icon longtext;
 
297
 
298
  if __name__ == '__main__':
299
  init_web_db()
300
+ init_web_data()
 
api/db/services/document_service.py CHANGED
@@ -118,9 +118,25 @@ class DocumentService(CommonService):
118
  if not docs:return
119
  return docs[0]["tenant_id"]
120
 
121
-
122
  @classmethod
123
  @DB.connection_context()
124
  def get_thumbnails(cls, docids):
125
  fields = [cls.model.id, cls.model.thumbnail]
126
  return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  if not docs:return
119
  return docs[0]["tenant_id"]
120
 
 
121
  @classmethod
122
  @DB.connection_context()
123
  def get_thumbnails(cls, docids):
124
  fields = [cls.model.id, cls.model.thumbnail]
125
  return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
126
+
127
+ @classmethod
128
+ @DB.connection_context()
129
+ def update_parser_config(cls, id, config):
130
+ e, d = cls.get_by_id(id)
131
+ if not e:raise LookupError(f"Document({id}) not found.")
132
+ def dfs_update(old, new):
133
+ for k,v in new.items():
134
+ if k not in old:
135
+ old[k] = v
136
+ continue
137
+ if isinstance(v, dict):
138
+ assert isinstance(old[k], dict)
139
+ dfs_update(old[k], v)
140
+ else: old[k] = v
141
+ dfs_update(d.parser_config, config)
142
+ cls.update_by_id(id, {"parser_config": d.parser_config})
api/settings.py CHANGED
@@ -94,7 +94,7 @@ ASR_MDL = default_llm[LLM_FACTORY]["asr_model"]
94
  IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
95
 
96
  API_KEY = LLM.get("api_key", "")
97
- PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
98
 
99
  # distribution
100
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
 
94
  IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
95
 
96
  API_KEY = LLM.get("api_key", "")
97
+ PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
98
 
99
  # distribution
100
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
deepdoc/parser/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
 
2
 
3
- from .pdf_parser import HuParser as PdfParser
4
  from .docx_parser import HuDocxParser as DocxParser
5
  from .excel_parser import HuExcelParser as ExcelParser
6
  from .ppt_parser import HuPptParser as PptParser
 
1
 
2
 
3
+ from .pdf_parser import HuParser as PdfParser, PlainParser
4
  from .docx_parser import HuDocxParser as DocxParser
5
  from .excel_parser import HuExcelParser as ExcelParser
6
  from .ppt_parser import HuPptParser as PptParser
deepdoc/parser/pdf_parser.py CHANGED
@@ -1073,5 +1073,37 @@ class HuParser:
1073
  return poss
1074
 
1075
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1076
  if __name__ == "__main__":
1077
  pass
 
1073
  return poss
1074
 
1075
 
1076
+ class PlainParser(object):
1077
+ def __call__(self, filename, **kwargs):
1078
+ self.outlines = []
1079
+ lines = []
1080
+ try:
1081
+ self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
1082
+ outlines = self.pdf.outline
1083
+ for page in self.pdf.pages:
1084
+ lines.extend([t for t in page.extract_text().split("\n")])
1085
+
1086
+ def dfs(arr, depth):
1087
+ for a in arr:
1088
+ if isinstance(a, dict):
1089
+ self.outlines.append((a["/Title"], depth))
1090
+ continue
1091
+ dfs(a, depth + 1)
1092
+
1093
+ dfs(outlines, 0)
1094
+ except Exception as e:
1095
+ logging.warning(f"Outlines exception: {e}")
1096
+ if not self.outlines:
1097
+ logging.warning(f"Miss outlines")
1098
+
1099
+ return [(l, "") for l in lines], []
1100
+
1101
+ def crop(self, ck, need_position):
1102
+ raise NotImplementedError
1103
+
1104
+ @staticmethod
1105
+ def remove_tag(txt):
1106
+ raise NotImplementedError
1107
+
1108
  if __name__ == "__main__":
1109
  pass
rag/app/book.py CHANGED
@@ -12,10 +12,12 @@
12
  #
13
  import copy
14
  import re
 
 
15
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
16
- hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions
17
  from rag.nlp import huqie
18
- from deepdoc.parser import PdfParser, DocxParser
19
 
20
 
21
  class Pdf(PdfParser):
@@ -69,10 +71,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
69
  sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
70
  remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
71
  callback(0.8, "Finish parsing.")
 
72
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
73
- pdf_parser = Pdf()
74
  sections, tbls = pdf_parser(filename if not binary else binary,
75
  from_page=from_page, to_page=to_page, callback=callback)
 
76
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
77
  callback(0.1, "Start to parse.")
78
  txt = ""
@@ -87,31 +91,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
87
  sections = [(l,"") for l in sections if l]
88
  remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
89
  callback(0.8, "Finish parsing.")
 
90
  else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
91
 
92
  make_colon_as_title(sections)
93
  bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
94
- if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
 
95
  else:
96
  sections = [s.split("@") for s,_ in sections]
97
  sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
98
- cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
99
 
100
  # is it English
101
  eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
102
 
103
  res = tokenize_table(tbls, doc, eng)
 
104
 
105
- # wrap up to es documents
106
- for ck in cks:
107
- d = copy.deepcopy(doc)
108
- ck = "\n".join(ck)
109
- if pdf_parser:
110
- d["image"], poss = pdf_parser.crop(ck, need_position=True)
111
- add_positions(d, poss)
112
- ck = pdf_parser.remove_tag(ck)
113
- tokenize(d, ck, eng)
114
- res.append(d)
115
  return res
116
 
117
 
 
12
  #
13
  import copy
14
  import re
15
+ from io import BytesIO
16
+
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
18
+ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, tokenize_chunks
19
  from rag.nlp import huqie
20
+ from deepdoc.parser import PdfParser, DocxParser, PlainParser
21
 
22
 
23
  class Pdf(PdfParser):
 
71
  sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
72
  remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
73
  callback(0.8, "Finish parsing.")
74
+
75
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
76
+ pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
77
  sections, tbls = pdf_parser(filename if not binary else binary,
78
  from_page=from_page, to_page=to_page, callback=callback)
79
+
80
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
81
  callback(0.1, "Start to parse.")
82
  txt = ""
 
91
  sections = [(l,"") for l in sections if l]
92
  remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
93
  callback(0.8, "Finish parsing.")
94
+
95
  else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
96
 
97
  make_colon_as_title(sections)
98
  bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
99
+ if bull >= 0:
100
+ chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
101
  else:
102
  sections = [s.split("@") for s,_ in sections]
103
  sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
104
+ chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
105
 
106
  # is it English
107
  eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
108
 
109
  res = tokenize_table(tbls, doc, eng)
110
+ res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
111
 
 
 
 
 
 
 
 
 
 
 
112
  return res
113
 
114
 
rag/app/laws.py CHANGED
@@ -15,9 +15,9 @@ import re
15
  from io import BytesIO
16
  from docx import Document
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
18
- make_colon_as_title, add_positions
19
  from rag.nlp import huqie
20
- from deepdoc.parser import PdfParser, DocxParser
21
  from rag.settings import cron_logger
22
 
23
 
@@ -68,7 +68,7 @@ class Pdf(PdfParser):
68
 
69
  callback(0.8, "Text extraction finished")
70
 
71
- return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
72
 
73
 
74
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -87,11 +87,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
87
  for txt in Docx()(filename, binary):
88
  sections.append(txt)
89
  callback(0.8, "Finish parsing.")
 
90
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
91
- pdf_parser = Pdf()
92
- for txt in pdf_parser(filename if not binary else binary,
93
- from_page=from_page, to_page=to_page, callback=callback):
94
- sections.append(txt)
 
95
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
96
  callback(0.1, "Start to parse.")
97
  txt = ""
@@ -114,22 +116,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
114
 
115
  make_colon_as_title(sections)
116
  bull = bullets_category(sections)
117
- cks = hierarchical_merge(bull, sections, 3)
118
- if not cks: callback(0.99, "No chunk parsed out.")
119
-
120
- res = []
121
- # wrap up to es documents
122
- for ck in cks:
123
- print("\n-".join(ck))
124
- ck = "\n".join(ck)
125
- d = copy.deepcopy(doc)
126
- if pdf_parser:
127
- d["image"], poss = pdf_parser.crop(ck, need_position=True)
128
- add_positions(d, poss)
129
- ck = pdf_parser.remove_tag(ck)
130
- tokenize(d, ck, eng)
131
- res.append(d)
132
- return res
133
 
134
 
135
  if __name__ == "__main__":
 
15
  from io import BytesIO
16
  from docx import Document
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
18
+ make_colon_as_title, add_positions, tokenize_chunks
19
  from rag.nlp import huqie
20
+ from deepdoc.parser import PdfParser, DocxParser, PlainParser
21
  from rag.settings import cron_logger
22
 
23
 
 
68
 
69
  callback(0.8, "Text extraction finished")
70
 
71
+ return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
72
 
73
 
74
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
87
  for txt in Docx()(filename, binary):
88
  sections.append(txt)
89
  callback(0.8, "Finish parsing.")
90
+
91
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
92
+ pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
93
+ for txt, poss in pdf_parser(filename if not binary else binary,
94
+ from_page=from_page, to_page=to_page, callback=callback):
95
+ sections.append(txt + poss)
96
+
97
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
98
  callback(0.1, "Start to parse.")
99
  txt = ""
 
116
 
117
  make_colon_as_title(sections)
118
  bull = bullets_category(sections)
119
+ chunks = hierarchical_merge(bull, sections, 3)
120
+ if not chunks: callback(0.99, "No chunk parsed out.")
121
+
122
+ return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  if __name__ == "__main__":
rag/app/manual.py CHANGED
@@ -2,8 +2,8 @@ import copy
2
  import re
3
 
4
  from api.db import ParserType
5
- from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
6
- from deepdoc.parser import PdfParser
7
  from rag.utils import num_tokens_from_string
8
 
9
 
@@ -30,9 +30,7 @@ class Pdf(PdfParser):
30
  # print(b)
31
  print("OCR:", timer()-start)
32
 
33
- def tag(pn, left, right, top, bottom):
34
- return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
35
- .format(pn, left, right, top, bottom)
36
 
37
  self._layouts_rec(zoomin)
38
  callback(0.65, "Layout analysis finished.")
@@ -49,6 +47,8 @@ class Pdf(PdfParser):
49
  for b in self.boxes:
50
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
51
 
 
 
52
  # set pivot using the most frequent type of title,
53
  # then merge between 2 pivot
54
  if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1:
@@ -103,9 +103,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
103
  pdf_parser = None
104
 
105
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
106
- pdf_parser = Pdf()
107
- cks, tbls = pdf_parser(filename if not binary else binary,
108
- from_page=from_page, to_page=to_page, callback=callback)
 
109
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
110
  doc = {
111
  "docnm_kwd": filename
@@ -115,13 +116,60 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
115
  # is it English
116
  eng = lang.lower() == "english"#pdf_parser.is_english
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  res = tokenize_table(tbls, doc, eng)
119
- for ck in cks:
120
- d = copy.deepcopy(doc)
121
- d["image"], poss = pdf_parser.crop(ck, need_position=True)
122
- add_positions(d, poss)
123
- tokenize(d, pdf_parser.remove_tag(ck), eng)
124
- res.append(d)
125
  return res
126
 
127
 
 
2
  import re
3
 
4
  from api.db import ParserType
5
+ from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
6
+ from deepdoc.parser import PdfParser, PlainParser
7
  from rag.utils import num_tokens_from_string
8
 
9
 
 
30
  # print(b)
31
  print("OCR:", timer()-start)
32
 
33
+
 
 
34
 
35
  self._layouts_rec(zoomin)
36
  callback(0.65, "Layout analysis finished.")
 
47
  for b in self.boxes:
48
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
49
 
50
+ return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
51
+
52
  # set pivot using the most frequent type of title,
53
  # then merge between 2 pivot
54
  if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1:
 
103
  pdf_parser = None
104
 
105
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
106
+ pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
107
+ sections, tbls = pdf_parser(filename if not binary else binary,
108
+ from_page=from_page, to_page=to_page, callback=callback)
109
+ if sections and len(sections[0])<3: cks = [(t, l, [0]*5) for t, l in sections]
110
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
111
  doc = {
112
  "docnm_kwd": filename
 
116
  # is it English
117
  eng = lang.lower() == "english"#pdf_parser.is_english
118
 
119
+ # set pivot using the most frequent type of title,
120
+ # then merge between 2 pivot
121
+ if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
122
+ max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
123
+ most_level = max(0, max_lvl - 1)
124
+ levels = []
125
+ for txt, _, _ in sections:
126
+ for t, lvl in pdf_parser.outlines:
127
+ tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
128
+ tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
129
+ if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
130
+ levels.append(lvl)
131
+ break
132
+ else:
133
+ levels.append(max_lvl + 1)
134
+ else:
135
+ bull = bullets_category([txt for txt,_,_ in sections])
136
+ most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
137
+
138
+ assert len(sections) == len(levels)
139
+ sec_ids = []
140
+ sid = 0
141
+ for i, lvl in enumerate(levels):
142
+ if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
143
+ sec_ids.append(sid)
144
+ # print(lvl, self.boxes[i]["text"], most_level, sid)
145
+
146
+ sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
147
+ for (img, rows), poss in tbls:
148
+ sections.append((rows if isinstance(rows, str) else rows[0], -1,
149
+ [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
150
+
151
+ def tag(pn, left, right, top, bottom):
152
+ if pn+left+right+top+bottom == 0:
153
+ return ""
154
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
155
+ .format(pn, left, right, top, bottom)
156
+
157
+ chunks = []
158
+ last_sid = -2
159
+ tk_cnt = 0
160
+ for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
161
+ poss = "\t".join([tag(*pos) for pos in poss])
162
+ if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
163
+ if chunks:
164
+ chunks[-1] += "\n" + txt + poss
165
+ tk_cnt += num_tokens_from_string(txt)
166
+ continue
167
+ chunks.append(txt + poss)
168
+ tk_cnt = num_tokens_from_string(txt)
169
+ if sec_id > -1: last_sid = sec_id
170
+
171
  res = tokenize_table(tbls, doc, eng)
172
+ res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
 
 
 
 
 
173
  return res
174
 
175
 
rag/app/naive.py CHANGED
@@ -12,8 +12,9 @@
12
  #
13
  import copy
14
  import re
 
15
  from rag.app import laws
16
- from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
17
  from deepdoc.parser import PdfParser, ExcelParser
18
  from rag.settings import cron_logger
19
 
@@ -56,6 +57,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
56
  """
57
 
58
  eng = lang.lower() == "english"#is_english(cks)
 
59
  doc = {
60
  "docnm_kwd": filename,
61
  "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@@ -69,15 +71,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
69
  for txt in laws.Docx()(filename, binary):
70
  sections.append((txt, ""))
71
  callback(0.8, "Finish parsing.")
 
72
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
73
- pdf_parser = Pdf()
74
  sections, tbls = pdf_parser(filename if not binary else binary,
75
  from_page=from_page, to_page=to_page, callback=callback)
76
  res = tokenize_table(tbls, doc, eng)
 
77
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
78
  callback(0.1, "Start to parse.")
79
  excel_parser = ExcelParser()
80
  sections = [(excel_parser.html(binary), "")]
 
81
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
82
  callback(0.1, "Start to parse.")
83
  txt = ""
@@ -92,26 +97,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
92
  sections = txt.split("\n")
93
  sections = [(l, "") for l in sections if l]
94
  callback(0.8, "Finish parsing.")
 
95
  else:
96
  raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
97
 
98
- parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
99
- cks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
100
-
101
- # wrap up to es documents
102
- for ck in cks:
103
- if len(ck.strip()) == 0:continue
104
- print("--", ck)
105
- d = copy.deepcopy(doc)
106
- if pdf_parser:
107
- try:
108
- d["image"], poss = pdf_parser.crop(ck, need_position=True)
109
- except Exception as e:
110
- continue
111
- add_positions(d, poss)
112
- ck = pdf_parser.remove_tag(ck)
113
- tokenize(d, ck, eng)
114
- res.append(d)
115
  return res
116
 
117
 
 
12
  #
13
  import copy
14
  import re
15
+ from deepdoc.parser.pdf_parser import PlainParser
16
  from rag.app import laws
17
+ from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
18
  from deepdoc.parser import PdfParser, ExcelParser
19
  from rag.settings import cron_logger
20
 
 
57
  """
58
 
59
  eng = lang.lower() == "english"#is_english(cks)
60
+ parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
61
  doc = {
62
  "docnm_kwd": filename,
63
  "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
 
71
  for txt in laws.Docx()(filename, binary):
72
  sections.append((txt, ""))
73
  callback(0.8, "Finish parsing.")
74
+
75
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
76
+ pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
77
  sections, tbls = pdf_parser(filename if not binary else binary,
78
  from_page=from_page, to_page=to_page, callback=callback)
79
  res = tokenize_table(tbls, doc, eng)
80
+
81
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
82
  callback(0.1, "Start to parse.")
83
  excel_parser = ExcelParser()
84
  sections = [(excel_parser.html(binary), "")]
85
+
86
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
87
  callback(0.1, "Start to parse.")
88
  txt = ""
 
97
  sections = txt.split("\n")
98
  sections = [(l, "") for l in sections if l]
99
  callback(0.8, "Finish parsing.")
100
+
101
  else:
102
  raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
103
 
104
+ chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
105
+
106
+ res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  return res
108
 
109
 
rag/app/one.py CHANGED
@@ -13,7 +13,7 @@
13
  import re
14
  from rag.app import laws
15
  from rag.nlp import huqie, tokenize
16
- from deepdoc.parser import PdfParser, ExcelParser
17
 
18
 
19
  class Pdf(PdfParser):
@@ -45,7 +45,7 @@ class Pdf(PdfParser):
45
  for (img, rows), poss in tbls:
46
  sections.append((rows if isinstance(rows, str) else rows[0],
47
  [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
48
- return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
49
 
50
 
51
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -59,16 +59,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
59
  sections = []
60
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
61
  callback(0.1, "Start to parse.")
62
- for txt in laws.Docx()(filename, binary):
63
- sections.append(txt)
64
  callback(0.8, "Finish parsing.")
 
65
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
66
- pdf_parser = Pdf()
67
  sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
 
 
68
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
69
  callback(0.1, "Start to parse.")
70
  excel_parser = ExcelParser()
71
  sections = [excel_parser.html(binary)]
 
72
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
73
  callback(0.1, "Start to parse.")
74
  txt = ""
@@ -81,8 +84,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
81
  if not l: break
82
  txt += l
83
  sections = txt.split("\n")
84
- sections = [(l, "") for l in sections if l]
85
  callback(0.8, "Finish parsing.")
 
86
  else:
87
  raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
88
 
 
13
  import re
14
  from rag.app import laws
15
  from rag.nlp import huqie, tokenize
16
+ from deepdoc.parser import PdfParser, ExcelParser, PlainParser
17
 
18
 
19
  class Pdf(PdfParser):
 
45
  for (img, rows), poss in tbls:
46
  sections.append((rows if isinstance(rows, str) else rows[0],
47
  [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
48
+ return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
49
 
50
 
51
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
59
  sections = []
60
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
61
  callback(0.1, "Start to parse.")
62
+ sections = [txt for txt in laws.Docx()(filename, binary) if txt]
 
63
  callback(0.8, "Finish parsing.")
64
+
65
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
66
+ pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
67
  sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
68
+ sections = [s for s, _ in sections if s]
69
+
70
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
71
  callback(0.1, "Start to parse.")
72
  excel_parser = ExcelParser()
73
  sections = [excel_parser.html(binary)]
74
+
75
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
76
  callback(0.1, "Start to parse.")
77
  txt = ""
 
84
  if not l: break
85
  txt += l
86
  sections = txt.split("\n")
87
+ sections = [s for s in sections if s]
88
  callback(0.8, "Finish parsing.")
89
+
90
  else:
91
  raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
92
 
rag/app/paper.py CHANGED
@@ -15,8 +15,8 @@ import re
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
- from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
19
- from deepdoc.parser import PdfParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
22
 
@@ -59,24 +59,6 @@ class Pdf(PdfParser):
59
  self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
60
  for b in self.boxes:
61
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
62
- # freq = Counter([b["text"] for b in self.boxes])
63
- # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
64
- # i = 0
65
- # while i < len(self.boxes):
66
- # if self.boxes[i]["text"] in garbage \
67
- # or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
68
- # or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
69
- # self.boxes.pop(i)
70
- # elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
71
- # '1'):
72
- # # merge within same layouts
73
- # self.boxes[i + 1]["top"] = self.boxes[i]["top"]
74
- # self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
75
- # self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
76
- # self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
77
- # self.boxes.pop(i)
78
- # else:
79
- # i += 1
80
 
81
  def _begin(txt):
82
  return re.match(
@@ -148,9 +130,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
148
  """
149
  pdf_parser = None
150
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
151
- pdf_parser = Pdf()
152
- paper = pdf_parser(filename if not binary else binary,
153
- from_page=from_page, to_page=to_page, callback=callback)
 
 
 
 
 
 
 
 
 
 
154
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
155
 
156
  doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
@@ -195,16 +187,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
195
  continue
196
  chunks.append(txt)
197
  last_sid = sec_id
198
- for txt in chunks:
199
- d = copy.deepcopy(doc)
200
- d["image"], poss = pdf_parser.crop(txt, need_position=True)
201
- add_positions(d, poss)
202
- tokenize(d, pdf_parser.remove_tag(txt), eng)
203
- res.append(d)
204
- print("----------------------\n", pdf_parser.remove_tag(txt))
205
-
206
  return res
207
 
 
208
  readed = [0] * len(paper["lines"])
209
  # find colon firstly
210
  i = 0
@@ -280,7 +266,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
280
  print(d)
281
  # d["image"].save(f"./logs/{i}.jpg")
282
  return res
283
-
284
 
285
  if __name__ == "__main__":
286
  import sys
 
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
+ from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
19
+ from deepdoc.parser import PdfParser, PlainParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
22
 
 
59
  self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
60
  for b in self.boxes:
61
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  def _begin(txt):
64
  return re.match(
 
130
  """
131
  pdf_parser = None
132
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
133
+ if not kwargs.get("parser_config",{}).get("layout_recognize", True):
134
+ pdf_parser = PlainParser()
135
+ paper = {
136
+ "title": filename,
137
+ "authors": " ",
138
+ "abstract": "",
139
+ "sections": pdf_parser(filename if not binary else binary),
140
+ "tables": []
141
+ }
142
+ else:
143
+ pdf_parser = Pdf()
144
+ paper = pdf_parser(filename if not binary else binary,
145
+ from_page=from_page, to_page=to_page, callback=callback)
146
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
147
 
148
  doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
 
187
  continue
188
  chunks.append(txt)
189
  last_sid = sec_id
190
+ res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
 
 
 
 
 
 
 
191
  return res
192
 
193
+ """
194
  readed = [0] * len(paper["lines"])
195
  # find colon firstly
196
  i = 0
 
266
  print(d)
267
  # d["image"].save(f"./logs/{i}.jpg")
268
  return res
269
+ """
270
 
271
  if __name__ == "__main__":
272
  import sys
rag/app/presentation.py CHANGED
@@ -18,7 +18,8 @@ from PIL import Image
18
 
19
  from rag.nlp import tokenize, is_english
20
  from rag.nlp import huqie
21
- from deepdoc.parser import PdfParser, PptParser
 
22
 
23
 
24
  class Ppt(PptParser):
@@ -56,19 +57,6 @@ class Pdf(PdfParser):
56
  callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
57
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
58
  res = []
59
- #################### More precisely ###################
60
- # self._layouts_rec(zoomin)
61
- # self._text_merge()
62
- # pages = {}
63
- # for b in self.boxes:
64
- # if self.__garbage(b["text"]):continue
65
- # if b["page_number"] not in pages: pages[b["page_number"]] = []
66
- # pages[b["page_number"]].append(b["text"])
67
- # for i, lines in pages.items():
68
- # res.append(("\n".join(lines), self.page_images[i-1]))
69
- # return res
70
- ########################################
71
-
72
  for i in range(len(self.boxes)):
73
  lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
74
  res.append((lines, self.page_images[i]))
@@ -76,6 +64,16 @@ class Pdf(PdfParser):
76
  return res
77
 
78
 
 
 
 
 
 
 
 
 
 
 
79
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
80
  """
81
  The supported file formats are pdf, pptx.
@@ -102,14 +100,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
102
  res.append(d)
103
  return res
104
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
105
- pdf_parser = Pdf()
106
- for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
107
  d = copy.deepcopy(doc)
108
  pn += from_page
109
- d["image"] = img
110
  d["page_num_int"] = [pn+1]
111
  d["top_int"] = [0]
112
- d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
113
  tokenize(d, txt, eng)
114
  res.append(d)
115
  return res
 
18
 
19
  from rag.nlp import tokenize, is_english
20
  from rag.nlp import huqie
21
+ from deepdoc.parser import PdfParser, PptParser, PlainParser
22
+ from PyPDF2 import PdfReader as pdf2_read
23
 
24
 
25
  class Ppt(PptParser):
 
57
  callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
58
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
59
  res = []
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  for i in range(len(self.boxes)):
61
  lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
62
  res.append((lines, self.page_images[i]))
 
64
  return res
65
 
66
 
67
+ class PlainPdf(PlainParser):
68
+ def __call__(self, filename, binary=None, callback=None, **kwargs):
69
+ self.pdf = pdf2_read(filename if not binary else BytesIO(filename))
70
+ page_txt = []
71
+ for page in self.pdf.pages:
72
+ page_txt.append(page.extract_text())
73
+ callback(0.9, "Parsing finished")
74
+ return [(txt, None) for txt in page_txt]
75
+
76
+
77
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
78
  """
79
  The supported file formats are pdf, pptx.
 
100
  res.append(d)
101
  return res
102
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
103
+ pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
104
+ for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
105
  d = copy.deepcopy(doc)
106
  pn += from_page
107
+ if img: d["image"] = img
108
  d["page_num_int"] = [pn+1]
109
  d["top_int"] = [0]
110
+ d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
111
  tokenize(d, txt, eng)
112
  res.append(d)
113
  return res
rag/nlp/__init__.py CHANGED
@@ -76,6 +76,25 @@ def tokenize(d, t, eng):
76
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def tokenize_table(tbls, doc, eng, batch_size=10):
80
  res = []
81
  # add tables
 
76
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
77
 
78
 
79
+ def tokenize_chunks(chunks, doc, eng, pdf_parser):
80
+ res = []
81
+ # wrap up as es documents
82
+ for ck in chunks:
83
+ if len(ck.strip()) == 0:continue
84
+ print("--", ck)
85
+ d = copy.deepcopy(doc)
86
+ if pdf_parser:
87
+ try:
88
+ d["image"], poss = pdf_parser.crop(ck, need_position=True)
89
+ add_positions(d, poss)
90
+ ck = pdf_parser.remove_tag(ck)
91
+ except NotImplementedError as e:
92
+ pass
93
+ tokenize(d, ck, eng)
94
+ res.append(d)
95
+ return res
96
+
97
+
98
  def tokenize_table(tbls, doc, eng, batch_size=10):
99
  res = []
100
  # add tables
rag/nlp/huqie.py CHANGED
@@ -300,7 +300,11 @@ class Huqie:
300
  def qieqie(self, tks):
301
  tks = tks.split(" ")
302
  zh_num = len([1 for c in tks if c and is_chinese(c[0])])
303
- if zh_num < len(tks) * 0.2:return " ".join(tks)
 
 
 
 
304
 
305
  res = []
306
  for tk in tks:
 
300
  def qieqie(self, tks):
301
  tks = tks.split(" ")
302
  zh_num = len([1 for c in tks if c and is_chinese(c[0])])
303
+ if zh_num < len(tks) * 0.2:
304
+ res = []
305
+ for tk in tks:
306
+ res.extend(tk.split("/"))
307
+ return " ".join(res)
308
 
309
  res = []
310
  for tk in tks:
rag/nlp/search.py CHANGED
@@ -68,6 +68,7 @@ class Dealer:
68
  s = Search()
69
  pg = int(req.get("page", 1)) - 1
70
  ps = int(req.get("size", 1000))
 
71
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
72
  "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
73
  "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
@@ -103,7 +104,7 @@ class Dealer:
103
  assert emb_mdl, "No embedding model selected"
104
  s["knn"] = self._vector(
105
  qst, emb_mdl, req.get(
106
- "similarity", 0.1), ps)
107
  s["knn"]["filter"] = bqry.to_dict()
108
  if "highlight" in s:
109
  del s["highlight"]
@@ -292,8 +293,8 @@ class Dealer:
292
  ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
293
  if not question:
294
  return ranks
295
- req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top,
296
- "question": question, "vector": True,
297
  "similarity": similarity_threshold}
298
  sres = self.search(req, index_name(tenant_id), embd_mdl)
299
 
 
68
  s = Search()
69
  pg = int(req.get("page", 1)) - 1
70
  ps = int(req.get("size", 1000))
71
+ topk = int(req.get("topk", 1024))
72
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
73
  "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
74
  "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
 
104
  assert emb_mdl, "No embedding model selected"
105
  s["knn"] = self._vector(
106
  qst, emb_mdl, req.get(
107
+ "similarity", 0.1), topk)
108
  s["knn"]["filter"] = bqry.to_dict()
109
  if "highlight" in s:
110
  del s["highlight"]
 
293
  ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
294
  if not question:
295
  return ranks
296
+ req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size,
297
+ "question": question, "vector": True, "topk": top,
298
  "similarity": similarity_threshold}
299
  sres = self.search(req, index_name(tenant_id), embd_mdl)
300
 
rag/svr/task_broker.py CHANGED
@@ -81,11 +81,15 @@ def dispatch():
81
 
82
  tsks = []
83
  if r["type"] == FileType.PDF.value:
 
 
 
84
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
85
- page_size = 12
86
- if r["parser_id"] == "paper": page_size = 22
87
  if r["parser_id"] == "one": page_size = 1000000000
88
- for s,e in r["parser_config"].get("pages", [(0,100000)]):
 
89
  e = min(e, pages)
90
  for p in range(s, e, page_size):
91
  task = new_task()
 
81
 
82
  tsks = []
83
  if r["type"] == FileType.PDF.value:
84
+ if not r["parser_config"].get("layout_recognize", True):
85
+ tsks.append(new_task())
86
+ continue
87
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
88
+ page_size = r["parser_config"].get("task_page_size", 12)
89
+ if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
90
  if r["parser_id"] == "one": page_size = 1000000000
91
+ for s,e in r["parser_config"].get("pages", [(1, 100000)]):
92
+ s -= 1
93
  e = min(e, pages)
94
  for p in range(s, e, page_size):
95
  task = new_task()