KevinHuSh commited on
Commit
e6acaf6
·
1 Parent(s): 6224edc

Add Q&A and Book, fix task running bugs (#50)

Browse files
api/apps/chunk_app.py CHANGED
@@ -18,10 +18,12 @@ import datetime
18
  from flask import request
19
  from flask_login import login_required, current_user
20
  from elasticsearch_dsl import Q
 
 
21
  from rag.nlp import search, huqie, retrievaler
22
  from rag.utils import ELASTICSEARCH, rmSpace
23
- from api.db import LLMType
24
- from api.db.services.kb_service import KnowledgebaseService
25
  from api.db.services.llm_service import TenantLLMService
26
  from api.db.services.user_service import UserTenantService
27
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
@@ -89,10 +91,8 @@ def get():
89
  res["chunk_id"] = id
90
  k = []
91
  for n in res.keys():
92
- if re.search(r"(_vec$|_sm_)", n):
93
  k.append(n)
94
- if re.search(r"(_tks|_ltks)", n):
95
- res[n] = rmSpace(res[n])
96
  for n in k:
97
  del res[n]
98
 
@@ -106,12 +106,12 @@ def get():
106
 
107
  @manager.route('/set', methods=['POST'])
108
  @login_required
109
- @validate_request("doc_id", "chunk_id", "content_ltks",
110
  "important_kwd")
111
  def set():
112
  req = request.json
113
  d = {"id": req["chunk_id"]}
114
- d["content_ltks"] = huqie.qie(req["content_ltks"])
115
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
116
  d["important_kwd"] = req["important_kwd"]
117
  d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
@@ -127,8 +127,15 @@ def set():
127
  e, doc = DocumentService.get_by_id(req["doc_id"])
128
  if not e:
129
  return get_data_error_result(retmsg="Document not found!")
 
 
 
 
 
 
 
130
  v, c = embd_mdl.encode([doc.name, req["content_ltks"]])
131
- v = 0.1 * v[0] + 0.9 * v[1]
132
  d["q_%d_vec" % len(v)] = v.tolist()
133
  ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
134
  return get_json_result(data=True)
 
18
  from flask import request
19
  from flask_login import login_required, current_user
20
  from elasticsearch_dsl import Q
21
+
22
+ from rag.app.qa import rmPrefix, beAdoc
23
  from rag.nlp import search, huqie, retrievaler
24
  from rag.utils import ELASTICSEARCH, rmSpace
25
+ from api.db import LLMType, ParserType
26
+ from api.db.services.knowledgebase_service import KnowledgebaseService
27
  from api.db.services.llm_service import TenantLLMService
28
  from api.db.services.user_service import UserTenantService
29
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 
91
  res["chunk_id"] = id
92
  k = []
93
  for n in res.keys():
94
+ if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
95
  k.append(n)
 
 
96
  for n in k:
97
  del res[n]
98
 
 
106
 
107
  @manager.route('/set', methods=['POST'])
108
  @login_required
109
+ @validate_request("doc_id", "chunk_id", "content_with_weight",
110
  "important_kwd")
111
  def set():
112
  req = request.json
113
  d = {"id": req["chunk_id"]}
114
+ d["content_ltks"] = huqie.qie(req["content_with_weight"])
115
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
116
  d["important_kwd"] = req["important_kwd"]
117
  d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
 
127
  e, doc = DocumentService.get_by_id(req["doc_id"])
128
  if not e:
129
  return get_data_error_result(retmsg="Document not found!")
130
+
131
+ if doc.parser_id == ParserType.QA:
132
+ arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
133
+ if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
134
+ q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
135
+ d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
136
+
137
  v, c = embd_mdl.encode([doc.name, req["content_ltks"]])
138
+ v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
139
  d["q_%d_vec" % len(v)] = v.tolist()
140
  ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
141
  return get_json_result(data=True)
api/apps/dialog_app.py CHANGED
@@ -18,7 +18,7 @@ from flask import request
18
  from flask_login import login_required, current_user
19
  from api.db.services.dialog_service import DialogService
20
  from api.db import StatusEnum
21
- from api.db.services.kb_service import KnowledgebaseService
22
  from api.db.services.user_service import TenantService
23
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
24
  from api.utils import get_uuid
 
18
  from flask_login import login_required, current_user
19
  from api.db.services.dialog_service import DialogService
20
  from api.db import StatusEnum
21
+ from api.db.services.knowledgebase_service import KnowledgebaseService
22
  from api.db.services.user_service import TenantService
23
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
24
  from api.utils import get_uuid
api/apps/document_app.py CHANGED
@@ -27,10 +27,10 @@ from api.db.services.task_service import TaskService
27
  from rag.nlp import search
28
  from rag.utils import ELASTICSEARCH
29
  from api.db.services import duplicate_name
30
- from api.db.services.kb_service import KnowledgebaseService
31
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
32
  from api.utils import get_uuid
33
- from api.db import FileType
34
  from api.db.services.document_service import DocumentService
35
  from api.settings import RetCode
36
  from api.utils.api_utils import get_json_result
@@ -210,13 +210,12 @@ def rm():
210
  @manager.route('/run', methods=['POST'])
211
  @login_required
212
  @validate_request("doc_ids", "run")
213
- def rm():
214
  req = request.json
215
  try:
216
  for id in req["doc_ids"]:
217
- DocumentService.update_by_id(id, {"run": str(req["run"])})
218
- if req["run"] == "2":
219
- TaskService.filter_delete([Task.doc_id == id])
220
  tenant_id = DocumentService.get_tenant_id(id)
221
  if not tenant_id:
222
  return get_data_error_result(retmsg="Tenant not found!")
@@ -284,12 +283,13 @@ def change_parser():
284
  if doc.parser_id.lower() == req["parser_id"].lower():
285
  return get_json_result(data=True)
286
 
287
- e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": 1})
288
- if not e:
289
- return get_data_error_result(retmsg="Document not found!")
290
- e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
291
  if not e:
292
  return get_data_error_result(retmsg="Document not found!")
 
 
 
 
293
 
294
  return get_json_result(data=True)
295
  except Exception as e:
 
27
  from rag.nlp import search
28
  from rag.utils import ELASTICSEARCH
29
  from api.db.services import duplicate_name
30
+ from api.db.services.knowledgebase_service import KnowledgebaseService
31
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
32
  from api.utils import get_uuid
33
+ from api.db import FileType, TaskStatus
34
  from api.db.services.document_service import DocumentService
35
  from api.settings import RetCode
36
  from api.utils.api_utils import get_json_result
 
210
  @manager.route('/run', methods=['POST'])
211
  @login_required
212
  @validate_request("doc_ids", "run")
213
+ def run():
214
  req = request.json
215
  try:
216
  for id in req["doc_ids"]:
217
+ DocumentService.update_by_id(id, {"run": str(req["run"]), "progress": 0})
218
+ if str(req["run"]) == TaskStatus.CANCEL.value:
 
219
  tenant_id = DocumentService.get_tenant_id(id)
220
  if not tenant_id:
221
  return get_data_error_result(retmsg="Tenant not found!")
 
283
  if doc.parser_id.lower() == req["parser_id"].lower():
284
  return get_json_result(data=True)
285
 
286
+ e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": ""})
 
 
 
287
  if not e:
288
  return get_data_error_result(retmsg="Document not found!")
289
+ if doc.token_num>0:
290
+ e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
291
+ if not e:
292
+ return get_data_error_result(retmsg="Document not found!")
293
 
294
  return get_json_result(data=True)
295
  except Exception as e:
api/apps/kb_app.py CHANGED
@@ -21,7 +21,7 @@ from api.db.services.user_service import TenantService, UserTenantService
21
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
22
  from api.utils import get_uuid, get_format_time
23
  from api.db import StatusEnum, UserTenantRole
24
- from api.db.services.kb_service import KnowledgebaseService
25
  from api.db.db_models import Knowledgebase
26
  from api.settings import stat_logger, RetCode
27
  from api.utils.api_utils import get_json_result
 
21
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
22
  from api.utils import get_uuid, get_format_time
23
  from api.db import StatusEnum, UserTenantRole
24
+ from api.db.services.knowledgebase_service import KnowledgebaseService
25
  from api.db.db_models import Knowledgebase
26
  from api.settings import stat_logger, RetCode
27
  from api.utils.api_utils import get_json_result
api/apps/llm_app.py CHANGED
@@ -22,7 +22,7 @@ from api.db.services.user_service import TenantService, UserTenantService
22
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
23
  from api.utils import get_uuid, get_format_time
24
  from api.db import StatusEnum, UserTenantRole
25
- from api.db.services.kb_service import KnowledgebaseService
26
  from api.db.db_models import Knowledgebase, TenantLLM
27
  from api.settings import stat_logger, RetCode
28
  from api.utils.api_utils import get_json_result
 
22
  from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
23
  from api.utils import get_uuid, get_format_time
24
  from api.db import StatusEnum, UserTenantRole
25
+ from api.db.services.knowledgebase_service import KnowledgebaseService
26
  from api.db.db_models import Knowledgebase, TenantLLM
27
  from api.settings import stat_logger, RetCode
28
  from api.utils.api_utils import get_json_result
api/db/__init__.py CHANGED
@@ -61,12 +61,19 @@ class ChatStyle(StrEnum):
61
  CUSTOM = 'Custom'
62
 
63
 
 
 
 
 
 
 
 
64
  class ParserType(StrEnum):
65
  GENERAL = "general"
66
  PRESENTATION = "presentation"
67
  LAWS = "laws"
68
  MANUAL = "manual"
69
  PAPER = "paper"
70
- RESUME = ""
71
- BOOK = ""
72
- QA = ""
 
61
  CUSTOM = 'Custom'
62
 
63
 
64
+ class TaskStatus(StrEnum):
65
+ RUNNING = "1"
66
+ CANCEL = "2"
67
+ DONE = "3"
68
+ FAIL = "4"
69
+
70
+
71
  class ParserType(StrEnum):
72
  GENERAL = "general"
73
  PRESENTATION = "presentation"
74
  LAWS = "laws"
75
  MANUAL = "manual"
76
  PAPER = "paper"
77
+ RESUME = "resume"
78
+ BOOK = "book"
79
+ QA = "qa"
api/db/db_utils.py CHANGED
@@ -33,8 +33,8 @@ def bulk_insert_into_db(model, data_source, replace_on_conflict=False):
33
  DB.create_tables([model])
34
 
35
 
36
- for data in data_source:
37
- current_time = current_timestamp()
38
  current_date = timestamp_to_date(current_time)
39
  if 'create_time' not in data:
40
  data['create_time'] = current_time
 
33
  DB.create_tables([model])
34
 
35
 
36
+ for i,data in enumerate(data_source):
37
+ current_time = current_timestamp() + i
38
  current_date = timestamp_to_date(current_time)
39
  if 'create_time' not in data:
40
  data['create_time'] = current_time
api/db/services/document_service.py CHANGED
@@ -15,11 +15,11 @@
15
  #
16
  from peewee import Expression
17
 
18
- from api.db import TenantPermission, FileType
19
  from api.db.db_models import DB, Knowledgebase, Tenant
20
  from api.db.db_models import Document
21
  from api.db.services.common_service import CommonService
22
- from api.db.services.kb_service import KnowledgebaseService
23
  from api.db import StatusEnum
24
 
25
 
@@ -71,6 +71,7 @@ class DocumentService(CommonService):
71
  ~(cls.model.type == FileType.VIRTUAL.value),
72
  cls.model.progress == 0,
73
  cls.model.update_time >= tm,
 
74
  (Expression(cls.model.create_time, "%%", comm) == mod))\
75
  .order_by(cls.model.update_time.asc())\
76
  .paginate(1, items_per_page)
 
15
  #
16
  from peewee import Expression
17
 
18
+ from api.db import TenantPermission, FileType, TaskStatus
19
  from api.db.db_models import DB, Knowledgebase, Tenant
20
  from api.db.db_models import Document
21
  from api.db.services.common_service import CommonService
22
+ from api.db.services.knowledgebase_service import KnowledgebaseService
23
  from api.db import StatusEnum
24
 
25
 
 
71
  ~(cls.model.type == FileType.VIRTUAL.value),
72
  cls.model.progress == 0,
73
  cls.model.update_time >= tm,
74
+ cls.model.run == TaskStatus.RUNNING.value,
75
  (Expression(cls.model.create_time, "%%", comm) == mod))\
76
  .order_by(cls.model.update_time.asc())\
77
  .paginate(1, items_per_page)
api/db/services/knowledgebase_service.py CHANGED
@@ -13,13 +13,52 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
- from api.db.db_models import Knowledgebase, Document
 
17
  from api.db.services.common_service import CommonService
18
 
19
 
20
  class KnowledgebaseService(CommonService):
21
  model = Knowledgebase
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- class DocumentService(CommonService):
25
- model = Document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ from api.db import StatusEnum, TenantPermission
17
+ from api.db.db_models import Knowledgebase, DB, Tenant
18
  from api.db.services.common_service import CommonService
19
 
20
 
21
  class KnowledgebaseService(CommonService):
22
  model = Knowledgebase
23
 
24
+ @classmethod
25
+ @DB.connection_context()
26
+ def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
27
+ page_number, items_per_page, orderby, desc):
28
+ kbs = cls.model.select().where(
29
+ ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
30
+ TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id))
31
+ & (cls.model.status == StatusEnum.VALID.value)
32
+ )
33
+ if desc:
34
+ kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
35
+ else:
36
+ kbs = kbs.order_by(cls.model.getter_by(orderby).asc())
37
 
38
+ kbs = kbs.paginate(page_number, items_per_page)
39
+
40
+ return list(kbs.dicts())
41
+
42
+ @classmethod
43
+ @DB.connection_context()
44
+ def get_detail(cls, kb_id):
45
+ fields = [
46
+ cls.model.id,
47
+ Tenant.embd_id,
48
+ cls.model.avatar,
49
+ cls.model.name,
50
+ cls.model.description,
51
+ cls.model.permission,
52
+ cls.model.doc_num,
53
+ cls.model.token_num,
54
+ cls.model.chunk_num,
55
+ cls.model.parser_id]
56
+ kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id)&(Tenant.status== StatusEnum.VALID.value))).where(
57
+ (cls.model.id == kb_id),
58
+ (cls.model.status == StatusEnum.VALID.value)
59
+ )
60
+ if not kbs:
61
+ return
62
+ d = kbs[0].to_dict()
63
+ d["embd_id"] = kbs[0].tenant.embd_id
64
+ return d
api/db/services/task_service.py CHANGED
@@ -1,53 +1,55 @@
1
- #
2
- # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- #
16
- from peewee import Expression
17
- from api.db.db_models import DB
18
- from api.db import StatusEnum, FileType
19
- from api.db.db_models import Task, Document, Knowledgebase, Tenant
20
- from api.db.services.common_service import CommonService
21
-
22
-
23
- class TaskService(CommonService):
24
- model = Task
25
-
26
- @classmethod
27
- @DB.connection_context()
28
- def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
29
- fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
30
- docs = cls.model.select(*fields) \
31
- .join(Document, on=(cls.model.doc_id == Document.id)) \
32
- .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
33
- .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
34
- .where(
35
- Document.status == StatusEnum.VALID.value,
36
- ~(Document.type == FileType.VIRTUAL.value),
37
- cls.model.progress == 0,
38
- cls.model.update_time >= tm,
39
- (Expression(cls.model.create_time, "%%", comm) == mod))\
40
- .order_by(cls.model.update_time.asc())\
41
- .paginate(1, items_per_page)
42
- return list(docs.dicts())
43
-
44
-
45
- @classmethod
46
- @DB.connection_context()
47
- def do_cancel(cls, id):
48
- try:
49
- cls.model.get_by_id(id)
50
- return False
51
- except Exception as e:
52
- pass
53
- return True
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from peewee import Expression
17
+ from api.db.db_models import DB
18
+ from api.db import StatusEnum, FileType, TaskStatus
19
+ from api.db.db_models import Task, Document, Knowledgebase, Tenant
20
+ from api.db.services.common_service import CommonService
21
+ from api.db.services.document_service import DocumentService
22
+
23
+
24
+ class TaskService(CommonService):
25
+ model = Task
26
+
27
+ @classmethod
28
+ @DB.connection_context()
29
+ def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
30
+ fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
31
+ docs = cls.model.select(*fields) \
32
+ .join(Document, on=(cls.model.doc_id == Document.id)) \
33
+ .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
34
+ .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
35
+ .where(
36
+ Document.status == StatusEnum.VALID.value,
37
+ ~(Document.type == FileType.VIRTUAL.value),
38
+ cls.model.progress == 0,
39
+ cls.model.update_time >= tm,
40
+ (Expression(cls.model.create_time, "%%", comm) == mod))\
41
+ .order_by(cls.model.update_time.asc())\
42
+ .paginate(1, items_per_page)
43
+ return list(docs.dicts())
44
+
45
+
46
+ @classmethod
47
+ @DB.connection_context()
48
+ def do_cancel(cls, id):
49
+ try:
50
+ task = cls.model.get_by_id(id)
51
+ _, doc = DocumentService.get_by_id(task.doc_id)
52
+ return doc.run == TaskStatus.CANCEL.value
53
+ except Exception as e:
54
+ pass
55
+ return True
api/utils/file_utils.py CHANGED
@@ -143,7 +143,7 @@ def filename_type(filename):
143
  if re.match(r".*\.pdf$", filename):
144
  return FileType.PDF.value
145
 
146
- if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename):
147
  return FileType.DOC.value
148
 
149
  if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
 
143
  if re.match(r".*\.pdf$", filename):
144
  return FileType.PDF.value
145
 
146
+ if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
147
  return FileType.DOC.value
148
 
149
  if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
rag/app/__init__.py CHANGED
@@ -4,14 +4,8 @@ from nltk import word_tokenize
4
 
5
  from rag.nlp import stemmer, huqie
6
 
7
-
8
- def callback__(progress, msg, func):
9
- if not func :return
10
- func(progress, msg)
11
-
12
-
13
  BULLET_PATTERN = [[
14
- r"第[零一二三四五六七八九十百]+编",
15
  r"第[零一二三四五六七八九十百]+章",
16
  r"第[零一二三四五六七八九十百]+节",
17
  r"第[零一二三四五六七八九十百]+条",
@@ -22,6 +16,8 @@ BULLET_PATTERN = [[
22
  r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
23
  r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
24
  ], [
 
 
25
  r"[零一二三四五六七八九十百]+[ 、]",
26
  r"[\((][零一二三四五六七八九十百]+[\))]",
27
  r"[\((][0-9]{,2}[\))]",
@@ -54,7 +50,7 @@ def bullets_category(sections):
54
  def is_english(texts):
55
  eng = 0
56
  for t in texts:
57
- if re.match(r"[a-zA-Z]", t.strip()):
58
  eng += 1
59
  if eng / len(texts) > 0.8:
60
  return True
@@ -70,3 +66,26 @@ def tokenize(d, t, eng):
70
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  from rag.nlp import stemmer, huqie
6
 
 
 
 
 
 
 
7
  BULLET_PATTERN = [[
8
+ r"第[零一二三四五六七八九十百]+(编|部分)",
9
  r"第[零一二三四五六七八九十百]+章",
10
  r"第[零一二三四五六七八九十百]+节",
11
  r"第[零一二三四五六七八九十百]+条",
 
16
  r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
17
  r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
18
  ], [
19
+ r"第[零一二三四五六七八九十百]+章",
20
+ r"第[零一二三四五六七八九十百]+节",
21
  r"[零一二三四五六七八九十百]+[ 、]",
22
  r"[\((][零一二三四五六七八九十百]+[\))]",
23
  r"[\((][0-9]{,2}[\))]",
 
50
  def is_english(texts):
51
  eng = 0
52
  for t in texts:
53
+ if re.match(r"[a-zA-Z]{2,}", t.strip()):
54
  eng += 1
55
  if eng / len(texts) > 0.8:
56
  return True
 
66
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
67
 
68
 
69
+ def remove_contents_table(sections, eng=False):
70
+ i = 0
71
+ while i < len(sections):
72
+ def get(i):
73
+ nonlocal sections
74
+ return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
75
+ if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
76
+ i += 1
77
+ continue
78
+ sections.pop(i)
79
+ if i >= len(sections): break
80
+ prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
81
+ while not prefix:
82
+ sections.pop(i)
83
+ if i >= len(sections): break
84
+ prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
85
+ sections.pop(i)
86
+ if i >= len(sections) or not prefix: break
87
+ for j in range(i, min(i+128, len(sections))):
88
+ if not re.match(prefix, get(j)):
89
+ continue
90
+ for _ in range(i, j):sections.pop(i)
91
+ break
rag/app/book.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ import re
4
+ from io import BytesIO
5
+ from docx import Document
6
+ import numpy as np
7
+ from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table
8
+ from rag.nlp import huqie
9
+ from rag.parser.docx_parser import HuDocxParser
10
+ from rag.parser.pdf_parser import HuParser
11
+
12
+
13
+ class Pdf(HuParser):
14
+ def __call__(self, filename, binary=None, from_page=0,
15
+ to_page=100000, zoomin=3, callback=None):
16
+ self.__images__(
17
+ filename if not binary else binary,
18
+ zoomin,
19
+ from_page,
20
+ to_page)
21
+ callback(0.1, "OCR finished")
22
+
23
+ from timeit import default_timer as timer
24
+ start = timer()
25
+ self._layouts_paddle(zoomin)
26
+ callback(0.47, "Layout analysis finished")
27
+ print("paddle layouts:", timer() - start)
28
+ self._table_transformer_job(zoomin)
29
+ callback(0.68, "Table analysis finished")
30
+ self._text_merge()
31
+ column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
32
+ self._concat_downward(concat_between_pages=False)
33
+ self._filter_forpages()
34
+ self._merge_with_same_bullet()
35
+ callback(0.75, "Text merging finished.")
36
+ tbls = self._extract_table_figure(True, zoomin, False)
37
+
38
+ callback(0.8, "Text extraction finished")
39
+
40
+ return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes]
41
+
42
+
43
+ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
44
+ doc = {
45
+ "docnm_kwd": filename,
46
+ "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
47
+ }
48
+ doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
49
+ pdf_parser = None
50
+ sections,tbls = [], []
51
+ if re.search(r"\.docx?$", filename, re.IGNORECASE):
52
+ callback(0.1, "Start to parse.")
53
+ doc_parser = HuDocxParser()
54
+ # TODO: table of contents need to be removed
55
+ sections, tbls = doc_parser(binary if binary else filename)
56
+ remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
57
+ callback(0.8, "Finish parsing.")
58
+ elif re.search(r"\.pdf$", filename, re.IGNORECASE):
59
+ pdf_parser = Pdf()
60
+ sections,tbls = pdf_parser(filename if not binary else binary,
61
+ from_page=from_page, to_page=to_page, callback=callback)
62
+ elif re.search(r"\.txt$", filename, re.IGNORECASE):
63
+ callback(0.1, "Start to parse.")
64
+ txt = ""
65
+ if binary:txt = binary.decode("utf-8")
66
+ else:
67
+ with open(filename, "r") as f:
68
+ while True:
69
+ l = f.readline()
70
+ if not l:break
71
+ txt += l
72
+ sections = txt.split("\n")
73
+ sections = [(l,"") for l in sections if l]
74
+ remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
75
+ callback(0.8, "Finish parsing.")
76
+ else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
77
+
78
+ bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)])
79
+ projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections)
80
+ levels = [[]] * len(BULLET_PATTERN[bull]) + 2
81
+ for i, (txt, layout) in enumerate(sections):
82
+ for j, p in enumerate(BULLET_PATTERN[bull]):
83
+ if re.match(p, txt.strip()):
84
+ projs[i] = j
85
+ levels[j].append(i)
86
+ break
87
+ else:
88
+ if re.search(r"(title|head)", layout):
89
+ projs[i] = BULLET_PATTERN[bull]
90
+ levels[BULLET_PATTERN[bull]].append(i)
91
+ else:
92
+ levels[BULLET_PATTERN[bull] + 1].append(i)
93
+ sections = [t for t,_ in sections]
94
+
95
+ def binary_search(arr, target):
96
+ if target > arr[-1]: return len(arr) - 1
97
+ if target > arr[0]: return -1
98
+ s, e = 0, len(arr)
99
+ while e - s > 1:
100
+ i = (e + s) // 2
101
+ if target > arr[i]:
102
+ s = i
103
+ continue
104
+ elif target < arr[i]:
105
+ e = i
106
+ continue
107
+ else:
108
+ assert False
109
+ return s
110
+
111
+ cks = []
112
+ readed = [False] * len(sections)
113
+ levels = levels[::-1]
114
+ for i, arr in enumerate(levels):
115
+ for j in arr:
116
+ if readed[j]: continue
117
+ readed[j] = True
118
+ cks.append([j])
119
+ if i + 1 == len(levels) - 1: continue
120
+ for ii in range(i + 1, len(levels)):
121
+ jj = binary_search(levels[ii], j)
122
+ if jj < 0: break
123
+ if jj > cks[-1][-1]: cks[-1].pop(-1)
124
+ cks[-1].append(levels[ii][jj])
125
+
126
+ # is it English
127
+ eng = is_english(random.choices(sections, k=218))
128
+
129
+ res = []
130
+ # add tables
131
+ for img, rows in tbls:
132
+ bs = 10
133
+ de = ";" if eng else ";"
134
+ for i in range(0, len(rows), bs):
135
+ d = copy.deepcopy(doc)
136
+ r = de.join(rows[i:i + bs])
137
+ r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
138
+ tokenize(d, r, eng)
139
+ d["image"] = img
140
+ res.append(d)
141
+ # wrap up to es documents
142
+ for ck in cks:
143
+ print("\n-".join(ck[::-1]))
144
+ ck = "\n".join(ck[::-1])
145
+ d = copy.deepcopy(doc)
146
+ if pdf_parser:
147
+ d["image"] = pdf_parser.crop(ck)
148
+ ck = pdf_parser.remove_tag(ck)
149
+ tokenize(d, ck, eng)
150
+ res.append(d)
151
+ return res
152
+
153
+
154
+ if __name__ == "__main__":
155
+ import sys
156
+ chunk(sys.argv[1])
rag/app/laws.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  from io import BytesIO
4
  from docx import Document
5
  import numpy as np
6
- from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
7
  from rag.nlp import huqie
8
  from rag.parser.docx_parser import HuDocxParser
9
  from rag.parser.pdf_parser import HuParser
@@ -32,12 +32,12 @@ class Pdf(HuParser):
32
  zoomin,
33
  from_page,
34
  to_page)
35
- callback__(0.1, "OCR finished", callback)
36
 
37
  from timeit import default_timer as timer
38
  start = timer()
39
  self._layouts_paddle(zoomin)
40
- callback__(0.77, "Layout analysis finished", callback)
41
  print("paddle layouts:", timer()-start)
42
  bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
43
  # is it English
@@ -75,7 +75,7 @@ class Pdf(HuParser):
75
  b["x1"] = max(b["x1"], b_["x1"])
76
  bxs.pop(i + 1)
77
 
78
- callback__(0.8, "Text extraction finished", callback)
79
 
80
  return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
81
 
@@ -89,17 +89,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
89
  pdf_parser = None
90
  sections = []
91
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
92
- callback__(0.1, "Start to parse.", callback)
93
  for txt in Docx()(filename, binary):
94
  sections.append(txt)
95
- callback__(0.8, "Finish parsing.", callback)
96
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
97
  pdf_parser = Pdf()
98
  for txt in pdf_parser(filename if not binary else binary,
99
  from_page=from_page, to_page=to_page, callback=callback):
100
  sections.append(txt)
101
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
102
- callback__(0.1, "Start to parse.", callback)
103
  txt = ""
104
  if binary:txt = binary.decode("utf-8")
105
  else:
@@ -110,7 +110,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
110
  txt += l
111
  sections = txt.split("\n")
112
  sections = [l for l in sections if l]
113
- callback__(0.8, "Finish parsing.", callback)
114
  else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
115
 
116
  # is it English
@@ -118,7 +118,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
118
  # Remove 'Contents' part
119
  i = 0
120
  while i < len(sections):
121
- if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])):
122
  i += 1
123
  continue
124
  sections.pop(i)
@@ -133,7 +133,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
133
  for j in range(i, min(i+128, len(sections))):
134
  if not re.match(prefix, sections[j]):
135
  continue
136
- for k in range(i, j):sections.pop(i)
137
  break
138
 
139
  bull = bullets_category(sections)
 
3
  from io import BytesIO
4
  from docx import Document
5
  import numpy as np
6
+ from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize
7
  from rag.nlp import huqie
8
  from rag.parser.docx_parser import HuDocxParser
9
  from rag.parser.pdf_parser import HuParser
 
32
  zoomin,
33
  from_page,
34
  to_page)
35
+ callback(0.1, "OCR finished")
36
 
37
  from timeit import default_timer as timer
38
  start = timer()
39
  self._layouts_paddle(zoomin)
40
+ callback(0.77, "Layout analysis finished")
41
  print("paddle layouts:", timer()-start)
42
  bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
43
  # is it English
 
75
  b["x1"] = max(b["x1"], b_["x1"])
76
  bxs.pop(i + 1)
77
 
78
+ callback(0.8, "Text extraction finished")
79
 
80
  return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
81
 
 
89
  pdf_parser = None
90
  sections = []
91
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
92
+ callback(0.1, "Start to parse.")
93
  for txt in Docx()(filename, binary):
94
  sections.append(txt)
95
+ callback(0.8, "Finish parsing.")
96
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
97
  pdf_parser = Pdf()
98
  for txt in pdf_parser(filename if not binary else binary,
99
  from_page=from_page, to_page=to_page, callback=callback):
100
  sections.append(txt)
101
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
102
+ callback(0.1, "Start to parse.")
103
  txt = ""
104
  if binary:txt = binary.decode("utf-8")
105
  else:
 
110
  txt += l
111
  sections = txt.split("\n")
112
  sections = [l for l in sections if l]
113
+ callback(0.8, "Finish parsing.")
114
  else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
115
 
116
  # is it English
 
118
  # Remove 'Contents' part
119
  i = 0
120
  while i < len(sections):
121
+ if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)):
122
  i += 1
123
  continue
124
  sections.pop(i)
 
133
  for j in range(i, min(i+128, len(sections))):
134
  if not re.match(prefix, sections[j]):
135
  continue
136
+ for _ in range(i, j):sections.pop(i)
137
  break
138
 
139
  bull = bullets_category(sections)
rag/app/manual.py CHANGED
@@ -1,6 +1,6 @@
1
  import copy
2
  import re
3
- from rag.app import callback__, tokenize
4
  from rag.nlp import huqie
5
  from rag.parser.pdf_parser import HuParser
6
  from rag.utils import num_tokens_from_string
@@ -14,19 +14,19 @@ class Pdf(HuParser):
14
  zoomin,
15
  from_page,
16
  to_page)
17
- callback__(0.2, "OCR finished.", callback)
18
 
19
  from timeit import default_timer as timer
20
  start = timer()
21
  self._layouts_paddle(zoomin)
22
- callback__(0.5, "Layout analysis finished.", callback)
23
  print("paddle layouts:", timer() - start)
24
  self._table_transformer_job(zoomin)
25
- callback__(0.7, "Table analysis finished.", callback)
26
  self._text_merge()
27
  self._concat_downward(concat_between_pages=False)
28
  self._filter_forpages()
29
- callback__(0.77, "Text merging finished", callback)
30
  tbls = self._extract_table_figure(True, zoomin, False)
31
 
32
  # clean mess
@@ -34,20 +34,8 @@ class Pdf(HuParser):
34
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
35
 
36
  # merge chunks with the same bullets
37
- i = 0
38
- while i + 1 < len(self.boxes):
39
- b = self.boxes[i]
40
- b_ = self.boxes[i + 1]
41
- if b["text"].strip()[0] != b_["text"].strip()[0] \
42
- or b["page_number"]!=b_["page_number"] \
43
- or b["top"] > b_["bottom"]:
44
- i += 1
45
- continue
46
- b_["text"] = b["text"] + "\n" + b_["text"]
47
- b_["x0"] = min(b["x0"], b_["x0"])
48
- b_["x1"] = max(b["x1"], b_["x1"])
49
- b_["top"] = b["top"]
50
- self.boxes.pop(i)
51
  # merge title with decent chunk
52
  i = 0
53
  while i + 1 < len(self.boxes):
@@ -62,7 +50,7 @@ class Pdf(HuParser):
62
  b_["top"] = b["top"]
63
  self.boxes.pop(i)
64
 
65
- callback__(0.8, "Parsing finished", callback)
66
  for b in self.boxes: print(b["text"], b.get("layoutno"))
67
 
68
  print(tbls)
 
1
  import copy
2
  import re
3
+ from rag.app import tokenize
4
  from rag.nlp import huqie
5
  from rag.parser.pdf_parser import HuParser
6
  from rag.utils import num_tokens_from_string
 
14
  zoomin,
15
  from_page,
16
  to_page)
17
+ callback(0.2, "OCR finished.")
18
 
19
  from timeit import default_timer as timer
20
  start = timer()
21
  self._layouts_paddle(zoomin)
22
+ callback(0.5, "Layout analysis finished.")
23
  print("paddle layouts:", timer() - start)
24
  self._table_transformer_job(zoomin)
25
+ callback(0.7, "Table analysis finished.")
26
  self._text_merge()
27
  self._concat_downward(concat_between_pages=False)
28
  self._filter_forpages()
29
+ callback(0.77, "Text merging finished")
30
  tbls = self._extract_table_figure(True, zoomin, False)
31
 
32
  # clean mess
 
34
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
35
 
36
  # merge chunks with the same bullets
37
+ self._merge_with_same_bullet()
38
+
 
 
 
 
 
 
 
 
 
 
 
 
39
  # merge title with decent chunk
40
  i = 0
41
  while i + 1 < len(self.boxes):
 
50
  b_["top"] = b["top"]
51
  self.boxes.pop(i)
52
 
53
+ callback(0.8, "Parsing finished")
54
  for b in self.boxes: print(b["text"], b.get("layoutno"))
55
 
56
  print(tbls)
rag/app/paper.py CHANGED
@@ -1,11 +1,9 @@
1
  import copy
2
  import re
3
  from collections import Counter
4
- from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
5
- from rag.nlp import huqie, stemmer
6
- from rag.parser.docx_parser import HuDocxParser
7
  from rag.parser.pdf_parser import HuParser
8
- from nltk.tokenize import word_tokenize
9
  import numpy as np
10
  from rag.utils import num_tokens_from_string
11
 
@@ -18,20 +16,20 @@ class Pdf(HuParser):
18
  zoomin,
19
  from_page,
20
  to_page)
21
- callback__(0.2, "OCR finished.", callback)
22
 
23
  from timeit import default_timer as timer
24
  start = timer()
25
  self._layouts_paddle(zoomin)
26
- callback__(0.47, "Layout analysis finished", callback)
27
  print("paddle layouts:", timer() - start)
28
  self._table_transformer_job(zoomin)
29
- callback__(0.68, "Table analysis finished", callback)
30
  self._text_merge()
31
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
32
  self._concat_downward(concat_between_pages=False)
33
  self._filter_forpages()
34
- callback__(0.75, "Text merging finished.", callback)
35
  tbls = self._extract_table_figure(True, zoomin, False)
36
 
37
  # clean mess
@@ -101,7 +99,7 @@ class Pdf(HuParser):
101
  break
102
  if not abstr: i = 0
103
 
104
- callback__(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
105
  for b in self.boxes: print(b["text"], b.get("layoutno"))
106
  print(tbls)
107
 
 
1
  import copy
2
  import re
3
  from collections import Counter
4
+ from rag.app import tokenize
5
+ from rag.nlp import huqie
 
6
  from rag.parser.pdf_parser import HuParser
 
7
  import numpy as np
8
  from rag.utils import num_tokens_from_string
9
 
 
16
  zoomin,
17
  from_page,
18
  to_page)
19
+ callback(0.2, "OCR finished.")
20
 
21
  from timeit import default_timer as timer
22
  start = timer()
23
  self._layouts_paddle(zoomin)
24
+ callback(0.47, "Layout analysis finished")
25
  print("paddle layouts:", timer() - start)
26
  self._table_transformer_job(zoomin)
27
+ callback(0.68, "Table analysis finished")
28
  self._text_merge()
29
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
30
  self._concat_downward(concat_between_pages=False)
31
  self._filter_forpages()
32
+ callback(0.75, "Text merging finished.")
33
  tbls = self._extract_table_figure(True, zoomin, False)
34
 
35
  # clean mess
 
99
  break
100
  if not abstr: i = 0
101
 
102
+ callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
103
  for b in self.boxes: print(b["text"], b.get("layoutno"))
104
  print(tbls)
105
 
rag/app/presentation.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  from io import BytesIO
4
  from pptx import Presentation
5
 
6
- from rag.app import callback__, tokenize, is_english
7
  from rag.nlp import huqie
8
  from rag.parser.pdf_parser import HuParser
9
 
@@ -43,7 +43,7 @@ class Ppt(object):
43
  if txt: texts.append(txt)
44
  txts.append("\n".join(texts))
45
 
46
- callback__(0.5, "Text extraction finished.", callback)
47
  import aspose.slides as slides
48
  import aspose.pydrawing as drawing
49
  imgs = []
@@ -53,7 +53,7 @@ class Ppt(object):
53
  slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
54
  imgs.append(buffered.getvalue())
55
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
56
- callback__(0.9, "Image extraction finished", callback)
57
  self.is_english = is_english(txts)
58
  return [(txts[i], imgs[i]) for i in range(len(txts))]
59
 
@@ -70,7 +70,7 @@ class Pdf(HuParser):
70
 
71
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
72
  self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
73
- callback__(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
74
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
75
  res = []
76
  #################### More precisely ###################
@@ -89,7 +89,7 @@ class Pdf(HuParser):
89
  for i in range(len(self.boxes)):
90
  lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
91
  res.append((lines, self.page_images[i]))
92
- callback__(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)), callback)
93
  return res
94
 
95
 
 
3
  from io import BytesIO
4
  from pptx import Presentation
5
 
6
+ from rag.app import tokenize, is_english
7
  from rag.nlp import huqie
8
  from rag.parser.pdf_parser import HuParser
9
 
 
43
  if txt: texts.append(txt)
44
  txts.append("\n".join(texts))
45
 
46
+ callback(0.5, "Text extraction finished.")
47
  import aspose.slides as slides
48
  import aspose.pydrawing as drawing
49
  imgs = []
 
53
  slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
54
  imgs.append(buffered.getvalue())
55
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
56
+ callback(0.9, "Image extraction finished")
57
  self.is_english = is_english(txts)
58
  return [(txts[i], imgs[i]) for i in range(len(txts))]
59
 
 
70
 
71
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
72
  self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
73
+ callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
74
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
75
  res = []
76
  #################### More precisely ###################
 
89
  for i in range(len(self.boxes)):
90
  lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
91
  res.append((lines, self.page_images[i]))
92
+ callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
93
  return res
94
 
95
 
rag/app/qa.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import re
3
+ from io import BytesIO
4
+ from nltk import word_tokenize
5
+ from openpyxl import load_workbook
6
+ from rag.app import is_english
7
+ from rag.nlp import huqie, stemmer
8
+
9
+
10
+ class Excel(object):
11
+ def __call__(self, fnm, binary=None, callback=None):
12
+ if not binary:
13
+ wb = load_workbook(fnm)
14
+ else:
15
+ wb = load_workbook(BytesIO(binary))
16
+ total = 0
17
+ for sheetname in wb.sheetnames:
18
+ total += len(list(wb[sheetname].rows))
19
+
20
+ res, fails = [], []
21
+ for sheetname in wb.sheetnames:
22
+ ws = wb[sheetname]
23
+ rows = list(ws.rows)
24
+ for i, r in enumerate(rows):
25
+ q, a = "", ""
26
+ for cell in r:
27
+ if not cell.value: continue
28
+ if not q: q = str(cell.value)
29
+ elif not a: a = str(cell.value)
30
+ else: break
31
+ if q and a: res.append((q, a))
32
+ else: fails.append(str(i+1))
33
+ if len(res) % 999 == 0:
34
+ callback(len(res)*0.6/total, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..."%(",".join(fails[:3])) if fails else "")))
35
+
36
+ callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
37
+ f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
38
+ self.is_english = is_english([rmPrefix(q) for q, _ in random.choices(res, k=30) if len(q)>1])
39
+ return res
40
+
41
+
42
+ def rmPrefix(txt):
43
+ return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
44
+
45
+
46
+ def beAdoc(d, q, a, eng):
47
+ qprefix = "Question: " if eng else "问题:"
48
+ aprefix = "Answer: " if eng else "回答:"
49
+ d["content_with_weight"] = "\t".join([qprefix+rmPrefix(q), aprefix+rmPrefix(a)])
50
+ if eng:
51
+ d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(q)])
52
+ else:
53
+ d["content_ltks"] = huqie.qie(q)
54
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
55
+ return d
56
+
57
+
58
+ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
59
+
60
+ res = []
61
+ if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
62
+ callback(0.1, "Start to parse.")
63
+ excel_parser = Excel()
64
+ for q,a in excel_parser(filename, binary, callback):
65
+ res.append(beAdoc({}, q, a, excel_parser.is_english))
66
+ return res
67
+ elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
68
+ callback(0.1, "Start to parse.")
69
+ txt = ""
70
+ if binary:
71
+ txt = binary.decode("utf-8")
72
+ else:
73
+ with open(filename, "r") as f:
74
+ while True:
75
+ l = f.readline()
76
+ if not l: break
77
+ txt += l
78
+ lines = txt.split("\n")
79
+ eng = is_english([rmPrefix(l) for l in lines[:100]])
80
+ fails = []
81
+ for i, line in enumerate(lines):
82
+ arr = [l for l in line.split("\t") if len(l) > 1]
83
+ if len(arr) != 2:
84
+ fails.append(str(i))
85
+ continue
86
+ res.append(beAdoc({}, arr[0], arr[1], eng))
87
+ if len(res) % 999 == 0:
88
+ callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
89
+ f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
90
+
91
+ callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
92
+ f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
93
+
94
+ return res
95
+
96
+ raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
97
+
98
+
99
+ if __name__== "__main__":
100
+ import sys
101
+ def kk(rat, ss):
102
+ pass
103
+ print(chunk(sys.argv[1], callback=kk))
104
+
rag/parser/pdf_parser.py CHANGED
@@ -763,7 +763,7 @@ class HuParser:
763
  return
764
  i = 0
765
  while i < len(self.boxes):
766
- if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
767
  i += 1
768
  continue
769
  eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
@@ -782,6 +782,22 @@ class HuParser:
782
  for k in range(i, j): self.boxes.pop(i)
783
  break
784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  def _blockType(self, b):
786
  patt = [
787
  ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 
763
  return
764
  i = 0
765
  while i < len(self.boxes):
766
+ if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
767
  i += 1
768
  continue
769
  eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
 
782
  for k in range(i, j): self.boxes.pop(i)
783
  break
784
 
785
+ def _merge_with_same_bullet(self):
786
+ i = 0
787
+ while i + 1 < len(self.boxes):
788
+ b = self.boxes[i]
789
+ b_ = self.boxes[i + 1]
790
+ if b["text"].strip()[0] != b_["text"].strip()[0] \
791
+ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
792
+ or b["top"] > b_["bottom"]:
793
+ i += 1
794
+ continue
795
+ b_["text"] = b["text"] + "\n" + b_["text"]
796
+ b_["x0"] = min(b["x0"], b_["x0"])
797
+ b_["x1"] = max(b["x1"], b_["x1"])
798
+ b_["top"] = b["top"]
799
+ self.boxes.pop(i)
800
+
801
  def _blockType(self, b):
802
  patt = [
803
  ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
rag/svr/task_broker.py CHANGED
@@ -1,130 +1,138 @@
1
- #
2
- # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- #
16
- import logging
17
- import os
18
- import time
19
- import random
20
- from timeit import default_timer as timer
21
- from api.db.db_models import Task
22
- from api.db.db_utils import bulk_insert_into_db
23
- from api.db.services.task_service import TaskService
24
- from rag.parser.pdf_parser import HuParser
25
- from rag.settings import cron_logger
26
- from rag.utils import MINIO
27
- from rag.utils import findMaxTm
28
- import pandas as pd
29
- from api.db import FileType
30
- from api.db.services.document_service import DocumentService
31
- from api.settings import database_logger
32
- from api.utils import get_format_time, get_uuid
33
- from api.utils.file_utils import get_project_base_directory
34
-
35
-
36
- def collect(tm):
37
- docs = DocumentService.get_newly_uploaded(tm)
38
- if len(docs) == 0:
39
- return pd.DataFrame()
40
- docs = pd.DataFrame(docs)
41
- mtm = docs["update_time"].max()
42
- cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
43
- return docs
44
-
45
-
46
- def set_dispatching(docid):
47
- try:
48
- DocumentService.update_by_id(
49
- docid, {"progress": random.randint(0, 3) / 100.,
50
- "progress_msg": "Task dispatched...",
51
- "process_begin_at": get_format_time()
52
- })
53
- except Exception as e:
54
- cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
55
-
56
-
57
- def dispatch():
58
- tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm")
59
- tm = findMaxTm(tm_fnm)
60
- rows = collect(tm)
61
- if len(rows) == 0:
62
- return
63
-
64
- tmf = open(tm_fnm, "a+")
65
- for _, r in rows.iterrows():
66
- try:
67
- tsks = TaskService.query(doc_id=r["id"])
68
- if tsks:
69
- for t in tsks:
70
- TaskService.delete_by_id(t.id)
71
- except Exception as e:
72
- cron_logger.error("delete task exception:" + str(e))
73
-
74
- def new_task():
75
- nonlocal r
76
- return {
77
- "id": get_uuid(),
78
- "doc_id": r["id"]
79
- }
80
-
81
- tsks = []
82
- if r["type"] == FileType.PDF.value:
83
- pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
84
- for p in range(0, pages, 10):
85
- task = new_task()
86
- task["from_page"] = p
87
- task["to_page"] = min(p + 10, pages)
88
- tsks.append(task)
89
- else:
90
- tsks.append(new_task())
91
- print(tsks)
92
- bulk_insert_into_db(Task, tsks, True)
93
- set_dispatching(r["id"])
94
- tmf.write(str(r["update_time"]) + "\n")
95
- tmf.close()
96
-
97
-
98
- def update_progress():
99
- docs = DocumentService.get_unfinished_docs()
100
- for d in docs:
101
- try:
102
- tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
103
- if not tsks:continue
104
- msg = []
105
- prg = 0
106
- finished = True
107
- bad = 0
108
- for t in tsks:
109
- if 0 <= t.progress < 1: finished = False
110
- prg += t.progress if t.progress >= 0 else 0
111
- msg.append(t.progress_msg)
112
- if t.progress == -1: bad += 1
113
- prg /= len(tsks)
114
- if finished and bad: prg = -1
115
- msg = "\n".join(msg)
116
- DocumentService.update_by_id(d["id"], {"progress": prg, "progress_msg": msg, "process_duation": timer()-d["process_begin_at"].timestamp()})
117
- except Exception as e:
118
- cron_logger.error("fetch task exception:" + str(e))
119
-
120
-
121
- if __name__ == "__main__":
122
- peewee_logger = logging.getLogger('peewee')
123
- peewee_logger.propagate = False
124
- peewee_logger.addHandler(database_logger.handlers[0])
125
- peewee_logger.setLevel(database_logger.level)
126
-
127
- while True:
128
- dispatch()
129
- time.sleep(3)
130
- update_progress()
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import logging
17
+ import os
18
+ import time
19
+ import random
20
+ from datetime import datetime
21
+ from api.db.db_models import Task
22
+ from api.db.db_utils import bulk_insert_into_db
23
+ from api.db.services.task_service import TaskService
24
+ from rag.parser.pdf_parser import HuParser
25
+ from rag.settings import cron_logger
26
+ from rag.utils import MINIO
27
+ from rag.utils import findMaxTm
28
+ import pandas as pd
29
+ from api.db import FileType, TaskStatus
30
+ from api.db.services.document_service import DocumentService
31
+ from api.settings import database_logger
32
+ from api.utils import get_format_time, get_uuid
33
+ from api.utils.file_utils import get_project_base_directory
34
+
35
+
36
+ def collect(tm):
37
+ docs = DocumentService.get_newly_uploaded(tm)
38
+ if len(docs) == 0:
39
+ return pd.DataFrame()
40
+ docs = pd.DataFrame(docs)
41
+ mtm = docs["update_time"].max()
42
+ cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
43
+ return docs
44
+
45
+
46
+ def set_dispatching(docid):
47
+ try:
48
+ DocumentService.update_by_id(
49
+ docid, {"progress": random.randint(0, 3) / 100.,
50
+ "progress_msg": "Task dispatched...",
51
+ "process_begin_at": get_format_time()
52
+ })
53
+ except Exception as e:
54
+ cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
55
+
56
+
57
+ def dispatch():
58
+ tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm")
59
+ tm = findMaxTm(tm_fnm)
60
+ rows = collect(tm)
61
+ if len(rows) == 0:
62
+ return
63
+
64
+ tmf = open(tm_fnm, "a+")
65
+ for _, r in rows.iterrows():
66
+ try:
67
+ tsks = TaskService.query(doc_id=r["id"])
68
+ if tsks:
69
+ for t in tsks:
70
+ TaskService.delete_by_id(t.id)
71
+ except Exception as e:
72
+ cron_logger.error("delete task exception:" + str(e))
73
+
74
+ def new_task():
75
+ nonlocal r
76
+ return {
77
+ "id": get_uuid(),
78
+ "doc_id": r["id"]
79
+ }
80
+
81
+ tsks = []
82
+ if r["type"] == FileType.PDF.value:
83
+ pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
84
+ for p in range(0, pages, 10):
85
+ task = new_task()
86
+ task["from_page"] = p
87
+ task["to_page"] = min(p + 10, pages)
88
+ tsks.append(task)
89
+ else:
90
+ tsks.append(new_task())
91
+ print(tsks)
92
+ bulk_insert_into_db(Task, tsks, True)
93
+ set_dispatching(r["id"])
94
+ tmf.write(str(r["update_time"]) + "\n")
95
+ tmf.close()
96
+
97
+
98
+ def update_progress():
99
+ docs = DocumentService.get_unfinished_docs()
100
+ for d in docs:
101
+ try:
102
+ tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
103
+ if not tsks:continue
104
+ msg = []
105
+ prg = 0
106
+ finished = True
107
+ bad = 0
108
+ status = TaskStatus.RUNNING.value
109
+ for t in tsks:
110
+ if 0 <= t.progress < 1: finished = False
111
+ prg += t.progress if t.progress >= 0 else 0
112
+ msg.append(t.progress_msg)
113
+ if t.progress == -1: bad += 1
114
+ prg /= len(tsks)
115
+ if finished and bad:
116
+ prg = -1
117
+ status = TaskStatus.FAIL.value
118
+ elif finished: status = TaskStatus.DONE.value
119
+
120
+ msg = "\n".join(msg)
121
+ info = {"process_duation": datetime.timestamp(datetime.now())-d["process_begin_at"].timestamp(), "run": status}
122
+ if prg !=0 : info["progress"] = prg
123
+ if msg: info["progress_msg"] = msg
124
+ DocumentService.update_by_id(d["id"], info)
125
+ except Exception as e:
126
+ cron_logger.error("fetch task exception:" + str(e))
127
+
128
+
129
+ if __name__ == "__main__":
130
+ peewee_logger = logging.getLogger('peewee')
131
+ peewee_logger.propagate = False
132
+ peewee_logger.addHandler(database_logger.handlers[0])
133
+ peewee_logger.setLevel(database_logger.level)
134
+
135
+ while True:
136
+ dispatch()
137
+ time.sleep(3)
138
+ update_progress()
rag/svr/task_executor.py CHANGED
@@ -24,8 +24,9 @@ import sys
24
  from functools import partial
25
  from timeit import default_timer as timer
26
 
 
 
27
  from api.db.services.task_service import TaskService
28
- from rag.llm import EmbeddingModel, CvModel
29
  from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
30
  from rag.utils import ELASTICSEARCH
31
  from rag.utils import MINIO
@@ -35,7 +36,7 @@ from rag.nlp import search
35
  from io import BytesIO
36
  import pandas as pd
37
 
38
- from rag.app import laws, paper, presentation, manual
39
 
40
  from api.db import LLMType, ParserType
41
  from api.db.services.document_service import DocumentService
@@ -51,13 +52,14 @@ FACTORY = {
51
  ParserType.PRESENTATION.value: presentation,
52
  ParserType.MANUAL.value: manual,
53
  ParserType.LAWS.value: laws,
 
54
  }
55
 
56
 
57
  def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."):
58
  cancel = TaskService.do_cancel(task_id)
59
  if cancel:
60
- msg = "Canceled."
61
  prog = -1
62
 
63
  if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
@@ -166,13 +168,16 @@ def init_kb(row):
166
 
167
 
168
  def embedding(docs, mdl):
169
- tts, cnts = [d["docnm_kwd"] for d in docs], [d["content_with_weight"] for d in docs]
170
  tk_count = 0
171
- tts, c = mdl.encode(tts)
172
- tk_count += c
 
 
173
  cnts, c = mdl.encode(cnts)
174
  tk_count += c
175
- vects = 0.1 * tts + 0.9 * cnts
 
176
  assert len(vects) == len(docs)
177
  for i, d in enumerate(docs):
178
  v = vects[i].tolist()
@@ -215,12 +220,14 @@ def main(comm, mod):
215
  callback(msg="Finished embedding! Start to build index!")
216
  init_kb(r)
217
  chunk_count = len(set([c["_id"] for c in cks]))
218
- callback(1., "Done!")
219
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
220
  if es_r:
221
  callback(-1, "Index failure!")
222
  cron_logger.error(str(es_r))
223
  else:
 
 
 
224
  DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
225
  cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
226
 
 
24
  from functools import partial
25
  from timeit import default_timer as timer
26
 
27
+ from elasticsearch_dsl import Q
28
+
29
  from api.db.services.task_service import TaskService
 
30
  from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
31
  from rag.utils import ELASTICSEARCH
32
  from rag.utils import MINIO
 
36
  from io import BytesIO
37
  import pandas as pd
38
 
39
+ from rag.app import laws, paper, presentation, manual, qa
40
 
41
  from api.db import LLMType, ParserType
42
  from api.db.services.document_service import DocumentService
 
52
  ParserType.PRESENTATION.value: presentation,
53
  ParserType.MANUAL.value: manual,
54
  ParserType.LAWS.value: laws,
55
+ ParserType.QA.value: qa,
56
  }
57
 
58
 
59
  def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."):
60
  cancel = TaskService.do_cancel(task_id)
61
  if cancel:
62
+ msg += " [Canceled]"
63
  prog = -1
64
 
65
  if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
 
168
 
169
 
170
  def embedding(docs, mdl):
171
+ tts, cnts = [d["docnm_kwd"] for d in docs if d.get("docnm_kwd")], [d["content_with_weight"] for d in docs]
172
  tk_count = 0
173
+ if len(tts) == len(cnts):
174
+ tts, c = mdl.encode(tts)
175
+ tk_count += c
176
+
177
  cnts, c = mdl.encode(cnts)
178
  tk_count += c
179
+ vects = (0.1 * tts + 0.9 * cnts) if len(tts) == len(cnts) else cnts
180
+
181
  assert len(vects) == len(docs)
182
  for i, d in enumerate(docs):
183
  v = vects[i].tolist()
 
220
  callback(msg="Finished embedding! Start to build index!")
221
  init_kb(r)
222
  chunk_count = len(set([c["_id"] for c in cks]))
 
223
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
224
  if es_r:
225
  callback(-1, "Index failure!")
226
  cron_logger.error(str(es_r))
227
  else:
228
+ if TaskService.do_cancel(r["id"]):
229
+ ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
230
+ callback(1., "Done!")
231
  DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
232
  cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
233