Kevin Hu
commited on
Commit
·
77dc93a
1
Parent(s):
d2db126
fix mind map bug (#1934)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- api/apps/document_app.py +12 -6
- api/db/services/api_service.py +1 -0
- graphrag/mind_map_extractor.py +1 -1
- rag/app/naive.py +1 -2
api/apps/document_app.py
CHANGED
|
@@ -452,7 +452,7 @@ def get_image(image_id):
|
|
| 452 |
@login_required
|
| 453 |
@validate_request("conversation_id")
|
| 454 |
def upload_and_parse():
|
| 455 |
-
|
| 456 |
if 'file' not in request.files:
|
| 457 |
return get_json_result(
|
| 458 |
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
|
@@ -463,7 +463,7 @@ def upload_and_parse():
|
|
| 463 |
return get_json_result(
|
| 464 |
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
| 465 |
|
| 466 |
-
e, conv = ConversationService.get_by_id(
|
| 467 |
if not e:
|
| 468 |
return get_data_error_result(retmsg="Conversation not found!")
|
| 469 |
e, dia = DialogService.get_by_id(conv.dialog_id)
|
|
@@ -487,6 +487,12 @@ def upload_and_parse():
|
|
| 487 |
def dummy(prog=None, msg=""):
|
| 488 |
pass
|
| 489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
| 491 |
exe = ThreadPoolExecutor(max_workers=12)
|
| 492 |
threads = []
|
|
@@ -497,7 +503,7 @@ def upload_and_parse():
|
|
| 497 |
"from_page": 0,
|
| 498 |
"to_page": 100000
|
| 499 |
}
|
| 500 |
-
threads.append(exe.submit(naive.chunk, d["name"], blob, **kwargs))
|
| 501 |
|
| 502 |
for (docinfo,_), th in zip(files, threads):
|
| 503 |
docs = []
|
|
@@ -550,7 +556,7 @@ def upload_and_parse():
|
|
| 550 |
for doc_id in docids:
|
| 551 |
cks = [c for c in docs if c["doc_id"] == doc_id]
|
| 552 |
|
| 553 |
-
if parser_ids[doc_id] != ParserType.PICTURE.value:
|
| 554 |
mindmap = MindMapExtractor(llm_bdl)
|
| 555 |
try:
|
| 556 |
mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
|
|
@@ -564,7 +570,7 @@ def upload_and_parse():
|
|
| 564 |
except Exception as e:
|
| 565 |
stat_logger.error("Mind map generation error:", traceback.format_exc())
|
| 566 |
|
| 567 |
-
vects = embedding(doc_id, cks)
|
| 568 |
assert len(cks) == len(vects)
|
| 569 |
for i, d in enumerate(cks):
|
| 570 |
v = vects[i]
|
|
@@ -575,4 +581,4 @@ def upload_and_parse():
|
|
| 575 |
DocumentService.increment_chunk_num(
|
| 576 |
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
|
| 577 |
|
| 578 |
-
return get_json_result(data=[d["id"] for d in files])
|
|
|
|
| 452 |
@login_required
|
| 453 |
@validate_request("conversation_id")
|
| 454 |
def upload_and_parse():
|
| 455 |
+
from rag.app import presentation, picture, naive, audio, email
|
| 456 |
if 'file' not in request.files:
|
| 457 |
return get_json_result(
|
| 458 |
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
| 463 |
return get_json_result(
|
| 464 |
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
| 465 |
|
| 466 |
+
e, conv = ConversationService.get_by_id(request.form.get("conversation_id"))
|
| 467 |
if not e:
|
| 468 |
return get_data_error_result(retmsg="Conversation not found!")
|
| 469 |
e, dia = DialogService.get_by_id(conv.dialog_id)
|
|
|
|
| 487 |
def dummy(prog=None, msg=""):
|
| 488 |
pass
|
| 489 |
|
| 490 |
+
FACTORY = {
|
| 491 |
+
ParserType.PRESENTATION.value: presentation,
|
| 492 |
+
ParserType.PICTURE.value: picture,
|
| 493 |
+
ParserType.AUDIO.value: audio,
|
| 494 |
+
ParserType.EMAIL.value: email
|
| 495 |
+
}
|
| 496 |
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
| 497 |
exe = ThreadPoolExecutor(max_workers=12)
|
| 498 |
threads = []
|
|
|
|
| 503 |
"from_page": 0,
|
| 504 |
"to_page": 100000
|
| 505 |
}
|
| 506 |
+
threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
|
| 507 |
|
| 508 |
for (docinfo,_), th in zip(files, threads):
|
| 509 |
docs = []
|
|
|
|
| 556 |
for doc_id in docids:
|
| 557 |
cks = [c for c in docs if c["doc_id"] == doc_id]
|
| 558 |
|
| 559 |
+
if False and parser_ids[doc_id] != ParserType.PICTURE.value:
|
| 560 |
mindmap = MindMapExtractor(llm_bdl)
|
| 561 |
try:
|
| 562 |
mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
|
|
|
|
| 570 |
except Exception as e:
|
| 571 |
stat_logger.error("Mind map generation error:", traceback.format_exc())
|
| 572 |
|
| 573 |
+
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
|
| 574 |
assert len(cks) == len(vects)
|
| 575 |
for i, d in enumerate(cks):
|
| 576 |
v = vects[i]
|
|
|
|
| 581 |
DocumentService.increment_chunk_num(
|
| 582 |
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
|
| 583 |
|
| 584 |
+
return get_json_result(data=[d["id"] for d,_ in files])
|
api/db/services/api_service.py
CHANGED
|
@@ -46,6 +46,7 @@ class API4ConversationService(CommonService):
|
|
| 46 |
@classmethod
|
| 47 |
@DB.connection_context()
|
| 48 |
def stats(cls, tenant_id, from_date, to_date, source=None):
|
|
|
|
| 49 |
return cls.model.select(
|
| 50 |
cls.model.create_date.truncate("day").alias("dt"),
|
| 51 |
peewee.fn.COUNT(
|
|
|
|
| 46 |
@classmethod
|
| 47 |
@DB.connection_context()
|
| 48 |
def stats(cls, tenant_id, from_date, to_date, source=None):
|
| 49 |
+
if len(to_date) == 10: to_date += " 23:59:59"
|
| 50 |
return cls.model.select(
|
| 51 |
cls.model.create_date.truncate("day").alias("dt"),
|
| 52 |
peewee.fn.COUNT(
|
graphrag/mind_map_extractor.py
CHANGED
|
@@ -113,7 +113,7 @@ class MindMapExtractor:
|
|
| 113 |
"children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
|
| 114 |
merge_json.items() if isinstance(v, dict) and self._key(k)]}
|
| 115 |
else:
|
| 116 |
-
k = self._key(list(
|
| 117 |
merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
|
| 118 |
|
| 119 |
except Exception as e:
|
|
|
|
| 113 |
"children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
|
| 114 |
merge_json.items() if isinstance(v, dict) and self._key(k)]}
|
| 115 |
else:
|
| 116 |
+
k = self._key(list(merge_json.keys())[0])
|
| 117 |
merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
|
| 118 |
|
| 119 |
except Exception as e:
|
rag/app/naive.py
CHANGED
|
@@ -61,9 +61,8 @@ class Docx(DocxParser):
|
|
| 61 |
if pn > to_page:
|
| 62 |
break
|
| 63 |
if from_page <= pn < to_page:
|
| 64 |
-
current_image = None
|
| 65 |
if p.text.strip():
|
| 66 |
-
if p.style.name == 'Caption':
|
| 67 |
former_image = None
|
| 68 |
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
|
| 69 |
former_image = lines[-1][1].pop()
|
|
|
|
| 61 |
if pn > to_page:
|
| 62 |
break
|
| 63 |
if from_page <= pn < to_page:
|
|
|
|
| 64 |
if p.text.strip():
|
| 65 |
+
if p.style and p.style.name == 'Caption':
|
| 66 |
former_image = None
|
| 67 |
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
|
| 68 |
former_image = lines[-1][1].pop()
|