Spaces:

retopara
/

ragflow

Build error

App Files Files Community

Kevin Hu commited on Aug 13, 2024

Commit

77dc93a

1 Parent(s): d2db126

fix mind map bug (#1934)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (4) hide show

api/apps/document_app.py +12 -6
api/db/services/api_service.py +1 -0
graphrag/mind_map_extractor.py +1 -1
rag/app/naive.py +1 -2

api/apps/document_app.py CHANGED Viewed

@@ -452,7 +452,7 @@ def get_image(image_id):
 @login_required
 @validate_request("conversation_id")
 def upload_and_parse():
-    req = request.json
     if 'file' not in request.files:
         return get_json_result(
             data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
@@ -463,7 +463,7 @@ def upload_and_parse():
             return get_json_result(
                 data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
-    e, conv = ConversationService.get_by_id(req["conversation_id"])
     if not e:
         return get_data_error_result(retmsg="Conversation not found!")
     e, dia = DialogService.get_by_id(conv.dialog_id)
@@ -487,6 +487,12 @@ def upload_and_parse():
     def dummy(prog=None, msg=""):
         pass
     parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。；！？", "layout_recognize": False}
     exe = ThreadPoolExecutor(max_workers=12)
     threads = []
@@ -497,7 +503,7 @@ def upload_and_parse():
             "from_page": 0,
             "to_page": 100000
         }
-        threads.append(exe.submit(naive.chunk, d["name"], blob, **kwargs))
     for (docinfo,_), th in zip(files, threads):
         docs = []
@@ -550,7 +556,7 @@ def upload_and_parse():
     for doc_id in docids:
         cks = [c for c in docs if c["doc_id"] == doc_id]
-        if parser_ids[doc_id] != ParserType.PICTURE.value:
             mindmap = MindMapExtractor(llm_bdl)
             try:
                 mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
@@ -564,7 +570,7 @@ def upload_and_parse():
             except Exception as e:
                 stat_logger.error("Mind map generation error:", traceback.format_exc())
-        vects = embedding(doc_id, cks)
         assert len(cks) == len(vects)
         for i, d in enumerate(cks):
             v = vects[i]
@@ -575,4 +581,4 @@ def upload_and_parse():
         DocumentService.increment_chunk_num(
             doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
-    return get_json_result(data=[d["id"] for d in files])

 @login_required
 @validate_request("conversation_id")
 def upload_and_parse():
+    from rag.app import presentation, picture, naive, audio, email
     if 'file' not in request.files:
         return get_json_result(
             data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
             return get_json_result(
                 data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
+    e, conv = ConversationService.get_by_id(request.form.get("conversation_id"))
     if not e:
         return get_data_error_result(retmsg="Conversation not found!")
     e, dia = DialogService.get_by_id(conv.dialog_id)
     def dummy(prog=None, msg=""):
         pass
+    FACTORY = {
+        ParserType.PRESENTATION.value: presentation,
+        ParserType.PICTURE.value: picture,
+        ParserType.AUDIO.value: audio,
+        ParserType.EMAIL.value: email
+    }
     parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。；！？", "layout_recognize": False}
     exe = ThreadPoolExecutor(max_workers=12)
     threads = []
             "from_page": 0,
             "to_page": 100000
         }
+        threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
     for (docinfo,_), th in zip(files, threads):
         docs = []
     for doc_id in docids:
         cks = [c for c in docs if c["doc_id"] == doc_id]
+        if False and parser_ids[doc_id] != ParserType.PICTURE.value:
             mindmap = MindMapExtractor(llm_bdl)
             try:
                 mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
             except Exception as e:
                 stat_logger.error("Mind map generation error:", traceback.format_exc())
+        vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
         assert len(cks) == len(vects)
         for i, d in enumerate(cks):
             v = vects[i]
         DocumentService.increment_chunk_num(
             doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
+    return get_json_result(data=[d["id"] for d,_ in files])

api/db/services/api_service.py CHANGED Viewed

@@ -46,6 +46,7 @@ class API4ConversationService(CommonService):
     @classmethod
     @DB.connection_context()
     def stats(cls, tenant_id, from_date, to_date, source=None):
         return cls.model.select(
             cls.model.create_date.truncate("day").alias("dt"),
             peewee.fn.COUNT(

     @classmethod
     @DB.connection_context()
     def stats(cls, tenant_id, from_date, to_date, source=None):
+        if len(to_date) == 10: to_date += " 23:59:59"
         return cls.model.select(
             cls.model.create_date.truncate("day").alias("dt"),
             peewee.fn.COUNT(

graphrag/mind_map_extractor.py CHANGED Viewed

@@ -113,7 +113,7 @@ class MindMapExtractor:
                           "children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
                                        merge_json.items() if isinstance(v, dict) and self._key(k)]}
             else:
-                k = self._key(list(self._be_children.keys())[0])
                 merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
         except Exception as e:

                           "children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
                                        merge_json.items() if isinstance(v, dict) and self._key(k)]}
             else:
+                k = self._key(list(merge_json.keys())[0])
                 merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
         except Exception as e:

rag/app/naive.py CHANGED Viewed

@@ -61,9 +61,8 @@ class Docx(DocxParser):
             if pn > to_page:
                 break
             if from_page <= pn < to_page:
-                current_image = None
                 if p.text.strip():
-                    if p.style.name == 'Caption':
                         former_image = None
                         if lines and lines[-1][1] and lines[-1][2] != 'Caption':
                             former_image = lines[-1][1].pop()

             if pn > to_page:
                 break
             if from_page <= pn < to_page:
                 if p.text.strip():
+                    if p.style and p.style.name == 'Caption':
                         former_image = None
                         if lines and lines[-1][1] and lines[-1][2] != 'Caption':
                             former_image = lines[-1][1].pop()