Kevin Hu
commited on
Commit
·
b6ce919
1
Parent(s):
c530565
refine mindmap (#1817)
Browse files### What problem does this PR solve?
#1594
### Type of change
- [x] Refactoring
- graphrag/index.py +8 -1
- graphrag/mind_map_extractor.py +20 -4
- graphrag/mind_map_prompt.py +11 -19
graphrag/index.py
CHANGED
@@ -21,6 +21,7 @@ from typing import List
|
|
21 |
import networkx as nx
|
22 |
from api.db import LLMType
|
23 |
from api.db.services.llm_service import LLMBundle
|
|
|
24 |
from graphrag.community_reports_extractor import CommunityReportsExtractor
|
25 |
from graphrag.entity_resolution import EntityResolution
|
26 |
from graphrag.graph_extractor import GraphExtractor
|
@@ -30,6 +31,11 @@ from rag.utils import num_tokens_from_string
|
|
30 |
|
31 |
|
32 |
def be_children(obj: dict, keyset:set):
|
|
|
|
|
|
|
|
|
|
|
33 |
arr = []
|
34 |
for k,v in obj.items():
|
35 |
k = re.sub(r"\*+", "", k)
|
@@ -65,7 +71,8 @@ def graph_merge(g1, g2):
|
|
65 |
|
66 |
|
67 |
def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
|
68 |
-
|
|
|
69 |
ext = GraphExtractor(llm_bdl)
|
70 |
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
|
71 |
left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
|
|
|
21 |
import networkx as nx
|
22 |
from api.db import LLMType
|
23 |
from api.db.services.llm_service import LLMBundle
|
24 |
+
from api.db.services.user_service import TenantService
|
25 |
from graphrag.community_reports_extractor import CommunityReportsExtractor
|
26 |
from graphrag.entity_resolution import EntityResolution
|
27 |
from graphrag.graph_extractor import GraphExtractor
|
|
|
31 |
|
32 |
|
33 |
def be_children(obj: dict, keyset:set):
|
34 |
+
if isinstance(obj, str):
|
35 |
+
obj = [obj]
|
36 |
+
if isinstance(obj, list):
|
37 |
+
for i in obj: keyset.add(i)
|
38 |
+
return [{"id": i, "children":[]} for i in obj]
|
39 |
arr = []
|
40 |
for k,v in obj.items():
|
41 |
k = re.sub(r"\*+", "", k)
|
|
|
71 |
|
72 |
|
73 |
def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
|
74 |
+
_, tenant = TenantService.get_by_id(tenant_id)
|
75 |
+
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
|
76 |
ext = GraphExtractor(llm_bdl)
|
77 |
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
|
78 |
left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
|
graphrag/mind_map_extractor.py
CHANGED
@@ -13,7 +13,9 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
-
|
|
|
|
|
17 |
import logging
|
18 |
import traceback
|
19 |
from concurrent.futures import ThreadPoolExecutor
|
@@ -65,7 +67,7 @@ class MindMapExtractor:
|
|
65 |
try:
|
66 |
exe = ThreadPoolExecutor(max_workers=12)
|
67 |
threads = []
|
68 |
-
token_count = self._llm.max_length * 0.
|
69 |
texts = []
|
70 |
res = []
|
71 |
cnt = 0
|
@@ -122,6 +124,19 @@ class MindMapExtractor:
|
|
122 |
continue
|
123 |
return data
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def _process_document(
|
126 |
self, text: str, prompt_variables: dict[str, str]
|
127 |
) -> str:
|
@@ -132,6 +147,7 @@ class MindMapExtractor:
|
|
132 |
text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
|
133 |
gen_conf = {"temperature": 0.5}
|
134 |
response = self._llm.chat(text, [], gen_conf)
|
|
|
135 |
print(response)
|
136 |
-
print("---------------------------------------------------\n", markdown_to_json.dictify(response))
|
137 |
-
return
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
+
import collections
|
17 |
+
import logging
|
18 |
+
import re
|
19 |
import logging
|
20 |
import traceback
|
21 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
67 |
try:
|
68 |
exe = ThreadPoolExecutor(max_workers=12)
|
69 |
threads = []
|
70 |
+
token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
|
71 |
texts = []
|
72 |
res = []
|
73 |
cnt = 0
|
|
|
124 |
continue
|
125 |
return data
|
126 |
|
127 |
+
def _todict(self, layer:collections.OrderedDict):
|
128 |
+
to_ret = layer
|
129 |
+
if isinstance(layer, collections.OrderedDict):
|
130 |
+
to_ret = dict(layer)
|
131 |
+
|
132 |
+
try:
|
133 |
+
for key, value in to_ret.items():
|
134 |
+
to_ret[key] = self._todict(value)
|
135 |
+
except AttributeError:
|
136 |
+
pass
|
137 |
+
|
138 |
+
return self._list_to_kv(to_ret)
|
139 |
+
|
140 |
def _process_document(
|
141 |
self, text: str, prompt_variables: dict[str, str]
|
142 |
) -> str:
|
|
|
147 |
text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
|
148 |
gen_conf = {"temperature": 0.5}
|
149 |
response = self._llm.chat(text, [], gen_conf)
|
150 |
+
response = re.sub(r"```[^\n]*", "", response)
|
151 |
print(response)
|
152 |
+
print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
|
153 |
+
return self._todict(markdown_to_json.dictify(response))
|
graphrag/mind_map_prompt.py
CHANGED
@@ -14,28 +14,20 @@
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
MIND_MAP_EXTRACTION_PROMPT = """
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
- Output requirement:
|
25 |
-
- In language of
|
26 |
-
- MUST IN FORMAT OF MARKDOWN
|
27 |
-
|
28 |
-
Output:
|
29 |
-
## <Title>
|
30 |
-
<Section Name>
|
31 |
-
<Section Name>
|
32 |
-
<Subsection Name>
|
33 |
-
<Subsection Name>
|
34 |
-
<Section Name>
|
35 |
-
<Subsection Name>
|
36 |
-
|
37 |
-TEXT-
|
38 |
{input_text}
|
39 |
|
40 |
-
Output:
|
41 |
"""
|
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
MIND_MAP_EXTRACTION_PROMPT = """
|
17 |
+
- Role: You're a talent text processor to summarize a piece of text into a mind map.
|
18 |
|
19 |
+
- Step of task:
|
20 |
+
1. Generate a title for user's 'TEXT'。
|
21 |
+
2. Classify the 'TEXT' into sections of a mind map.
|
22 |
+
3. If the subject matter is really complex, split them into sub-sections and sub-subsections.
|
23 |
+
4. Add a shot content summary of the bottom level section.
|
24 |
+
|
25 |
+
- Output requirement:
|
26 |
+
- Always try to maximize the number of sub-sections.
|
27 |
+
- In language of 'Text'
|
28 |
+
- MUST IN FORMAT OF MARKDOWN
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
-TEXT-
|
31 |
{input_text}
|
32 |
|
|
|
33 |
"""
|