Kevin Hu commited on
Commit
b6ce919
·
1 Parent(s): c530565

refine mindmap (#1817)

Browse files

### What problem does this PR solve?

#1594
### Type of change

- [x] Refactoring

graphrag/index.py CHANGED
@@ -21,6 +21,7 @@ from typing import List
21
  import networkx as nx
22
  from api.db import LLMType
23
  from api.db.services.llm_service import LLMBundle
 
24
  from graphrag.community_reports_extractor import CommunityReportsExtractor
25
  from graphrag.entity_resolution import EntityResolution
26
  from graphrag.graph_extractor import GraphExtractor
@@ -30,6 +31,11 @@ from rag.utils import num_tokens_from_string
30
 
31
 
32
  def be_children(obj: dict, keyset:set):
 
 
 
 
 
33
  arr = []
34
  for k,v in obj.items():
35
  k = re.sub(r"\*+", "", k)
@@ -65,7 +71,8 @@ def graph_merge(g1, g2):
65
 
66
 
67
  def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
68
- llm_bdl = LLMBundle(tenant_id, LLMType.CHAT)
 
69
  ext = GraphExtractor(llm_bdl)
70
  left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
71
  left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
 
21
  import networkx as nx
22
  from api.db import LLMType
23
  from api.db.services.llm_service import LLMBundle
24
+ from api.db.services.user_service import TenantService
25
  from graphrag.community_reports_extractor import CommunityReportsExtractor
26
  from graphrag.entity_resolution import EntityResolution
27
  from graphrag.graph_extractor import GraphExtractor
 
31
 
32
 
33
  def be_children(obj: dict, keyset:set):
34
+ if isinstance(obj, str):
35
+ obj = [obj]
36
+ if isinstance(obj, list):
37
+ for i in obj: keyset.add(i)
38
+ return [{"id": i, "children":[]} for i in obj]
39
  arr = []
40
  for k,v in obj.items():
41
  k = re.sub(r"\*+", "", k)
 
71
 
72
 
73
  def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
74
+ _, tenant = TenantService.get_by_id(tenant_id)
75
+ llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
76
  ext = GraphExtractor(llm_bdl)
77
  left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
78
  left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
graphrag/mind_map_extractor.py CHANGED
@@ -13,7 +13,9 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
-
 
 
17
  import logging
18
  import traceback
19
  from concurrent.futures import ThreadPoolExecutor
@@ -65,7 +67,7 @@ class MindMapExtractor:
65
  try:
66
  exe = ThreadPoolExecutor(max_workers=12)
67
  threads = []
68
- token_count = self._llm.max_length * 0.7
69
  texts = []
70
  res = []
71
  cnt = 0
@@ -122,6 +124,19 @@ class MindMapExtractor:
122
  continue
123
  return data
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def _process_document(
126
  self, text: str, prompt_variables: dict[str, str]
127
  ) -> str:
@@ -132,6 +147,7 @@ class MindMapExtractor:
132
  text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
133
  gen_conf = {"temperature": 0.5}
134
  response = self._llm.chat(text, [], gen_conf)
 
135
  print(response)
136
- print("---------------------------------------------------\n", markdown_to_json.dictify(response))
137
- return dict(markdown_to_json.dictify(response))
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ import collections
17
+ import logging
18
+ import re
19
  import logging
20
  import traceback
21
  from concurrent.futures import ThreadPoolExecutor
 
67
  try:
68
  exe = ThreadPoolExecutor(max_workers=12)
69
  threads = []
70
+ token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
71
  texts = []
72
  res = []
73
  cnt = 0
 
124
  continue
125
  return data
126
 
127
+ def _todict(self, layer:collections.OrderedDict):
128
+ to_ret = layer
129
+ if isinstance(layer, collections.OrderedDict):
130
+ to_ret = dict(layer)
131
+
132
+ try:
133
+ for key, value in to_ret.items():
134
+ to_ret[key] = self._todict(value)
135
+ except AttributeError:
136
+ pass
137
+
138
+ return self._list_to_kv(to_ret)
139
+
140
  def _process_document(
141
  self, text: str, prompt_variables: dict[str, str]
142
  ) -> str:
 
147
  text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
148
  gen_conf = {"temperature": 0.5}
149
  response = self._llm.chat(text, [], gen_conf)
150
+ response = re.sub(r"```[^\n]*", "", response)
151
  print(response)
152
+ print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
153
+ return self._todict(markdown_to_json.dictify(response))
graphrag/mind_map_prompt.py CHANGED
@@ -14,28 +14,20 @@
14
  # limitations under the License.
15
  #
16
  MIND_MAP_EXTRACTION_PROMPT = """
17
- - Role: You're a talent text processor.
18
 
19
- - Step of task:
20
- 1. Generate a title for user's 'TEXT'。
21
- 2. Classify the 'TEXT' into sections as you see fit.
22
- 3. If the subject matter is really complex, split them into sub-sections.
 
 
 
 
 
 
23
 
24
- - Output requirement:
25
- - In language of
26
- - MUST IN FORMAT OF MARKDOWN
27
-
28
- Output:
29
- ## <Title>
30
- <Section Name>
31
- <Section Name>
32
- <Subsection Name>
33
- <Subsection Name>
34
- <Section Name>
35
- <Subsection Name>
36
-
37
  -TEXT-
38
  {input_text}
39
 
40
- Output:
41
  """
 
14
  # limitations under the License.
15
  #
16
  MIND_MAP_EXTRACTION_PROMPT = """
17
+ - Role: You're a talent text processor to summarize a piece of text into a mind map.
18
 
19
+ - Step of task:
20
+ 1. Generate a title for user's 'TEXT'。
21
+ 2. Classify the 'TEXT' into sections of a mind map.
22
+ 3. If the subject matter is really complex, split them into sub-sections and sub-subsections.
23
+ 4. Add a shot content summary of the bottom level section.
24
+
25
+ - Output requirement:
26
+ - Always try to maximize the number of sub-sections.
27
+ - In language of 'Text'
28
+ - MUST IN FORMAT OF MARKDOWN
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  -TEXT-
31
  {input_text}
32
 
 
33
  """