Kevin Hu commited on
Commit
9cfd69b
·
1 Parent(s): 2f2501f

Code refactor. (#4291)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Refactoring

agent/component/answer.py CHANGED
@@ -16,6 +16,7 @@
16
  import random
17
  from abc import ABC
18
  from functools import partial
 
19
 
20
  import pandas as pd
21
 
@@ -76,4 +77,13 @@ class Answer(ComponentBase, ABC):
76
  def set_exception(self, e):
77
  self.exception = e
78
 
 
 
 
 
 
 
 
 
 
79
 
 
16
  import random
17
  from abc import ABC
18
  from functools import partial
19
+ from typing import Tuple, Union
20
 
21
  import pandas as pd
22
 
 
77
  def set_exception(self, e):
78
  self.exception = e
79
 
80
+ def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
81
+ if allow_partial:
82
+ return super.output()
83
+
84
+ for r, c in self._canvas.history[::-1]:
85
+ if r == "user":
86
+ return self._param.output_var_name, pd.DataFrame([{"content": c}])
87
+
88
+ self._param.output_var_name, pd.DataFrame([])
89
 
api/apps/canvas_app.py CHANGED
@@ -146,12 +146,16 @@ def run():
146
 
147
  canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
148
  canvas.history.append(("assistant", final_ans["content"]))
 
 
149
  if final_ans.get("reference"):
150
  canvas.reference.append(final_ans["reference"])
151
  cvs.dsl = json.loads(str(canvas))
152
  UserCanvasService.update_by_id(req["id"], cvs.to_dict())
153
  except Exception as e:
154
  cvs.dsl = json.loads(str(canvas))
 
 
155
  UserCanvasService.update_by_id(req["id"], cvs.to_dict())
156
  traceback.print_exc()
157
  yield "data:" + json.dumps({"code": 500, "message": str(e),
 
146
 
147
  canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
148
  canvas.history.append(("assistant", final_ans["content"]))
149
+ if not canvas.path[-1]:
150
+ canvas.path.pop(-1)
151
  if final_ans.get("reference"):
152
  canvas.reference.append(final_ans["reference"])
153
  cvs.dsl = json.loads(str(canvas))
154
  UserCanvasService.update_by_id(req["id"], cvs.to_dict())
155
  except Exception as e:
156
  cvs.dsl = json.loads(str(canvas))
157
+ if not canvas.path[-1]:
158
+ canvas.path.pop(-1)
159
  UserCanvasService.update_by_id(req["id"], cvs.to_dict())
160
  traceback.print_exc()
161
  yield "data:" + json.dumps({"code": 500, "message": str(e),
api/apps/dialog_app.py CHANGED
@@ -103,10 +103,7 @@ def set_dialog():
103
  }
104
  if not DialogService.save(**dia):
105
  return get_data_error_result(message="Fail to new a dialog!")
106
- e, dia = DialogService.get_by_id(dia["id"])
107
- if not e:
108
- return get_data_error_result(message="Fail to new a dialog!")
109
- return get_json_result(data=dia.to_json())
110
  else:
111
  del req["dialog_id"]
112
  if "kb_names" in req:
@@ -117,6 +114,7 @@ def set_dialog():
117
  if not e:
118
  return get_data_error_result(message="Fail to update a dialog!")
119
  dia = dia.to_dict()
 
120
  dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
121
  return get_json_result(data=dia)
122
  except Exception as e:
 
103
  }
104
  if not DialogService.save(**dia):
105
  return get_data_error_result(message="Fail to new a dialog!")
106
+ return get_json_result(data=dia)
 
 
 
107
  else:
108
  del req["dialog_id"]
109
  if "kb_names" in req:
 
114
  if not e:
115
  return get_data_error_result(message="Fail to update a dialog!")
116
  dia = dia.to_dict()
117
+ dia.update(req)
118
  dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
119
  return get_json_result(data=dia)
120
  except Exception as e:
api/apps/kb_app.py CHANGED
@@ -185,7 +185,8 @@ def rm():
185
  return get_data_error_result(
186
  message="Database error (Document removal)!")
187
  f2d = File2DocumentService.get_by_document_id(doc.id)
188
- FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
 
189
  File2DocumentService.delete_by_document_id(doc.id)
190
  FileService.filter_delete(
191
  [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
 
185
  return get_data_error_result(
186
  message="Database error (Document removal)!")
187
  f2d = File2DocumentService.get_by_document_id(doc.id)
188
+ if f2d:
189
+ FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
190
  File2DocumentService.delete_by_document_id(doc.id)
191
  FileService.filter_delete(
192
  [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
api/utils/api_utils.py CHANGED
@@ -120,6 +120,10 @@ def server_error_response(e):
120
  if len(e.args) > 1:
121
  return get_json_result(
122
  code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
 
 
 
 
123
  return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
124
 
125
 
 
120
  if len(e.args) > 1:
121
  return get_json_result(
122
  code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
123
+ if repr(e).find("index_not_found_exception") >= 0:
124
+ return get_json_result(code=settings.RetCode.EXCEPTION_ERROR,
125
+ message="No chunk found, please upload file and parse it.")
126
+
127
  return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
128
 
129
 
graphrag/graph_prompt.py CHANGED
@@ -11,20 +11,20 @@ Given a text document that is potentially relevant to this activity and a list o
11
 
12
  -Steps-
13
  1. Identify all entities. For each identified entity, extract the following information:
14
- - entity_name: Name of the entity, capitalized
15
  - entity_type: One of the following types: [{entity_types}]
16
- - entity_description: Comprehensive description of the entity's attributes and activities
17
  Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
18
 
19
  2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
20
  For each pair of related entities, extract the following information:
21
  - source_entity: name of the source entity, as identified in step 1
22
  - target_entity: name of the target entity, as identified in step 1
23
- - relationship_description: explanation as to why you think the source entity and the target entity are related to each other
24
  - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
25
  Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
26
 
27
- 3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
28
 
29
  4. When finished, output {completion_delimiter}
30
 
 
11
 
12
  -Steps-
13
  1. Identify all entities. For each identified entity, extract the following information:
14
+ - entity_name: Name of the entity, capitalized, in language of 'Text'
15
  - entity_type: One of the following types: [{entity_types}]
16
+ - entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text'
17
  Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
18
 
19
  2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
20
  For each pair of related entities, extract the following information:
21
  - source_entity: name of the source entity, as identified in step 1
22
  - target_entity: name of the target entity, as identified in step 1
23
+ - relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text'
24
  - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
25
  Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
26
 
27
+ 3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
28
 
29
  4. When finished, output {completion_delimiter}
30
 
graphrag/utils.py CHANGED
@@ -81,7 +81,7 @@ def get_llm_cache(llmnm, txt, history, genconf):
81
  return bin
82
 
83
 
84
- def set_llm_cache(llmnm, txt, v: str, history, genconf):
85
  hasher = xxhash.xxh64()
86
  hasher.update(str(llmnm).encode("utf-8"))
87
  hasher.update(str(txt).encode("utf-8"))
 
81
  return bin
82
 
83
 
84
+ def set_llm_cache(llmnm, txt, v, history, genconf):
85
  hasher = xxhash.xxh64()
86
  hasher.update(str(llmnm).encode("utf-8"))
87
  hasher.update(str(txt).encode("utf-8"))
rag/app/laws.py CHANGED
@@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
153
 
154
  if re.search(r"\.docx$", filename, re.IGNORECASE):
155
  callback(0.1, "Start to parse.")
156
- for txt in Docx()(filename, binary):
157
- sections.append(txt)
158
- callback(0.8, "Finish parsing.")
159
- chunks = sections
160
- return tokenize_chunks(chunks, doc, eng, pdf_parser)
161
 
162
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
163
  pdf_parser = Pdf() if kwargs.get(
 
153
 
154
  if re.search(r"\.docx$", filename, re.IGNORECASE):
155
  callback(0.1, "Start to parse.")
156
+ chunks = Docx()(filename, binary)
157
+ callback(0.7, "Finish parsing.")
158
+ return tokenize_chunks(chunks, doc, eng, None)
 
 
159
 
160
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
161
  pdf_parser = Pdf() if kwargs.get(
rag/app/manual.py CHANGED
@@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
193
  sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
194
  # set pivot using the most frequent type of title,
195
  # then merge between 2 pivot
196
- if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
197
  max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
198
  most_level = max(0, max_lvl - 1)
199
  levels = []
@@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
256
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
257
  return res
258
 
259
- if re.search(r"\.docx$", filename, re.IGNORECASE):
260
  docx_parser = Docx()
261
  ti_list, tbls = docx_parser(filename, binary,
262
  from_page=0, to_page=10000, callback=callback)
 
193
  sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
194
  # set pivot using the most frequent type of title,
195
  # then merge between 2 pivot
196
+ if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
197
  max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
198
  most_level = max(0, max_lvl - 1)
199
  levels = []
 
256
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
257
  return res
258
 
259
+ elif re.search(r"\.docx$", filename, re.IGNORECASE):
260
  docx_parser = Docx()
261
  ti_list, tbls = docx_parser(filename, binary,
262
  from_page=0, to_page=10000, callback=callback)
rag/app/table.py CHANGED
@@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
185
  "datetime": "_dt",
186
  "bool": "_kwd"}
187
  for df in dfs:
188
- for n in ["id", "index", "idx"]:
189
  if n in df.columns:
190
  del df[n]
191
  clmns = df.columns.values
 
185
  "datetime": "_dt",
186
  "bool": "_kwd"}
187
  for df in dfs:
188
+ for n in ["id", "_id", "index", "idx"]:
189
  if n in df.columns:
190
  del df[n]
191
  clmns = df.columns.values