JobSmithManipulation Kevin Hu commited on
Commit
74bda08
·
1 Parent(s): 016dc40

update document sdk (#2445)

Browse files

### What problem does this PR solve?


### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>

api/apps/sdk/doc.py CHANGED
@@ -84,15 +84,28 @@ def upload(dataset_id, tenant_id):
84
  @token_required
85
  def docinfos(tenant_id):
86
  req = request.args
 
 
 
 
87
  if "id" in req:
88
  doc_id = req["id"]
89
- e, doc = DocumentService.get_by_id(doc_id)
90
- return get_json_result(data=doc.to_json())
91
  if "name" in req:
92
  doc_name = req["name"]
93
  doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
94
- e, doc = DocumentService.get_by_id(doc_id)
95
- return get_json_result(data=doc.to_json())
 
 
 
 
 
 
 
 
 
 
 
96
 
97
 
98
  @manager.route('/save', methods=['POST'])
@@ -246,7 +259,7 @@ def rename():
246
  req["doc_id"], {"name": req["name"]}):
247
  return get_data_error_result(
248
  retmsg="Database error (Document rename)!")
249
-
250
  informs = File2DocumentService.get_by_document_id(req["doc_id"])
251
  if informs:
252
  e, file = FileService.get_by_id(informs[0].file_id)
@@ -259,7 +272,7 @@ def rename():
259
 
260
  @manager.route("/<document_id>", methods=["GET"])
261
  @token_required
262
- def download_document(dataset_id, document_id):
263
  try:
264
  # Check whether there is this document
265
  exist, document = DocumentService.get_by_id(document_id)
@@ -313,7 +326,21 @@ def list_docs(dataset_id, tenant_id):
313
  try:
314
  docs, tol = DocumentService.get_by_kb_id(
315
  kb_id, page_number, items_per_page, orderby, desc, keywords)
316
- return get_json_result(data={"total": tol, "docs": docs})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  except Exception as e:
318
  return server_error_response(e)
319
 
@@ -436,6 +463,8 @@ def list_chunk(tenant_id):
436
  query["available_int"] = int(req["available_int"])
437
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
438
  res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
 
 
439
  for id in sres.ids:
440
  d = {
441
  "chunk_id": id,
@@ -455,7 +484,21 @@ def list_chunk(tenant_id):
455
  poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
456
  float(d["positions"][i + 3]), float(d["positions"][i + 4])])
457
  d["positions"] = poss
458
- res["chunks"].append(d)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  return get_json_result(data=res)
460
  except Exception as e:
461
  if str(e).find("not_found") > 0:
@@ -471,8 +514,9 @@ def create(tenant_id):
471
  req = request.json
472
  md5 = hashlib.md5()
473
  md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
474
- chunck_id = md5.hexdigest()
475
- d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
 
476
  "content_with_weight": req["content_with_weight"]}
477
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
478
  d["important_kwd"] = req.get("important_kwd", [])
@@ -503,20 +547,33 @@ def create(tenant_id):
503
 
504
  DocumentService.increment_chunk_num(
505
  doc.id, doc.kb_id, c, 1, 0)
506
- return get_json_result(data={"chunk": d})
507
- # return get_json_result(data={"chunk_id": chunck_id})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  except Exception as e:
509
  return server_error_response(e)
510
 
511
-
512
  @manager.route('/chunk/rm', methods=['POST'])
513
  @token_required
514
  @validate_request("chunk_ids", "doc_id")
515
- def rm_chunk():
516
  req = request.json
517
  try:
518
  if not ELASTICSEARCH.deleteByQuery(
519
- Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
520
  return get_data_error_result(retmsg="Index updating failure")
521
  e, doc = DocumentService.get_by_id(req["doc_id"])
522
  if not e:
@@ -526,4 +583,126 @@ def rm_chunk():
526
  DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
527
  return get_json_result(data=True)
528
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  return server_error_response(e)
 
84
  @token_required
85
  def docinfos(tenant_id):
86
  req = request.args
87
+ if "id" not in req and "name" not in req:
88
+ return get_data_error_result(
89
+ retmsg="Id or name should be provided")
90
+ doc_id=None
91
  if "id" in req:
92
  doc_id = req["id"]
 
 
93
  if "name" in req:
94
  doc_name = req["name"]
95
  doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
96
+ e, doc = DocumentService.get_by_id(doc_id)
97
+ #rename key's name
98
+ key_mapping = {
99
+ "chunk_num": "chunk_count",
100
+ "kb_id": "knowledgebase_id",
101
+ "token_num": "token_count",
102
+ }
103
+ renamed_doc = {}
104
+ for key, value in doc.to_dict().items():
105
+ new_key = key_mapping.get(key, key)
106
+ renamed_doc[new_key] = value
107
+
108
+ return get_json_result(data=renamed_doc)
109
 
110
 
111
  @manager.route('/save', methods=['POST'])
 
259
  req["doc_id"], {"name": req["name"]}):
260
  return get_data_error_result(
261
  retmsg="Database error (Document rename)!")
262
+
263
  informs = File2DocumentService.get_by_document_id(req["doc_id"])
264
  if informs:
265
  e, file = FileService.get_by_id(informs[0].file_id)
 
272
 
273
  @manager.route("/<document_id>", methods=["GET"])
274
  @token_required
275
+ def download_document(dataset_id, document_id,tenant_id):
276
  try:
277
  # Check whether there is this document
278
  exist, document = DocumentService.get_by_id(document_id)
 
326
  try:
327
  docs, tol = DocumentService.get_by_kb_id(
328
  kb_id, page_number, items_per_page, orderby, desc, keywords)
329
+
330
+ # rename key's name
331
+ renamed_doc_list = []
332
+ for doc in docs:
333
+ key_mapping = {
334
+ "chunk_num": "chunk_count",
335
+ "kb_id": "knowledgebase_id",
336
+ "token_num": "token_count",
337
+ }
338
+ renamed_doc = {}
339
+ for key, value in doc.items():
340
+ new_key = key_mapping.get(key, key)
341
+ renamed_doc[new_key] = value
342
+ renamed_doc_list.append(renamed_doc)
343
+ return get_json_result(data={"total": tol, "docs": renamed_doc_list})
344
  except Exception as e:
345
  return server_error_response(e)
346
 
 
463
  query["available_int"] = int(req["available_int"])
464
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
465
  res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
466
+
467
+ origin_chunks=[]
468
  for id in sres.ids:
469
  d = {
470
  "chunk_id": id,
 
484
  poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
485
  float(d["positions"][i + 3]), float(d["positions"][i + 4])])
486
  d["positions"] = poss
487
+
488
+ origin_chunks.append(d)
489
+ ##rename keys
490
+ for chunk in origin_chunks:
491
+ key_mapping = {
492
+ "chunk_id": "id",
493
+ "content_with_weight": "content",
494
+ "doc_id": "document_id",
495
+ "important_kwd": "important_keywords",
496
+ }
497
+ renamed_chunk = {}
498
+ for key, value in chunk.items():
499
+ new_key = key_mapping.get(key, key)
500
+ renamed_chunk[new_key] = value
501
+ res["chunks"].append(renamed_chunk)
502
  return get_json_result(data=res)
503
  except Exception as e:
504
  if str(e).find("not_found") > 0:
 
514
  req = request.json
515
  md5 = hashlib.md5()
516
  md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
517
+
518
+ chunk_id = md5.hexdigest()
519
+ d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
520
  "content_with_weight": req["content_with_weight"]}
521
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
522
  d["important_kwd"] = req.get("important_kwd", [])
 
547
 
548
  DocumentService.increment_chunk_num(
549
  doc.id, doc.kb_id, c, 1, 0)
550
+ d["chunk_id"] = chunk_id
551
+ #rename keys
552
+ key_mapping = {
553
+ "chunk_id": "id",
554
+ "content_with_weight": "content",
555
+ "doc_id": "document_id",
556
+ "important_kwd": "important_keywords",
557
+ "kb_id":"knowledge_base_id",
558
+ }
559
+ renamed_chunk = {}
560
+ for key, value in d.items():
561
+ new_key = key_mapping.get(key, key)
562
+ renamed_chunk[new_key] = value
563
+
564
+ return get_json_result(data={"chunk": renamed_chunk})
565
+ # return get_json_result(data={"chunk_id": chunk_id})
566
  except Exception as e:
567
  return server_error_response(e)
568
 
 
569
  @manager.route('/chunk/rm', methods=['POST'])
570
  @token_required
571
  @validate_request("chunk_ids", "doc_id")
572
+ def rm_chunk(tenant_id):
573
  req = request.json
574
  try:
575
  if not ELASTICSEARCH.deleteByQuery(
576
+ Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
577
  return get_data_error_result(retmsg="Index updating failure")
578
  e, doc = DocumentService.get_by_id(req["doc_id"])
579
  if not e:
 
583
  DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
584
  return get_json_result(data=True)
585
  except Exception as e:
586
+ return server_error_response(e)
587
+
588
+ @manager.route('/chunk/set', methods=['POST'])
589
+ @token_required
590
+ @validate_request("doc_id", "chunk_id", "content_with_weight",
591
+ "important_kwd")
592
+ def set(tenant_id):
593
+ req = request.json
594
+ d = {
595
+ "id": req["chunk_id"],
596
+ "content_with_weight": req["content_with_weight"]}
597
+ d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
598
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
599
+ d["important_kwd"] = req["important_kwd"]
600
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
601
+ if "available_int" in req:
602
+ d["available_int"] = req["available_int"]
603
+
604
+ try:
605
+ tenant_id = DocumentService.get_tenant_id(req["doc_id"])
606
+ if not tenant_id:
607
+ return get_data_error_result(retmsg="Tenant not found!")
608
+
609
+ embd_id = DocumentService.get_embd_id(req["doc_id"])
610
+ embd_mdl = TenantLLMService.model_instance(
611
+ tenant_id, LLMType.EMBEDDING.value, embd_id)
612
+
613
+ e, doc = DocumentService.get_by_id(req["doc_id"])
614
+ if not e:
615
+ return get_data_error_result(retmsg="Document not found!")
616
+
617
+ if doc.parser_id == ParserType.QA:
618
+ arr = [
619
+ t for t in re.split(
620
+ r"[\n\t]",
621
+ req["content_with_weight"]) if len(t) > 1]
622
+ if len(arr) != 2:
623
+ return get_data_error_result(
624
+ retmsg="Q&A must be separated by TAB/ENTER key.")
625
+ q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
626
+ d = beAdoc(d, arr[0], arr[1], not any(
627
+ [rag_tokenizer.is_chinese(t) for t in q + a]))
628
+
629
+ v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
630
+ v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
631
+ d["q_%d_vec" % len(v)] = v.tolist()
632
+ ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
633
+ return get_json_result(data=True)
634
+ except Exception as e:
635
+ return server_error_response(e)
636
+
637
+ @manager.route('/retrieval_test', methods=['POST'])
638
+ @token_required
639
+ @validate_request("kb_id", "question")
640
+ def retrieval_test(tenant_id):
641
+ req = request.json
642
+ page = int(req.get("page", 1))
643
+ size = int(req.get("size", 30))
644
+ question = req["question"]
645
+ kb_id = req["kb_id"]
646
+ if isinstance(kb_id, str): kb_id = [kb_id]
647
+ doc_ids = req.get("doc_ids", [])
648
+ similarity_threshold = float(req.get("similarity_threshold", 0.2))
649
+ vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
650
+ top = int(req.get("top_k", 1024))
651
+
652
+ try:
653
+ tenants = UserTenantService.query(user_id=tenant_id)
654
+ for kid in kb_id:
655
+ for tenant in tenants:
656
+ if KnowledgebaseService.query(
657
+ tenant_id=tenant.tenant_id, id=kid):
658
+ break
659
+ else:
660
+ return get_json_result(
661
+ data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
662
+ retcode=RetCode.OPERATING_ERROR)
663
+
664
+ e, kb = KnowledgebaseService.get_by_id(kb_id[0])
665
+ if not e:
666
+ return get_data_error_result(retmsg="Knowledgebase not found!")
667
+
668
+ embd_mdl = TenantLLMService.model_instance(
669
+ kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
670
+
671
+ rerank_mdl = None
672
+ if req.get("rerank_id"):
673
+ rerank_mdl = TenantLLMService.model_instance(
674
+ kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])
675
+
676
+ if req.get("keyword", False):
677
+ chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
678
+ question += keyword_extraction(chat_mdl, question)
679
+
680
+ retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
681
+ ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
682
+ similarity_threshold, vector_similarity_weight, top,
683
+ doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
684
+ for c in ranks["chunks"]:
685
+ if "vector" in c:
686
+ del c["vector"]
687
+
688
+ ##rename keys
689
+ renamed_chunks=[]
690
+ for chunk in ranks["chunks"]:
691
+ key_mapping = {
692
+ "chunk_id": "id",
693
+ "content_with_weight": "content",
694
+ "doc_id": "document_id",
695
+ "important_kwd": "important_keywords",
696
+ }
697
+ rename_chunk={}
698
+ for key, value in chunk.items():
699
+ new_key = key_mapping.get(key, key)
700
+ rename_chunk[new_key] = value
701
+ renamed_chunks.append(rename_chunk)
702
+ ranks["chunks"] = renamed_chunks
703
+ return get_json_result(data=ranks)
704
+ except Exception as e:
705
+ if str(e).find("not_found") > 0:
706
+ return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!',
707
+ retcode=RetCode.DATA_ERROR)
708
  return server_error_response(e)
sdk/python/ragflow/modules/chunk.py CHANGED
@@ -3,32 +3,48 @@ from .base import Base
3
 
4
  class Chunk(Base):
5
  def __init__(self, rag, res_dict):
6
- # 初始化类的属性
7
  self.id = ""
8
- self.content_with_weight = ""
9
- self.content_ltks = []
10
- self.content_sm_ltks = []
11
- self.important_kwd = []
12
- self.important_tks = []
13
  self.create_time = ""
14
  self.create_timestamp_flt = 0.0
15
- self.kb_id = None
16
- self.docnm_kwd = ""
17
- self.doc_id = ""
18
- self.q_vec = []
19
  self.status = "1"
20
- for k, v in res_dict.items():
21
- if hasattr(self, k):
22
- setattr(self, k, v)
23
-
24
  super().__init__(rag, res_dict)
 
25
  def delete(self) -> bool:
26
  """
27
  Delete the chunk in the document.
28
  """
29
- res = self.rm('/doc/chunk/rm',
30
- {"doc_id": [self.id],""})
31
  res = res.json()
32
  if res.get("retmsg") == "success":
33
  return True
34
- raise Exception(res["retmsg"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  class Chunk(Base):
5
  def __init__(self, rag, res_dict):
 
6
  self.id = ""
7
+ self.content = ""
8
+ self.important_keywords = []
 
 
 
9
  self.create_time = ""
10
  self.create_timestamp_flt = 0.0
11
+ self.knowledgebase_id = None
12
+ self.document_name = ""
13
+ self.document_id = ""
 
14
  self.status = "1"
15
+ for k in list(res_dict.keys()):
16
+ if k not in self.__dict__:
17
+ res_dict.pop(k)
 
18
  super().__init__(rag, res_dict)
19
+
20
  def delete(self) -> bool:
21
  """
22
  Delete the chunk in the document.
23
  """
24
+ res = self.post('/doc/chunk/rm',
25
+ {"doc_id": self.document_id, 'chunk_ids': [self.id]})
26
  res = res.json()
27
  if res.get("retmsg") == "success":
28
  return True
29
+ raise Exception(res["retmsg"])
30
+
31
+ def save(self) -> bool:
32
+ """
33
+ Save the document details to the server.
34
+ """
35
+ res = self.post('/doc/chunk/set',
36
+ {"chunk_id": self.id,
37
+ "kb_id": self.knowledgebase_id,
38
+ "name": self.document_name,
39
+ "content_with_weight": self.content,
40
+ "important_kwd": self.important_keywords,
41
+ "create_time": self.create_time,
42
+ "create_timestamp_flt": self.create_timestamp_flt,
43
+ "doc_id": self.document_id,
44
+ "status": self.status,
45
+ })
46
+ res = res.json()
47
+ if res.get("retmsg") == "success":
48
+ return True
49
+ raise Exception(res["retmsg"])
50
+
sdk/python/ragflow/modules/document.py CHANGED
@@ -9,15 +9,15 @@ class Document(Base):
9
  self.id = ""
10
  self.name = ""
11
  self.thumbnail = None
12
- self.kb_id = None
13
  self.parser_method = ""
14
  self.parser_config = {"pages": [[1, 1000000]]}
15
  self.source_type = "local"
16
  self.type = ""
17
  self.created_by = ""
18
  self.size = 0
19
- self.token_num = 0
20
- self.chunk_num = 0
21
  self.progress = 0.0
22
  self.progress_msg = ""
23
  self.process_begin_at = None
@@ -34,10 +34,10 @@ class Document(Base):
34
  Save the document details to the server.
35
  """
36
  res = self.post('/doc/save',
37
- {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.kb_id,
38
  "parser_id": self.parser_method, "parser_config": self.parser_config.to_json(),
39
  "source_type": self.source_type, "type": self.type, "created_by": self.created_by,
40
- "size": self.size, "token_num": self.token_num, "chunk_num": self.chunk_num,
41
  "progress": self.progress, "progress_msg": self.progress_msg,
42
  "process_begin_at": self.process_begin_at, "process_duation": self.process_duration
43
  })
@@ -177,8 +177,10 @@ class Document(Base):
177
  if res.status_code == 200:
178
  res_data = res.json()
179
  if res_data.get("retmsg") == "success":
180
- chunks = res_data["data"]["chunks"]
181
- self.chunks = chunks # Store the chunks in the document instance
 
 
182
  return chunks
183
  else:
184
  raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
@@ -187,10 +189,9 @@ class Document(Base):
187
 
188
  def add_chunk(self, content: str):
189
  res = self.post('/doc/chunk/create', {"doc_id": self.id, "content_with_weight":content})
190
-
191
- # 假设返回的 response 包含 chunk 的信息
192
  if res.status_code == 200:
193
- chunk_data = res.json()
194
- return Chunk(self.rag,chunk_data) # 假设有一个 Chunk 类来处理 chunk 对象
 
195
  else:
196
  raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")
 
9
  self.id = ""
10
  self.name = ""
11
  self.thumbnail = None
12
+ self.knowledgebase_id = None
13
  self.parser_method = ""
14
  self.parser_config = {"pages": [[1, 1000000]]}
15
  self.source_type = "local"
16
  self.type = ""
17
  self.created_by = ""
18
  self.size = 0
19
+ self.token_count = 0
20
+ self.chunk_count = 0
21
  self.progress = 0.0
22
  self.progress_msg = ""
23
  self.process_begin_at = None
 
34
  Save the document details to the server.
35
  """
36
  res = self.post('/doc/save',
37
+ {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.knowledgebase_id,
38
  "parser_id": self.parser_method, "parser_config": self.parser_config.to_json(),
39
  "source_type": self.source_type, "type": self.type, "created_by": self.created_by,
40
+ "size": self.size, "token_num": self.token_count, "chunk_num": self.chunk_count,
41
  "progress": self.progress, "progress_msg": self.progress_msg,
42
  "process_begin_at": self.process_begin_at, "process_duation": self.process_duration
43
  })
 
177
  if res.status_code == 200:
178
  res_data = res.json()
179
  if res_data.get("retmsg") == "success":
180
+ chunks=[]
181
+ for chunk_data in res_data["data"].get("chunks", []):
182
+ chunk=Chunk(self.rag,chunk_data)
183
+ chunks.append(chunk)
184
  return chunks
185
  else:
186
  raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
 
189
 
190
  def add_chunk(self, content: str):
191
  res = self.post('/doc/chunk/create', {"doc_id": self.id, "content_with_weight":content})
 
 
192
  if res.status_code == 200:
193
+ res_data = res.json().get("data")
194
+ chunk_data = res_data.get("chunk")
195
+ return Chunk(self.rag,chunk_data)
196
  else:
197
  raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")
sdk/python/ragflow/ragflow.py CHANGED
@@ -20,6 +20,8 @@ import requests
20
  from .modules.assistant import Assistant
21
  from .modules.dataset import DataSet
22
  from .modules.document import Document
 
 
23
 
24
  class RAGFlow:
25
  def __init__(self, user_key, base_url, version='v1'):
@@ -143,7 +145,7 @@ class RAGFlow:
143
  return result_list
144
  raise Exception(res["retmsg"])
145
 
146
- def create_document(self, ds:DataSet, name: str, blob: bytes) -> bool:
147
  url = f"/doc/dataset/{ds.id}/documents/upload"
148
  files = {
149
  'file': (name, blob)
@@ -164,6 +166,7 @@ class RAGFlow:
164
  raise Exception(f"Upload failed: {response.json().get('retmsg')}")
165
 
166
  return False
 
167
  def get_document(self, id: str = None, name: str = None) -> Document:
168
  res = self.get("/doc/infos", {"id": id, "name": name})
169
  res = res.json()
@@ -204,8 +207,6 @@ class RAGFlow:
204
  if not doc_ids or not isinstance(doc_ids, list):
205
  raise ValueError("doc_ids must be a non-empty list of document IDs")
206
  data = {"doc_ids": doc_ids, "run": 2}
207
-
208
-
209
  res = self.post(f'/doc/run', data)
210
 
211
  if res.status_code != 200:
@@ -217,4 +218,61 @@ class RAGFlow:
217
  print(f"Error occurred during canceling parsing for documents: {str(e)}")
218
  raise
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
 
20
  from .modules.assistant import Assistant
21
  from .modules.dataset import DataSet
22
  from .modules.document import Document
23
+ from .modules.chunk import Chunk
24
+
25
 
26
  class RAGFlow:
27
  def __init__(self, user_key, base_url, version='v1'):
 
145
  return result_list
146
  raise Exception(res["retmsg"])
147
 
148
+ def create_document(self, ds: DataSet, name: str, blob: bytes) -> bool:
149
  url = f"/doc/dataset/{ds.id}/documents/upload"
150
  files = {
151
  'file': (name, blob)
 
166
  raise Exception(f"Upload failed: {response.json().get('retmsg')}")
167
 
168
  return False
169
+
170
  def get_document(self, id: str = None, name: str = None) -> Document:
171
  res = self.get("/doc/infos", {"id": id, "name": name})
172
  res = res.json()
 
207
  if not doc_ids or not isinstance(doc_ids, list):
208
  raise ValueError("doc_ids must be a non-empty list of document IDs")
209
  data = {"doc_ids": doc_ids, "run": 2}
 
 
210
  res = self.post(f'/doc/run', data)
211
 
212
  if res.status_code != 200:
 
218
  print(f"Error occurred during canceling parsing for documents: {str(e)}")
219
  raise
220
 
221
+ def retrieval(self,
222
+ question,
223
+ datasets=None,
224
+ documents=None,
225
+ offset=0,
226
+ limit=6,
227
+ similarity_threshold=0.1,
228
+ vector_similarity_weight=0.3,
229
+ top_k=1024):
230
+ """
231
+ Perform document retrieval based on the given parameters.
232
+
233
+ :param question: The query question.
234
+ :param datasets: A list of datasets (optional, as documents may be provided directly).
235
+ :param documents: A list of documents (if specific documents are provided).
236
+ :param offset: Offset for the retrieval results.
237
+ :param limit: Maximum number of retrieval results.
238
+ :param similarity_threshold: Similarity threshold.
239
+ :param vector_similarity_weight: Weight of vector similarity.
240
+ :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking).
241
+
242
+ Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API.
243
+ """
244
+ try:
245
+ data = {
246
+ "question": question,
247
+ "datasets": datasets if datasets is not None else [],
248
+ "documents": [doc.id if hasattr(doc, 'id') else doc for doc in
249
+ documents] if documents is not None else [],
250
+ "offset": offset,
251
+ "limit": limit,
252
+ "similarity_threshold": similarity_threshold,
253
+ "vector_similarity_weight": vector_similarity_weight,
254
+ "top_k": top_k,
255
+ "kb_id": datasets,
256
+ }
257
+
258
+ # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
259
+ res = self.post(f'/doc/retrieval_test', data)
260
+
261
+ # Check the response status code
262
+ if res.status_code == 200:
263
+ res_data = res.json()
264
+ if res_data.get("retmsg") == "success":
265
+ chunks = []
266
+ for chunk_data in res_data["data"].get("chunks", []):
267
+ chunk = Chunk(self, chunk_data)
268
+ chunks.append(chunk)
269
+ return chunks
270
+ else:
271
+ raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
272
+ else:
273
+ raise Exception(f"API request failed with status code {res.status_code}")
274
+
275
+ except Exception as e:
276
+ print(f"An error occurred during retrieval: {e}")
277
+ raise
278
 
sdk/python/test/t_document.py CHANGED
@@ -41,6 +41,7 @@ class TestDocument(TestSdk):
41
  def test_update_document_with_success(self):
42
  """
43
  Test updating a document with success.
 
44
  """
45
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
46
  doc = rag.get_document(name="TestDocument.txt")
@@ -60,7 +61,7 @@ class TestDocument(TestSdk):
60
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
61
 
62
  # Retrieve a document
63
- doc = rag.get_document(name="TestDocument.txt")
64
 
65
  # Check if the retrieved document is of type Document
66
  if isinstance(doc, Document):
@@ -147,14 +148,16 @@ class TestDocument(TestSdk):
147
  ds = rag.create_dataset(name="God4")
148
 
149
  # Define the document name and path
150
- name3 = 'ai.pdf'
151
- path = 'test_data/ai.pdf'
 
152
 
153
  # Create a document in the dataset using the file path
154
  rag.create_document(ds, name=name3, blob=open(path, "rb").read())
155
 
156
  # Retrieve the document by name
157
- doc = rag.get_document(name="ai.pdf")
 
158
 
159
  # Initiate asynchronous parsing
160
  doc.async_parse()
@@ -185,9 +188,9 @@ class TestDocument(TestSdk):
185
 
186
  # Prepare a list of file names and paths
187
  documents = [
188
- {'name': 'ai1.pdf', 'path': 'test_data/ai1.pdf'},
189
- {'name': 'ai2.pdf', 'path': 'test_data/ai2.pdf'},
190
- {'name': 'ai3.pdf', 'path': 'test_data/ai3.pdf'}
191
  ]
192
 
193
  # Create documents in bulk
@@ -248,6 +251,7 @@ class TestDocument(TestSdk):
248
  print(c)
249
  assert c is not None, "Chunk is None"
250
  assert "rag" in c['content_with_weight'].lower(), f"Keyword 'rag' not found in chunk content: {c.content}"
 
251
  def test_add_chunk_to_chunk_list(self):
252
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
253
  doc = rag.get_document(name='story.txt')
@@ -258,12 +262,44 @@ class TestDocument(TestSdk):
258
  def test_delete_chunk_of_chunk_list(self):
259
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
260
  doc = rag.get_document(name='story.txt')
261
-
262
  chunk = doc.add_chunk(content="assss")
263
  assert chunk is not None, "Chunk is None"
264
  assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
265
- chunk_num_before=doc.chunk_num
 
266
  chunk.delete()
267
- assert doc.chunk_num == chunk_num_before-1, "Chunk was not deleted"
268
-
269
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def test_update_document_with_success(self):
42
  """
43
  Test updating a document with success.
44
+ Update name or parser_method are supported
45
  """
46
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
47
  doc = rag.get_document(name="TestDocument.txt")
 
61
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
62
 
63
  # Retrieve a document
64
+ doc = rag.get_document(name="manual.txt")
65
 
66
  # Check if the retrieved document is of type Document
67
  if isinstance(doc, Document):
 
148
  ds = rag.create_dataset(name="God4")
149
 
150
  # Define the document name and path
151
+ name3 = 'westworld.pdf'
152
+ path = 'test_data/westworld.pdf'
153
+
154
 
155
  # Create a document in the dataset using the file path
156
  rag.create_document(ds, name=name3, blob=open(path, "rb").read())
157
 
158
  # Retrieve the document by name
159
+ doc = rag.get_document(name="westworld.pdf")
160
+
161
 
162
  # Initiate asynchronous parsing
163
  doc.async_parse()
 
188
 
189
  # Prepare a list of file names and paths
190
  documents = [
191
+ {'name': 'test1.txt', 'path': 'test_data/test1.txt'},
192
+ {'name': 'test2.txt', 'path': 'test_data/test2.txt'},
193
+ {'name': 'test3.txt', 'path': 'test_data/test3.txt'}
194
  ]
195
 
196
  # Create documents in bulk
 
251
  print(c)
252
  assert c is not None, "Chunk is None"
253
  assert "rag" in c['content_with_weight'].lower(), f"Keyword 'rag' not found in chunk content: {c.content}"
254
+
255
  def test_add_chunk_to_chunk_list(self):
256
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
257
  doc = rag.get_document(name='story.txt')
 
262
  def test_delete_chunk_of_chunk_list(self):
263
  rag = RAGFlow(API_KEY, HOST_ADDRESS)
264
  doc = rag.get_document(name='story.txt')
 
265
  chunk = doc.add_chunk(content="assss")
266
  assert chunk is not None, "Chunk is None"
267
  assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
268
+ doc = rag.get_document(name='story.txt')
269
+ chunk_count_before=doc.chunk_count
270
  chunk.delete()
271
+ doc = rag.get_document(name='story.txt')
272
+ assert doc.chunk_count == chunk_count_before-1, "Chunk was not deleted"
273
+
274
+ def test_update_chunk_content(self):
275
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
276
+ doc = rag.get_document(name='story.txt')
277
+ chunk = doc.add_chunk(content="assssd")
278
+ assert chunk is not None, "Chunk is None"
279
+ assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
280
+ chunk.content = "ragflow123"
281
+ res=chunk.save()
282
+ assert res is True, f"Failed to update chunk, error: {res}"
283
+
284
+ def test_retrieval_chunks(self):
285
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
286
+ ds = rag.create_dataset(name="God8")
287
+ name = 'ragflow_test.txt'
288
+ path = 'test_data/ragflow_test.txt'
289
+ rag.create_document(ds, name=name, blob=open(path, "rb").read())
290
+ doc = rag.get_document(name=name)
291
+ doc.async_parse()
292
+ # Wait for parsing to complete and get progress updates using join
293
+ for progress, msg in doc.join(interval=5, timeout=30):
294
+ print(progress, msg)
295
+ assert 0 <= progress <= 100, f"Invalid progress: {progress}"
296
+ assert msg, "Message should not be empty"
297
+ for c in rag.retrieval(question="What's ragflow?",
298
+ datasets=[ds.id], documents=[doc],
299
+ offset=0, limit=6, similarity_threshold=0.1,
300
+ vector_similarity_weight=0.3,
301
+ top_k=1024
302
+ ):
303
+ print(c)
304
+ assert c is not None, "Chunk is None"
305
+ assert "ragflow" in c.content.lower(), f"Keyword 'rag' not found in chunk content: {c.content}"