cecilia-uu
commited on
Commit
·
6674a75
1
Parent(s):
970e973
API: start parsing (#1377)
Browse files### What problem does this PR solve?
Make the document start parsing.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/dataset_api.py +169 -18
- sdk/python/ragflow/ragflow.py +12 -0
- sdk/python/test/test_data/lol.txt +3 -0
- sdk/python/test/test_document.py +254 -0
api/apps/dataset_api.py
CHANGED
@@ -18,30 +18,35 @@ import re
|
|
18 |
import warnings
|
19 |
from io import BytesIO
|
20 |
|
|
|
21 |
from flask import request, send_file
|
22 |
from flask_login import login_required, current_user
|
23 |
from httpx import HTTPError
|
24 |
-
from minio import S3Error
|
25 |
|
26 |
from api.contants import NAME_LENGTH_LIMIT
|
27 |
-
from api.db import FileType, ParserType, FileSource
|
28 |
from api.db import StatusEnum
|
29 |
-
from api.db.db_models import File
|
30 |
from api.db.services import duplicate_name
|
31 |
from api.db.services.document_service import DocumentService
|
32 |
from api.db.services.file2document_service import File2DocumentService
|
33 |
from api.db.services.file_service import FileService
|
34 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
|
35 |
from api.db.services.user_service import TenantService
|
36 |
from api.settings import RetCode
|
37 |
from api.utils import get_uuid
|
38 |
from api.utils.api_utils import construct_json_result, construct_error_response
|
39 |
from api.utils.api_utils import construct_result, validate_request
|
40 |
from api.utils.file_utils import filename_type, thumbnail
|
|
|
|
|
|
|
41 |
from rag.utils.minio_conn import MINIO
|
42 |
|
43 |
MAXIMUM_OF_UPLOADING_FILES = 256
|
44 |
|
|
|
45 |
# ------------------------------ create a dataset ---------------------------------------
|
46 |
|
47 |
@manager.route("/", methods=["POST"])
|
@@ -116,6 +121,7 @@ def create_dataset():
|
|
116 |
except Exception as e:
|
117 |
return construct_error_response(e)
|
118 |
|
|
|
119 |
# -----------------------------list datasets-------------------------------------------------------
|
120 |
|
121 |
@manager.route("/", methods=["GET"])
|
@@ -135,6 +141,7 @@ def list_datasets():
|
|
135 |
except HTTPError as http_err:
|
136 |
return construct_json_result(http_err)
|
137 |
|
|
|
138 |
# ---------------------------------delete a dataset ----------------------------
|
139 |
|
140 |
@manager.route("/<dataset_id>", methods=["DELETE"])
|
@@ -162,13 +169,15 @@ def remove_dataset(dataset_id):
|
|
162 |
|
163 |
# delete the dataset
|
164 |
if not KnowledgebaseService.delete_by_id(dataset_id):
|
165 |
-
return construct_json_result(code=RetCode.DATA_ERROR,
|
166 |
-
|
|
|
167 |
# success
|
168 |
return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
|
169 |
except Exception as e:
|
170 |
return construct_error_response(e)
|
171 |
|
|
|
172 |
# ------------------------------ get details of a dataset ----------------------------------------
|
173 |
|
174 |
@manager.route("/<dataset_id>", methods=["GET"])
|
@@ -182,6 +191,7 @@ def get_dataset(dataset_id):
|
|
182 |
except Exception as e:
|
183 |
return construct_json_result(e)
|
184 |
|
|
|
185 |
# ------------------------------ update a dataset --------------------------------------------
|
186 |
|
187 |
@manager.route("/<dataset_id>", methods=["PUT"])
|
@@ -209,8 +219,9 @@ def update_dataset(dataset_id):
|
|
209 |
if name.lower() != dataset.name.lower() \
|
210 |
and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
|
211 |
status=StatusEnum.VALID.value)) > 1:
|
212 |
-
return construct_json_result(code=RetCode.DATA_ERROR,
|
213 |
-
|
|
|
214 |
|
215 |
dataset_updating_data = {}
|
216 |
chunk_num = req.get("chunk_num")
|
@@ -222,17 +233,21 @@ def update_dataset(dataset_id):
|
|
222 |
if chunk_num == 0:
|
223 |
dataset_updating_data["embd_id"] = req["embedding_model_id"]
|
224 |
else:
|
225 |
-
construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
|
226 |
"dataset, so you cannot change the embedding "
|
227 |
"model.")
|
228 |
# only if chunk_num is 0, the user can update the chunk_method
|
229 |
-
if
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
233 |
construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
|
234 |
"in this dataset, so you cannot "
|
235 |
"change the chunk method.")
|
|
|
|
|
236 |
# convert the photo parameter to avatar
|
237 |
if req.get("photo"):
|
238 |
dataset_updating_data["avatar"] = req["photo"]
|
@@ -265,6 +280,7 @@ def update_dataset(dataset_id):
|
|
265 |
except Exception as e:
|
266 |
return construct_error_response(e)
|
267 |
|
|
|
268 |
# --------------------------------content management ----------------------------------------------
|
269 |
|
270 |
# ----------------------------upload files-----------------------------------------------------
|
@@ -339,9 +355,10 @@ def upload_documents(dataset_id):
|
|
339 |
location += "_"
|
340 |
|
341 |
blob = file.read()
|
|
|
342 |
# the content is empty, raising a warning
|
343 |
if blob == b'':
|
344 |
-
warnings.warn(f"[WARNING]: The file {filename} is empty.")
|
345 |
|
346 |
MINIO.put(dataset_id, location, blob)
|
347 |
|
@@ -453,6 +470,7 @@ def list_documents(dataset_id):
|
|
453 |
except Exception as e:
|
454 |
return construct_error_response(e)
|
455 |
|
|
|
456 |
# ----------------------------update: enable rename-----------------------------------------------------
|
457 |
@manager.route("/<dataset_id>/documents/<document_id>", methods=["PUT"])
|
458 |
@login_required
|
@@ -555,6 +573,7 @@ def update_document(dataset_id, document_id):
|
|
555 |
def is_illegal_value_for_enum(value, enum_class):
|
556 |
return value not in enum_class.__members__.values()
|
557 |
|
|
|
558 |
# ----------------------------download a file-----------------------------------------------------
|
559 |
@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
|
560 |
@login_required
|
@@ -563,7 +582,8 @@ def download_document(dataset_id, document_id):
|
|
563 |
# Check whether there is this dataset
|
564 |
exist, _ = KnowledgebaseService.get_by_id(dataset_id)
|
565 |
if not exist:
|
566 |
-
return construct_json_result(code=RetCode.DATA_ERROR,
|
|
|
567 |
|
568 |
# Check whether there is this document
|
569 |
exist, document = DocumentService.get_by_id(document_id)
|
@@ -591,8 +611,142 @@ def download_document(dataset_id, document_id):
|
|
591 |
except Exception as e:
|
592 |
return construct_error_response(e)
|
593 |
|
594 |
-
# ----------------------------start parsing-----------------------------------------------------
|
595 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
596 |
# ----------------------------stop parsing-----------------------------------------------------
|
597 |
|
598 |
# ----------------------------show the status of the file-----------------------------------------------------
|
@@ -610,6 +764,3 @@ def download_document(dataset_id, document_id):
|
|
610 |
# ----------------------------get a specific chunk-----------------------------------------------------
|
611 |
|
612 |
# ----------------------------retrieval test-----------------------------------------------------
|
613 |
-
|
614 |
-
|
615 |
-
|
|
|
18 |
import warnings
|
19 |
from io import BytesIO
|
20 |
|
21 |
+
from elasticsearch_dsl import Q
|
22 |
from flask import request, send_file
|
23 |
from flask_login import login_required, current_user
|
24 |
from httpx import HTTPError
|
|
|
25 |
|
26 |
from api.contants import NAME_LENGTH_LIMIT
|
27 |
+
from api.db import FileType, ParserType, FileSource, TaskStatus
|
28 |
from api.db import StatusEnum
|
29 |
+
from api.db.db_models import File, Task
|
30 |
from api.db.services import duplicate_name
|
31 |
from api.db.services.document_service import DocumentService
|
32 |
from api.db.services.file2document_service import File2DocumentService
|
33 |
from api.db.services.file_service import FileService
|
34 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
35 |
+
from api.db.services.task_service import TaskService
|
36 |
from api.db.services.user_service import TenantService
|
37 |
from api.settings import RetCode
|
38 |
from api.utils import get_uuid
|
39 |
from api.utils.api_utils import construct_json_result, construct_error_response
|
40 |
from api.utils.api_utils import construct_result, validate_request
|
41 |
from api.utils.file_utils import filename_type, thumbnail
|
42 |
+
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
|
43 |
+
from rag.nlp import search
|
44 |
+
from rag.utils.es_conn import ELASTICSEARCH
|
45 |
from rag.utils.minio_conn import MINIO
|
46 |
|
47 |
MAXIMUM_OF_UPLOADING_FILES = 256
|
48 |
|
49 |
+
|
50 |
# ------------------------------ create a dataset ---------------------------------------
|
51 |
|
52 |
@manager.route("/", methods=["POST"])
|
|
|
121 |
except Exception as e:
|
122 |
return construct_error_response(e)
|
123 |
|
124 |
+
|
125 |
# -----------------------------list datasets-------------------------------------------------------
|
126 |
|
127 |
@manager.route("/", methods=["GET"])
|
|
|
141 |
except HTTPError as http_err:
|
142 |
return construct_json_result(http_err)
|
143 |
|
144 |
+
|
145 |
# ---------------------------------delete a dataset ----------------------------
|
146 |
|
147 |
@manager.route("/<dataset_id>", methods=["DELETE"])
|
|
|
169 |
|
170 |
# delete the dataset
|
171 |
if not KnowledgebaseService.delete_by_id(dataset_id):
|
172 |
+
return construct_json_result(code=RetCode.DATA_ERROR,
|
173 |
+
message="There was an error during the dataset removal process. "
|
174 |
+
"Please check the status of the RAGFlow server and try the removal again.")
|
175 |
# success
|
176 |
return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
|
177 |
except Exception as e:
|
178 |
return construct_error_response(e)
|
179 |
|
180 |
+
|
181 |
# ------------------------------ get details of a dataset ----------------------------------------
|
182 |
|
183 |
@manager.route("/<dataset_id>", methods=["GET"])
|
|
|
191 |
except Exception as e:
|
192 |
return construct_json_result(e)
|
193 |
|
194 |
+
|
195 |
# ------------------------------ update a dataset --------------------------------------------
|
196 |
|
197 |
@manager.route("/<dataset_id>", methods=["PUT"])
|
|
|
219 |
if name.lower() != dataset.name.lower() \
|
220 |
and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
|
221 |
status=StatusEnum.VALID.value)) > 1:
|
222 |
+
return construct_json_result(code=RetCode.DATA_ERROR,
|
223 |
+
message=f"The name: {name.lower()} is already used by other "
|
224 |
+
f"datasets. Please choose a different name.")
|
225 |
|
226 |
dataset_updating_data = {}
|
227 |
chunk_num = req.get("chunk_num")
|
|
|
233 |
if chunk_num == 0:
|
234 |
dataset_updating_data["embd_id"] = req["embedding_model_id"]
|
235 |
else:
|
236 |
+
return construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
|
237 |
"dataset, so you cannot change the embedding "
|
238 |
"model.")
|
239 |
# only if chunk_num is 0, the user can update the chunk_method
|
240 |
+
if "chunk_method" in req:
|
241 |
+
type_value = req["chunk_method"]
|
242 |
+
if is_illegal_value_for_enum(type_value, ParserType):
|
243 |
+
return construct_json_result(message=f"Illegal value {type_value} for 'chunk_method' field.",
|
244 |
+
code=RetCode.DATA_ERROR)
|
245 |
+
if chunk_num != 0:
|
246 |
construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
|
247 |
"in this dataset, so you cannot "
|
248 |
"change the chunk method.")
|
249 |
+
dataset_updating_data["parser_id"] = req["template_type"]
|
250 |
+
|
251 |
# convert the photo parameter to avatar
|
252 |
if req.get("photo"):
|
253 |
dataset_updating_data["avatar"] = req["photo"]
|
|
|
280 |
except Exception as e:
|
281 |
return construct_error_response(e)
|
282 |
|
283 |
+
|
284 |
# --------------------------------content management ----------------------------------------------
|
285 |
|
286 |
# ----------------------------upload files-----------------------------------------------------
|
|
|
355 |
location += "_"
|
356 |
|
357 |
blob = file.read()
|
358 |
+
|
359 |
# the content is empty, raising a warning
|
360 |
if blob == b'':
|
361 |
+
warnings.warn(f"[WARNING]: The content of the file {filename} is empty.")
|
362 |
|
363 |
MINIO.put(dataset_id, location, blob)
|
364 |
|
|
|
470 |
except Exception as e:
|
471 |
return construct_error_response(e)
|
472 |
|
473 |
+
|
474 |
# ----------------------------update: enable rename-----------------------------------------------------
|
475 |
@manager.route("/<dataset_id>/documents/<document_id>", methods=["PUT"])
|
476 |
@login_required
|
|
|
573 |
def is_illegal_value_for_enum(value, enum_class):
|
574 |
return value not in enum_class.__members__.values()
|
575 |
|
576 |
+
|
577 |
# ----------------------------download a file-----------------------------------------------------
|
578 |
@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
|
579 |
@login_required
|
|
|
582 |
# Check whether there is this dataset
|
583 |
exist, _ = KnowledgebaseService.get_by_id(dataset_id)
|
584 |
if not exist:
|
585 |
+
return construct_json_result(code=RetCode.DATA_ERROR,
|
586 |
+
message=f"This dataset '{dataset_id}' cannot be found!")
|
587 |
|
588 |
# Check whether there is this document
|
589 |
exist, document = DocumentService.get_by_id(document_id)
|
|
|
611 |
except Exception as e:
|
612 |
return construct_error_response(e)
|
613 |
|
|
|
614 |
|
615 |
+
# ----------------------------start parsing a document-----------------------------------------------------
|
616 |
+
# helper method for parsing
|
617 |
+
def dummy(prog=None, msg=""):
|
618 |
+
pass
|
619 |
+
|
620 |
+
|
621 |
+
def doc_parse(binary, doc_name, parser_name, tenant_id):
|
622 |
+
match parser_name:
|
623 |
+
case "book":
|
624 |
+
book.chunk(doc_name, binary=binary, callback=dummy)
|
625 |
+
case "laws":
|
626 |
+
laws.chunk(doc_name, binary=binary, callback=dummy)
|
627 |
+
case "manual":
|
628 |
+
manual.chunk(doc_name, binary=binary, callback=dummy)
|
629 |
+
case "naive":
|
630 |
+
# It's the mode by default, which is general in the front-end
|
631 |
+
naive.chunk(doc_name, binary=binary, callback=dummy)
|
632 |
+
case "one":
|
633 |
+
one.chunk(doc_name, binary=binary, callback=dummy)
|
634 |
+
case "paper":
|
635 |
+
paper.chunk(doc_name, binary=binary, callback=dummy)
|
636 |
+
case "picture":
|
637 |
+
picture.chunk(doc_name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
|
638 |
+
case "presentation":
|
639 |
+
presentation.chunk(doc_name, binary=binary, callback=dummy)
|
640 |
+
case "qa":
|
641 |
+
qa.chunk(doc_name, binary=binary, callback=dummy)
|
642 |
+
case "resume":
|
643 |
+
resume.chunk(doc_name, binary=binary, callback=dummy)
|
644 |
+
case "table":
|
645 |
+
table.chunk(doc_name, binary=binary, callback=dummy)
|
646 |
+
case _:
|
647 |
+
return False
|
648 |
+
|
649 |
+
return True
|
650 |
+
|
651 |
+
|
652 |
+
@manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
|
653 |
+
@login_required
|
654 |
+
def parse_document(dataset_id, document_id):
|
655 |
+
try:
|
656 |
+
# valid dataset
|
657 |
+
exist, _ = KnowledgebaseService.get_by_id(dataset_id)
|
658 |
+
if not exist:
|
659 |
+
return construct_json_result(code=RetCode.DATA_ERROR,
|
660 |
+
message=f"This dataset '{dataset_id}' cannot be found!")
|
661 |
+
message = ""
|
662 |
+
res = get_message_during_parsing_document(document_id, message)
|
663 |
+
if isinstance(res, str):
|
664 |
+
message += res
|
665 |
+
return construct_json_result(code=RetCode.SUCCESS, message=message)
|
666 |
+
else:
|
667 |
+
return res
|
668 |
+
|
669 |
+
except Exception as e:
|
670 |
+
return construct_error_response(e)
|
671 |
+
|
672 |
+
|
673 |
+
# ----------------------------start parsing documents-----------------------------------------------------
|
674 |
+
@manager.route("/<dataset_id>/documents/status", methods=["POST"])
|
675 |
+
@login_required
|
676 |
+
def parse_documents(dataset_id):
|
677 |
+
doc_ids = request.json["doc_ids"]
|
678 |
+
try:
|
679 |
+
exist, _ = KnowledgebaseService.get_by_id(dataset_id)
|
680 |
+
if not exist:
|
681 |
+
return construct_json_result(code=RetCode.DATA_ERROR,
|
682 |
+
message=f"This dataset '{dataset_id}' cannot be found!")
|
683 |
+
|
684 |
+
def process(doc_ids):
|
685 |
+
message = ""
|
686 |
+
# for loop
|
687 |
+
for id in doc_ids:
|
688 |
+
res = get_message_during_parsing_document(id, message)
|
689 |
+
if isinstance(res, str):
|
690 |
+
message += res
|
691 |
+
else:
|
692 |
+
return res
|
693 |
+
return construct_json_result(data=True, code=RetCode.SUCCESS, message=message)
|
694 |
+
|
695 |
+
# two conditions
|
696 |
+
if doc_ids:
|
697 |
+
return process(doc_ids)
|
698 |
+
else:
|
699 |
+
# documents inside the dataset
|
700 |
+
docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time",
|
701 |
+
True, "")
|
702 |
+
doc_ids = [doc["id"] for doc in docs]
|
703 |
+
return process(doc_ids)
|
704 |
+
|
705 |
+
except Exception as e:
|
706 |
+
return construct_error_response(e)
|
707 |
+
|
708 |
+
|
709 |
+
# helper method for getting message or response when parsing the document
|
710 |
+
def get_message_during_parsing_document(id, message):
|
711 |
+
try:
|
712 |
+
# Check whether there is this document
|
713 |
+
exist, document = DocumentService.get_by_id(id)
|
714 |
+
if not exist:
|
715 |
+
return construct_json_result(message=f"This document '{id}' cannot be found!",
|
716 |
+
code=RetCode.ARGUMENT_ERROR)
|
717 |
+
|
718 |
+
tenant_id = DocumentService.get_tenant_id(id)
|
719 |
+
if not tenant_id:
|
720 |
+
return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
|
721 |
+
|
722 |
+
info = {"run": "1", "progress": 0}
|
723 |
+
info["progress_msg"] = ""
|
724 |
+
info["chunk_num"] = 0
|
725 |
+
info["token_num"] = 0
|
726 |
+
|
727 |
+
DocumentService.update_by_id(id, info)
|
728 |
+
|
729 |
+
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
730 |
+
|
731 |
+
_, doc_attributes = DocumentService.get_by_id(id)
|
732 |
+
doc_attributes = doc_attributes.to_dict()
|
733 |
+
doc_id = doc_attributes["id"]
|
734 |
+
|
735 |
+
bucket, doc_name = File2DocumentService.get_minio_address(doc_id=doc_id)
|
736 |
+
binary = MINIO.get(bucket, doc_name)
|
737 |
+
parser_name = doc_attributes["parser_id"]
|
738 |
+
if binary:
|
739 |
+
res = doc_parse(binary, doc_name, parser_name, tenant_id)
|
740 |
+
if res is False:
|
741 |
+
message += f"The parser id: {parser_name} of the document {doc_id} is not supported; "
|
742 |
+
else:
|
743 |
+
message += f"Empty data in the document: {doc_name}; "
|
744 |
+
# failed in parsing
|
745 |
+
if doc_attributes["status"] == TaskStatus.FAIL.value:
|
746 |
+
message += f"Failed in parsing the document: {doc_id}; "
|
747 |
+
return message
|
748 |
+
except Exception as e:
|
749 |
+
return construct_error_response(e)
|
750 |
# ----------------------------stop parsing-----------------------------------------------------
|
751 |
|
752 |
# ----------------------------show the status of the file-----------------------------------------------------
|
|
|
764 |
# ----------------------------get a specific chunk-----------------------------------------------------
|
765 |
|
766 |
# ----------------------------retrieval test-----------------------------------------------------
|
|
|
|
|
|
sdk/python/ragflow/ragflow.py
CHANGED
@@ -142,7 +142,19 @@ class RAGFlow:
|
|
142 |
with open(file_path, "wb") as file:
|
143 |
file.write(content)
|
144 |
return {"code": RetCode.SUCCESS, "data": content}
|
|
|
145 |
# ----------------------------start parsing-----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
# ----------------------------stop parsing-----------------------------------------------------
|
148 |
|
|
|
142 |
with open(file_path, "wb") as file:
|
143 |
file.write(content)
|
144 |
return {"code": RetCode.SUCCESS, "data": content}
|
145 |
+
|
146 |
# ----------------------------start parsing-----------------------------------------------------
|
147 |
+
def start_parsing_document(self, dataset_id, document_id):
|
148 |
+
endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status"
|
149 |
+
res = requests.post(endpoint, headers=self.authorization_header)
|
150 |
+
|
151 |
+
return res.json()
|
152 |
+
|
153 |
+
def start_parsing_documents(self, dataset_id, doc_ids=None):
|
154 |
+
endpoint = f"{self.dataset_url}/{dataset_id}/documents/status"
|
155 |
+
res = requests.post(endpoint, headers=self.authorization_header, json={"doc_ids": doc_ids})
|
156 |
+
|
157 |
+
return res.json()
|
158 |
|
159 |
# ----------------------------stop parsing-----------------------------------------------------
|
160 |
|
sdk/python/test/test_data/lol.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
llll
|
2 |
+
ooooo
|
3 |
+
llll
|
sdk/python/test/test_document.py
CHANGED
@@ -695,7 +695,261 @@ class TestFile(TestSdk):
|
|
695 |
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
|
696 |
|
697 |
# ----------------------------start parsing-----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
699 |
# ----------------------------stop parsing-----------------------------------------------------
|
700 |
|
701 |
# ----------------------------show the status of the file-----------------------------------------------------
|
|
|
695 |
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
|
696 |
|
697 |
# ----------------------------start parsing-----------------------------------------------------
|
698 |
+
def test_start_parsing_document_with_success(self):
|
699 |
+
"""
|
700 |
+
Test the parsing of a document with success.
|
701 |
+
"""
|
702 |
+
# create a dataset
|
703 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
704 |
+
created_res = ragflow.create_dataset("test_start_parsing_document_with_success")
|
705 |
+
created_res_id = created_res["data"]["dataset_id"]
|
706 |
+
# upload files
|
707 |
+
file_paths = ["test_data/lol.txt"]
|
708 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
709 |
+
# get the doc_id
|
710 |
+
data = uploading_res["data"][0]
|
711 |
+
doc_id = data["id"]
|
712 |
+
# parse file
|
713 |
+
res = ragflow.start_parsing_document(created_res_id, doc_id)
|
714 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
715 |
+
|
716 |
+
def test_start_parsing_nonexistent_document(self):
|
717 |
+
"""
|
718 |
+
Test the parsing a document which does not exist.
|
719 |
+
"""
|
720 |
+
# create a dataset
|
721 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
722 |
+
created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document")
|
723 |
+
created_res_id = created_res["data"]["dataset_id"]
|
724 |
+
res = ragflow.start_parsing_document(created_res_id, "imagination")
|
725 |
+
assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "This document 'imagination' cannot be found!"
|
726 |
+
|
727 |
+
def test_start_parsing_document_in_nonexistent_dataset(self):
|
728 |
+
"""
|
729 |
+
Test the parsing a document whose dataset is nonexistent.
|
730 |
+
"""
|
731 |
+
# create a dataset
|
732 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
733 |
+
created_res = ragflow.create_dataset("test_download_nonexistent_document")
|
734 |
+
created_res_id = created_res["data"]["dataset_id"]
|
735 |
+
# upload files
|
736 |
+
file_paths = ["test_data/test.txt"]
|
737 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
738 |
+
# get the doc_id
|
739 |
+
data = uploading_res["data"][0]
|
740 |
+
doc_id = data["id"]
|
741 |
+
# parse
|
742 |
+
res = ragflow.start_parsing_document("imagination", doc_id)
|
743 |
+
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
|
744 |
+
|
745 |
+
def test_start_parsing_an_empty_document(self):
|
746 |
+
"""
|
747 |
+
Test the parsing of an empty document.
|
748 |
+
"""
|
749 |
+
# create a dataset
|
750 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
751 |
+
created_res = ragflow.create_dataset("test_download_nonexistent_document")
|
752 |
+
created_res_id = created_res["data"]["dataset_id"]
|
753 |
+
# upload files
|
754 |
+
file_paths = ["test_data/empty.txt"]
|
755 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
756 |
+
# get the doc_id
|
757 |
+
data = uploading_res["data"][0]
|
758 |
+
doc_id = data["id"]
|
759 |
+
res = ragflow.start_parsing_document(created_res_id, doc_id)
|
760 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
|
761 |
+
|
762 |
+
# ------------------------parsing multiple documents----------------------------
|
763 |
+
def test_start_parsing_documents_in_nonexistent_dataset(self):
|
764 |
+
"""
|
765 |
+
Test the parsing documents whose dataset is nonexistent.
|
766 |
+
"""
|
767 |
+
# create a dataset
|
768 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
769 |
+
created_res = ragflow.create_dataset("test_download_nonexistent_document")
|
770 |
+
created_res_id = created_res["data"]["dataset_id"]
|
771 |
+
# upload files
|
772 |
+
file_paths = ["test_data/test.txt"]
|
773 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
774 |
+
# parse
|
775 |
+
res = ragflow.start_parsing_documents("imagination")
|
776 |
+
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
|
777 |
+
|
778 |
+
def test_start_parsing_multiple_documents(self):
|
779 |
+
"""
|
780 |
+
Test the parsing documents with a success.
|
781 |
+
"""
|
782 |
+
# create a dataset
|
783 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
784 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
785 |
+
created_res_id = created_res["data"]["dataset_id"]
|
786 |
+
# upload files
|
787 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
788 |
+
ragflow.upload_local_file(created_res_id, file_paths)
|
789 |
+
res = ragflow.start_parsing_documents(created_res_id)
|
790 |
+
assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == ""
|
791 |
+
|
792 |
+
def test_start_parsing_multiple_documents_with_one_empty_file(self):
|
793 |
+
"""
|
794 |
+
Test the parsing documents, one of which is empty.
|
795 |
+
"""
|
796 |
+
# create a dataset
|
797 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
798 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
799 |
+
created_res_id = created_res["data"]["dataset_id"]
|
800 |
+
# upload files
|
801 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"]
|
802 |
+
ragflow.upload_local_file(created_res_id, file_paths)
|
803 |
+
res = ragflow.start_parsing_documents(created_res_id)
|
804 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
|
805 |
+
|
806 |
+
def test_start_parsing_multiple_specific_documents(self):
|
807 |
+
"""
|
808 |
+
Test the parsing documents whose document ids are specified.
|
809 |
+
"""
|
810 |
+
# create a dataset
|
811 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
812 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
813 |
+
created_res_id = created_res["data"]["dataset_id"]
|
814 |
+
# upload files
|
815 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
816 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
817 |
+
# get the doc_id
|
818 |
+
data = uploading_res["data"]
|
819 |
+
doc_ids = []
|
820 |
+
for d in data:
|
821 |
+
doc_ids.append(d["id"])
|
822 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
823 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
824 |
+
|
825 |
+
def test_start_re_parsing_multiple_specific_documents(self):
|
826 |
+
"""
|
827 |
+
Test the re-parsing documents.
|
828 |
+
"""
|
829 |
+
# create a dataset
|
830 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
831 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
832 |
+
created_res_id = created_res["data"]["dataset_id"]
|
833 |
+
# upload files
|
834 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
835 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
836 |
+
# get the doc_id
|
837 |
+
data = uploading_res["data"]
|
838 |
+
doc_ids = []
|
839 |
+
for d in data:
|
840 |
+
doc_ids.append(d["id"])
|
841 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
842 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
843 |
+
# re-parse
|
844 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
845 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
846 |
+
|
847 |
+
def test_start_re_parsing_multiple_specific_documents_with_changing_parser_id(self):
|
848 |
+
"""
|
849 |
+
Test the re-parsing documents after changing the parser id.
|
850 |
+
"""
|
851 |
+
# create a dataset
|
852 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
853 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
854 |
+
created_res_id = created_res["data"]["dataset_id"]
|
855 |
+
# upload files
|
856 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
857 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
858 |
+
# get the doc_id
|
859 |
+
data = uploading_res["data"]
|
860 |
+
doc_ids = []
|
861 |
+
for d in data:
|
862 |
+
doc_ids.append(d["id"])
|
863 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
864 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
865 |
+
# general -> laws
|
866 |
+
params = {
|
867 |
+
"template_type": "laws"
|
868 |
+
}
|
869 |
+
ragflow.update_file(created_res_id, doc_ids[0], **params)
|
870 |
+
# re-parse
|
871 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
872 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
873 |
+
|
874 |
+
def test_start_re_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self):
|
875 |
+
"""
|
876 |
+
Test the re-parsing documents after changing an illegal parser id.
|
877 |
+
"""
|
878 |
+
# create a dataset
|
879 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
880 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
881 |
+
created_res_id = created_res["data"]["dataset_id"]
|
882 |
+
# upload files
|
883 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
884 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
885 |
+
# get the doc_id
|
886 |
+
data = uploading_res["data"]
|
887 |
+
doc_ids = []
|
888 |
+
for d in data:
|
889 |
+
doc_ids.append(d["id"])
|
890 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
891 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
892 |
+
# general -> illegal
|
893 |
+
params = {
|
894 |
+
"template_type": "illegal"
|
895 |
+
}
|
896 |
+
res = ragflow.update_file(created_res_id, doc_ids[0], **params)
|
897 |
+
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field."
|
898 |
+
# re-parse
|
899 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
900 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
901 |
+
|
902 |
+
def test_start_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self):
|
903 |
+
"""
|
904 |
+
Test the parsing documents after changing an illegal parser id.
|
905 |
+
"""
|
906 |
+
# create a dataset
|
907 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
908 |
+
created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
|
909 |
+
created_res_id = created_res["data"]["dataset_id"]
|
910 |
+
# upload files
|
911 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
912 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
913 |
+
# get the doc_id
|
914 |
+
data = uploading_res["data"]
|
915 |
+
doc_ids = []
|
916 |
+
for d in data:
|
917 |
+
doc_ids.append(d["id"])
|
918 |
+
# general -> illegal
|
919 |
+
params = {
|
920 |
+
"template_type": "illegal"
|
921 |
+
}
|
922 |
+
res = ragflow.update_file(created_res_id, doc_ids[0], **params)
|
923 |
+
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field."
|
924 |
+
# re-parse
|
925 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
926 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
927 |
|
928 |
+
def test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal(self):
|
929 |
+
"""
|
930 |
+
Test the parsing documents whose dataset's parser id is illegal.
|
931 |
+
"""
|
932 |
+
# create a dataset
|
933 |
+
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
|
934 |
+
created_res = ragflow.create_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal")
|
935 |
+
created_res_id = created_res["data"]["dataset_id"]
|
936 |
+
# update the parser id
|
937 |
+
params = {
|
938 |
+
"chunk_method": "illegal"
|
939 |
+
}
|
940 |
+
res = ragflow.update_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal", **params)
|
941 |
+
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'chunk_method' field."
|
942 |
+
# upload files
|
943 |
+
file_paths = ["test_data/test.txt", "test_data/test1.txt"]
|
944 |
+
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
|
945 |
+
# get the doc_id
|
946 |
+
data = uploading_res["data"]
|
947 |
+
doc_ids = []
|
948 |
+
for d in data:
|
949 |
+
doc_ids.append(d["id"])
|
950 |
+
# parse
|
951 |
+
res = ragflow.start_parsing_documents(created_res_id, doc_ids)
|
952 |
+
assert res["code"] == RetCode.SUCCESS and res["message"] == ""
|
953 |
# ----------------------------stop parsing-----------------------------------------------------
|
954 |
|
955 |
# ----------------------------show the status of the file-----------------------------------------------------
|