LiuHua
root
commited on
Commit
·
7889c99
1
Parent(s):
788c296
create dataset (#2074)
Browse files### What problem does this PR solve?
You can use sdk to create a dataset
### Type of change
- [x] New Feature
---------
Co-authored-by: root <root@xwg>
- sdk/python/ragflow/modules/__init__.py +0 -0
- sdk/python/ragflow/modules/base.py +30 -0
- sdk/python/ragflow/modules/dataset.py +33 -0
- sdk/python/ragflow/ragflow.py +36 -25
- sdk/python/setup.py +3 -1
- sdk/python/test/common.py +1 -1
- sdk/python/test/t_dataset.py +23 -0
sdk/python/ragflow/modules/__init__.py
ADDED
File without changes
|
sdk/python/ragflow/modules/base.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Base(object):
|
2 |
+
def __init__(self, rag, res_dict):
|
3 |
+
self.rag = rag
|
4 |
+
for k, v in res_dict.items():
|
5 |
+
if isinstance(v, dict):
|
6 |
+
self.__dict__[k] = Base(rag, v)
|
7 |
+
else:
|
8 |
+
self.__dict__[k] = v
|
9 |
+
|
10 |
+
def to_json(self):
|
11 |
+
pr = {}
|
12 |
+
for name in dir(self):
|
13 |
+
value = getattr(self, name)
|
14 |
+
if not name.startswith('__') and not callable(value) and name != "rag":
|
15 |
+
if isinstance(value, Base):
|
16 |
+
pr[name] = value.to_json()
|
17 |
+
else:
|
18 |
+
pr[name] = value
|
19 |
+
return pr
|
20 |
+
|
21 |
+
|
22 |
+
def post(self, path, param):
|
23 |
+
res = self.rag.post(path,param)
|
24 |
+
return res
|
25 |
+
|
26 |
+
def get(self, path, params=''):
|
27 |
+
res = self.rag.get(path,params)
|
28 |
+
return res
|
29 |
+
|
30 |
+
|
sdk/python/ragflow/modules/dataset.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import Base
|
2 |
+
|
3 |
+
|
4 |
+
class DataSet(Base):
|
5 |
+
class ParseConfig(Base):
|
6 |
+
def __init__(self, rag, res_dict):
|
7 |
+
self.chunk_token_count = 128
|
8 |
+
self.layout_recognize = True
|
9 |
+
self.delimiter = '\n!?。;!?'
|
10 |
+
self.task_page_size = 12
|
11 |
+
super().__init__(rag, res_dict)
|
12 |
+
|
13 |
+
def __init__(self, rag, res_dict):
|
14 |
+
self.id = ""
|
15 |
+
self.name = ""
|
16 |
+
self.avatar = ""
|
17 |
+
self.tenant_id = None
|
18 |
+
self.description = ""
|
19 |
+
self.language = "English"
|
20 |
+
self.embedding_model = ""
|
21 |
+
self.permission = "me"
|
22 |
+
self.document_count = 0
|
23 |
+
self.chunk_count = 0
|
24 |
+
self.parse_method = 0
|
25 |
+
self.parser_config = None
|
26 |
+
super().__init__(rag, res_dict)
|
27 |
+
|
28 |
+
def delete(self):
|
29 |
+
try:
|
30 |
+
self.post("/rm", {"kb_id": self.id})
|
31 |
+
return True
|
32 |
+
except Exception:
|
33 |
+
return False
|
sdk/python/ragflow/ragflow.py
CHANGED
@@ -12,35 +12,56 @@
|
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
-
import json
|
16 |
-
import os
|
17 |
|
18 |
import requests
|
19 |
|
20 |
-
from
|
21 |
-
from api.settings import RetCode
|
22 |
|
23 |
|
24 |
class RAGFlow:
|
25 |
def __init__(self, user_key, base_url, version='v1'):
|
26 |
"""
|
27 |
-
api_url: http://<host_address>/
|
28 |
-
dataset_url: http://<host_address>/
|
29 |
-
document_url: http://<host_address>/
|
30 |
"""
|
31 |
self.user_key = user_key
|
32 |
-
self.api_url = f"{base_url}/
|
33 |
-
self.dataset_url = f"{self.api_url}/
|
34 |
self.authorization_header = {"Authorization": "{}".format(self.user_key)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
def create_dataset(self, dataset_name):
|
37 |
"""
|
38 |
name: dataset name
|
39 |
"""
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def delete_dataset(self, dataset_name):
|
45 |
dataset_id = self.find_dataset_id_by_name(dataset_name)
|
46 |
|
@@ -55,16 +76,6 @@ class RAGFlow:
|
|
55 |
return dataset["id"]
|
56 |
return None
|
57 |
|
58 |
-
def list_dataset(self, offset=0, count=-1, orderby="create_time", desc=True):
|
59 |
-
params = {
|
60 |
-
"offset": offset,
|
61 |
-
"count": count,
|
62 |
-
"orderby": orderby,
|
63 |
-
"desc": desc
|
64 |
-
}
|
65 |
-
response = requests.get(url=self.dataset_url, params=params, headers=self.authorization_header)
|
66 |
-
return response.json()
|
67 |
-
|
68 |
def get_dataset(self, dataset_name):
|
69 |
dataset_id = self.find_dataset_id_by_name(dataset_name)
|
70 |
endpoint = f"{self.dataset_url}/{dataset_id}"
|
@@ -78,7 +89,7 @@ class RAGFlow:
|
|
78 |
response = requests.put(endpoint, json=params, headers=self.authorization_header)
|
79 |
return response.json()
|
80 |
|
81 |
-
# ------------------------------- CONTENT MANAGEMENT -----------------------------------------------------
|
82 |
|
83 |
# ----------------------------upload local files-----------------------------------------------------
|
84 |
def upload_local_file(self, dataset_id, file_paths):
|
@@ -186,4 +197,4 @@ class RAGFlow:
|
|
186 |
# ----------------------------get a specific chunk-----------------------------------------------------
|
187 |
|
188 |
# ----------------------------retrieval test-----------------------------------------------------
|
189 |
-
|
|
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
|
|
|
|
15 |
|
16 |
import requests
|
17 |
|
18 |
+
from .modules.dataset import DataSet
|
|
|
19 |
|
20 |
|
21 |
class RAGFlow:
|
22 |
def __init__(self, user_key, base_url, version='v1'):
|
23 |
"""
|
24 |
+
api_url: http://<host_address>/v1
|
25 |
+
dataset_url: http://<host_address>/v1/kb
|
26 |
+
document_url: http://<host_address>/v1/dataset/{dataset_id}/documents
|
27 |
"""
|
28 |
self.user_key = user_key
|
29 |
+
self.api_url = f"{base_url}/{version}"
|
30 |
+
self.dataset_url = f"{self.api_url}/kb"
|
31 |
self.authorization_header = {"Authorization": "{}".format(self.user_key)}
|
32 |
+
self.base_url = base_url
|
33 |
+
|
34 |
+
def post(self, path, param):
|
35 |
+
res = requests.post(url=self.dataset_url + path, json=param, headers=self.authorization_header)
|
36 |
+
return res
|
37 |
+
|
38 |
+
def get(self, path, params=''):
|
39 |
+
res = requests.get(self.dataset_url + path, params=params, headers=self.authorization_header)
|
40 |
+
return res
|
41 |
|
42 |
def create_dataset(self, dataset_name):
|
43 |
"""
|
44 |
name: dataset name
|
45 |
"""
|
46 |
+
res_create = self.post("/create", {"name": dataset_name})
|
47 |
+
res_create_data = res_create.json()['data']
|
48 |
+
res_detail = self.get("/detail", {"kb_id": res_create_data["kb_id"]})
|
49 |
+
res_detail_data = res_detail.json()['data']
|
50 |
+
result = {}
|
51 |
+
result['id'] = res_detail_data['id']
|
52 |
+
result['name'] = res_detail_data['name']
|
53 |
+
result['avatar'] = res_detail_data['avatar']
|
54 |
+
result['description'] = res_detail_data['description']
|
55 |
+
result['language'] = res_detail_data['language']
|
56 |
+
result['embedding_model'] = res_detail_data['embd_id']
|
57 |
+
result['permission'] = res_detail_data['permission']
|
58 |
+
result['document_count'] = res_detail_data['doc_num']
|
59 |
+
result['chunk_count'] = res_detail_data['chunk_num']
|
60 |
+
result['parser_config'] = res_detail_data['parser_config']
|
61 |
+
dataset = DataSet(self, result)
|
62 |
+
return dataset
|
63 |
+
|
64 |
+
"""
|
65 |
def delete_dataset(self, dataset_name):
|
66 |
dataset_id = self.find_dataset_id_by_name(dataset_name)
|
67 |
|
|
|
76 |
return dataset["id"]
|
77 |
return None
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def get_dataset(self, dataset_name):
|
80 |
dataset_id = self.find_dataset_id_by_name(dataset_name)
|
81 |
endpoint = f"{self.dataset_url}/{dataset_id}"
|
|
|
89 |
response = requests.put(endpoint, json=params, headers=self.authorization_header)
|
90 |
return response.json()
|
91 |
|
92 |
+
# ------------------------------- CONTENT MANAGEMENT -----------------------------------------------------
|
93 |
|
94 |
# ----------------------------upload local files-----------------------------------------------------
|
95 |
def upload_local_file(self, dataset_id, file_paths):
|
|
|
197 |
# ----------------------------get a specific chunk-----------------------------------------------------
|
198 |
|
199 |
# ----------------------------retrieval test-----------------------------------------------------
|
200 |
+
"""
|
sdk/python/setup.py
CHANGED
@@ -15,5 +15,7 @@
|
|
15 |
import setuptools
|
16 |
|
17 |
if __name__ == "__main__":
|
18 |
-
setuptools.setup(
|
|
|
|
|
19 |
|
|
|
15 |
import setuptools
|
16 |
|
17 |
if __name__ == "__main__":
|
18 |
+
setuptools.setup(name='ragflow',
|
19 |
+
version="0.1",
|
20 |
+
packages=setuptools.find_packages())
|
21 |
|
sdk/python/test/common.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
|
2 |
|
3 |
-
API_KEY = '
|
4 |
HOST_ADDRESS = 'http://127.0.0.1:9380'
|
|
|
1 |
|
2 |
|
3 |
+
API_KEY = 'IjUxNGM0MmM4NWY5MzExZWY5MDhhMDI0MmFjMTIwMDA2Ig.ZsWebA.mV1NKdSPPllgowiH-7vz36tMWyI'
|
4 |
HOST_ADDRESS = 'http://127.0.0.1:9380'
|
sdk/python/test/t_dataset.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ragflow import RAGFlow
|
2 |
+
|
3 |
+
from common import API_KEY, HOST_ADDRESS
|
4 |
+
from test_sdkbase import TestSdk
|
5 |
+
|
6 |
+
|
7 |
+
class TestDataset(TestSdk):
|
8 |
+
def test_create_dataset_with_success(self):
|
9 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
10 |
+
ds = rag.create_dataset("God")
|
11 |
+
assert ds is not None, "The dataset creation failed, returned None."
|
12 |
+
assert ds.name == "God", "Dataset name does not match."
|
13 |
+
|
14 |
+
def test_delete_one_file(self):
|
15 |
+
"""
|
16 |
+
Test deleting one file with success.
|
17 |
+
"""
|
18 |
+
rag = RAGFlow(API_KEY, HOST_ADDRESS)
|
19 |
+
ds = rag.create_dataset("ABC")
|
20 |
+
assert ds is not None, "Failed to create dataset"
|
21 |
+
assert ds.name == "ABC", "Dataset name mismatch"
|
22 |
+
delete_result = ds.delete()
|
23 |
+
assert delete_result is True, "Failed to delete dataset"
|