Matej Horník commited on
Commit
19e6d59
·
1 Parent(s): 3090a99

feat: docs for api endpoints to generate openapi specification (#3109)

Browse files

### What problem does this PR solve?

**Added openapi specification for API routes. This creates swagger UI
similar to FastAPI to better use the API.**
Using python package `flasgger`

### Type of change
- [x] New Feature (non-breaking change which adds functionality)

Not all routes are included since this is a work in progress.

Docs can be accessed on: `{host}:{port}/apidocs`

api/apps/__init__.py CHANGED
@@ -21,6 +21,7 @@ from pathlib import Path
21
  from flask import Blueprint, Flask
22
  from werkzeug.wrappers.request import Request
23
  from flask_cors import CORS
 
24
 
25
  from api.db import StatusEnum
26
  from api.db.db_models import close_connection
@@ -34,27 +35,62 @@ from api.settings import API_VERSION, access_logger
34
  from api.utils.api_utils import server_error_response
35
  from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
36
 
37
- __all__ = ['app']
38
 
39
 
40
- logger = logging.getLogger('flask.app')
41
  for h in access_logger.handlers:
42
  logger.addHandler(h)
43
 
44
  Request.json = property(lambda self: self.get_json(force=True, silent=True))
45
 
46
  app = Flask(__name__)
47
- CORS(app, supports_credentials=True,max_age=2592000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  app.url_map.strict_slashes = False
49
  app.json_encoder = CustomJSONEncoder
50
  app.errorhandler(Exception)(server_error_response)
51
 
52
 
53
  ## convince for dev and debug
54
- #app.config["LOGIN_DISABLED"] = True
55
  app.config["SESSION_PERMANENT"] = False
56
  app.config["SESSION_TYPE"] = "filesystem"
57
- app.config['MAX_CONTENT_LENGTH'] = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024))
 
 
58
 
59
  Session(app)
60
  login_manager = LoginManager()
@@ -64,17 +100,23 @@ commands.register_commands(app)
64
 
65
 
66
  def search_pages_path(pages_dir):
67
- app_path_list = [path for path in pages_dir.glob('*_app.py') if not path.name.startswith('.')]
68
- api_path_list = [path for path in pages_dir.glob('*sdk/*.py') if not path.name.startswith('.')]
 
 
 
 
69
  app_path_list.extend(api_path_list)
70
  return app_path_list
71
 
72
 
73
  def register_page(page_path):
74
- path = f'{page_path}'
75
 
76
- page_name = page_path.stem.rstrip('_app')
77
- module_name = '.'.join(page_path.parts[page_path.parts.index('api'):-1] + (page_name,))
 
 
78
 
79
  spec = spec_from_file_location(module_name, page_path)
80
  page = module_from_spec(spec)
@@ -82,8 +124,10 @@ def register_page(page_path):
82
  page.manager = Blueprint(page_name, module_name)
83
  sys.modules[module_name] = page
84
  spec.loader.exec_module(page)
85
- page_name = getattr(page, 'page_name', page_name)
86
- url_prefix = f'/api/{API_VERSION}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}'
 
 
87
 
88
  app.register_blueprint(page.manager, url_prefix=url_prefix)
89
  return url_prefix
@@ -91,14 +135,12 @@ def register_page(page_path):
91
 
92
  pages_dir = [
93
  Path(__file__).parent,
94
- Path(__file__).parent.parent / 'api' / 'apps',
95
- Path(__file__).parent.parent / 'api' / 'apps' / 'sdk',
96
  ]
97
 
98
  client_urls_prefix = [
99
- register_page(path)
100
- for dir in pages_dir
101
- for path in search_pages_path(dir)
102
  ]
103
 
104
 
@@ -109,7 +151,9 @@ def load_user(web_request):
109
  if authorization:
110
  try:
111
  access_token = str(jwt.loads(authorization))
112
- user = UserService.query(access_token=access_token, status=StatusEnum.VALID.value)
 
 
113
  if user:
114
  return user[0]
115
  else:
@@ -123,4 +167,4 @@ def load_user(web_request):
123
 
124
  @app.teardown_request
125
  def _db_close(exc):
126
- close_connection()
 
21
  from flask import Blueprint, Flask
22
  from werkzeug.wrappers.request import Request
23
  from flask_cors import CORS
24
+ from flasgger import Swagger
25
 
26
  from api.db import StatusEnum
27
  from api.db.db_models import close_connection
 
35
  from api.utils.api_utils import server_error_response
36
  from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
37
 
38
+ __all__ = ["app"]
39
 
40
 
41
+ logger = logging.getLogger("flask.app")
42
  for h in access_logger.handlers:
43
  logger.addHandler(h)
44
 
45
  Request.json = property(lambda self: self.get_json(force=True, silent=True))
46
 
47
  app = Flask(__name__)
48
+
49
+ # Add this at the beginning of your file to configure Swagger UI
50
+ swagger_config = {
51
+ "headers": [],
52
+ "specs": [
53
+ {
54
+ "endpoint": "apispec",
55
+ "route": "/apispec.json",
56
+ "rule_filter": lambda rule: True, # Include all endpoints
57
+ "model_filter": lambda tag: True, # Include all models
58
+ }
59
+ ],
60
+ "static_url_path": "/flasgger_static",
61
+ "swagger_ui": True,
62
+ "specs_route": "/apidocs/",
63
+ }
64
+
65
+ swagger = Swagger(
66
+ app,
67
+ config=swagger_config,
68
+ template={
69
+ "swagger": "2.0",
70
+ "info": {
71
+ "title": "RAGFlow API",
72
+ "description": "",
73
+ "version": "1.0.0",
74
+ },
75
+ "securityDefinitions": {
76
+ "ApiKeyAuth": {"type": "apiKey", "name": "Authorization", "in": "header"}
77
+ },
78
+ },
79
+ )
80
+
81
+ CORS(app, supports_credentials=True, max_age=2592000)
82
  app.url_map.strict_slashes = False
83
  app.json_encoder = CustomJSONEncoder
84
  app.errorhandler(Exception)(server_error_response)
85
 
86
 
87
  ## convince for dev and debug
88
+ # app.config["LOGIN_DISABLED"] = True
89
  app.config["SESSION_PERMANENT"] = False
90
  app.config["SESSION_TYPE"] = "filesystem"
91
+ app.config["MAX_CONTENT_LENGTH"] = int(
92
+ os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024)
93
+ )
94
 
95
  Session(app)
96
  login_manager = LoginManager()
 
100
 
101
 
102
  def search_pages_path(pages_dir):
103
+ app_path_list = [
104
+ path for path in pages_dir.glob("*_app.py") if not path.name.startswith(".")
105
+ ]
106
+ api_path_list = [
107
+ path for path in pages_dir.glob("*sdk/*.py") if not path.name.startswith(".")
108
+ ]
109
  app_path_list.extend(api_path_list)
110
  return app_path_list
111
 
112
 
113
  def register_page(page_path):
114
+ path = f"{page_path}"
115
 
116
+ page_name = page_path.stem.rstrip("_app")
117
+ module_name = ".".join(
118
+ page_path.parts[page_path.parts.index("api") : -1] + (page_name,)
119
+ )
120
 
121
  spec = spec_from_file_location(module_name, page_path)
122
  page = module_from_spec(spec)
 
124
  page.manager = Blueprint(page_name, module_name)
125
  sys.modules[module_name] = page
126
  spec.loader.exec_module(page)
127
+ page_name = getattr(page, "page_name", page_name)
128
+ url_prefix = (
129
+ f"/api/{API_VERSION}" if "/sdk/" in path else f"/{API_VERSION}/{page_name}"
130
+ )
131
 
132
  app.register_blueprint(page.manager, url_prefix=url_prefix)
133
  return url_prefix
 
135
 
136
  pages_dir = [
137
  Path(__file__).parent,
138
+ Path(__file__).parent.parent / "api" / "apps",
139
+ Path(__file__).parent.parent / "api" / "apps" / "sdk",
140
  ]
141
 
142
  client_urls_prefix = [
143
+ register_page(path) for dir in pages_dir for path in search_pages_path(dir)
 
 
144
  ]
145
 
146
 
 
151
  if authorization:
152
  try:
153
  access_token = str(jwt.loads(authorization))
154
+ user = UserService.query(
155
+ access_token=access_token, status=StatusEnum.VALID.value
156
+ )
157
  if user:
158
  return user[0]
159
  else:
 
167
 
168
  @app.teardown_request
169
  def _db_close(exc):
170
+ close_connection()
api/apps/sdk/dataset.py CHANGED
@@ -21,16 +21,72 @@ from api.db.services.document_service import DocumentService
21
  from api.db.services.file2document_service import File2DocumentService
22
  from api.db.services.file_service import FileService
23
  from api.db.services.knowledgebase_service import KnowledgebaseService
24
- from api.db.services.llm_service import TenantLLMService,LLMService
25
  from api.db.services.user_service import TenantService
26
  from api.settings import RetCode
27
  from api.utils import get_uuid
28
- from api.utils.api_utils import get_result, token_required, get_error_data_result, valid,get_parser_config
 
 
 
 
 
 
29
 
30
 
31
- @manager.route('/datasets', methods=['POST'])
32
  @token_required
33
  def create(tenant_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  req = request.json
35
  e, t = TenantService.get_by_id(tenant_id)
36
  permission = req.get("permission")
@@ -38,49 +94,97 @@ def create(tenant_id):
38
  chunk_method = req.get("chunk_method")
39
  parser_config = req.get("parser_config")
40
  valid_permission = ["me", "team"]
41
- valid_language =["Chinese", "English"]
42
- valid_chunk_method = ["naive","manual","qa","table","paper","book","laws","presentation","picture","one","knowledge_graph","email"]
43
- check_validation=valid(permission,valid_permission,language,valid_language,chunk_method,valid_chunk_method)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  if check_validation:
45
  return check_validation
46
- req["parser_config"]=get_parser_config(chunk_method,parser_config)
47
  if "tenant_id" in req:
48
- return get_error_data_result(
49
- retmsg="`tenant_id` must not be provided")
50
  if "chunk_count" in req or "document_count" in req:
51
- return get_error_data_result(retmsg="`chunk_count` or `document_count` must not be provided")
52
- if "name" not in req:
53
  return get_error_data_result(
54
- retmsg="`name` is not empty!")
55
- req['id'] = get_uuid()
 
 
 
56
  req["name"] = req["name"].strip()
57
  if req["name"] == "":
 
 
 
 
58
  return get_error_data_result(
59
- retmsg="`name` is not empty string!")
60
- if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value):
61
- return get_error_data_result(
62
- retmsg="Duplicated dataset name in creating dataset.")
63
- req["tenant_id"] = req['created_by'] = tenant_id
64
  if not req.get("embedding_model"):
65
- req['embedding_model'] = t.embd_id
66
  else:
67
- valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
68
- "BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
69
- "nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
70
- "text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
71
- embd_model=LLMService.query(llm_name=req["embedding_model"],model_type="embedding")
 
 
 
 
 
 
 
 
 
 
 
 
72
  if not embd_model:
73
- return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
 
 
74
  if embd_model:
75
- if req["embedding_model"] not in valid_embedding_models and not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
76
- return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
 
 
 
 
 
 
 
 
77
  key_mapping = {
78
  "chunk_num": "chunk_count",
79
  "doc_num": "document_count",
80
  "parser_id": "chunk_method",
81
- "embd_id": "embedding_model"
 
 
 
 
 
82
  }
83
- mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
84
  req.update(mapped_keys)
85
  if not KnowledgebaseService.save(**req):
86
  return get_error_data_result(retmsg="Create dataset error.(Database error)")
@@ -91,21 +195,53 @@ def create(tenant_id):
91
  renamed_data[new_key] = value
92
  return get_result(data=renamed_data)
93
 
94
- @manager.route('/datasets', methods=['DELETE'])
 
95
  @token_required
96
  def delete(tenant_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  req = request.json
98
  if not req:
99
- ids=None
100
  else:
101
- ids=req.get("ids")
102
  if not ids:
103
  id_list = []
104
- kbs=KnowledgebaseService.query(tenant_id=tenant_id)
105
  for kb in kbs:
106
  id_list.append(kb.id)
107
  else:
108
- id_list=ids
109
  for id in id_list:
110
  kbs = KnowledgebaseService.query(id=id, tenant_id=tenant_id)
111
  if not kbs:
@@ -113,19 +249,75 @@ def delete(tenant_id):
113
  for doc in DocumentService.query(kb_id=id):
114
  if not DocumentService.remove_document(doc, tenant_id):
115
  return get_error_data_result(
116
- retmsg="Remove document error.(Database error)")
 
117
  f2d = File2DocumentService.get_by_document_id(doc.id)
118
- FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
 
 
 
 
 
119
  File2DocumentService.delete_by_document_id(doc.id)
120
  if not KnowledgebaseService.delete_by_id(id):
121
- return get_error_data_result(
122
- retmsg="Delete dataset error.(Database error)")
123
  return get_result(retcode=RetCode.SUCCESS)
124
 
125
- @manager.route('/datasets/<dataset_id>', methods=['PUT'])
 
126
  @token_required
127
- def update(tenant_id,dataset_id):
128
- if not KnowledgebaseService.query(id=dataset_id,tenant_id=tenant_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  return get_error_data_result(retmsg="You don't own the dataset")
130
  req = request.json
131
  e, t = TenantService.get_by_id(tenant_id)
@@ -138,91 +330,202 @@ def update(tenant_id,dataset_id):
138
  parser_config = req.get("parser_config")
139
  valid_permission = ["me", "team"]
140
  valid_language = ["Chinese", "English"]
141
- valid_chunk_method = ["naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one",
142
- "knowledge_graph", "email"]
143
- check_validation = valid(permission, valid_permission, language, valid_language, chunk_method, valid_chunk_method)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  if check_validation:
145
  return check_validation
146
  if "tenant_id" in req:
147
  if req["tenant_id"] != tenant_id:
148
- return get_error_data_result(
149
- retmsg="Can't change `tenant_id`.")
150
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
151
  if "parser_config" in req:
152
- temp_dict=kb.parser_config
153
  temp_dict.update(req["parser_config"])
154
  req["parser_config"] = temp_dict
155
  if "chunk_count" in req:
156
  if req["chunk_count"] != kb.chunk_num:
157
- return get_error_data_result(
158
- retmsg="Can't change `chunk_count`.")
159
  req.pop("chunk_count")
160
  if "document_count" in req:
161
- if req['document_count'] != kb.doc_num:
162
- return get_error_data_result(
163
- retmsg="Can't change `document_count`.")
164
  req.pop("document_count")
165
  if "chunk_method" in req:
166
- if kb.chunk_num != 0 and req['chunk_method'] != kb.parser_id:
167
  return get_error_data_result(
168
- retmsg="If `chunk_count` is not 0, `chunk_method` is not changeable.")
169
- req['parser_id'] = req.pop('chunk_method')
170
- if req['parser_id'] != kb.parser_id:
 
171
  if not req.get("parser_config"):
172
  req["parser_config"] = get_parser_config(chunk_method, parser_config)
173
  if "embedding_model" in req:
174
- if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
175
  return get_error_data_result(
176
- retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable.")
 
177
  if not req.get("embedding_model"):
178
  return get_error_data_result("`embedding_model` can't be empty")
179
- valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
180
- "BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
181
- "nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
182
- "text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
183
- embd_model=LLMService.query(llm_name=req["embedding_model"],model_type="embedding")
 
 
 
 
 
 
 
 
 
 
 
 
184
  if not embd_model:
185
- return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
 
 
186
  if embd_model:
187
- if req["embedding_model"] not in valid_embedding_models and not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
188
- return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
189
- req['embd_id'] = req.pop('embedding_model')
 
 
 
 
 
 
 
 
190
  if "name" in req:
191
  req["name"] = req["name"].strip()
192
- if req["name"].lower() != kb.name.lower() \
193
- and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
194
- status=StatusEnum.VALID.value)) > 0:
 
 
 
 
 
 
195
  return get_error_data_result(
196
- retmsg="Duplicated dataset name in updating dataset.")
 
197
  if not KnowledgebaseService.update_by_id(kb.id, req):
198
  return get_error_data_result(retmsg="Update dataset error.(Database error)")
199
  return get_result(retcode=RetCode.SUCCESS)
200
 
201
- @manager.route('/datasets', methods=['GET'])
 
202
  @token_required
203
  def list(tenant_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  id = request.args.get("id")
205
  name = request.args.get("name")
206
- kbs = KnowledgebaseService.query(id=id,name=name,status=1)
207
  if not kbs:
208
  return get_error_data_result(retmsg="The dataset doesn't exist")
209
  page_number = int(request.args.get("page", 1))
210
  items_per_page = int(request.args.get("page_size", 1024))
211
  orderby = request.args.get("orderby", "create_time")
212
- if request.args.get("desc") == "False" or request.args.get("desc") == "false" :
213
  desc = False
214
  else:
215
  desc = True
216
  tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
217
  kbs = KnowledgebaseService.get_list(
218
- [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc, id, name)
 
 
 
 
 
 
 
 
219
  renamed_list = []
220
  for kb in kbs:
221
  key_mapping = {
222
  "chunk_num": "chunk_count",
223
  "doc_num": "document_count",
224
  "parser_id": "chunk_method",
225
- "embd_id": "embedding_model"
226
  }
227
  renamed_data = {}
228
  for key, value in kb.items():
 
21
  from api.db.services.file2document_service import File2DocumentService
22
  from api.db.services.file_service import FileService
23
  from api.db.services.knowledgebase_service import KnowledgebaseService
24
+ from api.db.services.llm_service import TenantLLMService, LLMService
25
  from api.db.services.user_service import TenantService
26
  from api.settings import RetCode
27
  from api.utils import get_uuid
28
+ from api.utils.api_utils import (
29
+ get_result,
30
+ token_required,
31
+ get_error_data_result,
32
+ valid,
33
+ get_parser_config,
34
+ )
35
 
36
 
37
+ @manager.route("/datasets", methods=["POST"])
38
  @token_required
39
  def create(tenant_id):
40
+ """
41
+ Create a new dataset.
42
+ ---
43
+ tags:
44
+ - Datasets
45
+ security:
46
+ - ApiKeyAuth: []
47
+ parameters:
48
+ - in: header
49
+ name: Authorization
50
+ type: string
51
+ required: true
52
+ description: Bearer token for authentication.
53
+ - in: body
54
+ name: body
55
+ description: Dataset creation parameters.
56
+ required: true
57
+ schema:
58
+ type: object
59
+ required:
60
+ - name
61
+ properties:
62
+ name:
63
+ type: string
64
+ description: Name of the dataset.
65
+ permission:
66
+ type: string
67
+ enum: ['me', 'team']
68
+ description: Dataset permission.
69
+ language:
70
+ type: string
71
+ enum: ['Chinese', 'English']
72
+ description: Language of the dataset.
73
+ chunk_method:
74
+ type: string
75
+ enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
76
+ "presentation", "picture", "one", "knowledge_graph", "email"]
77
+ description: Chunking method.
78
+ parser_config:
79
+ type: object
80
+ description: Parser configuration.
81
+ responses:
82
+ 200:
83
+ description: Successful operation.
84
+ schema:
85
+ type: object
86
+ properties:
87
+ data:
88
+ type: object
89
+ """
90
  req = request.json
91
  e, t = TenantService.get_by_id(tenant_id)
92
  permission = req.get("permission")
 
94
  chunk_method = req.get("chunk_method")
95
  parser_config = req.get("parser_config")
96
  valid_permission = ["me", "team"]
97
+ valid_language = ["Chinese", "English"]
98
+ valid_chunk_method = [
99
+ "naive",
100
+ "manual",
101
+ "qa",
102
+ "table",
103
+ "paper",
104
+ "book",
105
+ "laws",
106
+ "presentation",
107
+ "picture",
108
+ "one",
109
+ "knowledge_graph",
110
+ "email",
111
+ ]
112
+ check_validation = valid(
113
+ permission,
114
+ valid_permission,
115
+ language,
116
+ valid_language,
117
+ chunk_method,
118
+ valid_chunk_method,
119
+ )
120
  if check_validation:
121
  return check_validation
122
+ req["parser_config"] = get_parser_config(chunk_method, parser_config)
123
  if "tenant_id" in req:
124
+ return get_error_data_result(retmsg="`tenant_id` must not be provided")
 
125
  if "chunk_count" in req or "document_count" in req:
 
 
126
  return get_error_data_result(
127
+ retmsg="`chunk_count` or `document_count` must not be provided"
128
+ )
129
+ if "name" not in req:
130
+ return get_error_data_result(retmsg="`name` is not empty!")
131
+ req["id"] = get_uuid()
132
  req["name"] = req["name"].strip()
133
  if req["name"] == "":
134
+ return get_error_data_result(retmsg="`name` is not empty string!")
135
+ if KnowledgebaseService.query(
136
+ name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value
137
+ ):
138
  return get_error_data_result(
139
+ retmsg="Duplicated dataset name in creating dataset."
140
+ )
141
+ req["tenant_id"] = req["created_by"] = tenant_id
 
 
142
  if not req.get("embedding_model"):
143
+ req["embedding_model"] = t.embd_id
144
  else:
145
+ valid_embedding_models = [
146
+ "BAAI/bge-large-zh-v1.5",
147
+ "BAAI/bge-base-en-v1.5",
148
+ "BAAI/bge-large-en-v1.5",
149
+ "BAAI/bge-small-en-v1.5",
150
+ "BAAI/bge-small-zh-v1.5",
151
+ "jinaai/jina-embeddings-v2-base-en",
152
+ "jinaai/jina-embeddings-v2-small-en",
153
+ "nomic-ai/nomic-embed-text-v1.5",
154
+ "sentence-transformers/all-MiniLM-L6-v2",
155
+ "text-embedding-v2",
156
+ "text-embedding-v3",
157
+ "maidalun1020/bce-embedding-base_v1",
158
+ ]
159
+ embd_model = LLMService.query(
160
+ llm_name=req["embedding_model"], model_type="embedding"
161
+ )
162
  if not embd_model:
163
+ return get_error_data_result(
164
+ f"`embedding_model` {req.get('embedding_model')} doesn't exist"
165
+ )
166
  if embd_model:
167
+ if req[
168
+ "embedding_model"
169
+ ] not in valid_embedding_models and not TenantLLMService.query(
170
+ tenant_id=tenant_id,
171
+ model_type="embedding",
172
+ llm_name=req.get("embedding_model"),
173
+ ):
174
+ return get_error_data_result(
175
+ f"`embedding_model` {req.get('embedding_model')} doesn't exist"
176
+ )
177
  key_mapping = {
178
  "chunk_num": "chunk_count",
179
  "doc_num": "document_count",
180
  "parser_id": "chunk_method",
181
+ "embd_id": "embedding_model",
182
+ }
183
+ mapped_keys = {
184
+ new_key: req[old_key]
185
+ for new_key, old_key in key_mapping.items()
186
+ if old_key in req
187
  }
 
188
  req.update(mapped_keys)
189
  if not KnowledgebaseService.save(**req):
190
  return get_error_data_result(retmsg="Create dataset error.(Database error)")
 
195
  renamed_data[new_key] = value
196
  return get_result(data=renamed_data)
197
 
198
+
199
+ @manager.route("/datasets", methods=["DELETE"])
200
  @token_required
201
  def delete(tenant_id):
202
+ """
203
+ Delete datasets.
204
+ ---
205
+ tags:
206
+ - Datasets
207
+ security:
208
+ - ApiKeyAuth: []
209
+ parameters:
210
+ - in: header
211
+ name: Authorization
212
+ type: string
213
+ required: true
214
+ description: Bearer token for authentication.
215
+ - in: body
216
+ name: body
217
+ description: Dataset deletion parameters.
218
+ required: true
219
+ schema:
220
+ type: object
221
+ properties:
222
+ ids:
223
+ type: array
224
+ items:
225
+ type: string
226
+ description: List of dataset IDs to delete.
227
+ responses:
228
+ 200:
229
+ description: Successful operation.
230
+ schema:
231
+ type: object
232
+ """
233
  req = request.json
234
  if not req:
235
+ ids = None
236
  else:
237
+ ids = req.get("ids")
238
  if not ids:
239
  id_list = []
240
+ kbs = KnowledgebaseService.query(tenant_id=tenant_id)
241
  for kb in kbs:
242
  id_list.append(kb.id)
243
  else:
244
+ id_list = ids
245
  for id in id_list:
246
  kbs = KnowledgebaseService.query(id=id, tenant_id=tenant_id)
247
  if not kbs:
 
249
  for doc in DocumentService.query(kb_id=id):
250
  if not DocumentService.remove_document(doc, tenant_id):
251
  return get_error_data_result(
252
+ retmsg="Remove document error.(Database error)"
253
+ )
254
  f2d = File2DocumentService.get_by_document_id(doc.id)
255
+ FileService.filter_delete(
256
+ [
257
+ File.source_type == FileSource.KNOWLEDGEBASE,
258
+ File.id == f2d[0].file_id,
259
+ ]
260
+ )
261
  File2DocumentService.delete_by_document_id(doc.id)
262
  if not KnowledgebaseService.delete_by_id(id):
263
+ return get_error_data_result(retmsg="Delete dataset error.(Database error)")
 
264
  return get_result(retcode=RetCode.SUCCESS)
265
 
266
+
267
+ @manager.route("/datasets/<dataset_id>", methods=["PUT"])
268
  @token_required
269
+ def update(tenant_id, dataset_id):
270
+ """
271
+ Update a dataset.
272
+ ---
273
+ tags:
274
+ - Datasets
275
+ security:
276
+ - ApiKeyAuth: []
277
+ parameters:
278
+ - in: path
279
+ name: dataset_id
280
+ type: string
281
+ required: true
282
+ description: ID of the dataset to update.
283
+ - in: header
284
+ name: Authorization
285
+ type: string
286
+ required: true
287
+ description: Bearer token for authentication.
288
+ - in: body
289
+ name: body
290
+ description: Dataset update parameters.
291
+ required: true
292
+ schema:
293
+ type: object
294
+ properties:
295
+ name:
296
+ type: string
297
+ description: New name of the dataset.
298
+ permission:
299
+ type: string
300
+ enum: ['me', 'team']
301
+ description: Updated permission.
302
+ language:
303
+ type: string
304
+ enum: ['Chinese', 'English']
305
+ description: Updated language.
306
+ chunk_method:
307
+ type: string
308
+ enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
309
+ "presentation", "picture", "one", "knowledge_graph", "email"]
310
+ description: Updated chunking method.
311
+ parser_config:
312
+ type: object
313
+ description: Updated parser configuration.
314
+ responses:
315
+ 200:
316
+ description: Successful operation.
317
+ schema:
318
+ type: object
319
+ """
320
+ if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
321
  return get_error_data_result(retmsg="You don't own the dataset")
322
  req = request.json
323
  e, t = TenantService.get_by_id(tenant_id)
 
330
  parser_config = req.get("parser_config")
331
  valid_permission = ["me", "team"]
332
  valid_language = ["Chinese", "English"]
333
+ valid_chunk_method = [
334
+ "naive",
335
+ "manual",
336
+ "qa",
337
+ "table",
338
+ "paper",
339
+ "book",
340
+ "laws",
341
+ "presentation",
342
+ "picture",
343
+ "one",
344
+ "knowledge_graph",
345
+ "email",
346
+ ]
347
+ check_validation = valid(
348
+ permission,
349
+ valid_permission,
350
+ language,
351
+ valid_language,
352
+ chunk_method,
353
+ valid_chunk_method,
354
+ )
355
  if check_validation:
356
  return check_validation
357
  if "tenant_id" in req:
358
  if req["tenant_id"] != tenant_id:
359
+ return get_error_data_result(retmsg="Can't change `tenant_id`.")
 
360
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
361
  if "parser_config" in req:
362
+ temp_dict = kb.parser_config
363
  temp_dict.update(req["parser_config"])
364
  req["parser_config"] = temp_dict
365
  if "chunk_count" in req:
366
  if req["chunk_count"] != kb.chunk_num:
367
+ return get_error_data_result(retmsg="Can't change `chunk_count`.")
 
368
  req.pop("chunk_count")
369
  if "document_count" in req:
370
+ if req["document_count"] != kb.doc_num:
371
+ return get_error_data_result(retmsg="Can't change `document_count`.")
 
372
  req.pop("document_count")
373
  if "chunk_method" in req:
374
+ if kb.chunk_num != 0 and req["chunk_method"] != kb.parser_id:
375
  return get_error_data_result(
376
+ retmsg="If `chunk_count` is not 0, `chunk_method` is not changeable."
377
+ )
378
+ req["parser_id"] = req.pop("chunk_method")
379
+ if req["parser_id"] != kb.parser_id:
380
  if not req.get("parser_config"):
381
  req["parser_config"] = get_parser_config(chunk_method, parser_config)
382
  if "embedding_model" in req:
383
+ if kb.chunk_num != 0 and req["embedding_model"] != kb.embd_id:
384
  return get_error_data_result(
385
+ retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable."
386
+ )
387
  if not req.get("embedding_model"):
388
  return get_error_data_result("`embedding_model` can't be empty")
389
+ valid_embedding_models = [
390
+ "BAAI/bge-large-zh-v1.5",
391
+ "BAAI/bge-base-en-v1.5",
392
+ "BAAI/bge-large-en-v1.5",
393
+ "BAAI/bge-small-en-v1.5",
394
+ "BAAI/bge-small-zh-v1.5",
395
+ "jinaai/jina-embeddings-v2-base-en",
396
+ "jinaai/jina-embeddings-v2-small-en",
397
+ "nomic-ai/nomic-embed-text-v1.5",
398
+ "sentence-transformers/all-MiniLM-L6-v2",
399
+ "text-embedding-v2",
400
+ "text-embedding-v3",
401
+ "maidalun1020/bce-embedding-base_v1",
402
+ ]
403
+ embd_model = LLMService.query(
404
+ llm_name=req["embedding_model"], model_type="embedding"
405
+ )
406
  if not embd_model:
407
+ return get_error_data_result(
408
+ f"`embedding_model` {req.get('embedding_model')} doesn't exist"
409
+ )
410
  if embd_model:
411
+ if req[
412
+ "embedding_model"
413
+ ] not in valid_embedding_models and not TenantLLMService.query(
414
+ tenant_id=tenant_id,
415
+ model_type="embedding",
416
+ llm_name=req.get("embedding_model"),
417
+ ):
418
+ return get_error_data_result(
419
+ f"`embedding_model` {req.get('embedding_model')} doesn't exist"
420
+ )
421
+ req["embd_id"] = req.pop("embedding_model")
422
  if "name" in req:
423
  req["name"] = req["name"].strip()
424
+ if (
425
+ req["name"].lower() != kb.name.lower()
426
+ and len(
427
+ KnowledgebaseService.query(
428
+ name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value
429
+ )
430
+ )
431
+ > 0
432
+ ):
433
  return get_error_data_result(
434
+ retmsg="Duplicated dataset name in updating dataset."
435
+ )
436
  if not KnowledgebaseService.update_by_id(kb.id, req):
437
  return get_error_data_result(retmsg="Update dataset error.(Database error)")
438
  return get_result(retcode=RetCode.SUCCESS)
439
 
440
+
441
+ @manager.route("/datasets", methods=["GET"])
442
  @token_required
443
  def list(tenant_id):
444
+ """
445
+ List datasets.
446
+ ---
447
+ tags:
448
+ - Datasets
449
+ security:
450
+ - ApiKeyAuth: []
451
+ parameters:
452
+ - in: query
453
+ name: id
454
+ type: string
455
+ required: false
456
+ description: Dataset ID to filter.
457
+ - in: query
458
+ name: name
459
+ type: string
460
+ required: false
461
+ description: Dataset name to filter.
462
+ - in: query
463
+ name: page
464
+ type: integer
465
+ required: false
466
+ default: 1
467
+ description: Page number.
468
+ - in: query
469
+ name: page_size
470
+ type: integer
471
+ required: false
472
+ default: 1024
473
+ description: Number of items per page.
474
+ - in: query
475
+ name: orderby
476
+ type: string
477
+ required: false
478
+ default: "create_time"
479
+ description: Field to order by.
480
+ - in: query
481
+ name: desc
482
+ type: boolean
483
+ required: false
484
+ default: true
485
+ description: Order in descending.
486
+ - in: header
487
+ name: Authorization
488
+ type: string
489
+ required: true
490
+ description: Bearer token for authentication.
491
+ responses:
492
+ 200:
493
+ description: Successful operation.
494
+ schema:
495
+ type: array
496
+ items:
497
+ type: object
498
+ """
499
  id = request.args.get("id")
500
  name = request.args.get("name")
501
+ kbs = KnowledgebaseService.query(id=id, name=name, status=1)
502
  if not kbs:
503
  return get_error_data_result(retmsg="The dataset doesn't exist")
504
  page_number = int(request.args.get("page", 1))
505
  items_per_page = int(request.args.get("page_size", 1024))
506
  orderby = request.args.get("orderby", "create_time")
507
+ if request.args.get("desc") == "False" or request.args.get("desc") == "false":
508
  desc = False
509
  else:
510
  desc = True
511
  tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
512
  kbs = KnowledgebaseService.get_list(
513
+ [m["tenant_id"] for m in tenants],
514
+ tenant_id,
515
+ page_number,
516
+ items_per_page,
517
+ orderby,
518
+ desc,
519
+ id,
520
+ name,
521
+ )
522
  renamed_list = []
523
  for kb in kbs:
524
  key_mapping = {
525
  "chunk_num": "chunk_count",
526
  "doc_num": "document_count",
527
  "parser_id": "chunk_method",
528
+ "embd_id": "embedding_model",
529
  }
530
  renamed_data = {}
531
  for key, value in kb.items():
api/apps/sdk/doc.py CHANGED
@@ -39,7 +39,7 @@ from api.db.services.file2document_service import File2DocumentService
39
  from api.db.services.file_service import FileService
40
  from api.db.services.knowledgebase_service import KnowledgebaseService
41
  from api.settings import RetCode, retrievaler
42
- from api.utils.api_utils import construct_json_result,get_parser_config
43
  from rag.nlp import search
44
  from rag.utils import rmSpace
45
  from rag.utils.es_conn import ELASTICSEARCH
@@ -49,36 +49,93 @@ import os
49
  MAXIMUM_OF_UPLOADING_FILES = 256
50
 
51
 
52
-
53
- @manager.route('/datasets/<dataset_id>/documents', methods=['POST'])
54
  @token_required
55
  def upload(dataset_id, tenant_id):
56
- if 'file' not in request.files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  return get_error_data_result(
58
- retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
59
- file_objs = request.files.getlist('file')
 
60
  for file_obj in file_objs:
61
- if file_obj.filename == '':
62
  return get_result(
63
- retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
 
64
  # total size
65
  total_size = 0
66
  for file_obj in file_objs:
67
  file_obj.seek(0, os.SEEK_END)
68
  total_size += file_obj.tell()
69
  file_obj.seek(0)
70
- MAX_TOTAL_FILE_SIZE=10*1024*1024
71
  if total_size > MAX_TOTAL_FILE_SIZE:
72
  return get_result(
73
- retmsg=f'Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)',
74
- retcode=RetCode.ARGUMENT_ERROR)
 
75
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
76
  if not e:
77
  raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
78
- err, files= FileService.upload_document(kb, file_objs, tenant_id)
79
  if err:
80
- return get_result(
81
- retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
82
  # rename key's name
83
  renamed_doc_list = []
84
  for file in files:
@@ -87,7 +144,7 @@ def upload(dataset_id, tenant_id):
87
  "chunk_num": "chunk_count",
88
  "kb_id": "dataset_id",
89
  "token_num": "token_count",
90
- "parser_id": "chunk_method"
91
  }
92
  renamed_doc = {}
93
  for key, value in doc.items():
@@ -98,9 +155,54 @@ def upload(dataset_id, tenant_id):
98
  return get_result(data=renamed_doc_list)
99
 
100
 
101
- @manager.route('/datasets/<dataset_id>/documents/<document_id>', methods=['PUT'])
102
  @token_required
103
  def update_doc(tenant_id, dataset_id, document_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  req = request.json
105
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
106
  return get_error_data_result(retmsg="You don't own the dataset.")
@@ -115,20 +217,25 @@ def update_doc(tenant_id, dataset_id, document_id):
115
  if req["token_count"] != doc.token_num:
116
  return get_error_data_result(retmsg="Can't change `token_count`.")
117
  if "progress" in req:
118
- if req['progress'] != doc.progress:
119
  return get_error_data_result(retmsg="Can't change `progress`.")
120
 
121
  if "name" in req and req["name"] != doc.name:
122
- if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
123
- return get_result(retmsg="The extension of file can't be changed", retcode=RetCode.ARGUMENT_ERROR)
 
 
 
 
 
 
124
  for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
125
  if d.name == req["name"]:
126
  return get_error_data_result(
127
- retmsg="Duplicated document name in the same dataset.")
128
- if not DocumentService.update_by_id(
129
- document_id, {"name": req["name"]}):
130
- return get_error_data_result(
131
- retmsg="Database error (Document rename)!")
132
 
133
  informs = File2DocumentService.get_by_document_id(document_id)
134
  if informs:
@@ -137,77 +244,231 @@ def update_doc(tenant_id, dataset_id, document_id):
137
  if "parser_config" in req:
138
  DocumentService.update_parser_config(doc.id, req["parser_config"])
139
  if "chunk_method" in req:
140
- valid_chunk_method = {"naive","manual","qa","table","paper","book","laws","presentation","picture","one","knowledge_graph","email"}
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  if req.get("chunk_method") not in valid_chunk_method:
142
- return get_error_data_result(f"`chunk_method` {req['chunk_method']} doesn't exist")
 
 
143
  if doc.parser_id.lower() == req["chunk_method"].lower():
144
- return get_result()
145
 
146
- if doc.type == FileType.VISUAL or re.search(
147
- r"\.(ppt|pptx|pages)$", doc.name):
148
  return get_error_data_result(retmsg="Not supported yet!")
149
 
150
- e = DocumentService.update_by_id(doc.id,
151
- {"parser_id": req["chunk_method"], "progress": 0, "progress_msg": "",
152
- "run": TaskStatus.UNSTART.value})
 
 
 
 
 
 
153
  if not e:
154
  return get_error_data_result(retmsg="Document not found!")
155
- req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config"))
 
 
156
  DocumentService.update_parser_config(doc.id, req["parser_config"])
157
  if doc.token_num > 0:
158
- e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
159
- doc.process_duation * -1)
 
 
 
 
 
160
  if not e:
161
  return get_error_data_result(retmsg="Document not found!")
162
  ELASTICSEARCH.deleteByQuery(
163
- Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
 
164
 
165
  return get_result()
166
 
167
 
168
- @manager.route('/datasets/<dataset_id>/documents/<document_id>', methods=['GET'])
169
  @token_required
170
  def download(tenant_id, dataset_id, document_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
172
- return get_error_data_result(retmsg=f'You do not own the dataset {dataset_id}.')
173
  doc = DocumentService.query(kb_id=dataset_id, id=document_id)
174
  if not doc:
175
- return get_error_data_result(retmsg=f'The dataset not own the document {document_id}.')
 
 
176
  # The process of downloading
177
- doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
 
 
178
  file_stream = STORAGE_IMPL.get(doc_id, doc_location)
179
  if not file_stream:
180
- return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
 
 
181
  file = BytesIO(file_stream)
182
  # Use send_file with a proper filename and MIME type
183
  return send_file(
184
  file,
185
  as_attachment=True,
186
  download_name=doc[0].name,
187
- mimetype='application/octet-stream' # Set a default MIME type
188
  )
189
 
190
 
191
- @manager.route('/datasets/<dataset_id>/documents', methods=['GET'])
192
  @token_required
193
  def list_docs(dataset_id, tenant_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
195
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
196
  id = request.args.get("id")
197
  name = request.args.get("name")
198
- if not DocumentService.query(id=id,kb_id=dataset_id):
199
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
200
- if not DocumentService.query(name=name,kb_id=dataset_id):
201
  return get_error_data_result(retmsg=f"You don't own the document {name}.")
202
  offset = int(request.args.get("offset", 1))
203
- keywords = request.args.get("keywords","")
204
  limit = int(request.args.get("limit", 1024))
205
  orderby = request.args.get("orderby", "create_time")
206
  if request.args.get("desc") == "False":
207
  desc = False
208
  else:
209
  desc = True
210
- docs, tol = DocumentService.get_list(dataset_id, offset, limit, orderby, desc, keywords, id,name)
 
 
211
 
212
  # rename key's name
213
  renamed_doc_list = []
@@ -216,42 +477,80 @@ def list_docs(dataset_id, tenant_id):
216
  "chunk_num": "chunk_count",
217
  "kb_id": "dataset_id",
218
  "token_num": "token_count",
219
- "parser_id": "chunk_method"
220
  }
221
  run_mapping = {
222
- "0" :"UNSTART",
223
- "1":"RUNNING",
224
- "2":"CANCEL",
225
- "3":"DONE",
226
- "4":"FAIL"
227
  }
228
  renamed_doc = {}
229
  for key, value in doc.items():
 
 
230
  new_key = key_mapping.get(key, key)
231
  renamed_doc[new_key] = value
232
- if key =="run":
233
- renamed_doc["run"]=run_mapping.get(value)
234
  renamed_doc_list.append(renamed_doc)
235
  return get_result(data={"total": tol, "docs": renamed_doc_list})
236
 
237
 
238
- @manager.route('/datasets/<dataset_id>/documents', methods=['DELETE'])
239
  @token_required
240
- def delete(tenant_id,dataset_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
242
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
243
  req = request.json
244
  if not req:
245
- doc_ids=None
246
  else:
247
- doc_ids=req.get("ids")
248
  if not doc_ids:
249
  doc_list = []
250
- docs=DocumentService.query(kb_id=dataset_id)
251
  for doc in docs:
252
  doc_list.append(doc.id)
253
  else:
254
- doc_list=doc_ids
255
  root_folder = FileService.get_root_folder(tenant_id)
256
  pf_id = root_folder["id"]
257
  FileService.init_knowledgebase_docs(pf_id, tenant_id)
@@ -269,10 +568,16 @@ def delete(tenant_id,dataset_id):
269
 
270
  if not DocumentService.remove_document(doc, tenant_id):
271
  return get_error_data_result(
272
- retmsg="Database error (Document removal)!")
 
273
 
274
  f2d = File2DocumentService.get_by_document_id(doc_id)
275
- FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
 
 
 
 
 
276
  File2DocumentService.delete_by_document_id(doc_id)
277
 
278
  STORAGE_IMPL.rm(b, n)
@@ -285,25 +590,66 @@ def delete(tenant_id,dataset_id):
285
  return get_result()
286
 
287
 
288
- @manager.route('/datasets/<dataset_id>/chunks', methods=['POST'])
289
  @token_required
290
- def parse(tenant_id,dataset_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
292
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
293
  req = request.json
294
  if not req.get("document_ids"):
295
  return get_error_data_result("`document_ids` is required")
296
  for id in req["document_ids"]:
297
- doc = DocumentService.query(id=id,kb_id=dataset_id)
298
  if not doc:
299
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
 
 
 
 
300
  info = {"run": "1", "progress": 0}
301
  info["progress_msg"] = ""
302
  info["chunk_num"] = 0
303
  info["token_num"] = 0
304
  DocumentService.update_by_id(id, info)
305
  ELASTICSEARCH.deleteByQuery(
306
- Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
 
307
  TaskService.filter_delete([Task.doc_id == id])
308
  e, doc = DocumentService.get_by_id(id)
309
  doc = doc.to_dict()
@@ -312,9 +658,46 @@ def parse(tenant_id,dataset_id):
312
  queue_tasks(doc, bucket, name)
313
  return get_result()
314
 
315
- @manager.route('/datasets/<dataset_id>/chunks', methods=['DELETE'])
 
316
  @token_required
317
- def stop_parsing(tenant_id,dataset_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
319
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
320
  req = request.json
@@ -325,46 +708,125 @@ def stop_parsing(tenant_id,dataset_id):
325
  if not doc:
326
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
327
  if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
328
- return get_error_data_result("Can't stop parsing document with progress at 0 or 1")
329
- info = {"run": "2", "progress": 0,"chunk_num":0}
 
 
330
  DocumentService.update_by_id(id, info)
331
  ELASTICSEARCH.deleteByQuery(
332
- Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
 
333
  return get_result()
334
 
335
 
336
- @manager.route('/datasets/<dataset_id>/documents/<document_id>/chunks', methods=['GET'])
337
  @token_required
338
- def list_chunks(tenant_id,dataset_id,document_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
340
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
341
- doc=DocumentService.query(id=document_id, kb_id=dataset_id)
342
  if not doc:
343
- return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
344
- doc=doc[0]
 
 
345
  req = request.args
346
  doc_id = document_id
347
  page = int(req.get("offset", 1))
348
  size = int(req.get("limit", 30))
349
  question = req.get("keywords", "")
350
  query = {
351
- "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
 
 
 
 
352
  }
353
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
354
  key_mapping = {
355
  "chunk_num": "chunk_count",
356
  "kb_id": "dataset_id",
357
  "token_num": "token_count",
358
- "parser_id": "chunk_method"
359
  }
360
  run_mapping = {
361
  "0": "UNSTART",
362
  "1": "RUNNING",
363
  "2": "CANCEL",
364
  "3": "DONE",
365
- "4": "FAIL"
366
  }
367
- doc=doc.to_dict()
368
  renamed_doc = {}
369
  for key, value in doc.items():
370
  new_key = key_mapping.get(key, key)
@@ -377,21 +839,30 @@ def list_chunks(tenant_id,dataset_id,document_id):
377
  for id in sres.ids:
378
  d = {
379
  "chunk_id": id,
380
- "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
381
- id].get(
382
- "content_with_weight", ""),
 
 
383
  "doc_id": sres.field[id]["doc_id"],
384
  "docnm_kwd": sres.field[id]["docnm_kwd"],
385
  "important_kwd": sres.field[id].get("important_kwd", []),
386
  "img_id": sres.field[id].get("img_id", ""),
387
  "available_int": sres.field[id].get("available_int", 1),
388
- "positions": sres.field[id].get("position_int", "").split("\t")
389
  }
390
  if len(d["positions"]) % 5 == 0:
391
  poss = []
392
  for i in range(0, len(d["positions"]), 5):
393
- poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
394
- float(d["positions"][i + 3]), float(d["positions"][i + 4])])
 
 
 
 
 
 
 
395
  d["positions"] = poss
396
 
397
  origin_chunks.append(d)
@@ -411,7 +882,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
411
  "doc_id": "document_id",
412
  "important_kwd": "important_keywords",
413
  "img_id": "image_id",
414
- "available_int":"available"
415
  }
416
  renamed_chunk = {}
417
  for key, value in chunk.items():
@@ -425,31 +896,104 @@ def list_chunks(tenant_id,dataset_id,document_id):
425
  return get_result(data=res)
426
 
427
 
428
-
429
- @manager.route('/datasets/<dataset_id>/documents/<document_id>/chunks', methods=['POST'])
 
430
  @token_required
431
- def add_chunk(tenant_id,dataset_id,document_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
433
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
434
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
435
  if not doc:
436
- return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
 
 
437
  doc = doc[0]
438
  req = request.json
439
  if not req.get("content"):
440
  return get_error_data_result(retmsg="`content` is required")
441
  if "important_keywords" in req:
442
  if type(req["important_keywords"]) != list:
443
- return get_error_data_result("`important_keywords` is required to be a list")
 
 
444
  md5 = hashlib.md5()
445
  md5.update((req["content"] + document_id).encode("utf-8"))
446
 
447
  chunk_id = md5.hexdigest()
448
- d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
449
- "content_with_weight": req["content"]}
 
 
 
450
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
451
  d["important_kwd"] = req.get("important_keywords", [])
452
- d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", [])))
 
 
453
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
454
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
455
  d["kb_id"] = [doc.kb_id]
@@ -457,17 +1001,17 @@ def add_chunk(tenant_id,dataset_id,document_id):
457
  d["doc_id"] = doc.id
458
  embd_id = DocumentService.get_embd_id(document_id)
459
  embd_mdl = TenantLLMService.model_instance(
460
- tenant_id, LLMType.EMBEDDING.value, embd_id)
461
- print(embd_mdl,flush=True)
 
462
  v, c = embd_mdl.encode([doc.name, req["content"]])
463
  v = 0.1 * v[0] + 0.9 * v[1]
464
  d["q_%d_vec" % len(v)] = v.tolist()
465
  ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
466
 
467
- DocumentService.increment_chunk_num(
468
- doc.id, doc.kb_id, c, 1, 0)
469
  d["chunk_id"] = chunk_id
470
- d["kb_id"]=doc.kb_id
471
  # rename keys
472
  key_mapping = {
473
  "chunk_id": "id",
@@ -477,7 +1021,7 @@ def add_chunk(tenant_id,dataset_id,document_id):
477
  "kb_id": "dataset_id",
478
  "create_timestamp_flt": "create_timestamp",
479
  "create_time": "create_time",
480
- "document_keyword": "document"
481
  }
482
  renamed_chunk = {}
483
  for key, value in d.items():
@@ -488,32 +1032,79 @@ def add_chunk(tenant_id,dataset_id,document_id):
488
  # return get_result(data={"chunk_id": chunk_id})
489
 
490
 
491
- @manager.route('datasets/<dataset_id>/documents/<document_id>/chunks', methods=['DELETE'])
 
 
492
  @token_required
493
- def rm_chunk(tenant_id,dataset_id,document_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
495
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
496
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
497
  if not doc:
498
- return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
 
 
499
  doc = doc[0]
500
  req = request.json
501
- query = {
502
- "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
 
503
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
504
  if not req:
505
- chunk_ids=None
506
  else:
507
- chunk_ids=req.get("chunk_ids")
508
  if not chunk_ids:
509
- chunk_list=sres.ids
510
  else:
511
- chunk_list=chunk_ids
512
  for chunk_id in chunk_list:
513
  if chunk_id not in sres.ids:
514
  return get_error_data_result(f"Chunk {chunk_id} not found")
515
  if not ELASTICSEARCH.deleteByQuery(
516
- Q("ids", values=chunk_list), search.index_name(tenant_id)):
 
517
  return get_error_data_result(retmsg="Index updating failure")
518
  deleted_chunk_ids = chunk_list
519
  chunk_number = len(deleted_chunk_ids)
@@ -521,37 +1112,92 @@ def rm_chunk(tenant_id,dataset_id,document_id):
521
  return get_result()
522
 
523
 
524
-
525
- @manager.route('/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>', methods=['PUT'])
 
526
  @token_required
527
- def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  try:
529
- res = ELASTICSEARCH.get(
530
- chunk_id, search.index_name(
531
- tenant_id))
532
  except Exception as e:
533
  return get_error_data_result(f"Can't find this chunk {chunk_id}")
534
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
535
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
536
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
537
  if not doc:
538
- return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
 
 
539
  doc = doc[0]
540
  query = {
541
- "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True
 
 
 
 
542
  }
543
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
544
  if chunk_id not in sres.ids:
545
  return get_error_data_result(f"You don't own the chunk {chunk_id}")
546
  req = request.json
547
- content=res["_source"].get("content_with_weight")
548
- d = {
549
- "id": chunk_id,
550
- "content_with_weight": req.get("content",content)}
551
  d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
552
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
553
  if "important_keywords" in req:
554
- if not isinstance(req["important_keywords"],list):
555
  return get_error_data_result("`important_keywords` should be a list")
556
  d["important_kwd"] = req.get("important_keywords")
557
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
@@ -559,18 +1205,18 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
559
  d["available_int"] = int(req["available"])
560
  embd_id = DocumentService.get_embd_id(document_id)
561
  embd_mdl = TenantLLMService.model_instance(
562
- tenant_id, LLMType.EMBEDDING.value, embd_id)
 
563
  if doc.parser_id == ParserType.QA:
564
- arr = [
565
- t for t in re.split(
566
- r"[\n\t]",
567
- d["content_with_weight"]) if len(t) > 1]
568
  if len(arr) != 2:
569
  return get_error_data_result(
570
- retmsg="Q&A must be separated by TAB/ENTER key.")
 
571
  q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
572
- d = beAdoc(d, arr[0], arr[1], not any(
573
- [rag_tokenizer.is_chinese(t) for t in q + a]))
 
574
 
575
  v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
576
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
@@ -579,41 +1225,120 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
579
  return get_result()
580
 
581
 
582
-
583
- @manager.route('/retrieval', methods=['POST'])
584
  @token_required
585
  def retrieval_test(tenant_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  req = request.json
587
  if not req.get("dataset_ids"):
588
  return get_error_data_result("`dataset_ids` is required.")
589
  kb_ids = req["dataset_ids"]
590
- if not isinstance(kb_ids,list):
591
  return get_error_data_result("`dataset_ids` should be a list")
592
  kbs = KnowledgebaseService.get_by_ids(kb_ids)
593
  for id in kb_ids:
594
- if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
595
  return get_error_data_result(f"You don't own the dataset {id}.")
596
  embd_nms = list(set([kb.embd_id for kb in kbs]))
597
  if len(embd_nms) != 1:
598
  return get_result(
599
  retmsg='Datasets use different embedding models."',
600
- retcode=RetCode.AUTHENTICATION_ERROR)
 
601
  if "question" not in req:
602
  return get_error_data_result("`question` is required.")
603
  page = int(req.get("offset", 1))
604
  size = int(req.get("limit", 1024))
605
  question = req["question"]
606
  doc_ids = req.get("document_ids", [])
607
- if not isinstance(doc_ids,list):
608
  return get_error_data_result("`documents` should be a list")
609
- doc_ids_list=KnowledgebaseService.list_documents_by_ids(kb_ids)
610
  for doc_id in doc_ids:
611
  if doc_id not in doc_ids_list:
612
- return get_error_data_result(f"The datasets don't own the document {doc_id}")
 
 
613
  similarity_threshold = float(req.get("similarity_threshold", 0.2))
614
  vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
615
  top = int(req.get("top_k", 1024))
616
- if req.get("highlight")=="False" or req.get("highlight")=="false":
617
  highlight = False
618
  else:
619
  highlight = True
@@ -622,21 +1347,34 @@ def retrieval_test(tenant_id):
622
  if not e:
623
  return get_error_data_result(retmsg="Dataset not found!")
624
  embd_mdl = TenantLLMService.model_instance(
625
- kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
 
626
 
627
  rerank_mdl = None
628
  if req.get("rerank_id"):
629
  rerank_mdl = TenantLLMService.model_instance(
630
- kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])
 
631
 
632
  if req.get("keyword", False):
633
  chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
634
  question += keyword_extraction(chat_mdl, question)
635
 
636
  retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
637
- ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
638
- similarity_threshold, vector_similarity_weight, top,
639
- doc_ids, rerank_mdl=rerank_mdl, highlight=highlight)
 
 
 
 
 
 
 
 
 
 
 
640
  for c in ranks["chunks"]:
641
  if "vector" in c:
642
  del c["vector"]
@@ -649,7 +1387,7 @@ def retrieval_test(tenant_id):
649
  "content_with_weight": "content",
650
  "doc_id": "document_id",
651
  "important_kwd": "important_keywords",
652
- "docnm_kwd": "document_keyword"
653
  }
654
  rename_chunk = {}
655
  for key, value in chunk.items():
@@ -660,6 +1398,8 @@ def retrieval_test(tenant_id):
660
  return get_result(data=ranks)
661
  except Exception as e:
662
  if str(e).find("not_found") > 0:
663
- return get_result(retmsg=f'No chunk found! Check the chunk status please!',
664
- retcode=RetCode.DATA_ERROR)
665
- return server_error_response(e)
 
 
 
39
  from api.db.services.file_service import FileService
40
  from api.db.services.knowledgebase_service import KnowledgebaseService
41
  from api.settings import RetCode, retrievaler
42
+ from api.utils.api_utils import construct_json_result, get_parser_config
43
  from rag.nlp import search
44
  from rag.utils import rmSpace
45
  from rag.utils.es_conn import ELASTICSEARCH
 
49
  MAXIMUM_OF_UPLOADING_FILES = 256
50
 
51
 
52
+ @manager.route("/datasets/<dataset_id>/documents", methods=["POST"])
 
53
  @token_required
54
  def upload(dataset_id, tenant_id):
55
+ """
56
+ Upload documents to a dataset.
57
+ ---
58
+ tags:
59
+ - Documents
60
+ security:
61
+ - ApiKeyAuth: []
62
+ parameters:
63
+ - in: path
64
+ name: dataset_id
65
+ type: string
66
+ required: true
67
+ description: ID of the dataset.
68
+ - in: header
69
+ name: Authorization
70
+ type: string
71
+ required: true
72
+ description: Bearer token for authentication.
73
+ - in: formData
74
+ name: file
75
+ type: file
76
+ required: true
77
+ description: Document files to upload.
78
+ responses:
79
+ 200:
80
+ description: Successfully uploaded documents.
81
+ schema:
82
+ type: object
83
+ properties:
84
+ data:
85
+ type: array
86
+ items:
87
+ type: object
88
+ properties:
89
+ id:
90
+ type: string
91
+ description: Document ID.
92
+ name:
93
+ type: string
94
+ description: Document name.
95
+ chunk_count:
96
+ type: integer
97
+ description: Number of chunks.
98
+ token_count:
99
+ type: integer
100
+ description: Number of tokens.
101
+ dataset_id:
102
+ type: string
103
+ description: ID of the dataset.
104
+ chunk_method:
105
+ type: string
106
+ description: Chunking method used.
107
+ run:
108
+ type: string
109
+ description: Processing status.
110
+ """
111
+ if "file" not in request.files:
112
  return get_error_data_result(
113
+ retmsg="No file part!", retcode=RetCode.ARGUMENT_ERROR
114
+ )
115
+ file_objs = request.files.getlist("file")
116
  for file_obj in file_objs:
117
+ if file_obj.filename == "":
118
  return get_result(
119
+ retmsg="No file selected!", retcode=RetCode.ARGUMENT_ERROR
120
+ )
121
  # total size
122
  total_size = 0
123
  for file_obj in file_objs:
124
  file_obj.seek(0, os.SEEK_END)
125
  total_size += file_obj.tell()
126
  file_obj.seek(0)
127
+ MAX_TOTAL_FILE_SIZE = 10 * 1024 * 1024
128
  if total_size > MAX_TOTAL_FILE_SIZE:
129
  return get_result(
130
+ retmsg=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
131
+ retcode=RetCode.ARGUMENT_ERROR,
132
+ )
133
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
134
  if not e:
135
  raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
136
+ err, files = FileService.upload_document(kb, file_objs, tenant_id)
137
  if err:
138
+ return get_result(retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
 
139
  # rename key's name
140
  renamed_doc_list = []
141
  for file in files:
 
144
  "chunk_num": "chunk_count",
145
  "kb_id": "dataset_id",
146
  "token_num": "token_count",
147
+ "parser_id": "chunk_method",
148
  }
149
  renamed_doc = {}
150
  for key, value in doc.items():
 
155
  return get_result(data=renamed_doc_list)
156
 
157
 
158
+ @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"])
159
  @token_required
160
  def update_doc(tenant_id, dataset_id, document_id):
161
+ """
162
+ Update a document within a dataset.
163
+ ---
164
+ tags:
165
+ - Documents
166
+ security:
167
+ - ApiKeyAuth: []
168
+ parameters:
169
+ - in: path
170
+ name: dataset_id
171
+ type: string
172
+ required: true
173
+ description: ID of the dataset.
174
+ - in: path
175
+ name: document_id
176
+ type: string
177
+ required: true
178
+ description: ID of the document to update.
179
+ - in: header
180
+ name: Authorization
181
+ type: string
182
+ required: true
183
+ description: Bearer token for authentication.
184
+ - in: body
185
+ name: body
186
+ description: Document update parameters.
187
+ required: true
188
+ schema:
189
+ type: object
190
+ properties:
191
+ name:
192
+ type: string
193
+ description: New name of the document.
194
+ parser_config:
195
+ type: object
196
+ description: Parser configuration.
197
+ chunk_method:
198
+ type: string
199
+ description: Chunking method.
200
+ responses:
201
+ 200:
202
+ description: Document updated successfully.
203
+ schema:
204
+ type: object
205
+ """
206
  req = request.json
207
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
208
  return get_error_data_result(retmsg="You don't own the dataset.")
 
217
  if req["token_count"] != doc.token_num:
218
  return get_error_data_result(retmsg="Can't change `token_count`.")
219
  if "progress" in req:
220
+ if req["progress"] != doc.progress:
221
  return get_error_data_result(retmsg="Can't change `progress`.")
222
 
223
  if "name" in req and req["name"] != doc.name:
224
+ if (
225
+ pathlib.Path(req["name"].lower()).suffix
226
+ != pathlib.Path(doc.name.lower()).suffix
227
+ ):
228
+ return get_result(
229
+ retmsg="The extension of file can't be changed",
230
+ retcode=RetCode.ARGUMENT_ERROR,
231
+ )
232
  for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
233
  if d.name == req["name"]:
234
  return get_error_data_result(
235
+ retmsg="Duplicated document name in the same dataset."
236
+ )
237
+ if not DocumentService.update_by_id(document_id, {"name": req["name"]}):
238
+ return get_error_data_result(retmsg="Database error (Document rename)!")
 
239
 
240
  informs = File2DocumentService.get_by_document_id(document_id)
241
  if informs:
 
244
  if "parser_config" in req:
245
  DocumentService.update_parser_config(doc.id, req["parser_config"])
246
  if "chunk_method" in req:
247
+ valid_chunk_method = {
248
+ "naive",
249
+ "manual",
250
+ "qa",
251
+ "table",
252
+ "paper",
253
+ "book",
254
+ "laws",
255
+ "presentation",
256
+ "picture",
257
+ "one",
258
+ "knowledge_graph",
259
+ "email",
260
+ }
261
  if req.get("chunk_method") not in valid_chunk_method:
262
+ return get_error_data_result(
263
+ f"`chunk_method` {req['chunk_method']} doesn't exist"
264
+ )
265
  if doc.parser_id.lower() == req["chunk_method"].lower():
266
+ return get_result()
267
 
268
+ if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
 
269
  return get_error_data_result(retmsg="Not supported yet!")
270
 
271
+ e = DocumentService.update_by_id(
272
+ doc.id,
273
+ {
274
+ "parser_id": req["chunk_method"],
275
+ "progress": 0,
276
+ "progress_msg": "",
277
+ "run": TaskStatus.UNSTART.value,
278
+ },
279
+ )
280
  if not e:
281
  return get_error_data_result(retmsg="Document not found!")
282
+ req["parser_config"] = get_parser_config(
283
+ req["chunk_method"], req.get("parser_config")
284
+ )
285
  DocumentService.update_parser_config(doc.id, req["parser_config"])
286
  if doc.token_num > 0:
287
+ e = DocumentService.increment_chunk_num(
288
+ doc.id,
289
+ doc.kb_id,
290
+ doc.token_num * -1,
291
+ doc.chunk_num * -1,
292
+ doc.process_duation * -1,
293
+ )
294
  if not e:
295
  return get_error_data_result(retmsg="Document not found!")
296
  ELASTICSEARCH.deleteByQuery(
297
+ Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)
298
+ )
299
 
300
  return get_result()
301
 
302
 
303
+ @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])
304
  @token_required
305
  def download(tenant_id, dataset_id, document_id):
306
+ """
307
+ Download a document from a dataset.
308
+ ---
309
+ tags:
310
+ - Documents
311
+ security:
312
+ - ApiKeyAuth: []
313
+ produces:
314
+ - application/octet-stream
315
+ parameters:
316
+ - in: path
317
+ name: dataset_id
318
+ type: string
319
+ required: true
320
+ description: ID of the dataset.
321
+ - in: path
322
+ name: document_id
323
+ type: string
324
+ required: true
325
+ description: ID of the document to download.
326
+ - in: header
327
+ name: Authorization
328
+ type: string
329
+ required: true
330
+ description: Bearer token for authentication.
331
+ responses:
332
+ 200:
333
+ description: Document file stream.
334
+ schema:
335
+ type: file
336
+ 400:
337
+ description: Error message.
338
+ schema:
339
+ type: object
340
+ """
341
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
342
+ return get_error_data_result(retmsg=f"You do not own the dataset {dataset_id}.")
343
  doc = DocumentService.query(kb_id=dataset_id, id=document_id)
344
  if not doc:
345
+ return get_error_data_result(
346
+ retmsg=f"The dataset not own the document {document_id}."
347
+ )
348
  # The process of downloading
349
+ doc_id, doc_location = File2DocumentService.get_storage_address(
350
+ doc_id=document_id
351
+ ) # minio address
352
  file_stream = STORAGE_IMPL.get(doc_id, doc_location)
353
  if not file_stream:
354
+ return construct_json_result(
355
+ message="This file is empty.", code=RetCode.DATA_ERROR
356
+ )
357
  file = BytesIO(file_stream)
358
  # Use send_file with a proper filename and MIME type
359
  return send_file(
360
  file,
361
  as_attachment=True,
362
  download_name=doc[0].name,
363
+ mimetype="application/octet-stream", # Set a default MIME type
364
  )
365
 
366
 
367
+ @manager.route("/datasets/<dataset_id>/documents", methods=["GET"])
368
  @token_required
369
  def list_docs(dataset_id, tenant_id):
370
+ """
371
+ List documents in a dataset.
372
+ ---
373
+ tags:
374
+ - Documents
375
+ security:
376
+ - ApiKeyAuth: []
377
+ parameters:
378
+ - in: path
379
+ name: dataset_id
380
+ type: string
381
+ required: true
382
+ description: ID of the dataset.
383
+ - in: query
384
+ name: id
385
+ type: string
386
+ required: false
387
+ description: Filter by document ID.
388
+ - in: query
389
+ name: offset
390
+ type: integer
391
+ required: false
392
+ default: 1
393
+ description: Page number.
394
+ - in: query
395
+ name: limit
396
+ type: integer
397
+ required: false
398
+ default: 1024
399
+ description: Number of items per page.
400
+ - in: query
401
+ name: orderby
402
+ type: string
403
+ required: false
404
+ default: "create_time"
405
+ description: Field to order by.
406
+ - in: query
407
+ name: desc
408
+ type: boolean
409
+ required: false
410
+ default: true
411
+ description: Order in descending.
412
+ - in: header
413
+ name: Authorization
414
+ type: string
415
+ required: true
416
+ description: Bearer token for authentication.
417
+ responses:
418
+ 200:
419
+ description: List of documents.
420
+ schema:
421
+ type: object
422
+ properties:
423
+ total:
424
+ type: integer
425
+ description: Total number of documents.
426
+ docs:
427
+ type: array
428
+ items:
429
+ type: object
430
+ properties:
431
+ id:
432
+ type: string
433
+ description: Document ID.
434
+ name:
435
+ type: string
436
+ description: Document name.
437
+ chunk_count:
438
+ type: integer
439
+ description: Number of chunks.
440
+ token_count:
441
+ type: integer
442
+ description: Number of tokens.
443
+ dataset_id:
444
+ type: string
445
+ description: ID of the dataset.
446
+ chunk_method:
447
+ type: string
448
+ description: Chunking method used.
449
+ run:
450
+ type: string
451
+ description: Processing status.
452
+ """
453
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
454
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
455
  id = request.args.get("id")
456
  name = request.args.get("name")
457
+ if not DocumentService.query(id=id, kb_id=dataset_id):
458
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
459
+ if not DocumentService.query(name=name, kb_id=dataset_id):
460
  return get_error_data_result(retmsg=f"You don't own the document {name}.")
461
  offset = int(request.args.get("offset", 1))
462
+ keywords = request.args.get("keywords", "")
463
  limit = int(request.args.get("limit", 1024))
464
  orderby = request.args.get("orderby", "create_time")
465
  if request.args.get("desc") == "False":
466
  desc = False
467
  else:
468
  desc = True
469
+ docs, tol = DocumentService.get_list(
470
+ dataset_id, offset, limit, orderby, desc, keywords, id, name
471
+ )
472
 
473
  # rename key's name
474
  renamed_doc_list = []
 
477
  "chunk_num": "chunk_count",
478
  "kb_id": "dataset_id",
479
  "token_num": "token_count",
480
+ "parser_id": "chunk_method",
481
  }
482
  run_mapping = {
483
+ "0": "UNSTART",
484
+ "1": "RUNNING",
485
+ "2": "CANCEL",
486
+ "3": "DONE",
487
+ "4": "FAIL",
488
  }
489
  renamed_doc = {}
490
  for key, value in doc.items():
491
+ if key == "run":
492
+ renamed_doc["run"] = run_mapping.get(str(value))
493
  new_key = key_mapping.get(key, key)
494
  renamed_doc[new_key] = value
495
+ if key == "run":
496
+ renamed_doc["run"] = run_mapping.get(value)
497
  renamed_doc_list.append(renamed_doc)
498
  return get_result(data={"total": tol, "docs": renamed_doc_list})
499
 
500
 
501
+ @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"])
502
  @token_required
503
+ def delete(tenant_id, dataset_id):
504
+ """
505
+ Delete documents from a dataset.
506
+ ---
507
+ tags:
508
+ - Documents
509
+ security:
510
+ - ApiKeyAuth: []
511
+ parameters:
512
+ - in: path
513
+ name: dataset_id
514
+ type: string
515
+ required: true
516
+ description: ID of the dataset.
517
+ - in: body
518
+ name: body
519
+ description: Document deletion parameters.
520
+ required: true
521
+ schema:
522
+ type: object
523
+ properties:
524
+ ids:
525
+ type: array
526
+ items:
527
+ type: string
528
+ description: List of document IDs to delete.
529
+ - in: header
530
+ name: Authorization
531
+ type: string
532
+ required: true
533
+ description: Bearer token for authentication.
534
+ responses:
535
+ 200:
536
+ description: Documents deleted successfully.
537
+ schema:
538
+ type: object
539
+ """
540
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
541
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
542
  req = request.json
543
  if not req:
544
+ doc_ids = None
545
  else:
546
+ doc_ids = req.get("ids")
547
  if not doc_ids:
548
  doc_list = []
549
+ docs = DocumentService.query(kb_id=dataset_id)
550
  for doc in docs:
551
  doc_list.append(doc.id)
552
  else:
553
+ doc_list = doc_ids
554
  root_folder = FileService.get_root_folder(tenant_id)
555
  pf_id = root_folder["id"]
556
  FileService.init_knowledgebase_docs(pf_id, tenant_id)
 
568
 
569
  if not DocumentService.remove_document(doc, tenant_id):
570
  return get_error_data_result(
571
+ retmsg="Database error (Document removal)!"
572
+ )
573
 
574
  f2d = File2DocumentService.get_by_document_id(doc_id)
575
+ FileService.filter_delete(
576
+ [
577
+ File.source_type == FileSource.KNOWLEDGEBASE,
578
+ File.id == f2d[0].file_id,
579
+ ]
580
+ )
581
  File2DocumentService.delete_by_document_id(doc_id)
582
 
583
  STORAGE_IMPL.rm(b, n)
 
590
  return get_result()
591
 
592
 
593
+ @manager.route("/datasets/<dataset_id>/chunks", methods=["POST"])
594
  @token_required
595
+ def parse(tenant_id, dataset_id):
596
+ """
597
+ Start parsing documents into chunks.
598
+ ---
599
+ tags:
600
+ - Chunks
601
+ security:
602
+ - ApiKeyAuth: []
603
+ parameters:
604
+ - in: path
605
+ name: dataset_id
606
+ type: string
607
+ required: true
608
+ description: ID of the dataset.
609
+ - in: body
610
+ name: body
611
+ description: Parsing parameters.
612
+ required: true
613
+ schema:
614
+ type: object
615
+ properties:
616
+ document_ids:
617
+ type: array
618
+ items:
619
+ type: string
620
+ description: List of document IDs to parse.
621
+ - in: header
622
+ name: Authorization
623
+ type: string
624
+ required: true
625
+ description: Bearer token for authentication.
626
+ responses:
627
+ 200:
628
+ description: Parsing started successfully.
629
+ schema:
630
+ type: object
631
+ """
632
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
633
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
634
  req = request.json
635
  if not req.get("document_ids"):
636
  return get_error_data_result("`document_ids` is required")
637
  for id in req["document_ids"]:
638
+ doc = DocumentService.query(id=id, kb_id=dataset_id)
639
  if not doc:
640
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
641
+ if doc[0].progress != 0.0:
642
+ return get_error_data_result(
643
+ "Can't stop parsing document with progress at 0 or 100"
644
+ )
645
  info = {"run": "1", "progress": 0}
646
  info["progress_msg"] = ""
647
  info["chunk_num"] = 0
648
  info["token_num"] = 0
649
  DocumentService.update_by_id(id, info)
650
  ELASTICSEARCH.deleteByQuery(
651
+ Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
652
+ )
653
  TaskService.filter_delete([Task.doc_id == id])
654
  e, doc = DocumentService.get_by_id(id)
655
  doc = doc.to_dict()
 
658
  queue_tasks(doc, bucket, name)
659
  return get_result()
660
 
661
+
662
+ @manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"])
663
  @token_required
664
+ def stop_parsing(tenant_id, dataset_id):
665
+ """
666
+ Stop parsing documents into chunks.
667
+ ---
668
+ tags:
669
+ - Chunks
670
+ security:
671
+ - ApiKeyAuth: []
672
+ parameters:
673
+ - in: path
674
+ name: dataset_id
675
+ type: string
676
+ required: true
677
+ description: ID of the dataset.
678
+ - in: body
679
+ name: body
680
+ description: Stop parsing parameters.
681
+ required: true
682
+ schema:
683
+ type: object
684
+ properties:
685
+ document_ids:
686
+ type: array
687
+ items:
688
+ type: string
689
+ description: List of document IDs to stop parsing.
690
+ - in: header
691
+ name: Authorization
692
+ type: string
693
+ required: true
694
+ description: Bearer token for authentication.
695
+ responses:
696
+ 200:
697
+ description: Parsing stopped successfully.
698
+ schema:
699
+ type: object
700
+ """
701
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
702
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
703
  req = request.json
 
708
  if not doc:
709
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
710
  if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
711
+ return get_error_data_result(
712
+ "Can't stop parsing document with progress at 0 or 1"
713
+ )
714
+ info = {"run": "2", "progress": 0, "chunk_num": 0}
715
  DocumentService.update_by_id(id, info)
716
  ELASTICSEARCH.deleteByQuery(
717
+ Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
718
+ )
719
  return get_result()
720
 
721
 
722
+ @manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"])
723
  @token_required
724
+ def list_chunks(tenant_id, dataset_id, document_id):
725
+ """
726
+ List chunks of a document.
727
+ ---
728
+ tags:
729
+ - Chunks
730
+ security:
731
+ - ApiKeyAuth: []
732
+ parameters:
733
+ - in: path
734
+ name: dataset_id
735
+ type: string
736
+ required: true
737
+ description: ID of the dataset.
738
+ - in: path
739
+ name: document_id
740
+ type: string
741
+ required: true
742
+ description: ID of the document.
743
+ - in: query
744
+ name: offset
745
+ type: integer
746
+ required: false
747
+ default: 1
748
+ description: Page number.
749
+ - in: query
750
+ name: limit
751
+ type: integer
752
+ required: false
753
+ default: 30
754
+ description: Number of items per page.
755
+ - in: header
756
+ name: Authorization
757
+ type: string
758
+ required: true
759
+ description: Bearer token for authentication.
760
+ responses:
761
+ 200:
762
+ description: List of chunks.
763
+ schema:
764
+ type: object
765
+ properties:
766
+ total:
767
+ type: integer
768
+ description: Total number of chunks.
769
+ chunks:
770
+ type: array
771
+ items:
772
+ type: object
773
+ properties:
774
+ id:
775
+ type: string
776
+ description: Chunk ID.
777
+ content:
778
+ type: string
779
+ description: Chunk content.
780
+ document_id:
781
+ type: string
782
+ description: ID of the document.
783
+ important_keywords:
784
+ type: array
785
+ items:
786
+ type: string
787
+ description: Important keywords.
788
+ image_id:
789
+ type: string
790
+ description: Image ID associated with the chunk.
791
+ doc:
792
+ type: object
793
+ description: Document details.
794
+ """
795
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
796
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
797
+ doc = DocumentService.query(id=document_id, kb_id=dataset_id)
798
  if not doc:
799
+ return get_error_data_result(
800
+ retmsg=f"You don't own the document {document_id}."
801
+ )
802
+ doc = doc[0]
803
  req = request.args
804
  doc_id = document_id
805
  page = int(req.get("offset", 1))
806
  size = int(req.get("limit", 30))
807
  question = req.get("keywords", "")
808
  query = {
809
+ "doc_ids": [doc_id],
810
+ "page": page,
811
+ "size": size,
812
+ "question": question,
813
+ "sort": True,
814
  }
815
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
816
  key_mapping = {
817
  "chunk_num": "chunk_count",
818
  "kb_id": "dataset_id",
819
  "token_num": "token_count",
820
+ "parser_id": "chunk_method",
821
  }
822
  run_mapping = {
823
  "0": "UNSTART",
824
  "1": "RUNNING",
825
  "2": "CANCEL",
826
  "3": "DONE",
827
+ "4": "FAIL",
828
  }
829
+ doc = doc.to_dict()
830
  renamed_doc = {}
831
  for key, value in doc.items():
832
  new_key = key_mapping.get(key, key)
 
839
  for id in sres.ids:
840
  d = {
841
  "chunk_id": id,
842
+ "content_with_weight": (
843
+ rmSpace(sres.highlight[id])
844
+ if question and id in sres.highlight
845
+ else sres.field[id].get("content_with_weight", "")
846
+ ),
847
  "doc_id": sres.field[id]["doc_id"],
848
  "docnm_kwd": sres.field[id]["docnm_kwd"],
849
  "important_kwd": sres.field[id].get("important_kwd", []),
850
  "img_id": sres.field[id].get("img_id", ""),
851
  "available_int": sres.field[id].get("available_int", 1),
852
+ "positions": sres.field[id].get("position_int", "").split("\t"),
853
  }
854
  if len(d["positions"]) % 5 == 0:
855
  poss = []
856
  for i in range(0, len(d["positions"]), 5):
857
+ poss.append(
858
+ [
859
+ float(d["positions"][i]),
860
+ float(d["positions"][i + 1]),
861
+ float(d["positions"][i + 2]),
862
+ float(d["positions"][i + 3]),
863
+ float(d["positions"][i + 4]),
864
+ ]
865
+ )
866
  d["positions"] = poss
867
 
868
  origin_chunks.append(d)
 
882
  "doc_id": "document_id",
883
  "important_kwd": "important_keywords",
884
  "img_id": "image_id",
885
+ "available_int": "available",
886
  }
887
  renamed_chunk = {}
888
  for key, value in chunk.items():
 
896
  return get_result(data=res)
897
 
898
 
899
+ @manager.route(
900
+ "/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["POST"]
901
+ )
902
  @token_required
903
+ def add_chunk(tenant_id, dataset_id, document_id):
904
+ """
905
+ Add a chunk to a document.
906
+ ---
907
+ tags:
908
+ - Chunks
909
+ security:
910
+ - ApiKeyAuth: []
911
+ parameters:
912
+ - in: path
913
+ name: dataset_id
914
+ type: string
915
+ required: true
916
+ description: ID of the dataset.
917
+ - in: path
918
+ name: document_id
919
+ type: string
920
+ required: true
921
+ description: ID of the document.
922
+ - in: body
923
+ name: body
924
+ description: Chunk data.
925
+ required: true
926
+ schema:
927
+ type: object
928
+ properties:
929
+ content:
930
+ type: string
931
+ required: true
932
+ description: Content of the chunk.
933
+ important_keywords:
934
+ type: array
935
+ items:
936
+ type: string
937
+ description: Important keywords.
938
+ - in: header
939
+ name: Authorization
940
+ type: string
941
+ required: true
942
+ description: Bearer token for authentication.
943
+ responses:
944
+ 200:
945
+ description: Chunk added successfully.
946
+ schema:
947
+ type: object
948
+ properties:
949
+ chunk:
950
+ type: object
951
+ properties:
952
+ id:
953
+ type: string
954
+ description: Chunk ID.
955
+ content:
956
+ type: string
957
+ description: Chunk content.
958
+ document_id:
959
+ type: string
960
+ description: ID of the document.
961
+ important_keywords:
962
+ type: array
963
+ items:
964
+ type: string
965
+ description: Important keywords.
966
+ """
967
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
968
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
969
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
970
  if not doc:
971
+ return get_error_data_result(
972
+ retmsg=f"You don't own the document {document_id}."
973
+ )
974
  doc = doc[0]
975
  req = request.json
976
  if not req.get("content"):
977
  return get_error_data_result(retmsg="`content` is required")
978
  if "important_keywords" in req:
979
  if type(req["important_keywords"]) != list:
980
+ return get_error_data_result(
981
+ "`important_keywords` is required to be a list"
982
+ )
983
  md5 = hashlib.md5()
984
  md5.update((req["content"] + document_id).encode("utf-8"))
985
 
986
  chunk_id = md5.hexdigest()
987
+ d = {
988
+ "id": chunk_id,
989
+ "content_ltks": rag_tokenizer.tokenize(req["content"]),
990
+ "content_with_weight": req["content"],
991
+ }
992
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
993
  d["important_kwd"] = req.get("important_keywords", [])
994
+ d["important_tks"] = rag_tokenizer.tokenize(
995
+ " ".join(req.get("important_keywords", []))
996
+ )
997
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
998
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
999
  d["kb_id"] = [doc.kb_id]
 
1001
  d["doc_id"] = doc.id
1002
  embd_id = DocumentService.get_embd_id(document_id)
1003
  embd_mdl = TenantLLMService.model_instance(
1004
+ tenant_id, LLMType.EMBEDDING.value, embd_id
1005
+ )
1006
+ print(embd_mdl, flush=True)
1007
  v, c = embd_mdl.encode([doc.name, req["content"]])
1008
  v = 0.1 * v[0] + 0.9 * v[1]
1009
  d["q_%d_vec" % len(v)] = v.tolist()
1010
  ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
1011
 
1012
+ DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
 
1013
  d["chunk_id"] = chunk_id
1014
+ d["kb_id"] = doc.kb_id
1015
  # rename keys
1016
  key_mapping = {
1017
  "chunk_id": "id",
 
1021
  "kb_id": "dataset_id",
1022
  "create_timestamp_flt": "create_timestamp",
1023
  "create_time": "create_time",
1024
+ "document_keyword": "document",
1025
  }
1026
  renamed_chunk = {}
1027
  for key, value in d.items():
 
1032
  # return get_result(data={"chunk_id": chunk_id})
1033
 
1034
 
1035
+ @manager.route(
1036
+ "datasets/<dataset_id>/documents/<document_id>/chunks", methods=["DELETE"]
1037
+ )
1038
  @token_required
1039
+ def rm_chunk(tenant_id, dataset_id, document_id):
1040
+ """
1041
+ Remove chunks from a document.
1042
+ ---
1043
+ tags:
1044
+ - Chunks
1045
+ security:
1046
+ - ApiKeyAuth: []
1047
+ parameters:
1048
+ - in: path
1049
+ name: dataset_id
1050
+ type: string
1051
+ required: true
1052
+ description: ID of the dataset.
1053
+ - in: path
1054
+ name: document_id
1055
+ type: string
1056
+ required: true
1057
+ description: ID of the document.
1058
+ - in: body
1059
+ name: body
1060
+ description: Chunk removal parameters.
1061
+ required: true
1062
+ schema:
1063
+ type: object
1064
+ properties:
1065
+ chunk_ids:
1066
+ type: array
1067
+ items:
1068
+ type: string
1069
+ description: List of chunk IDs to remove.
1070
+ - in: header
1071
+ name: Authorization
1072
+ type: string
1073
+ required: true
1074
+ description: Bearer token for authentication.
1075
+ responses:
1076
+ 200:
1077
+ description: Chunks removed successfully.
1078
+ schema:
1079
+ type: object
1080
+ """
1081
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
1082
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
1083
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
1084
  if not doc:
1085
+ return get_error_data_result(
1086
+ retmsg=f"You don't own the document {document_id}."
1087
+ )
1088
  doc = doc[0]
1089
  req = request.json
1090
+ if not req.get("chunk_ids"):
1091
+ return get_error_data_result("`chunk_ids` is required")
1092
+ query = {"doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
1093
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
1094
  if not req:
1095
+ chunk_ids = None
1096
  else:
1097
+ chunk_ids = req.get("chunk_ids")
1098
  if not chunk_ids:
1099
+ chunk_list = sres.ids
1100
  else:
1101
+ chunk_list = chunk_ids
1102
  for chunk_id in chunk_list:
1103
  if chunk_id not in sres.ids:
1104
  return get_error_data_result(f"Chunk {chunk_id} not found")
1105
  if not ELASTICSEARCH.deleteByQuery(
1106
+ Q("ids", values=chunk_list), search.index_name(tenant_id)
1107
+ ):
1108
  return get_error_data_result(retmsg="Index updating failure")
1109
  deleted_chunk_ids = chunk_list
1110
  chunk_number = len(deleted_chunk_ids)
 
1112
  return get_result()
1113
 
1114
 
1115
+ @manager.route(
1116
+ "/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>", methods=["PUT"]
1117
+ )
1118
  @token_required
1119
+ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
1120
+ """
1121
+ Update a chunk within a document.
1122
+ ---
1123
+ tags:
1124
+ - Chunks
1125
+ security:
1126
+ - ApiKeyAuth: []
1127
+ parameters:
1128
+ - in: path
1129
+ name: dataset_id
1130
+ type: string
1131
+ required: true
1132
+ description: ID of the dataset.
1133
+ - in: path
1134
+ name: document_id
1135
+ type: string
1136
+ required: true
1137
+ description: ID of the document.
1138
+ - in: path
1139
+ name: chunk_id
1140
+ type: string
1141
+ required: true
1142
+ description: ID of the chunk to update.
1143
+ - in: body
1144
+ name: body
1145
+ description: Chunk update parameters.
1146
+ required: true
1147
+ schema:
1148
+ type: object
1149
+ properties:
1150
+ content:
1151
+ type: string
1152
+ description: Updated content of the chunk.
1153
+ important_keywords:
1154
+ type: array
1155
+ items:
1156
+ type: string
1157
+ description: Updated important keywords.
1158
+ available:
1159
+ type: boolean
1160
+ description: Availability status of the chunk.
1161
+ - in: header
1162
+ name: Authorization
1163
+ type: string
1164
+ required: true
1165
+ description: Bearer token for authentication.
1166
+ responses:
1167
+ 200:
1168
+ description: Chunk updated successfully.
1169
+ schema:
1170
+ type: object
1171
+ """
1172
  try:
1173
+ res = ELASTICSEARCH.get(chunk_id, search.index_name(tenant_id))
 
 
1174
  except Exception as e:
1175
  return get_error_data_result(f"Can't find this chunk {chunk_id}")
1176
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
1177
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
1178
  doc = DocumentService.query(id=document_id, kb_id=dataset_id)
1179
  if not doc:
1180
+ return get_error_data_result(
1181
+ retmsg=f"You don't own the document {document_id}."
1182
+ )
1183
  doc = doc[0]
1184
  query = {
1185
+ "doc_ids": [document_id],
1186
+ "page": 1,
1187
+ "size": 1024,
1188
+ "question": "",
1189
+ "sort": True,
1190
  }
1191
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
1192
  if chunk_id not in sres.ids:
1193
  return get_error_data_result(f"You don't own the chunk {chunk_id}")
1194
  req = request.json
1195
+ content = res["_source"].get("content_with_weight")
1196
+ d = {"id": chunk_id, "content_with_weight": req.get("content", content)}
 
 
1197
  d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
1198
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
1199
  if "important_keywords" in req:
1200
+ if not isinstance(req["important_keywords"], list):
1201
  return get_error_data_result("`important_keywords` should be a list")
1202
  d["important_kwd"] = req.get("important_keywords")
1203
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
 
1205
  d["available_int"] = int(req["available"])
1206
  embd_id = DocumentService.get_embd_id(document_id)
1207
  embd_mdl = TenantLLMService.model_instance(
1208
+ tenant_id, LLMType.EMBEDDING.value, embd_id
1209
+ )
1210
  if doc.parser_id == ParserType.QA:
1211
+ arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
 
 
 
1212
  if len(arr) != 2:
1213
  return get_error_data_result(
1214
+ retmsg="Q&A must be separated by TAB/ENTER key."
1215
+ )
1216
  q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
1217
+ d = beAdoc(
1218
+ d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
1219
+ )
1220
 
1221
  v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
1222
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
 
1225
  return get_result()
1226
 
1227
 
1228
+ @manager.route("/retrieval", methods=["POST"])
 
1229
  @token_required
1230
  def retrieval_test(tenant_id):
1231
+ """
1232
+ Retrieve chunks based on a query.
1233
+ ---
1234
+ tags:
1235
+ - Retrieval
1236
+ security:
1237
+ - ApiKeyAuth: []
1238
+ parameters:
1239
+ - in: body
1240
+ name: body
1241
+ description: Retrieval parameters.
1242
+ required: true
1243
+ schema:
1244
+ type: object
1245
+ properties:
1246
+ dataset_ids:
1247
+ type: array
1248
+ items:
1249
+ type: string
1250
+ required: true
1251
+ description: List of dataset IDs to search in.
1252
+ question:
1253
+ type: string
1254
+ required: true
1255
+ description: Query string.
1256
+ document_ids:
1257
+ type: array
1258
+ items:
1259
+ type: string
1260
+ description: List of document IDs to filter.
1261
+ similarity_threshold:
1262
+ type: number
1263
+ format: float
1264
+ description: Similarity threshold.
1265
+ vector_similarity_weight:
1266
+ type: number
1267
+ format: float
1268
+ description: Vector similarity weight.
1269
+ top_k:
1270
+ type: integer
1271
+ description: Maximum number of chunks to return.
1272
+ highlight:
1273
+ type: boolean
1274
+ description: Whether to highlight matched content.
1275
+ - in: header
1276
+ name: Authorization
1277
+ type: string
1278
+ required: true
1279
+ description: Bearer token for authentication.
1280
+ responses:
1281
+ 200:
1282
+ description: Retrieval results.
1283
+ schema:
1284
+ type: object
1285
+ properties:
1286
+ chunks:
1287
+ type: array
1288
+ items:
1289
+ type: object
1290
+ properties:
1291
+ id:
1292
+ type: string
1293
+ description: Chunk ID.
1294
+ content:
1295
+ type: string
1296
+ description: Chunk content.
1297
+ document_id:
1298
+ type: string
1299
+ description: ID of the document.
1300
+ dataset_id:
1301
+ type: string
1302
+ description: ID of the dataset.
1303
+ similarity:
1304
+ type: number
1305
+ format: float
1306
+ description: Similarity score.
1307
+ """
1308
  req = request.json
1309
  if not req.get("dataset_ids"):
1310
  return get_error_data_result("`dataset_ids` is required.")
1311
  kb_ids = req["dataset_ids"]
1312
+ if not isinstance(kb_ids, list):
1313
  return get_error_data_result("`dataset_ids` should be a list")
1314
  kbs = KnowledgebaseService.get_by_ids(kb_ids)
1315
  for id in kb_ids:
1316
+ if not KnowledgebaseService.query(id=id, tenant_id=tenant_id):
1317
  return get_error_data_result(f"You don't own the dataset {id}.")
1318
  embd_nms = list(set([kb.embd_id for kb in kbs]))
1319
  if len(embd_nms) != 1:
1320
  return get_result(
1321
  retmsg='Datasets use different embedding models."',
1322
+ retcode=RetCode.AUTHENTICATION_ERROR,
1323
+ )
1324
  if "question" not in req:
1325
  return get_error_data_result("`question` is required.")
1326
  page = int(req.get("offset", 1))
1327
  size = int(req.get("limit", 1024))
1328
  question = req["question"]
1329
  doc_ids = req.get("document_ids", [])
1330
+ if not isinstance(doc_ids, list):
1331
  return get_error_data_result("`documents` should be a list")
1332
+ doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
1333
  for doc_id in doc_ids:
1334
  if doc_id not in doc_ids_list:
1335
+ return get_error_data_result(
1336
+ f"The datasets don't own the document {doc_id}"
1337
+ )
1338
  similarity_threshold = float(req.get("similarity_threshold", 0.2))
1339
  vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
1340
  top = int(req.get("top_k", 1024))
1341
+ if req.get("highlight") == "False" or req.get("highlight") == "false":
1342
  highlight = False
1343
  else:
1344
  highlight = True
 
1347
  if not e:
1348
  return get_error_data_result(retmsg="Dataset not found!")
1349
  embd_mdl = TenantLLMService.model_instance(
1350
+ kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id
1351
+ )
1352
 
1353
  rerank_mdl = None
1354
  if req.get("rerank_id"):
1355
  rerank_mdl = TenantLLMService.model_instance(
1356
+ kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]
1357
+ )
1358
 
1359
  if req.get("keyword", False):
1360
  chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
1361
  question += keyword_extraction(chat_mdl, question)
1362
 
1363
  retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
1364
+ ranks = retr.retrieval(
1365
+ question,
1366
+ embd_mdl,
1367
+ kb.tenant_id,
1368
+ kb_ids,
1369
+ page,
1370
+ size,
1371
+ similarity_threshold,
1372
+ vector_similarity_weight,
1373
+ top,
1374
+ doc_ids,
1375
+ rerank_mdl=rerank_mdl,
1376
+ highlight=highlight,
1377
+ )
1378
  for c in ranks["chunks"]:
1379
  if "vector" in c:
1380
  del c["vector"]
 
1387
  "content_with_weight": "content",
1388
  "doc_id": "document_id",
1389
  "important_kwd": "important_keywords",
1390
+ "docnm_kwd": "document_keyword",
1391
  }
1392
  rename_chunk = {}
1393
  for key, value in chunk.items():
 
1398
  return get_result(data=ranks)
1399
  except Exception as e:
1400
  if str(e).find("not_found") > 0:
1401
+ return get_result(
1402
+ retmsg=f"No chunk found! Check the chunk status please!",
1403
+ retcode=RetCode.DATA_ERROR,
1404
+ )
1405
+ return server_error_response(e)
api/apps/system_app.py CHANGED
@@ -24,8 +24,14 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
24
  from api.db.services.user_service import UserTenantService
25
  from api.settings import DATABASE_TYPE
26
  from api.utils import current_timestamp, datetime_format
27
- from api.utils.api_utils import get_json_result, get_data_error_result, server_error_response, \
28
- generate_confirmation_token, request, validate_request
 
 
 
 
 
 
29
  from api.versions import get_rag_version
30
  from rag.utils.es_conn import ELASTICSEARCH
31
  from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE
@@ -34,44 +40,121 @@ from timeit import default_timer as timer
34
  from rag.utils.redis_conn import REDIS_CONN
35
 
36
 
37
- @manager.route('/version', methods=['GET'])
38
  @login_required
39
  def version():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  return get_json_result(data=get_rag_version())
41
 
42
 
43
- @manager.route('/status', methods=['GET'])
44
  @login_required
45
  def status():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  res = {}
47
  st = timer()
48
  try:
49
  res["es"] = ELASTICSEARCH.health()
50
- res["es"]["elapsed"] = "{:.1f}".format((timer() - st)*1000.)
51
  except Exception as e:
52
- res["es"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
 
 
 
 
53
 
54
  st = timer()
55
  try:
56
  STORAGE_IMPL.health()
57
- res["storage"] = {"storage": STORAGE_IMPL_TYPE.lower(), "status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
 
 
 
 
58
  except Exception as e:
59
- res["storage"] = {"storage": STORAGE_IMPL_TYPE.lower(), "status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
 
 
 
 
 
60
 
61
  st = timer()
62
  try:
63
  KnowledgebaseService.get_by_id("x")
64
- res["database"] = {"database": DATABASE_TYPE.lower(), "status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
 
 
 
 
65
  except Exception as e:
66
- res["database"] = {"database": DATABASE_TYPE.lower(), "status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
 
 
 
 
 
67
 
68
  st = timer()
69
  try:
70
  if not REDIS_CONN.health():
71
  raise Exception("Lost connection!")
72
- res["redis"] = {"status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
 
 
 
73
  except Exception as e:
74
- res["redis"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
 
 
 
 
75
 
76
  try:
77
  v = REDIS_CONN.get("TASKEXE")
@@ -84,10 +167,12 @@ def status():
84
  if len(arr) == 1:
85
  obj[id] = [0]
86
  else:
87
- obj[id] = [arr[i+1]-arr[i] for i in range(len(arr)-1)]
88
  elapsed = max(obj[id])
89
- if elapsed > 50: color = "yellow"
90
- if elapsed > 120: color = "red"
 
 
91
  res["task_executor"] = {"status": color, "elapsed": obj}
92
  except Exception as e:
93
  res["task_executor"] = {"status": "red", "error": str(e)}
@@ -95,21 +180,46 @@ def status():
95
  return get_json_result(data=res)
96
 
97
 
98
- @manager.route('/new_token', methods=['POST'])
99
  @login_required
100
  def new_token():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  try:
102
  tenants = UserTenantService.query(user_id=current_user.id)
103
  if not tenants:
104
  return get_data_error_result(retmsg="Tenant not found!")
105
 
106
  tenant_id = tenants[0].tenant_id
107
- obj = {"tenant_id": tenant_id, "token": generate_confirmation_token(tenant_id),
108
- "create_time": current_timestamp(),
109
- "create_date": datetime_format(datetime.now()),
110
- "update_time": None,
111
- "update_date": None
112
- }
 
 
113
 
114
  if not APITokenService.save(**obj):
115
  return get_data_error_result(retmsg="Fail to new a dialog!")
@@ -119,9 +229,37 @@ def new_token():
119
  return server_error_response(e)
120
 
121
 
122
- @manager.route('/token_list', methods=['GET'])
123
  @login_required
124
  def token_list():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  try:
126
  tenants = UserTenantService.query(user_id=current_user.id)
127
  if not tenants:
@@ -133,9 +271,33 @@ def token_list():
133
  return server_error_response(e)
134
 
135
 
136
- @manager.route('/token/<token>', methods=['DELETE'])
137
  @login_required
138
  def rm(token):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  APITokenService.filter_delete(
140
- [APIToken.tenant_id == current_user.id, APIToken.token == token])
141
- return get_json_result(data=True)
 
 
24
  from api.db.services.user_service import UserTenantService
25
  from api.settings import DATABASE_TYPE
26
  from api.utils import current_timestamp, datetime_format
27
+ from api.utils.api_utils import (
28
+ get_json_result,
29
+ get_data_error_result,
30
+ server_error_response,
31
+ generate_confirmation_token,
32
+ request,
33
+ validate_request,
34
+ )
35
  from api.versions import get_rag_version
36
  from rag.utils.es_conn import ELASTICSEARCH
37
  from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE
 
40
  from rag.utils.redis_conn import REDIS_CONN
41
 
42
 
43
+ @manager.route("/version", methods=["GET"])
44
  @login_required
45
  def version():
46
+ """
47
+ Get the current version of the application.
48
+ ---
49
+ tags:
50
+ - System
51
+ security:
52
+ - ApiKeyAuth: []
53
+ responses:
54
+ 200:
55
+ description: Version retrieved successfully.
56
+ schema:
57
+ type: object
58
+ properties:
59
+ version:
60
+ type: string
61
+ description: Version number.
62
+ """
63
  return get_json_result(data=get_rag_version())
64
 
65
 
66
+ @manager.route("/status", methods=["GET"])
67
  @login_required
68
  def status():
69
+ """
70
+ Get the system status.
71
+ ---
72
+ tags:
73
+ - System
74
+ security:
75
+ - ApiKeyAuth: []
76
+ responses:
77
+ 200:
78
+ description: System is operational.
79
+ schema:
80
+ type: object
81
+ properties:
82
+ es:
83
+ type: object
84
+ description: Elasticsearch status.
85
+ storage:
86
+ type: object
87
+ description: Storage status.
88
+ database:
89
+ type: object
90
+ description: Database status.
91
+ 503:
92
+ description: Service unavailable.
93
+ schema:
94
+ type: object
95
+ properties:
96
+ error:
97
+ type: string
98
+ description: Error message.
99
+ """
100
  res = {}
101
  st = timer()
102
  try:
103
  res["es"] = ELASTICSEARCH.health()
104
+ res["es"]["elapsed"] = "{:.1f}".format((timer() - st) * 1000.0)
105
  except Exception as e:
106
+ res["es"] = {
107
+ "status": "red",
108
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
109
+ "error": str(e),
110
+ }
111
 
112
  st = timer()
113
  try:
114
  STORAGE_IMPL.health()
115
+ res["storage"] = {
116
+ "storage": STORAGE_IMPL_TYPE.lower(),
117
+ "status": "green",
118
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
119
+ }
120
  except Exception as e:
121
+ res["storage"] = {
122
+ "storage": STORAGE_IMPL_TYPE.lower(),
123
+ "status": "red",
124
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
125
+ "error": str(e),
126
+ }
127
 
128
  st = timer()
129
  try:
130
  KnowledgebaseService.get_by_id("x")
131
+ res["database"] = {
132
+ "database": DATABASE_TYPE.lower(),
133
+ "status": "green",
134
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
135
+ }
136
  except Exception as e:
137
+ res["database"] = {
138
+ "database": DATABASE_TYPE.lower(),
139
+ "status": "red",
140
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
141
+ "error": str(e),
142
+ }
143
 
144
  st = timer()
145
  try:
146
  if not REDIS_CONN.health():
147
  raise Exception("Lost connection!")
148
+ res["redis"] = {
149
+ "status": "green",
150
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
151
+ }
152
  except Exception as e:
153
+ res["redis"] = {
154
+ "status": "red",
155
+ "elapsed": "{:.1f}".format((timer() - st) * 1000.0),
156
+ "error": str(e),
157
+ }
158
 
159
  try:
160
  v = REDIS_CONN.get("TASKEXE")
 
167
  if len(arr) == 1:
168
  obj[id] = [0]
169
  else:
170
+ obj[id] = [arr[i + 1] - arr[i] for i in range(len(arr) - 1)]
171
  elapsed = max(obj[id])
172
+ if elapsed > 50:
173
+ color = "yellow"
174
+ if elapsed > 120:
175
+ color = "red"
176
  res["task_executor"] = {"status": color, "elapsed": obj}
177
  except Exception as e:
178
  res["task_executor"] = {"status": "red", "error": str(e)}
 
180
  return get_json_result(data=res)
181
 
182
 
183
+ @manager.route("/new_token", methods=["POST"])
184
  @login_required
185
  def new_token():
186
+ """
187
+ Generate a new API token.
188
+ ---
189
+ tags:
190
+ - API Tokens
191
+ security:
192
+ - ApiKeyAuth: []
193
+ parameters:
194
+ - in: query
195
+ name: name
196
+ type: string
197
+ required: false
198
+ description: Name of the token.
199
+ responses:
200
+ 200:
201
+ description: Token generated successfully.
202
+ schema:
203
+ type: object
204
+ properties:
205
+ token:
206
+ type: string
207
+ description: The generated API token.
208
+ """
209
  try:
210
  tenants = UserTenantService.query(user_id=current_user.id)
211
  if not tenants:
212
  return get_data_error_result(retmsg="Tenant not found!")
213
 
214
  tenant_id = tenants[0].tenant_id
215
+ obj = {
216
+ "tenant_id": tenant_id,
217
+ "token": generate_confirmation_token(tenant_id),
218
+ "create_time": current_timestamp(),
219
+ "create_date": datetime_format(datetime.now()),
220
+ "update_time": None,
221
+ "update_date": None,
222
+ }
223
 
224
  if not APITokenService.save(**obj):
225
  return get_data_error_result(retmsg="Fail to new a dialog!")
 
229
  return server_error_response(e)
230
 
231
 
232
+ @manager.route("/token_list", methods=["GET"])
233
  @login_required
234
  def token_list():
235
+ """
236
+ List all API tokens for the current user.
237
+ ---
238
+ tags:
239
+ - API Tokens
240
+ security:
241
+ - ApiKeyAuth: []
242
+ responses:
243
+ 200:
244
+ description: List of API tokens.
245
+ schema:
246
+ type: object
247
+ properties:
248
+ tokens:
249
+ type: array
250
+ items:
251
+ type: object
252
+ properties:
253
+ token:
254
+ type: string
255
+ description: The API token.
256
+ name:
257
+ type: string
258
+ description: Name of the token.
259
+ create_time:
260
+ type: string
261
+ description: Token creation time.
262
+ """
263
  try:
264
  tenants = UserTenantService.query(user_id=current_user.id)
265
  if not tenants:
 
271
  return server_error_response(e)
272
 
273
 
274
+ @manager.route("/token/<token>", methods=["DELETE"])
275
  @login_required
276
  def rm(token):
277
+ """
278
+ Remove an API token.
279
+ ---
280
+ tags:
281
+ - API Tokens
282
+ security:
283
+ - ApiKeyAuth: []
284
+ parameters:
285
+ - in: path
286
+ name: token
287
+ type: string
288
+ required: true
289
+ description: The API token to remove.
290
+ responses:
291
+ 200:
292
+ description: Token removed successfully.
293
+ schema:
294
+ type: object
295
+ properties:
296
+ success:
297
+ type: boolean
298
+ description: Deletion status.
299
+ """
300
  APITokenService.filter_delete(
301
+ [APIToken.tenant_id == current_user.id, APIToken.token == token]
302
+ )
303
+ return get_json_result(data=True)
api/apps/user_app.py CHANGED
@@ -23,65 +23,141 @@ from flask_login import login_required, current_user, login_user, logout_user
23
 
24
  from api.db.db_models import TenantLLM
25
  from api.db.services.llm_service import TenantLLMService, LLMService
26
- from api.utils.api_utils import server_error_response, validate_request, get_data_error_result
27
- from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
 
 
 
 
 
 
 
 
 
 
 
28
  from api.db import UserTenantRole, LLMType, FileType
29
- from api.settings import RetCode, GITHUB_OAUTH, FEISHU_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, \
30
- API_KEY, \
31
- LLM_FACTORY, LLM_BASE_URL, RERANK_MDL
 
 
 
 
 
 
 
 
 
 
 
32
  from api.db.services.user_service import UserService, TenantService, UserTenantService
33
  from api.db.services.file_service import FileService
34
  from api.settings import stat_logger
35
  from api.utils.api_utils import get_json_result, construct_response
36
 
37
 
38
- @manager.route('/login', methods=['POST', 'GET'])
39
  def login():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  if not request.json:
41
- return get_json_result(data=False,
42
- retcode=RetCode.AUTHENTICATION_ERROR,
43
- retmsg='Unauthorized!')
44
 
45
- email = request.json.get('email', "")
46
  users = UserService.query(email=email)
47
  if not users:
48
- return get_json_result(data=False,
49
- retcode=RetCode.AUTHENTICATION_ERROR,
50
- retmsg=f'Email: {email} is not registered!')
 
 
51
 
52
- password = request.json.get('password')
53
  try:
54
  password = decrypt(password)
55
  except BaseException:
56
- return get_json_result(data=False,
57
- retcode=RetCode.SERVER_ERROR,
58
- retmsg='Fail to crypt password')
59
 
60
  user = UserService.query_user(email, password)
61
  if user:
62
  response_data = user.to_json()
63
  user.access_token = get_uuid()
64
  login_user(user)
65
- user.update_time = current_timestamp(),
66
- user.update_date = datetime_format(datetime.now()),
67
  user.save()
68
  msg = "Welcome back!"
69
  return construct_response(data=response_data, auth=user.get_id(), retmsg=msg)
70
  else:
71
- return get_json_result(data=False,
72
- retcode=RetCode.AUTHENTICATION_ERROR,
73
- retmsg='Email and password do not match!')
 
 
74
 
75
 
76
- @manager.route('/github_callback', methods=['GET'])
77
  def github_callback():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  import requests
79
- res = requests.post(GITHUB_OAUTH.get("url"),
80
- data={
81
- "client_id": GITHUB_OAUTH.get("client_id"),
82
- "client_secret": GITHUB_OAUTH.get("secret_key"),
83
- "code": request.args.get('code')},
84
- headers={"Accept": "application/json"})
 
 
 
 
85
  res = res.json()
86
  if "error" in res:
87
  return redirect("/?error=%s" % res["error_description"])
@@ -103,19 +179,22 @@ def github_callback():
103
  except Exception as e:
104
  stat_logger.exception(e)
105
  avatar = ""
106
- users = user_register(user_id, {
107
- "access_token": session["access_token"],
108
- "email": email_address,
109
- "avatar": avatar,
110
- "nickname": user_info["login"],
111
- "login_channel": "github",
112
- "last_login_time": get_format_time(),
113
- "is_superuser": False,
114
- })
 
 
 
115
  if not users:
116
- raise Exception(f'Fail to register {email_address}.')
117
  if len(users) > 1:
118
- raise Exception(f'Same email: {email_address} exists!')
119
 
120
  # Try to log in
121
  user = users[0]
@@ -134,30 +213,56 @@ def github_callback():
134
  return redirect("/?auth=%s" % user.get_id())
135
 
136
 
137
- @manager.route('/feishu_callback', methods=['GET'])
138
  def feishu_callback():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  import requests
140
- app_access_token_res = requests.post(FEISHU_OAUTH.get("app_access_token_url"),
141
- data=json.dumps({
142
- "app_id": FEISHU_OAUTH.get("app_id"),
143
- "app_secret": FEISHU_OAUTH.get("app_secret")
144
- }),
145
- headers={"Content-Type": "application/json; charset=utf-8"})
 
 
 
 
 
146
  app_access_token_res = app_access_token_res.json()
147
- if app_access_token_res['code'] != 0:
148
  return redirect("/?error=%s" % app_access_token_res)
149
 
150
- res = requests.post(FEISHU_OAUTH.get("user_access_token_url"),
151
- data=json.dumps({
152
- "grant_type": FEISHU_OAUTH.get("grant_type"),
153
- "code": request.args.get('code')
154
- }),
155
- headers={
156
- "Content-Type": "application/json; charset=utf-8",
157
- 'Authorization': f"Bearer {app_access_token_res['app_access_token']}"
158
- })
 
 
 
 
159
  res = res.json()
160
- if res['code'] != 0:
161
  return redirect("/?error=%s" % res["message"])
162
 
163
  if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
@@ -176,19 +281,22 @@ def feishu_callback():
176
  except Exception as e:
177
  stat_logger.exception(e)
178
  avatar = ""
179
- users = user_register(user_id, {
180
- "access_token": session["access_token"],
181
- "email": email_address,
182
- "avatar": avatar,
183
- "nickname": user_info["en_name"],
184
- "login_channel": "feishu",
185
- "last_login_time": get_format_time(),
186
- "is_superuser": False,
187
- })
 
 
 
188
  if not users:
189
- raise Exception(f'Fail to register {email_address}.')
190
  if len(users) > 1:
191
- raise Exception(f'Same email: {email_address} exists!')
192
 
193
  # Try to log in
194
  user = users[0]
@@ -209,11 +317,14 @@ def feishu_callback():
209
 
210
  def user_info_from_feishu(access_token):
211
  import requests
212
- headers = {"Content-Type": "application/json; charset=utf-8",
213
- 'Authorization': f"Bearer {access_token}"}
 
 
 
214
  res = requests.get(
215
- f"https://open.feishu.cn/open-apis/authen/v1/user_info",
216
- headers=headers)
217
  user_info = res.json()["data"]
218
  user_info["email"] = None if user_info.get("email") == "" else user_info["email"]
219
  return user_info
@@ -221,24 +332,38 @@ def user_info_from_feishu(access_token):
221
 
222
  def user_info_from_github(access_token):
223
  import requests
224
- headers = {"Accept": "application/json",
225
- 'Authorization': f"token {access_token}"}
226
  res = requests.get(
227
- f"https://api.github.com/user?access_token={access_token}",
228
- headers=headers)
229
  user_info = res.json()
230
  email_info = requests.get(
231
  f"https://api.github.com/user/emails?access_token={access_token}",
232
- headers=headers).json()
 
233
  user_info["email"] = next(
234
- (email for email in email_info if email['primary'] == True),
235
- None)["email"]
236
  return user_info
237
 
238
 
239
- @manager.route("/logout", methods=['GET'])
240
  @login_required
241
  def log_out():
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  current_user.access_token = ""
243
  current_user.save()
244
  logout_user()
@@ -248,20 +373,62 @@ def log_out():
248
  @manager.route("/setting", methods=["POST"])
249
  @login_required
250
  def setting_user():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  update_dict = {}
252
  request_data = request.json
253
  if request_data.get("password"):
254
  new_password = request_data.get("new_password")
255
  if not check_password_hash(
256
- current_user.password, decrypt(request_data["password"])):
257
- return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg='Password error!')
 
 
 
 
 
258
 
259
  if new_password:
260
  update_dict["password"] = generate_password_hash(decrypt(new_password))
261
 
262
  for k in request_data.keys():
263
- if k in ["password", "new_password", "email", "status", "is_superuser", "login_channel", "is_anonymous",
264
- "is_active", "is_authenticated", "last_login_time"]:
 
 
 
 
 
 
 
 
 
 
265
  continue
266
  update_dict[k] = request_data[k]
267
 
@@ -270,12 +437,37 @@ def setting_user():
270
  return get_json_result(data=True)
271
  except Exception as e:
272
  stat_logger.exception(e)
273
- return get_json_result(data=False, retmsg='Update failure!', retcode=RetCode.EXCEPTION_ERROR)
 
 
274
 
275
 
276
  @manager.route("/info", methods=["GET"])
277
  @login_required
278
  def user_profile():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  return get_json_result(data=current_user.to_dict())
280
 
281
 
@@ -310,13 +502,13 @@ def user_register(user_id, user):
310
  "asr_id": ASR_MDL,
311
  "parser_ids": PARSERS,
312
  "img2txt_id": IMAGE2TEXT_MDL,
313
- "rerank_id": RERANK_MDL
314
  }
315
  usr_tenant = {
316
  "tenant_id": user_id,
317
  "user_id": user_id,
318
  "invited_by": user_id,
319
- "role": UserTenantRole.OWNER
320
  }
321
  file_id = get_uuid()
322
  file = {
@@ -331,13 +523,16 @@ def user_register(user_id, user):
331
  }
332
  tenant_llm = []
333
  for llm in LLMService.query(fid=LLM_FACTORY):
334
- tenant_llm.append({"tenant_id": user_id,
335
- "llm_factory": LLM_FACTORY,
336
- "llm_name": llm.llm_name,
337
- "model_type": llm.model_type,
338
- "api_key": API_KEY,
339
- "api_base": LLM_BASE_URL
340
- })
 
 
 
341
 
342
  if not UserService.save(**user):
343
  return
@@ -351,21 +546,52 @@ def user_register(user_id, user):
351
  @manager.route("/register", methods=["POST"])
352
  @validate_request("nickname", "email", "password")
353
  def user_add():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  req = request.json
355
  email_address = req["email"]
356
 
357
  # Validate the email address
358
  if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,5}$", email_address):
359
- return get_json_result(data=False,
360
- retmsg=f'Invalid email address: {email_address}!',
361
- retcode=RetCode.OPERATING_ERROR)
 
 
362
 
363
  # Check if the email address is already used
364
  if UserService.query(email=email_address):
365
  return get_json_result(
366
  data=False,
367
- retmsg=f'Email: {email_address} has already registered!',
368
- retcode=RetCode.OPERATING_ERROR)
 
369
 
370
  # Construct user info data
371
  nickname = req["nickname"]
@@ -383,25 +609,55 @@ def user_add():
383
  try:
384
  users = user_register(user_id, user_dict)
385
  if not users:
386
- raise Exception(f'Fail to register {email_address}.')
387
  if len(users) > 1:
388
- raise Exception(f'Same email: {email_address} exists!')
389
  user = users[0]
390
  login_user(user)
391
- return construct_response(data=user.to_json(),
392
- auth=user.get_id(),
393
- retmsg=f"{nickname}, welcome aboard!")
 
 
394
  except Exception as e:
395
  rollback_user_registration(user_id)
396
  stat_logger.exception(e)
397
- return get_json_result(data=False,
398
- retmsg=f'User registration failure, error: {str(e)}',
399
- retcode=RetCode.EXCEPTION_ERROR)
 
 
400
 
401
 
402
  @manager.route("/tenant_info", methods=["GET"])
403
  @login_required
404
  def tenant_info():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  try:
406
  tenants = TenantService.get_info_by(current_user.id)
407
  if not tenants:
@@ -415,6 +671,42 @@ def tenant_info():
415
  @login_required
416
  @validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id")
417
  def set_tenant_info():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  req = request.json
419
  try:
420
  tid = req["tenant_id"]
 
23
 
24
  from api.db.db_models import TenantLLM
25
  from api.db.services.llm_service import TenantLLMService, LLMService
26
+ from api.utils.api_utils import (
27
+ server_error_response,
28
+ validate_request,
29
+ get_data_error_result,
30
+ )
31
+ from api.utils import (
32
+ get_uuid,
33
+ get_format_time,
34
+ decrypt,
35
+ download_img,
36
+ current_timestamp,
37
+ datetime_format,
38
+ )
39
  from api.db import UserTenantRole, LLMType, FileType
40
+ from api.settings import (
41
+ RetCode,
42
+ GITHUB_OAUTH,
43
+ FEISHU_OAUTH,
44
+ CHAT_MDL,
45
+ EMBEDDING_MDL,
46
+ ASR_MDL,
47
+ IMAGE2TEXT_MDL,
48
+ PARSERS,
49
+ API_KEY,
50
+ LLM_FACTORY,
51
+ LLM_BASE_URL,
52
+ RERANK_MDL,
53
+ )
54
  from api.db.services.user_service import UserService, TenantService, UserTenantService
55
  from api.db.services.file_service import FileService
56
  from api.settings import stat_logger
57
  from api.utils.api_utils import get_json_result, construct_response
58
 
59
 
60
+ @manager.route("/login", methods=["POST", "GET"])
61
  def login():
62
+ """
63
+ User login endpoint.
64
+ ---
65
+ tags:
66
+ - User
67
+ parameters:
68
+ - in: body
69
+ name: body
70
+ description: Login credentials.
71
+ required: true
72
+ schema:
73
+ type: object
74
+ properties:
75
+ email:
76
+ type: string
77
+ description: User email.
78
+ password:
79
+ type: string
80
+ description: User password.
81
+ responses:
82
+ 200:
83
+ description: Login successful.
84
+ schema:
85
+ type: object
86
+ 401:
87
+ description: Authentication failed.
88
+ schema:
89
+ type: object
90
+ """
91
  if not request.json:
92
+ return get_json_result(
93
+ data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg="Unauthorized!"
94
+ )
95
 
96
+ email = request.json.get("email", "")
97
  users = UserService.query(email=email)
98
  if not users:
99
+ return get_json_result(
100
+ data=False,
101
+ retcode=RetCode.AUTHENTICATION_ERROR,
102
+ retmsg=f"Email: {email} is not registered!",
103
+ )
104
 
105
+ password = request.json.get("password")
106
  try:
107
  password = decrypt(password)
108
  except BaseException:
109
+ return get_json_result(
110
+ data=False, retcode=RetCode.SERVER_ERROR, retmsg="Fail to crypt password"
111
+ )
112
 
113
  user = UserService.query_user(email, password)
114
  if user:
115
  response_data = user.to_json()
116
  user.access_token = get_uuid()
117
  login_user(user)
118
+ user.update_time = (current_timestamp(),)
119
+ user.update_date = (datetime_format(datetime.now()),)
120
  user.save()
121
  msg = "Welcome back!"
122
  return construct_response(data=response_data, auth=user.get_id(), retmsg=msg)
123
  else:
124
+ return get_json_result(
125
+ data=False,
126
+ retcode=RetCode.AUTHENTICATION_ERROR,
127
+ retmsg="Email and password do not match!",
128
+ )
129
 
130
 
131
+ @manager.route("/github_callback", methods=["GET"])
132
  def github_callback():
133
+ """
134
+ GitHub OAuth callback endpoint.
135
+ ---
136
+ tags:
137
+ - OAuth
138
+ parameters:
139
+ - in: query
140
+ name: code
141
+ type: string
142
+ required: true
143
+ description: Authorization code from GitHub.
144
+ responses:
145
+ 200:
146
+ description: Authentication successful.
147
+ schema:
148
+ type: object
149
+ """
150
  import requests
151
+
152
+ res = requests.post(
153
+ GITHUB_OAUTH.get("url"),
154
+ data={
155
+ "client_id": GITHUB_OAUTH.get("client_id"),
156
+ "client_secret": GITHUB_OAUTH.get("secret_key"),
157
+ "code": request.args.get("code"),
158
+ },
159
+ headers={"Accept": "application/json"},
160
+ )
161
  res = res.json()
162
  if "error" in res:
163
  return redirect("/?error=%s" % res["error_description"])
 
179
  except Exception as e:
180
  stat_logger.exception(e)
181
  avatar = ""
182
+ users = user_register(
183
+ user_id,
184
+ {
185
+ "access_token": session["access_token"],
186
+ "email": email_address,
187
+ "avatar": avatar,
188
+ "nickname": user_info["login"],
189
+ "login_channel": "github",
190
+ "last_login_time": get_format_time(),
191
+ "is_superuser": False,
192
+ },
193
+ )
194
  if not users:
195
+ raise Exception(f"Fail to register {email_address}.")
196
  if len(users) > 1:
197
+ raise Exception(f"Same email: {email_address} exists!")
198
 
199
  # Try to log in
200
  user = users[0]
 
213
  return redirect("/?auth=%s" % user.get_id())
214
 
215
 
216
+ @manager.route("/feishu_callback", methods=["GET"])
217
  def feishu_callback():
218
+ """
219
+ Feishu OAuth callback endpoint.
220
+ ---
221
+ tags:
222
+ - OAuth
223
+ parameters:
224
+ - in: query
225
+ name: code
226
+ type: string
227
+ required: true
228
+ description: Authorization code from Feishu.
229
+ responses:
230
+ 200:
231
+ description: Authentication successful.
232
+ schema:
233
+ type: object
234
+ """
235
  import requests
236
+
237
+ app_access_token_res = requests.post(
238
+ FEISHU_OAUTH.get("app_access_token_url"),
239
+ data=json.dumps(
240
+ {
241
+ "app_id": FEISHU_OAUTH.get("app_id"),
242
+ "app_secret": FEISHU_OAUTH.get("app_secret"),
243
+ }
244
+ ),
245
+ headers={"Content-Type": "application/json; charset=utf-8"},
246
+ )
247
  app_access_token_res = app_access_token_res.json()
248
+ if app_access_token_res["code"] != 0:
249
  return redirect("/?error=%s" % app_access_token_res)
250
 
251
+ res = requests.post(
252
+ FEISHU_OAUTH.get("user_access_token_url"),
253
+ data=json.dumps(
254
+ {
255
+ "grant_type": FEISHU_OAUTH.get("grant_type"),
256
+ "code": request.args.get("code"),
257
+ }
258
+ ),
259
+ headers={
260
+ "Content-Type": "application/json; charset=utf-8",
261
+ "Authorization": f"Bearer {app_access_token_res['app_access_token']}",
262
+ },
263
+ )
264
  res = res.json()
265
+ if res["code"] != 0:
266
  return redirect("/?error=%s" % res["message"])
267
 
268
  if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
 
281
  except Exception as e:
282
  stat_logger.exception(e)
283
  avatar = ""
284
+ users = user_register(
285
+ user_id,
286
+ {
287
+ "access_token": session["access_token"],
288
+ "email": email_address,
289
+ "avatar": avatar,
290
+ "nickname": user_info["en_name"],
291
+ "login_channel": "feishu",
292
+ "last_login_time": get_format_time(),
293
+ "is_superuser": False,
294
+ },
295
+ )
296
  if not users:
297
+ raise Exception(f"Fail to register {email_address}.")
298
  if len(users) > 1:
299
+ raise Exception(f"Same email: {email_address} exists!")
300
 
301
  # Try to log in
302
  user = users[0]
 
317
 
318
  def user_info_from_feishu(access_token):
319
  import requests
320
+
321
+ headers = {
322
+ "Content-Type": "application/json; charset=utf-8",
323
+ "Authorization": f"Bearer {access_token}",
324
+ }
325
  res = requests.get(
326
+ f"https://open.feishu.cn/open-apis/authen/v1/user_info", headers=headers
327
+ )
328
  user_info = res.json()["data"]
329
  user_info["email"] = None if user_info.get("email") == "" else user_info["email"]
330
  return user_info
 
332
 
333
  def user_info_from_github(access_token):
334
  import requests
335
+
336
+ headers = {"Accept": "application/json", "Authorization": f"token {access_token}"}
337
  res = requests.get(
338
+ f"https://api.github.com/user?access_token={access_token}", headers=headers
339
+ )
340
  user_info = res.json()
341
  email_info = requests.get(
342
  f"https://api.github.com/user/emails?access_token={access_token}",
343
+ headers=headers,
344
+ ).json()
345
  user_info["email"] = next(
346
+ (email for email in email_info if email["primary"] == True), None
347
+ )["email"]
348
  return user_info
349
 
350
 
351
+ @manager.route("/logout", methods=["GET"])
352
  @login_required
353
  def log_out():
354
+ """
355
+ User logout endpoint.
356
+ ---
357
+ tags:
358
+ - User
359
+ security:
360
+ - ApiKeyAuth: []
361
+ responses:
362
+ 200:
363
+ description: Logout successful.
364
+ schema:
365
+ type: object
366
+ """
367
  current_user.access_token = ""
368
  current_user.save()
369
  logout_user()
 
373
  @manager.route("/setting", methods=["POST"])
374
  @login_required
375
  def setting_user():
376
+ """
377
+ Update user settings.
378
+ ---
379
+ tags:
380
+ - User
381
+ security:
382
+ - ApiKeyAuth: []
383
+ parameters:
384
+ - in: body
385
+ name: body
386
+ description: User settings to update.
387
+ required: true
388
+ schema:
389
+ type: object
390
+ properties:
391
+ nickname:
392
+ type: string
393
+ description: New nickname.
394
+ email:
395
+ type: string
396
+ description: New email.
397
+ responses:
398
+ 200:
399
+ description: Settings updated successfully.
400
+ schema:
401
+ type: object
402
+ """
403
  update_dict = {}
404
  request_data = request.json
405
  if request_data.get("password"):
406
  new_password = request_data.get("new_password")
407
  if not check_password_hash(
408
+ current_user.password, decrypt(request_data["password"])
409
+ ):
410
+ return get_json_result(
411
+ data=False,
412
+ retcode=RetCode.AUTHENTICATION_ERROR,
413
+ retmsg="Password error!",
414
+ )
415
 
416
  if new_password:
417
  update_dict["password"] = generate_password_hash(decrypt(new_password))
418
 
419
  for k in request_data.keys():
420
+ if k in [
421
+ "password",
422
+ "new_password",
423
+ "email",
424
+ "status",
425
+ "is_superuser",
426
+ "login_channel",
427
+ "is_anonymous",
428
+ "is_active",
429
+ "is_authenticated",
430
+ "last_login_time",
431
+ ]:
432
  continue
433
  update_dict[k] = request_data[k]
434
 
 
437
  return get_json_result(data=True)
438
  except Exception as e:
439
  stat_logger.exception(e)
440
+ return get_json_result(
441
+ data=False, retmsg="Update failure!", retcode=RetCode.EXCEPTION_ERROR
442
+ )
443
 
444
 
445
  @manager.route("/info", methods=["GET"])
446
  @login_required
447
  def user_profile():
448
+ """
449
+ Get user profile information.
450
+ ---
451
+ tags:
452
+ - User
453
+ security:
454
+ - ApiKeyAuth: []
455
+ responses:
456
+ 200:
457
+ description: User profile retrieved successfully.
458
+ schema:
459
+ type: object
460
+ properties:
461
+ id:
462
+ type: string
463
+ description: User ID.
464
+ nickname:
465
+ type: string
466
+ description: User nickname.
467
+ email:
468
+ type: string
469
+ description: User email.
470
+ """
471
  return get_json_result(data=current_user.to_dict())
472
 
473
 
 
502
  "asr_id": ASR_MDL,
503
  "parser_ids": PARSERS,
504
  "img2txt_id": IMAGE2TEXT_MDL,
505
+ "rerank_id": RERANK_MDL,
506
  }
507
  usr_tenant = {
508
  "tenant_id": user_id,
509
  "user_id": user_id,
510
  "invited_by": user_id,
511
+ "role": UserTenantRole.OWNER,
512
  }
513
  file_id = get_uuid()
514
  file = {
 
523
  }
524
  tenant_llm = []
525
  for llm in LLMService.query(fid=LLM_FACTORY):
526
+ tenant_llm.append(
527
+ {
528
+ "tenant_id": user_id,
529
+ "llm_factory": LLM_FACTORY,
530
+ "llm_name": llm.llm_name,
531
+ "model_type": llm.model_type,
532
+ "api_key": API_KEY,
533
+ "api_base": LLM_BASE_URL,
534
+ }
535
+ )
536
 
537
  if not UserService.save(**user):
538
  return
 
546
  @manager.route("/register", methods=["POST"])
547
  @validate_request("nickname", "email", "password")
548
  def user_add():
549
+ """
550
+ Register a new user.
551
+ ---
552
+ tags:
553
+ - User
554
+ parameters:
555
+ - in: body
556
+ name: body
557
+ description: Registration details.
558
+ required: true
559
+ schema:
560
+ type: object
561
+ properties:
562
+ nickname:
563
+ type: string
564
+ description: User nickname.
565
+ email:
566
+ type: string
567
+ description: User email.
568
+ password:
569
+ type: string
570
+ description: User password.
571
+ responses:
572
+ 200:
573
+ description: Registration successful.
574
+ schema:
575
+ type: object
576
+ """
577
  req = request.json
578
  email_address = req["email"]
579
 
580
  # Validate the email address
581
  if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,5}$", email_address):
582
+ return get_json_result(
583
+ data=False,
584
+ retmsg=f"Invalid email address: {email_address}!",
585
+ retcode=RetCode.OPERATING_ERROR,
586
+ )
587
 
588
  # Check if the email address is already used
589
  if UserService.query(email=email_address):
590
  return get_json_result(
591
  data=False,
592
+ retmsg=f"Email: {email_address} has already registered!",
593
+ retcode=RetCode.OPERATING_ERROR,
594
+ )
595
 
596
  # Construct user info data
597
  nickname = req["nickname"]
 
609
  try:
610
  users = user_register(user_id, user_dict)
611
  if not users:
612
+ raise Exception(f"Fail to register {email_address}.")
613
  if len(users) > 1:
614
+ raise Exception(f"Same email: {email_address} exists!")
615
  user = users[0]
616
  login_user(user)
617
+ return construct_response(
618
+ data=user.to_json(),
619
+ auth=user.get_id(),
620
+ retmsg=f"{nickname}, welcome aboard!",
621
+ )
622
  except Exception as e:
623
  rollback_user_registration(user_id)
624
  stat_logger.exception(e)
625
+ return get_json_result(
626
+ data=False,
627
+ retmsg=f"User registration failure, error: {str(e)}",
628
+ retcode=RetCode.EXCEPTION_ERROR,
629
+ )
630
 
631
 
632
  @manager.route("/tenant_info", methods=["GET"])
633
  @login_required
634
  def tenant_info():
635
+ """
636
+ Get tenant information.
637
+ ---
638
+ tags:
639
+ - Tenant
640
+ security:
641
+ - ApiKeyAuth: []
642
+ responses:
643
+ 200:
644
+ description: Tenant information retrieved successfully.
645
+ schema:
646
+ type: object
647
+ properties:
648
+ tenant_id:
649
+ type: string
650
+ description: Tenant ID.
651
+ name:
652
+ type: string
653
+ description: Tenant name.
654
+ llm_id:
655
+ type: string
656
+ description: LLM ID.
657
+ embd_id:
658
+ type: string
659
+ description: Embedding model ID.
660
+ """
661
  try:
662
  tenants = TenantService.get_info_by(current_user.id)
663
  if not tenants:
 
671
  @login_required
672
  @validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id")
673
  def set_tenant_info():
674
+ """
675
+ Update tenant information.
676
+ ---
677
+ tags:
678
+ - Tenant
679
+ security:
680
+ - ApiKeyAuth: []
681
+ parameters:
682
+ - in: body
683
+ name: body
684
+ description: Tenant information to update.
685
+ required: true
686
+ schema:
687
+ type: object
688
+ properties:
689
+ tenant_id:
690
+ type: string
691
+ description: Tenant ID.
692
+ llm_id:
693
+ type: string
694
+ description: LLM ID.
695
+ embd_id:
696
+ type: string
697
+ description: Embedding model ID.
698
+ asr_id:
699
+ type: string
700
+ description: ASR model ID.
701
+ img2txt_id:
702
+ type: string
703
+ description: Image to Text model ID.
704
+ responses:
705
+ 200:
706
+ description: Tenant information updated successfully.
707
+ schema:
708
+ type: object
709
+ """
710
  req = request.json
711
  try:
712
  tid = req["tenant_id"]
api/ragflow_server.py CHANGED
@@ -27,7 +27,11 @@ from api.apps import app
27
  from api.db.runtime_config import RuntimeConfig
28
  from api.db.services.document_service import DocumentService
29
  from api.settings import (
30
- HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
 
 
 
 
31
  )
32
  from api import utils
33
 
@@ -45,27 +49,33 @@ def update_progress():
45
  stat_logger.error("update_progress exception:" + str(e))
46
 
47
 
48
- if __name__ == '__main__':
49
- print(r"""
 
50
  ____ ___ ______ ______ __
51
  / __ \ / | / ____// ____// /____ _ __
52
  / /_/ // /| | / / __ / /_ / // __ \| | /| / /
53
  / _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
54
  /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
55
 
56
- """, flush=True)
57
- stat_logger.info(
58
- f'project base: {utils.file_utils.get_project_base_directory()}'
59
  )
 
60
 
61
  # init db
62
  init_web_db()
63
  init_web_data()
64
  # init runtime config
65
  import argparse
 
66
  parser = argparse.ArgumentParser()
67
- parser.add_argument('--version', default=False, help="rag flow version", action='store_true')
68
- parser.add_argument('--debug', default=False, help="debug mode", action='store_true')
 
 
 
 
69
  args = parser.parse_args()
70
  if args.version:
71
  print(get_versions())
@@ -78,7 +88,7 @@ if __name__ == '__main__':
78
  RuntimeConfig.init_env()
79
  RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
80
 
81
- peewee_logger = logging.getLogger('peewee')
82
  peewee_logger.propagate = False
83
  # rag_arch.common.log.ROpenHandler
84
  peewee_logger.addHandler(database_logger.handlers[0])
@@ -93,7 +103,14 @@ if __name__ == '__main__':
93
  werkzeug_logger = logging.getLogger("werkzeug")
94
  for h in access_logger.handlers:
95
  werkzeug_logger.addHandler(h)
96
- run_simple(hostname=HOST, port=HTTP_PORT, application=app, threaded=True, use_reloader=RuntimeConfig.DEBUG, use_debugger=RuntimeConfig.DEBUG)
 
 
 
 
 
 
 
97
  except Exception:
98
  traceback.print_exc()
99
- os.kill(os.getpid(), signal.SIGKILL)
 
27
  from api.db.runtime_config import RuntimeConfig
28
  from api.db.services.document_service import DocumentService
29
  from api.settings import (
30
+ HOST,
31
+ HTTP_PORT,
32
+ access_logger,
33
+ database_logger,
34
+ stat_logger,
35
  )
36
  from api import utils
37
 
 
49
  stat_logger.error("update_progress exception:" + str(e))
50
 
51
 
52
+ if __name__ == "__main__":
53
+ print(
54
+ r"""
55
  ____ ___ ______ ______ __
56
  / __ \ / | / ____// ____// /____ _ __
57
  / /_/ // /| | / / __ / /_ / // __ \| | /| / /
58
  / _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
59
  /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
60
 
61
+ """,
62
+ flush=True,
 
63
  )
64
+ stat_logger.info(f"project base: {utils.file_utils.get_project_base_directory()}")
65
 
66
  # init db
67
  init_web_db()
68
  init_web_data()
69
  # init runtime config
70
  import argparse
71
+
72
  parser = argparse.ArgumentParser()
73
+ parser.add_argument(
74
+ "--version", default=False, help="rag flow version", action="store_true"
75
+ )
76
+ parser.add_argument(
77
+ "--debug", default=False, help="debug mode", action="store_true"
78
+ )
79
  args = parser.parse_args()
80
  if args.version:
81
  print(get_versions())
 
88
  RuntimeConfig.init_env()
89
  RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
90
 
91
+ peewee_logger = logging.getLogger("peewee")
92
  peewee_logger.propagate = False
93
  # rag_arch.common.log.ROpenHandler
94
  peewee_logger.addHandler(database_logger.handlers[0])
 
103
  werkzeug_logger = logging.getLogger("werkzeug")
104
  for h in access_logger.handlers:
105
  werkzeug_logger.addHandler(h)
106
+ run_simple(
107
+ hostname=HOST,
108
+ port=HTTP_PORT,
109
+ application=app,
110
+ threaded=True,
111
+ use_reloader=RuntimeConfig.DEBUG,
112
+ use_debugger=RuntimeConfig.DEBUG,
113
+ )
114
  except Exception:
115
  traceback.print_exc()
116
+ os.kill(os.getpid(), signal.SIGKILL)
poetry.lock CHANGED
@@ -435,6 +435,17 @@ files = [
435
  {file = "Aspose.Slides-24.10.0-py3-none-win_amd64.whl", hash = "sha256:8980015fbc32c1e70e80444c70a642597511300ead6b352183bf74ba3da67f2d"},
436
  ]
437
 
 
 
 
 
 
 
 
 
 
 
 
438
  [[package]]
439
  name = "attrs"
440
  version = "24.2.0"
@@ -1912,7 +1923,10 @@ files = [
1912
  huggingface-hub = ">=0.20,<1.0"
1913
  loguru = ">=0.7.2,<0.8.0"
1914
  mmh3 = ">=4.0,<5.0"
1915
- numpy = {version = ">=1.26,<2", markers = "python_version >= \"3.12\""}
 
 
 
1916
  onnx = ">=1.15.0,<2.0.0"
1917
  onnxruntime = ">=1.17.0,<2.0.0"
1918
  pillow = ">=10.3.0,<11.0.0"
@@ -2037,6 +2051,24 @@ sentence_transformers = "*"
2037
  torch = ">=1.6.0"
2038
  transformers = ">=4.33.0"
2039
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2040
  [[package]]
2041
  name = "flask"
2042
  version = "3.0.3"
@@ -4381,6 +4413,17 @@ httpx = ">=0.25,<1"
4381
  orjson = ">=3.9.10,<3.11"
4382
  pydantic = ">=2.5.2,<3"
4383
 
 
 
 
 
 
 
 
 
 
 
 
4384
  [[package]]
4385
  name = "mkl"
4386
  version = "2021.4.0"
@@ -5149,7 +5192,10 @@ files = [
5149
  ]
5150
 
5151
  [package.dependencies]
5152
- numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""}
 
 
 
5153
 
5154
  [[package]]
5155
  name = "opencv-python-headless"
@@ -5168,7 +5214,10 @@ files = [
5168
  ]
5169
 
5170
  [package.dependencies]
5171
- numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""}
 
 
 
5172
 
5173
  [[package]]
5174
  name = "openpyxl"
@@ -5350,7 +5399,10 @@ files = [
5350
  ]
5351
 
5352
  [package.dependencies]
5353
- numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""}
 
 
 
5354
  python-dateutil = ">=2.8.2"
5355
  pytz = ">=2020.1"
5356
  tzdata = ">=2022.7"
@@ -7009,6 +7061,24 @@ lxml = "*"
7009
  [package.extras]
7010
  test = ["timeout-decorator"]
7011
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7012
  [[package]]
7013
  name = "referencing"
7014
  version = "0.35.1"
@@ -8468,6 +8538,7 @@ nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"
8468
  nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
8469
  nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
8470
  sympy = "*"
 
8471
  typing-extensions = ">=4.8.0"
8472
 
8473
  [package.extras]
@@ -8611,6 +8682,29 @@ files = [
8611
  trio = ">=0.11"
8612
  wsproto = ">=0.14"
8613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8614
  [[package]]
8615
  name = "typer"
8616
  version = "0.12.5"
@@ -9446,5 +9540,5 @@ files = [
9446
 
9447
  [metadata]
9448
  lock-version = "2.0"
9449
- python-versions = ">=3.12,<3.13"
9450
- content-hash = "9c488418342dcd2a1ff625db0da677d086e309c9e4285b46c622f1099af4850f"
 
435
  {file = "Aspose.Slides-24.10.0-py3-none-win_amd64.whl", hash = "sha256:8980015fbc32c1e70e80444c70a642597511300ead6b352183bf74ba3da67f2d"},
436
  ]
437
 
438
+ [[package]]
439
+ name = "async-timeout"
440
+ version = "4.0.3"
441
+ description = "Timeout context manager for asyncio programs"
442
+ optional = false
443
+ python-versions = ">=3.7"
444
+ files = [
445
+ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
446
+ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
447
+ ]
448
+
449
  [[package]]
450
  name = "attrs"
451
  version = "24.2.0"
 
1923
  huggingface-hub = ">=0.20,<1.0"
1924
  loguru = ">=0.7.2,<0.8.0"
1925
  mmh3 = ">=4.0,<5.0"
1926
+ numpy = [
1927
+ {version = ">=1.21,<2", markers = "python_version < \"3.12\""},
1928
+ {version = ">=1.26,<2", markers = "python_version >= \"3.12\""},
1929
+ ]
1930
  onnx = ">=1.15.0,<2.0.0"
1931
  onnxruntime = ">=1.17.0,<2.0.0"
1932
  pillow = ">=10.3.0,<11.0.0"
 
2051
  torch = ">=1.6.0"
2052
  transformers = ">=4.33.0"
2053
 
2054
+ [[package]]
2055
+ name = "flasgger"
2056
+ version = "0.9.7.1"
2057
+ description = "Extract swagger specs from your flask project"
2058
+ optional = false
2059
+ python-versions = "*"
2060
+ files = [
2061
+ {file = "flasgger-0.9.7.1.tar.gz", hash = "sha256:ca098e10bfbb12f047acc6299cc70a33851943a746e550d86e65e60d4df245fb"},
2062
+ ]
2063
+
2064
+ [package.dependencies]
2065
+ Flask = ">=0.10"
2066
+ jsonschema = ">=3.0.1"
2067
+ mistune = "*"
2068
+ packaging = "*"
2069
+ PyYAML = ">=3.0"
2070
+ six = ">=1.10.0"
2071
+
2072
  [[package]]
2073
  name = "flask"
2074
  version = "3.0.3"
 
4413
  orjson = ">=3.9.10,<3.11"
4414
  pydantic = ">=2.5.2,<3"
4415
 
4416
+ [[package]]
4417
+ name = "mistune"
4418
+ version = "3.0.2"
4419
+ description = "A sane and fast Markdown parser with useful plugins and renderers"
4420
+ optional = false
4421
+ python-versions = ">=3.7"
4422
+ files = [
4423
+ {file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"},
4424
+ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"},
4425
+ ]
4426
+
4427
  [[package]]
4428
  name = "mkl"
4429
  version = "2021.4.0"
 
5192
  ]
5193
 
5194
  [package.dependencies]
5195
+ numpy = [
5196
+ {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
5197
+ {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5198
+ ]
5199
 
5200
  [[package]]
5201
  name = "opencv-python-headless"
 
5214
  ]
5215
 
5216
  [package.dependencies]
5217
+ numpy = [
5218
+ {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
5219
+ {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5220
+ ]
5221
 
5222
  [[package]]
5223
  name = "openpyxl"
 
5399
  ]
5400
 
5401
  [package.dependencies]
5402
+ numpy = [
5403
+ {version = ">=1.23.2", markers = "python_version == \"3.11\""},
5404
+ {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
5405
+ ]
5406
  python-dateutil = ">=2.8.2"
5407
  pytz = ">=2020.1"
5408
  tzdata = ">=2022.7"
 
7061
  [package.extras]
7062
  test = ["timeout-decorator"]
7063
 
7064
+ [[package]]
7065
+ name = "redis"
7066
+ version = "5.0.3"
7067
+ description = "Python client for Redis database and key-value store"
7068
+ optional = false
7069
+ python-versions = ">=3.7"
7070
+ files = [
7071
+ {file = "redis-5.0.3-py3-none-any.whl", hash = "sha256:5da9b8fe9e1254293756c16c008e8620b3d15fcc6dde6babde9541850e72a32d"},
7072
+ {file = "redis-5.0.3.tar.gz", hash = "sha256:4973bae7444c0fbed64a06b87446f79361cb7e4ec1538c022d696ed7a5015580"},
7073
+ ]
7074
+
7075
+ [package.dependencies]
7076
+ async-timeout = {version = ">=4.0.3", markers = "python_full_version < \"3.11.3\""}
7077
+
7078
+ [package.extras]
7079
+ hiredis = ["hiredis (>=1.0.0)"]
7080
+ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"]
7081
+
7082
  [[package]]
7083
  name = "referencing"
7084
  version = "0.35.1"
 
8538
  nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
8539
  nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
8540
  sympy = "*"
8541
+ triton = {version = "2.3.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.12\""}
8542
  typing-extensions = ">=4.8.0"
8543
 
8544
  [package.extras]
 
8682
  trio = ">=0.11"
8683
  wsproto = ">=0.14"
8684
 
8685
+ [[package]]
8686
+ name = "triton"
8687
+ version = "2.3.0"
8688
+ description = "A language and compiler for custom Deep Learning operations"
8689
+ optional = false
8690
+ python-versions = "*"
8691
+ files = [
8692
+ {file = "triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ce4b8ff70c48e47274c66f269cce8861cf1dc347ceeb7a67414ca151b1822d8"},
8693
+ {file = "triton-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c3d9607f85103afdb279938fc1dd2a66e4f5999a58eb48a346bd42738f986dd"},
8694
+ {file = "triton-2.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:218d742e67480d9581bafb73ed598416cc8a56f6316152e5562ee65e33de01c0"},
8695
+ {file = "triton-2.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381ec6b3dac06922d3e4099cfc943ef032893b25415de295e82b1a82b0359d2c"},
8696
+ {file = "triton-2.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:038e06a09c06a164fef9c48de3af1e13a63dc1ba3c792871e61a8e79720ea440"},
8697
+ {file = "triton-2.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8f636e0341ac348899a47a057c3daea99ea7db31528a225a3ba4ded28ccc65"},
8698
+ ]
8699
+
8700
+ [package.dependencies]
8701
+ filelock = "*"
8702
+
8703
+ [package.extras]
8704
+ build = ["cmake (>=3.20)", "lit"]
8705
+ tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"]
8706
+ tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
8707
+
8708
  [[package]]
8709
  name = "typer"
8710
  version = "0.12.5"
 
9540
 
9541
  [metadata]
9542
  lock-version = "2.0"
9543
+ python-versions = ">=3.11,<3.13"
9544
+ content-hash = "74a9b4afef47cc36d638b43fd918ece27d65259af1ca9e5b17f6b239774e8bf9"
pyproject.toml CHANGED
@@ -8,7 +8,7 @@ readme = "README.md"
8
  package-mode = false
9
 
10
  [tool.poetry.dependencies]
11
- python = ">=3.12,<3.13"
12
  datrie = "0.8.2"
13
  akshare = "^1.14.81"
14
  azure-storage-blob = "12.22.0"
@@ -114,6 +114,7 @@ graspologic = "^3.4.1"
114
  pymysql = "^1.1.1"
115
  mini-racer = "^0.12.4"
116
  pyicu = "^2.13.1"
 
117
 
118
 
119
  [tool.poetry.group.full]
 
8
  package-mode = false
9
 
10
  [tool.poetry.dependencies]
11
+ python = ">=3.11,<3.13"
12
  datrie = "0.8.2"
13
  akshare = "^1.14.81"
14
  azure-storage-blob = "12.22.0"
 
114
  pymysql = "^1.1.1"
115
  mini-racer = "^0.12.4"
116
  pyicu = "^2.13.1"
117
+ flasgger = "^0.9.7.1"
118
 
119
 
120
  [tool.poetry.group.full]