chrysanthemum-boy FannC commited on
Commit
caa112b
·
1 Parent(s): c712315

Add upload file by knowledge base name API. (#539)

Browse files

### What problem does this PR solve?
Add upload file by knowledge base name API.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Documentation Update

---------

Co-authored-by: chrysanthemum-boy <fannc@qq.com>

api/apps/api_app.py CHANGED
@@ -13,18 +13,28 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
 
16
  from datetime import datetime, timedelta
17
  from flask import request
18
  from flask_login import login_required, current_user
 
 
19
  from api.db.db_models import APIToken, API4Conversation
 
20
  from api.db.services.api_service import APITokenService, API4ConversationService
21
  from api.db.services.dialog_service import DialogService, chat
 
 
22
  from api.db.services.user_service import UserTenantService
23
  from api.settings import RetCode
24
  from api.utils import get_uuid, current_timestamp, datetime_format
25
  from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request
26
  from itsdangerous import URLSafeTimedSerializer
27
 
 
 
 
28
 
29
  def generate_confirmation_token(tenent_id):
30
  serializer = URLSafeTimedSerializer(tenent_id)
@@ -191,4 +201,74 @@ def get(conversation_id):
191
 
192
  return get_json_result(data=conv.to_dict())
193
  except Exception as e:
194
- return server_error_response(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ import os
17
+ import re
18
  from datetime import datetime, timedelta
19
  from flask import request
20
  from flask_login import login_required, current_user
21
+
22
+ from api.db import FileType, ParserType
23
  from api.db.db_models import APIToken, API4Conversation
24
+ from api.db.services import duplicate_name
25
  from api.db.services.api_service import APITokenService, API4ConversationService
26
  from api.db.services.dialog_service import DialogService, chat
27
+ from api.db.services.document_service import DocumentService
28
+ from api.db.services.knowledgebase_service import KnowledgebaseService
29
  from api.db.services.user_service import UserTenantService
30
  from api.settings import RetCode
31
  from api.utils import get_uuid, current_timestamp, datetime_format
32
  from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request
33
  from itsdangerous import URLSafeTimedSerializer
34
 
35
+ from api.utils.file_utils import filename_type, thumbnail
36
+ from rag.utils import MINIO
37
+
38
 
39
  def generate_confirmation_token(tenent_id):
40
  serializer = URLSafeTimedSerializer(tenent_id)
 
201
 
202
  return get_json_result(data=conv.to_dict())
203
  except Exception as e:
204
+ return server_error_response(e)
205
+
206
+
207
+ @manager.route('/document/upload', methods=['POST'])
208
+ @validate_request("kb_name")
209
+ def upload():
210
+ token = request.headers.get('Authorization').split()[1]
211
+ objs = APIToken.query(token=token)
212
+ if not objs:
213
+ return get_json_result(
214
+ data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)
215
+
216
+ kb_name = request.form.get("kb_name").strip()
217
+ tenant_id = objs[0].tenant_id
218
+
219
+ try:
220
+ e, kb = KnowledgebaseService.get_by_name(kb_name, tenant_id)
221
+ if not e:
222
+ return get_data_error_result(
223
+ retmsg="Can't find this knowledgebase!")
224
+ kb_id = kb.id
225
+ except Exception as e:
226
+ return server_error_response(e)
227
+
228
+ if 'file' not in request.files:
229
+ return get_json_result(
230
+ data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
231
+
232
+ file = request.files['file']
233
+ if file.filename == '':
234
+ return get_json_result(
235
+ data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
236
+ try:
237
+ if DocumentService.get_doc_count(kb.tenant_id) >= int(os.environ.get('MAX_FILE_NUM_PER_USER', 8192)):
238
+ return get_data_error_result(
239
+ retmsg="Exceed the maximum file number of a free user!")
240
+
241
+ filename = duplicate_name(
242
+ DocumentService.query,
243
+ name=file.filename,
244
+ kb_id=kb_id)
245
+ filetype = filename_type(filename)
246
+ if not filetype:
247
+ return get_data_error_result(
248
+ retmsg="This type of file has not been supported yet!")
249
+
250
+ location = filename
251
+ while MINIO.obj_exist(kb_id, location):
252
+ location += "_"
253
+ blob = request.files['file'].read()
254
+ MINIO.put(kb_id, location, blob)
255
+ doc = {
256
+ "id": get_uuid(),
257
+ "kb_id": kb.id,
258
+ "parser_id": kb.parser_id,
259
+ "parser_config": kb.parser_config,
260
+ "created_by": kb.tenant_id,
261
+ "type": filetype,
262
+ "name": filename,
263
+ "location": location,
264
+ "size": len(blob),
265
+ "thumbnail": thumbnail(filename, blob)
266
+ }
267
+ if doc["type"] == FileType.VISUAL:
268
+ doc["parser_id"] = ParserType.PICTURE.value
269
+ if re.search(r"\.(ppt|pptx|pages)$", filename):
270
+ doc["parser_id"] = ParserType.PRESENTATION.value
271
+ doc = DocumentService.insert(doc)
272
+ return get_json_result(data=doc.to_json())
273
+ except Exception as e:
274
+ return server_error_response(e)
api/db/services/knowledgebase_service.py CHANGED
@@ -27,7 +27,8 @@ class KnowledgebaseService(CommonService):
27
  page_number, items_per_page, orderby, desc):
28
  kbs = cls.model.select().where(
29
  ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
30
- TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id))
 
31
  & (cls.model.status == StatusEnum.VALID.value)
32
  )
33
  if desc:
@@ -56,7 +57,8 @@ class KnowledgebaseService(CommonService):
56
  cls.model.chunk_num,
57
  cls.model.parser_id,
58
  cls.model.parser_config]
59
- kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
 
60
  (cls.model.id == kb_id),
61
  (cls.model.status == StatusEnum.VALID.value)
62
  )
@@ -86,6 +88,7 @@ class KnowledgebaseService(CommonService):
86
  old[k] = list(set(old[k] + v))
87
  else:
88
  old[k] = v
 
89
  dfs_update(m.parser_config, config)
90
  cls.update_by_id(id, {"parser_config": m.parser_config})
91
 
@@ -97,3 +100,15 @@ class KnowledgebaseService(CommonService):
97
  if k.parser_config and "field_map" in k.parser_config:
98
  conf.update(k.parser_config["field_map"])
99
  return conf
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  page_number, items_per_page, orderby, desc):
28
  kbs = cls.model.select().where(
29
  ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
30
+ TenantPermission.TEAM.value)) | (
31
+ cls.model.tenant_id == user_id))
32
  & (cls.model.status == StatusEnum.VALID.value)
33
  )
34
  if desc:
 
57
  cls.model.chunk_num,
58
  cls.model.parser_id,
59
  cls.model.parser_config]
60
+ kbs = cls.model.select(*fields).join(Tenant, on=(
61
+ (Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
62
  (cls.model.id == kb_id),
63
  (cls.model.status == StatusEnum.VALID.value)
64
  )
 
88
  old[k] = list(set(old[k] + v))
89
  else:
90
  old[k] = v
91
+
92
  dfs_update(m.parser_config, config)
93
  cls.update_by_id(id, {"parser_config": m.parser_config})
94
 
 
100
  if k.parser_config and "field_map" in k.parser_config:
101
  conf.update(k.parser_config["field_map"])
102
  return conf
103
+
104
+ @classmethod
105
+ @DB.connection_context()
106
+ def get_by_name(cls, kb_name, tenant_id):
107
+ kb = cls.model.select().where(
108
+ (cls.model.name == kb_name)
109
+ & (cls.model.tenant_id == tenant_id)
110
+ & (cls.model.status == StatusEnum.VALID.value)
111
+ )
112
+ if kb:
113
+ return True, kb[0]
114
+ return False, None
docs/conversation_api.md CHANGED
@@ -303,5 +303,61 @@ This will be called to get the answer to users' questions.
303
  ## Get document content or image
304
 
305
  This is usually used when display content of citation.
306
- ### Path: /document/get/\<id\>
307
  ### Method: GET
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  ## Get document content or image
304
 
305
  This is usually used when display content of citation.
306
+ ### Path: /api/document/get/\<id\>
307
  ### Method: GET
308
+
309
+ ## Upload file
310
+
311
+ This is usually used when upload a file to.
312
+ ### Path: /api/document/upload/
313
+ ### Method: POST
314
+
315
+ ### Parameter:
316
+
317
+ | name | type | optional | description |
318
+ |---------|--------|----------|----------------------------------------|
319
+ | file | file | No | Upload file. |
320
+ | kb_name | string | No | Choose the upload knowledge base name. |
321
+
322
+ ### Response
323
+ ```json
324
+ {
325
+ "data": {
326
+ "chunk_num": 0,
327
+ "create_date": "Thu, 25 Apr 2024 14:30:06 GMT",
328
+ "create_time": 1714026606921,
329
+ "created_by": "553ec818fd5711ee8ea63043d7ed348e",
330
+ "id": "41e9324602cd11ef9f5f3043d7ed348e",
331
+ "kb_id": "06802686c0a311ee85d6246e9694c130",
332
+ "location": "readme.txt",
333
+ "name": "readme.txt",
334
+ "parser_config": {
335
+ "field_map": {
336
+ },
337
+ "pages": [
338
+ [
339
+ 0,
340
+ 1000000
341
+ ]
342
+ ]
343
+ },
344
+ "parser_id": "general",
345
+ "process_begin_at": null,
346
+ "process_duation": 0.0,
347
+ "progress": 0.0,
348
+ "progress_msg": "",
349
+ "run": "0",
350
+ "size": 929,
351
+ "source_type": "local",
352
+ "status": "1",
353
+ "thumbnail": null,
354
+ "token_num": 0,
355
+ "type": "doc",
356
+ "update_date": "Thu, 25 Apr 2024 14:30:06 GMT",
357
+ "update_time": 1714026606921
358
+ },
359
+ "retcode": 0,
360
+ "retmsg": "success"
361
+ }
362
+
363
+ ```