gzdaniel commited on
Commit
e2283ef
·
1 Parent(s): 8acdc4e

feat(postgres): Implement text_chunks upsert for PGKVStorage

Browse files
lightrag/kg/postgres_impl.py CHANGED
@@ -520,7 +520,21 @@ class PGKVStorage(BaseKVStorage):
520
  return
521
 
522
  if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
523
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS):
525
  for k, v in data.items():
526
  upsert_sql = SQL_TEMPLATES["upsert_doc_full"]
@@ -2409,7 +2423,7 @@ class PGGraphStorage(BaseGraphStorage):
2409
  NAMESPACE_TABLE_MAP = {
2410
  NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL",
2411
  NameSpace.KV_STORE_TEXT_CHUNKS: "LIGHTRAG_DOC_CHUNKS",
2412
- NameSpace.VECTOR_STORE_CHUNKS: "LIGHTRAG_DOC_CHUNKS",
2413
  NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
2414
  NameSpace.VECTOR_STORE_RELATIONSHIPS: "LIGHTRAG_VDB_RELATION",
2415
  NameSpace.DOC_STATUS: "LIGHTRAG_DOC_STATUS",
@@ -2444,13 +2458,27 @@ TABLES = {
2444
  chunk_order_index INTEGER,
2445
  tokens INTEGER,
2446
  content TEXT,
2447
- content_vector VECTOR,
2448
  file_path VARCHAR(256),
2449
  create_time TIMESTAMP(0) WITH TIME ZONE,
2450
  update_time TIMESTAMP(0) WITH TIME ZONE,
2451
  CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
2452
  )"""
2453
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2454
  "LIGHTRAG_VDB_ENTITY": {
2455
  "ddl": """CREATE TABLE LIGHTRAG_VDB_ENTITY (
2456
  id VARCHAR(255),
@@ -2551,7 +2579,20 @@ SQL_TEMPLATES = {
2551
  chunk_id=EXCLUDED.chunk_id,
2552
  update_time = CURRENT_TIMESTAMP
2553
  """,
2554
- "upsert_chunk": """INSERT INTO LIGHTRAG_DOC_CHUNKS (workspace, id, tokens,
 
 
 
 
 
 
 
 
 
 
 
 
 
2555
  chunk_order_index, full_doc_id, content, content_vector, file_path,
2556
  create_time, update_time)
2557
  VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
@@ -2564,7 +2605,6 @@ SQL_TEMPLATES = {
2564
  file_path=EXCLUDED.file_path,
2565
  update_time = EXCLUDED.update_time
2566
  """,
2567
- # SQL for VectorStorage
2568
  "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
2569
  content_vector, chunk_ids, file_path, create_time, update_time)
2570
  VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7, $8, $9)
@@ -2625,7 +2665,7 @@ SQL_TEMPLATES = {
2625
  "chunks": """
2626
  WITH relevant_chunks AS (
2627
  SELECT id as chunk_id
2628
- FROM LIGHTRAG_DOC_CHUNKS
2629
  WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
2630
  )
2631
  SELECT id, content, file_path, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM
 
520
  return
521
 
522
  if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
523
+ current_time = datetime.datetime.now(timezone.utc)
524
+ for k, v in data.items():
525
+ upsert_sql = SQL_TEMPLATES["upsert_text_chunk"]
526
+ _data = {
527
+ "workspace": self.db.workspace,
528
+ "id": k,
529
+ "tokens": v["tokens"],
530
+ "chunk_order_index": v["chunk_order_index"],
531
+ "full_doc_id": v["full_doc_id"],
532
+ "content": v["content"],
533
+ "file_path": v["file_path"],
534
+ "create_time": current_time,
535
+ "update_time": current_time,
536
+ }
537
+ await self.db.execute(upsert_sql, _data)
538
  elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS):
539
  for k, v in data.items():
540
  upsert_sql = SQL_TEMPLATES["upsert_doc_full"]
 
2423
  NAMESPACE_TABLE_MAP = {
2424
  NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL",
2425
  NameSpace.KV_STORE_TEXT_CHUNKS: "LIGHTRAG_DOC_CHUNKS",
2426
+ NameSpace.VECTOR_STORE_CHUNKS: "LIGHTRAG_VDB_CHUNKS",
2427
  NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
2428
  NameSpace.VECTOR_STORE_RELATIONSHIPS: "LIGHTRAG_VDB_RELATION",
2429
  NameSpace.DOC_STATUS: "LIGHTRAG_DOC_STATUS",
 
2458
  chunk_order_index INTEGER,
2459
  tokens INTEGER,
2460
  content TEXT,
 
2461
  file_path VARCHAR(256),
2462
  create_time TIMESTAMP(0) WITH TIME ZONE,
2463
  update_time TIMESTAMP(0) WITH TIME ZONE,
2464
  CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
2465
  )"""
2466
  },
2467
+ "LIGHTRAG_VDB_CHUNKS": {
2468
+ "ddl": """CREATE TABLE LIGHTRAG_VDB_CHUNKS (
2469
+ id VARCHAR(255),
2470
+ workspace VARCHAR(255),
2471
+ full_doc_id VARCHAR(256),
2472
+ chunk_order_index INTEGER,
2473
+ tokens INTEGER,
2474
+ content TEXT,
2475
+ content_vector VECTOR,
2476
+ file_path VARCHAR(256),
2477
+ create_time TIMESTAMP(0) WITH TIME ZONE,
2478
+ update_time TIMESTAMP(0) WITH TIME ZONE,
2479
+ CONSTRAINT LIGHTRAG_VDB_CHUNKS_PK PRIMARY KEY (workspace, id)
2480
+ )"""
2481
+ },
2482
  "LIGHTRAG_VDB_ENTITY": {
2483
  "ddl": """CREATE TABLE LIGHTRAG_VDB_ENTITY (
2484
  id VARCHAR(255),
 
2579
  chunk_id=EXCLUDED.chunk_id,
2580
  update_time = CURRENT_TIMESTAMP
2581
  """,
2582
+ "upsert_text_chunk": """INSERT INTO LIGHTRAG_DOC_CHUNKS (workspace, id, tokens,
2583
+ chunk_order_index, full_doc_id, content, file_path,
2584
+ create_time, update_time)
2585
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
2586
+ ON CONFLICT (workspace,id) DO UPDATE
2587
+ SET tokens=EXCLUDED.tokens,
2588
+ chunk_order_index=EXCLUDED.chunk_order_index,
2589
+ full_doc_id=EXCLUDED.full_doc_id,
2590
+ content = EXCLUDED.content,
2591
+ file_path=EXCLUDED.file_path,
2592
+ update_time = EXCLUDED.update_time
2593
+ """,
2594
+ # SQL for VectorStorage
2595
+ "upsert_chunk": """INSERT INTO LIGHTRAG_VDB_CHUNKS (workspace, id, tokens,
2596
  chunk_order_index, full_doc_id, content, content_vector, file_path,
2597
  create_time, update_time)
2598
  VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
 
2605
  file_path=EXCLUDED.file_path,
2606
  update_time = EXCLUDED.update_time
2607
  """,
 
2608
  "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
2609
  content_vector, chunk_ids, file_path, create_time, update_time)
2610
  VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7, $8, $9)
 
2665
  "chunks": """
2666
  WITH relevant_chunks AS (
2667
  SELECT id as chunk_id
2668
+ FROM LIGHTRAG_VDB_CHUNKS
2669
  WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
2670
  )
2671
  SELECT id, content, file_path, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM
lightrag/lightrag.py CHANGED
@@ -394,13 +394,13 @@ class LightRAG:
394
  embedding_func=self.embedding_func,
395
  )
396
 
397
- # TODO: deprecating, text_chunks is redundant with chunks_vdb
398
  self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
399
  namespace=make_namespace(
400
  self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS
401
  ),
402
  embedding_func=self.embedding_func,
403
  )
 
404
  self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore
405
  namespace=make_namespace(
406
  self.namespace_prefix, NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION
 
394
  embedding_func=self.embedding_func,
395
  )
396
 
 
397
  self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
398
  namespace=make_namespace(
399
  self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS
400
  ),
401
  embedding_func=self.embedding_func,
402
  )
403
+
404
  self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore
405
  namespace=make_namespace(
406
  self.namespace_prefix, NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION