feat(postgres): Implement text_chunks upsert for PGKVStorage
Browse files- lightrag/kg/postgres_impl.py +46 -6
- lightrag/lightrag.py +1 -1
lightrag/kg/postgres_impl.py
CHANGED
@@ -520,7 +520,21 @@ class PGKVStorage(BaseKVStorage):
|
|
520 |
return
|
521 |
|
522 |
if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS):
|
525 |
for k, v in data.items():
|
526 |
upsert_sql = SQL_TEMPLATES["upsert_doc_full"]
|
@@ -2409,7 +2423,7 @@ class PGGraphStorage(BaseGraphStorage):
|
|
2409 |
NAMESPACE_TABLE_MAP = {
|
2410 |
NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL",
|
2411 |
NameSpace.KV_STORE_TEXT_CHUNKS: "LIGHTRAG_DOC_CHUNKS",
|
2412 |
-
NameSpace.VECTOR_STORE_CHUNKS: "
|
2413 |
NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
|
2414 |
NameSpace.VECTOR_STORE_RELATIONSHIPS: "LIGHTRAG_VDB_RELATION",
|
2415 |
NameSpace.DOC_STATUS: "LIGHTRAG_DOC_STATUS",
|
@@ -2444,13 +2458,27 @@ TABLES = {
|
|
2444 |
chunk_order_index INTEGER,
|
2445 |
tokens INTEGER,
|
2446 |
content TEXT,
|
2447 |
-
content_vector VECTOR,
|
2448 |
file_path VARCHAR(256),
|
2449 |
create_time TIMESTAMP(0) WITH TIME ZONE,
|
2450 |
update_time TIMESTAMP(0) WITH TIME ZONE,
|
2451 |
CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
|
2452 |
)"""
|
2453 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2454 |
"LIGHTRAG_VDB_ENTITY": {
|
2455 |
"ddl": """CREATE TABLE LIGHTRAG_VDB_ENTITY (
|
2456 |
id VARCHAR(255),
|
@@ -2551,7 +2579,20 @@ SQL_TEMPLATES = {
|
|
2551 |
chunk_id=EXCLUDED.chunk_id,
|
2552 |
update_time = CURRENT_TIMESTAMP
|
2553 |
""",
|
2554 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2555 |
chunk_order_index, full_doc_id, content, content_vector, file_path,
|
2556 |
create_time, update_time)
|
2557 |
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
@@ -2564,7 +2605,6 @@ SQL_TEMPLATES = {
|
|
2564 |
file_path=EXCLUDED.file_path,
|
2565 |
update_time = EXCLUDED.update_time
|
2566 |
""",
|
2567 |
-
# SQL for VectorStorage
|
2568 |
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
2569 |
content_vector, chunk_ids, file_path, create_time, update_time)
|
2570 |
VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7, $8, $9)
|
@@ -2625,7 +2665,7 @@ SQL_TEMPLATES = {
|
|
2625 |
"chunks": """
|
2626 |
WITH relevant_chunks AS (
|
2627 |
SELECT id as chunk_id
|
2628 |
-
FROM
|
2629 |
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
2630 |
)
|
2631 |
SELECT id, content, file_path, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM
|
|
|
520 |
return
|
521 |
|
522 |
if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
523 |
+
current_time = datetime.datetime.now(timezone.utc)
|
524 |
+
for k, v in data.items():
|
525 |
+
upsert_sql = SQL_TEMPLATES["upsert_text_chunk"]
|
526 |
+
_data = {
|
527 |
+
"workspace": self.db.workspace,
|
528 |
+
"id": k,
|
529 |
+
"tokens": v["tokens"],
|
530 |
+
"chunk_order_index": v["chunk_order_index"],
|
531 |
+
"full_doc_id": v["full_doc_id"],
|
532 |
+
"content": v["content"],
|
533 |
+
"file_path": v["file_path"],
|
534 |
+
"create_time": current_time,
|
535 |
+
"update_time": current_time,
|
536 |
+
}
|
537 |
+
await self.db.execute(upsert_sql, _data)
|
538 |
elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS):
|
539 |
for k, v in data.items():
|
540 |
upsert_sql = SQL_TEMPLATES["upsert_doc_full"]
|
|
|
2423 |
NAMESPACE_TABLE_MAP = {
|
2424 |
NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL",
|
2425 |
NameSpace.KV_STORE_TEXT_CHUNKS: "LIGHTRAG_DOC_CHUNKS",
|
2426 |
+
NameSpace.VECTOR_STORE_CHUNKS: "LIGHTRAG_VDB_CHUNKS",
|
2427 |
NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
|
2428 |
NameSpace.VECTOR_STORE_RELATIONSHIPS: "LIGHTRAG_VDB_RELATION",
|
2429 |
NameSpace.DOC_STATUS: "LIGHTRAG_DOC_STATUS",
|
|
|
2458 |
chunk_order_index INTEGER,
|
2459 |
tokens INTEGER,
|
2460 |
content TEXT,
|
|
|
2461 |
file_path VARCHAR(256),
|
2462 |
create_time TIMESTAMP(0) WITH TIME ZONE,
|
2463 |
update_time TIMESTAMP(0) WITH TIME ZONE,
|
2464 |
CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
|
2465 |
)"""
|
2466 |
},
|
2467 |
+
"LIGHTRAG_VDB_CHUNKS": {
|
2468 |
+
"ddl": """CREATE TABLE LIGHTRAG_VDB_CHUNKS (
|
2469 |
+
id VARCHAR(255),
|
2470 |
+
workspace VARCHAR(255),
|
2471 |
+
full_doc_id VARCHAR(256),
|
2472 |
+
chunk_order_index INTEGER,
|
2473 |
+
tokens INTEGER,
|
2474 |
+
content TEXT,
|
2475 |
+
content_vector VECTOR,
|
2476 |
+
file_path VARCHAR(256),
|
2477 |
+
create_time TIMESTAMP(0) WITH TIME ZONE,
|
2478 |
+
update_time TIMESTAMP(0) WITH TIME ZONE,
|
2479 |
+
CONSTRAINT LIGHTRAG_VDB_CHUNKS_PK PRIMARY KEY (workspace, id)
|
2480 |
+
)"""
|
2481 |
+
},
|
2482 |
"LIGHTRAG_VDB_ENTITY": {
|
2483 |
"ddl": """CREATE TABLE LIGHTRAG_VDB_ENTITY (
|
2484 |
id VARCHAR(255),
|
|
|
2579 |
chunk_id=EXCLUDED.chunk_id,
|
2580 |
update_time = CURRENT_TIMESTAMP
|
2581 |
""",
|
2582 |
+
"upsert_text_chunk": """INSERT INTO LIGHTRAG_DOC_CHUNKS (workspace, id, tokens,
|
2583 |
+
chunk_order_index, full_doc_id, content, file_path,
|
2584 |
+
create_time, update_time)
|
2585 |
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
2586 |
+
ON CONFLICT (workspace,id) DO UPDATE
|
2587 |
+
SET tokens=EXCLUDED.tokens,
|
2588 |
+
chunk_order_index=EXCLUDED.chunk_order_index,
|
2589 |
+
full_doc_id=EXCLUDED.full_doc_id,
|
2590 |
+
content = EXCLUDED.content,
|
2591 |
+
file_path=EXCLUDED.file_path,
|
2592 |
+
update_time = EXCLUDED.update_time
|
2593 |
+
""",
|
2594 |
+
# SQL for VectorStorage
|
2595 |
+
"upsert_chunk": """INSERT INTO LIGHTRAG_VDB_CHUNKS (workspace, id, tokens,
|
2596 |
chunk_order_index, full_doc_id, content, content_vector, file_path,
|
2597 |
create_time, update_time)
|
2598 |
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
|
|
2605 |
file_path=EXCLUDED.file_path,
|
2606 |
update_time = EXCLUDED.update_time
|
2607 |
""",
|
|
|
2608 |
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
2609 |
content_vector, chunk_ids, file_path, create_time, update_time)
|
2610 |
VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7, $8, $9)
|
|
|
2665 |
"chunks": """
|
2666 |
WITH relevant_chunks AS (
|
2667 |
SELECT id as chunk_id
|
2668 |
+
FROM LIGHTRAG_VDB_CHUNKS
|
2669 |
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
2670 |
)
|
2671 |
SELECT id, content, file_path, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM
|
lightrag/lightrag.py
CHANGED
@@ -394,13 +394,13 @@ class LightRAG:
|
|
394 |
embedding_func=self.embedding_func,
|
395 |
)
|
396 |
|
397 |
-
# TODO: deprecating, text_chunks is redundant with chunks_vdb
|
398 |
self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
399 |
namespace=make_namespace(
|
400 |
self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS
|
401 |
),
|
402 |
embedding_func=self.embedding_func,
|
403 |
)
|
|
|
404 |
self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore
|
405 |
namespace=make_namespace(
|
406 |
self.namespace_prefix, NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION
|
|
|
394 |
embedding_func=self.embedding_func,
|
395 |
)
|
396 |
|
|
|
397 |
self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
398 |
namespace=make_namespace(
|
399 |
self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS
|
400 |
),
|
401 |
embedding_func=self.embedding_func,
|
402 |
)
|
403 |
+
|
404 |
self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore
|
405 |
namespace=make_namespace(
|
406 |
self.namespace_prefix, NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION
|