Merge branch 'main' into rerank
Browse files- README-zh.md +0 -1
- README.md +0 -1
- env.example +0 -2
- lightrag/api/README.md +3 -2
- lightrag/api/utils_api.py +0 -4
- lightrag/constants.py +1 -2
- lightrag/kg/memgraph_impl.py +61 -64
- lightrag/lightrag.py +0 -5
- lightrag/operate.py +0 -1
README-zh.md
CHANGED
|
@@ -242,7 +242,6 @@ if __name__ == "__main__":
|
|
| 242 |
| **tokenizer** | `Tokenizer` | 用于将文本转换为 tokens(数字)以及使用遵循 TokenizerInterface 协议的 .encode() 和 .decode() 函数将 tokens 转换回文本的函数。 如果您不指定,它将使用默认的 Tiktoken tokenizer。 | `TiktokenTokenizer` |
|
| 243 |
| **tiktoken_model_name** | `str` | 如果您使用的是默认的 Tiktoken tokenizer,那么这是要使用的特定 Tiktoken 模型的名称。如果您提供自己的 tokenizer,则忽略此设置。 | `gpt-4o-mini` |
|
| 244 |
| **entity_extract_max_gleaning** | `int` | 实体提取过程中的循环次数,附加历史消息 | `1` |
|
| 245 |
-
| **entity_summary_to_max_tokens** | `int` | 每个实体摘要的最大令牌大小 | `500` |
|
| 246 |
| **node_embedding_algorithm** | `str` | 节点嵌入算法(当前未使用) | `node2vec` |
|
| 247 |
| **node2vec_params** | `dict` | 节点嵌入的参数 | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` |
|
| 248 |
| **embedding_func** | `EmbeddingFunc` | 从文本生成嵌入向量的函数 | `openai_embed` |
|
|
|
|
| 242 |
| **tokenizer** | `Tokenizer` | 用于将文本转换为 tokens(数字)以及使用遵循 TokenizerInterface 协议的 .encode() 和 .decode() 函数将 tokens 转换回文本的函数。 如果您不指定,它将使用默认的 Tiktoken tokenizer。 | `TiktokenTokenizer` |
|
| 243 |
| **tiktoken_model_name** | `str` | 如果您使用的是默认的 Tiktoken tokenizer,那么这是要使用的特定 Tiktoken 模型的名称。如果您提供自己的 tokenizer,则忽略此设置。 | `gpt-4o-mini` |
|
| 244 |
| **entity_extract_max_gleaning** | `int` | 实体提取过程中的循环次数,附加历史消息 | `1` |
|
|
|
|
| 245 |
| **node_embedding_algorithm** | `str` | 节点嵌入算法(当前未使用) | `node2vec` |
|
| 246 |
| **node2vec_params** | `dict` | 节点嵌入的参数 | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` |
|
| 247 |
| **embedding_func** | `EmbeddingFunc` | 从文本生成嵌入向量的函数 | `openai_embed` |
|
README.md
CHANGED
|
@@ -249,7 +249,6 @@ A full list of LightRAG init parameters:
|
|
| 249 |
| **tokenizer** | `Tokenizer` | The function used to convert text into tokens (numbers) and back using .encode() and .decode() functions following `TokenizerInterface` protocol. If you don't specify one, it will use the default Tiktoken tokenizer. | `TiktokenTokenizer` |
|
| 250 |
| **tiktoken_model_name** | `str` | If you're using the default Tiktoken tokenizer, this is the name of the specific Tiktoken model to use. This setting is ignored if you provide your own tokenizer. | `gpt-4o-mini` |
|
| 251 |
| **entity_extract_max_gleaning** | `int` | Number of loops in the entity extraction process, appending history messages | `1` |
|
| 252 |
-
| **entity_summary_to_max_tokens** | `int` | Maximum token size for each entity summary | `500` |
|
| 253 |
| **node_embedding_algorithm** | `str` | Algorithm for node embedding (currently not used) | `node2vec` |
|
| 254 |
| **node2vec_params** | `dict` | Parameters for node embedding | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` |
|
| 255 |
| **embedding_func** | `EmbeddingFunc` | Function to generate embedding vectors from text | `openai_embed` |
|
|
|
|
| 249 |
| **tokenizer** | `Tokenizer` | The function used to convert text into tokens (numbers) and back using .encode() and .decode() functions following `TokenizerInterface` protocol. If you don't specify one, it will use the default Tiktoken tokenizer. | `TiktokenTokenizer` |
|
| 250 |
| **tiktoken_model_name** | `str` | If you're using the default Tiktoken tokenizer, this is the name of the specific Tiktoken model to use. This setting is ignored if you provide your own tokenizer. | `gpt-4o-mini` |
|
| 251 |
| **entity_extract_max_gleaning** | `int` | Number of loops in the entity extraction process, appending history messages | `1` |
|
|
|
|
| 252 |
| **node_embedding_algorithm** | `str` | Algorithm for node embedding (currently not used) | `node2vec` |
|
| 253 |
| **node2vec_params** | `dict` | Parameters for node embedding | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` |
|
| 254 |
| **embedding_func** | `EmbeddingFunc` | Function to generate embedding vectors from text | `openai_embed` |
|
env.example
CHANGED
|
@@ -75,8 +75,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest
|
|
| 75 |
SUMMARY_LANGUAGE=English
|
| 76 |
### Number of duplicated entities/edges to trigger LLM re-summary on merge ( at least 3 is recommented)
|
| 77 |
# FORCE_LLM_SUMMARY_ON_MERGE=6
|
| 78 |
-
### Max tokens for entity/relations description after merge
|
| 79 |
-
# MAX_TOKEN_SUMMARY=500
|
| 80 |
### Maximum number of entity extraction attempts for ambiguous content
|
| 81 |
# MAX_GLEANING=1
|
| 82 |
|
|
|
|
| 75 |
SUMMARY_LANGUAGE=English
|
| 76 |
### Number of duplicated entities/edges to trigger LLM re-summary on merge ( at least 3 is recommented)
|
| 77 |
# FORCE_LLM_SUMMARY_ON_MERGE=6
|
|
|
|
|
|
|
| 78 |
### Maximum number of entity extraction attempts for ambiguous content
|
| 79 |
# MAX_GLEANING=1
|
| 80 |
|
lightrag/api/README.md
CHANGED
|
@@ -181,9 +181,9 @@ The command-line `workspace` argument and the `WORKSPACE` environment variable i
|
|
| 181 |
- **For local file-based databases, data isolation is achieved through workspace subdirectories:** `JsonKVStorage`, `JsonDocStatusStorage`, `NetworkXStorage`, `NanoVectorDBStorage`, `FaissVectorDBStorage`.
|
| 182 |
- **For databases that store data in collections, it's done by adding a workspace prefix to the collection name:** `RedisKVStorage`, `RedisDocStatusStorage`, `MilvusVectorDBStorage`, `QdrantVectorDBStorage`, `MongoKVStorage`, `MongoDocStatusStorage`, `MongoVectorDBStorage`, `MongoGraphStorage`, `PGGraphStorage`.
|
| 183 |
- **For relational databases, data isolation is achieved by adding a `workspace` field to the tables for logical data separation:** `PGKVStorage`, `PGVectorStorage`, `PGDocStatusStorage`.
|
| 184 |
-
- **For
|
| 185 |
|
| 186 |
-
To maintain compatibility with legacy data, the default workspace for PostgreSQL is `default` and for Neo4j is `base` when no workspace is configured. For all external storages, the system provides dedicated workspace environment variables to override the common `WORKSPACE` environment variable configuration. These storage-specific workspace environment variables are: `REDIS_WORKSPACE`, `MILVUS_WORKSPACE`, `QDRANT_WORKSPACE`, `MONGODB_WORKSPACE`, `POSTGRES_WORKSPACE`, `NEO4J_WORKSPACE`.
|
| 187 |
|
| 188 |
### Multiple workers for Gunicorn + Uvicorn
|
| 189 |
|
|
@@ -396,6 +396,7 @@ MongoKVStorage MongoDB
|
|
| 396 |
NetworkXStorage NetworkX (default)
|
| 397 |
Neo4JStorage Neo4J
|
| 398 |
PGGraphStorage PostgreSQL with AGE plugin
|
|
|
|
| 399 |
```
|
| 400 |
|
| 401 |
> Testing has shown that Neo4J delivers superior performance in production environments compared to PostgreSQL with AGE plugin.
|
|
|
|
| 181 |
- **For local file-based databases, data isolation is achieved through workspace subdirectories:** `JsonKVStorage`, `JsonDocStatusStorage`, `NetworkXStorage`, `NanoVectorDBStorage`, `FaissVectorDBStorage`.
|
| 182 |
- **For databases that store data in collections, it's done by adding a workspace prefix to the collection name:** `RedisKVStorage`, `RedisDocStatusStorage`, `MilvusVectorDBStorage`, `QdrantVectorDBStorage`, `MongoKVStorage`, `MongoDocStatusStorage`, `MongoVectorDBStorage`, `MongoGraphStorage`, `PGGraphStorage`.
|
| 183 |
- **For relational databases, data isolation is achieved by adding a `workspace` field to the tables for logical data separation:** `PGKVStorage`, `PGVectorStorage`, `PGDocStatusStorage`.
|
| 184 |
+
- **For graph databases, logical data isolation is achieved through labels:** `Neo4JStorage`, `MemgraphStorage`
|
| 185 |
|
| 186 |
+
To maintain compatibility with legacy data, the default workspace for PostgreSQL is `default` and for Neo4j is `base` when no workspace is configured. For all external storages, the system provides dedicated workspace environment variables to override the common `WORKSPACE` environment variable configuration. These storage-specific workspace environment variables are: `REDIS_WORKSPACE`, `MILVUS_WORKSPACE`, `QDRANT_WORKSPACE`, `MONGODB_WORKSPACE`, `POSTGRES_WORKSPACE`, `NEO4J_WORKSPACE`, `MEMGRAPH_WORKSPACE`.
|
| 187 |
|
| 188 |
### Multiple workers for Gunicorn + Uvicorn
|
| 189 |
|
|
|
|
| 396 |
NetworkXStorage NetworkX (default)
|
| 397 |
Neo4JStorage Neo4J
|
| 398 |
PGGraphStorage PostgreSQL with AGE plugin
|
| 399 |
+
MemgraphStorage. Memgraph
|
| 400 |
```
|
| 401 |
|
| 402 |
> Testing has shown that Neo4J delivers superior performance in production environments compared to PostgreSQL with AGE plugin.
|
lightrag/api/utils_api.py
CHANGED
|
@@ -10,7 +10,6 @@ from ascii_colors import ASCIIColors
|
|
| 10 |
from lightrag.api import __api_version__ as api_version
|
| 11 |
from lightrag import __version__ as core_version
|
| 12 |
from lightrag.constants import (
|
| 13 |
-
DEFAULT_MAX_TOKEN_SUMMARY,
|
| 14 |
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
| 15 |
)
|
| 16 |
from fastapi import HTTPException, Security, Request, status
|
|
@@ -280,9 +279,6 @@ def display_splash_screen(args: argparse.Namespace) -> None:
|
|
| 280 |
ASCIIColors.white(" ├─ Top-K: ", end="")
|
| 281 |
ASCIIColors.yellow(f"{args.top_k}")
|
| 282 |
ASCIIColors.white(" ├─ Max Token Summary: ", end="")
|
| 283 |
-
ASCIIColors.yellow(
|
| 284 |
-
f"{get_env_value('MAX_TOKEN_SUMMARY', DEFAULT_MAX_TOKEN_SUMMARY, int)}"
|
| 285 |
-
)
|
| 286 |
ASCIIColors.white(" └─ Force LLM Summary on Merge: ", end="")
|
| 287 |
ASCIIColors.yellow(
|
| 288 |
f"{get_env_value('FORCE_LLM_SUMMARY_ON_MERGE', DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int)}"
|
|
|
|
| 10 |
from lightrag.api import __api_version__ as api_version
|
| 11 |
from lightrag import __version__ as core_version
|
| 12 |
from lightrag.constants import (
|
|
|
|
| 13 |
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
| 14 |
)
|
| 15 |
from fastapi import HTTPException, Security, Request, status
|
|
|
|
| 279 |
ASCIIColors.white(" ├─ Top-K: ", end="")
|
| 280 |
ASCIIColors.yellow(f"{args.top_k}")
|
| 281 |
ASCIIColors.white(" ├─ Max Token Summary: ", end="")
|
|
|
|
|
|
|
|
|
|
| 282 |
ASCIIColors.white(" └─ Force LLM Summary on Merge: ", end="")
|
| 283 |
ASCIIColors.yellow(
|
| 284 |
f"{get_env_value('FORCE_LLM_SUMMARY_ON_MERGE', DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int)}"
|
lightrag/constants.py
CHANGED
|
@@ -8,8 +8,7 @@ consistency and makes maintenance easier.
|
|
| 8 |
|
| 9 |
# Default values for environment variables
|
| 10 |
DEFAULT_MAX_GLEANING = 1
|
| 11 |
-
|
| 12 |
-
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 6
|
| 13 |
DEFAULT_WOKERS = 2
|
| 14 |
DEFAULT_TIMEOUT = 150
|
| 15 |
|
|
|
|
| 8 |
|
| 9 |
# Default values for environment variables
|
| 10 |
DEFAULT_MAX_GLEANING = 1
|
| 11 |
+
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 4
|
|
|
|
| 12 |
DEFAULT_WOKERS = 2
|
| 13 |
DEFAULT_TIMEOUT = 150
|
| 14 |
|
lightrag/kg/memgraph_impl.py
CHANGED
|
@@ -435,7 +435,7 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 435 |
|
| 436 |
async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
|
| 437 |
"""
|
| 438 |
-
Upsert a node in the
|
| 439 |
|
| 440 |
Args:
|
| 441 |
node_id: The unique identifier for the node (used as label)
|
|
@@ -448,7 +448,9 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 448 |
properties = node_data
|
| 449 |
entity_type = properties["entity_type"]
|
| 450 |
if "entity_id" not in properties:
|
| 451 |
-
raise ValueError(
|
|
|
|
|
|
|
| 452 |
|
| 453 |
try:
|
| 454 |
async with self._driver.session(database=self._DATABASE) as session:
|
|
@@ -732,7 +734,7 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 732 |
self,
|
| 733 |
node_label: str,
|
| 734 |
max_depth: int = 3,
|
| 735 |
-
max_nodes: int =
|
| 736 |
) -> KnowledgeGraph:
|
| 737 |
"""
|
| 738 |
Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
|
|
@@ -740,120 +742,118 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 740 |
Args:
|
| 741 |
node_label: Label of the starting node, * means all nodes
|
| 742 |
max_depth: Maximum depth of the subgraph, Defaults to 3
|
| 743 |
-
max_nodes:
|
| 744 |
|
| 745 |
Returns:
|
| 746 |
KnowledgeGraph object containing nodes and edges, with an is_truncated flag
|
| 747 |
indicating whether the graph was truncated due to max_nodes limit
|
| 748 |
-
|
| 749 |
-
Raises:
|
| 750 |
-
Exception: If there is an error executing the query
|
| 751 |
"""
|
| 752 |
-
if
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
|
|
|
|
|
|
| 756 |
|
|
|
|
| 757 |
result = KnowledgeGraph()
|
| 758 |
seen_nodes = set()
|
| 759 |
seen_edges = set()
|
| 760 |
-
|
| 761 |
async with self._driver.session(
|
| 762 |
database=self._DATABASE, default_access_mode="READ"
|
| 763 |
) as session:
|
| 764 |
try:
|
| 765 |
if node_label == "*":
|
| 766 |
-
# First check if
|
| 767 |
-
count_query =
|
|
|
|
|
|
|
| 768 |
count_result = None
|
| 769 |
-
total_count = 0
|
| 770 |
try:
|
| 771 |
count_result = await session.run(count_query)
|
| 772 |
count_record = await count_result.single()
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
result.is_truncated = True
|
| 780 |
-
logger.info(
|
| 781 |
-
f"Graph truncated: {total_count} nodes found, limited to {max_nodes}"
|
| 782 |
-
)
|
| 783 |
finally:
|
| 784 |
if count_result:
|
| 785 |
await count_result.consume()
|
| 786 |
|
| 787 |
-
# Run
|
| 788 |
main_query = f"""
|
| 789 |
MATCH (n:`{workspace_label}`)
|
| 790 |
OPTIONAL MATCH (n)-[r]-()
|
| 791 |
WITH n, COALESCE(count(r), 0) AS degree
|
| 792 |
ORDER BY degree DESC
|
| 793 |
LIMIT $max_nodes
|
| 794 |
-
WITH collect(n) AS
|
| 795 |
-
|
|
|
|
|
|
|
| 796 |
WHERE a IN kept_nodes AND b IN kept_nodes
|
| 797 |
-
RETURN
|
| 798 |
collect(DISTINCT r) AS relationships
|
| 799 |
"""
|
| 800 |
result_set = None
|
| 801 |
try:
|
| 802 |
result_set = await session.run(
|
| 803 |
-
main_query,
|
|
|
|
| 804 |
)
|
| 805 |
record = await result_set.single()
|
| 806 |
-
if not record:
|
| 807 |
-
logger.debug("No record returned from main query")
|
| 808 |
-
return result
|
| 809 |
finally:
|
| 810 |
if result_set:
|
| 811 |
await result_set.consume()
|
| 812 |
|
| 813 |
else:
|
| 814 |
-
|
|
|
|
| 815 |
MATCH (start:`{workspace_label}`)
|
| 816 |
WHERE start.entity_id = $entity_id
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
WITH nodes(path) AS path_nodes, relationships(path) AS path_rels
|
| 822 |
-
UNWIND path_nodes AS n
|
| 823 |
-
WITH collect(DISTINCT n) AS all_nodes, collect(DISTINCT path_rels) AS all_rel_lists
|
| 824 |
-
WITH all_nodes, reduce(r = [], x IN all_rel_lists | r + x) AS all_rels
|
| 825 |
-
RETURN all_nodes, all_rels
|
| 826 |
-
}}
|
| 827 |
-
WITH all_nodes AS nodes, all_rels AS relationships, size(all_nodes) AS total_nodes
|
| 828 |
WITH
|
| 829 |
CASE
|
| 830 |
-
WHEN
|
| 831 |
-
ELSE
|
| 832 |
END AS limited_nodes,
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
RETURN
|
| 837 |
[node IN limited_nodes | {{node: node}}] AS node_info,
|
| 838 |
relationships,
|
| 839 |
-
total_nodes,
|
| 840 |
is_truncated
|
| 841 |
"""
|
|
|
|
| 842 |
result_set = None
|
| 843 |
try:
|
| 844 |
result_set = await session.run(
|
| 845 |
-
|
| 846 |
{
|
| 847 |
"entity_id": node_label,
|
|
|
|
| 848 |
},
|
| 849 |
)
|
| 850 |
record = await result_set.single()
|
|
|
|
|
|
|
| 851 |
if not record:
|
| 852 |
logger.debug(f"No nodes found for entity_id: {node_label}")
|
| 853 |
return result
|
| 854 |
|
| 855 |
-
# Check if the
|
| 856 |
-
if
|
| 857 |
result.is_truncated = True
|
| 858 |
logger.info(
|
| 859 |
f"Graph truncated: breadth-first search limited to {max_nodes} nodes"
|
|
@@ -863,13 +863,11 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 863 |
if result_set:
|
| 864 |
await result_set.consume()
|
| 865 |
|
| 866 |
-
|
| 867 |
-
if record and record["node_info"]:
|
| 868 |
for node_info in record["node_info"]:
|
| 869 |
node = node_info["node"]
|
| 870 |
node_id = node.id
|
| 871 |
if node_id not in seen_nodes:
|
| 872 |
-
seen_nodes.add(node_id)
|
| 873 |
result.nodes.append(
|
| 874 |
KnowledgeGraphNode(
|
| 875 |
id=f"{node_id}",
|
|
@@ -877,11 +875,11 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 877 |
properties=dict(node),
|
| 878 |
)
|
| 879 |
)
|
|
|
|
| 880 |
|
| 881 |
for rel in record["relationships"]:
|
| 882 |
edge_id = rel.id
|
| 883 |
if edge_id not in seen_edges:
|
| 884 |
-
seen_edges.add(edge_id)
|
| 885 |
start = rel.start_node
|
| 886 |
end = rel.end_node
|
| 887 |
result.edges.append(
|
|
@@ -893,14 +891,13 @@ class MemgraphStorage(BaseGraphStorage):
|
|
| 893 |
properties=dict(rel),
|
| 894 |
)
|
| 895 |
)
|
|
|
|
| 896 |
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
|
| 901 |
except Exception as e:
|
| 902 |
-
logger.
|
| 903 |
-
# Return empty but properly initialized KnowledgeGraph on error
|
| 904 |
-
return KnowledgeGraph()
|
| 905 |
|
| 906 |
return result
|
|
|
|
| 435 |
|
| 436 |
async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
|
| 437 |
"""
|
| 438 |
+
Upsert a node in the Memgraph database.
|
| 439 |
|
| 440 |
Args:
|
| 441 |
node_id: The unique identifier for the node (used as label)
|
|
|
|
| 448 |
properties = node_data
|
| 449 |
entity_type = properties["entity_type"]
|
| 450 |
if "entity_id" not in properties:
|
| 451 |
+
raise ValueError(
|
| 452 |
+
"Memgraph: node properties must contain an 'entity_id' field"
|
| 453 |
+
)
|
| 454 |
|
| 455 |
try:
|
| 456 |
async with self._driver.session(database=self._DATABASE) as session:
|
|
|
|
| 734 |
self,
|
| 735 |
node_label: str,
|
| 736 |
max_depth: int = 3,
|
| 737 |
+
max_nodes: int = None,
|
| 738 |
) -> KnowledgeGraph:
|
| 739 |
"""
|
| 740 |
Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
|
|
|
|
| 742 |
Args:
|
| 743 |
node_label: Label of the starting node, * means all nodes
|
| 744 |
max_depth: Maximum depth of the subgraph, Defaults to 3
|
| 745 |
+
max_nodes: Maximum nodes to return by BFS, Defaults to 1000
|
| 746 |
|
| 747 |
Returns:
|
| 748 |
KnowledgeGraph object containing nodes and edges, with an is_truncated flag
|
| 749 |
indicating whether the graph was truncated due to max_nodes limit
|
|
|
|
|
|
|
|
|
|
| 750 |
"""
|
| 751 |
+
# Get max_nodes from global_config if not provided
|
| 752 |
+
if max_nodes is None:
|
| 753 |
+
max_nodes = self.global_config.get("max_graph_nodes", 1000)
|
| 754 |
+
else:
|
| 755 |
+
# Limit max_nodes to not exceed global_config max_graph_nodes
|
| 756 |
+
max_nodes = min(max_nodes, self.global_config.get("max_graph_nodes", 1000))
|
| 757 |
|
| 758 |
+
workspace_label = self._get_workspace_label()
|
| 759 |
result = KnowledgeGraph()
|
| 760 |
seen_nodes = set()
|
| 761 |
seen_edges = set()
|
| 762 |
+
|
| 763 |
async with self._driver.session(
|
| 764 |
database=self._DATABASE, default_access_mode="READ"
|
| 765 |
) as session:
|
| 766 |
try:
|
| 767 |
if node_label == "*":
|
| 768 |
+
# First check total node count to determine if graph is truncated
|
| 769 |
+
count_query = (
|
| 770 |
+
f"MATCH (n:`{workspace_label}`) RETURN count(n) as total"
|
| 771 |
+
)
|
| 772 |
count_result = None
|
|
|
|
| 773 |
try:
|
| 774 |
count_result = await session.run(count_query)
|
| 775 |
count_record = await count_result.single()
|
| 776 |
+
|
| 777 |
+
if count_record and count_record["total"] > max_nodes:
|
| 778 |
+
result.is_truncated = True
|
| 779 |
+
logger.info(
|
| 780 |
+
f"Graph truncated: {count_record['total']} nodes found, limited to {max_nodes}"
|
| 781 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
finally:
|
| 783 |
if count_result:
|
| 784 |
await count_result.consume()
|
| 785 |
|
| 786 |
+
# Run main query to get nodes with highest degree
|
| 787 |
main_query = f"""
|
| 788 |
MATCH (n:`{workspace_label}`)
|
| 789 |
OPTIONAL MATCH (n)-[r]-()
|
| 790 |
WITH n, COALESCE(count(r), 0) AS degree
|
| 791 |
ORDER BY degree DESC
|
| 792 |
LIMIT $max_nodes
|
| 793 |
+
WITH collect({{node: n}}) AS filtered_nodes
|
| 794 |
+
UNWIND filtered_nodes AS node_info
|
| 795 |
+
WITH collect(node_info.node) AS kept_nodes, filtered_nodes
|
| 796 |
+
OPTIONAL MATCH (a)-[r]-(b)
|
| 797 |
WHERE a IN kept_nodes AND b IN kept_nodes
|
| 798 |
+
RETURN filtered_nodes AS node_info,
|
| 799 |
collect(DISTINCT r) AS relationships
|
| 800 |
"""
|
| 801 |
result_set = None
|
| 802 |
try:
|
| 803 |
result_set = await session.run(
|
| 804 |
+
main_query,
|
| 805 |
+
{"max_nodes": max_nodes},
|
| 806 |
)
|
| 807 |
record = await result_set.single()
|
|
|
|
|
|
|
|
|
|
| 808 |
finally:
|
| 809 |
if result_set:
|
| 810 |
await result_set.consume()
|
| 811 |
|
| 812 |
else:
|
| 813 |
+
# Run subgraph query for specific node_label
|
| 814 |
+
subgraph_query = f"""
|
| 815 |
MATCH (start:`{workspace_label}`)
|
| 816 |
WHERE start.entity_id = $entity_id
|
| 817 |
+
|
| 818 |
+
MATCH path = (start)-[*BFS 0..{max_depth}]-(end:`{workspace_label}`)
|
| 819 |
+
WHERE ALL(n IN nodes(path) WHERE '{workspace_label}' IN labels(n))
|
| 820 |
+
WITH collect(DISTINCT end) + start AS all_nodes_unlimited
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 821 |
WITH
|
| 822 |
CASE
|
| 823 |
+
WHEN size(all_nodes_unlimited) <= $max_nodes THEN all_nodes_unlimited
|
| 824 |
+
ELSE all_nodes_unlimited[0..$max_nodes]
|
| 825 |
END AS limited_nodes,
|
| 826 |
+
size(all_nodes_unlimited) > $max_nodes AS is_truncated
|
| 827 |
+
|
| 828 |
+
UNWIND limited_nodes AS n
|
| 829 |
+
MATCH (n)-[r]-(m)
|
| 830 |
+
WHERE m IN limited_nodes
|
| 831 |
+
WITH collect(DISTINCT n) AS limited_nodes, collect(DISTINCT r) AS relationships, is_truncated
|
| 832 |
+
|
| 833 |
RETURN
|
| 834 |
[node IN limited_nodes | {{node: node}}] AS node_info,
|
| 835 |
relationships,
|
|
|
|
| 836 |
is_truncated
|
| 837 |
"""
|
| 838 |
+
|
| 839 |
result_set = None
|
| 840 |
try:
|
| 841 |
result_set = await session.run(
|
| 842 |
+
subgraph_query,
|
| 843 |
{
|
| 844 |
"entity_id": node_label,
|
| 845 |
+
"max_nodes": max_nodes,
|
| 846 |
},
|
| 847 |
)
|
| 848 |
record = await result_set.single()
|
| 849 |
+
|
| 850 |
+
# If no record found, return empty KnowledgeGraph
|
| 851 |
if not record:
|
| 852 |
logger.debug(f"No nodes found for entity_id: {node_label}")
|
| 853 |
return result
|
| 854 |
|
| 855 |
+
# Check if the result was truncated
|
| 856 |
+
if record.get("is_truncated"):
|
| 857 |
result.is_truncated = True
|
| 858 |
logger.info(
|
| 859 |
f"Graph truncated: breadth-first search limited to {max_nodes} nodes"
|
|
|
|
| 863 |
if result_set:
|
| 864 |
await result_set.consume()
|
| 865 |
|
| 866 |
+
if record:
|
|
|
|
| 867 |
for node_info in record["node_info"]:
|
| 868 |
node = node_info["node"]
|
| 869 |
node_id = node.id
|
| 870 |
if node_id not in seen_nodes:
|
|
|
|
| 871 |
result.nodes.append(
|
| 872 |
KnowledgeGraphNode(
|
| 873 |
id=f"{node_id}",
|
|
|
|
| 875 |
properties=dict(node),
|
| 876 |
)
|
| 877 |
)
|
| 878 |
+
seen_nodes.add(node_id)
|
| 879 |
|
| 880 |
for rel in record["relationships"]:
|
| 881 |
edge_id = rel.id
|
| 882 |
if edge_id not in seen_edges:
|
|
|
|
| 883 |
start = rel.start_node
|
| 884 |
end = rel.end_node
|
| 885 |
result.edges.append(
|
|
|
|
| 891 |
properties=dict(rel),
|
| 892 |
)
|
| 893 |
)
|
| 894 |
+
seen_edges.add(edge_id)
|
| 895 |
|
| 896 |
+
logger.info(
|
| 897 |
+
f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}"
|
| 898 |
+
)
|
| 899 |
|
| 900 |
except Exception as e:
|
| 901 |
+
logger.warning(f"Memgraph error during subgraph query: {str(e)}")
|
|
|
|
|
|
|
| 902 |
|
| 903 |
return result
|
lightrag/lightrag.py
CHANGED
|
@@ -23,7 +23,6 @@ from typing import (
|
|
| 23 |
)
|
| 24 |
from lightrag.constants import (
|
| 25 |
DEFAULT_MAX_GLEANING,
|
| 26 |
-
DEFAULT_MAX_TOKEN_SUMMARY,
|
| 27 |
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
| 28 |
)
|
| 29 |
from lightrag.utils import get_env_value
|
|
@@ -134,10 +133,6 @@ class LightRAG:
|
|
| 134 |
)
|
| 135 |
"""Maximum number of entity extraction attempts for ambiguous content."""
|
| 136 |
|
| 137 |
-
summary_to_max_tokens: int = field(
|
| 138 |
-
default=get_env_value("MAX_TOKEN_SUMMARY", DEFAULT_MAX_TOKEN_SUMMARY, int)
|
| 139 |
-
)
|
| 140 |
-
|
| 141 |
force_llm_summary_on_merge: int = field(
|
| 142 |
default=get_env_value(
|
| 143 |
"FORCE_LLM_SUMMARY_ON_MERGE", DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int
|
|
|
|
| 23 |
)
|
| 24 |
from lightrag.constants import (
|
| 25 |
DEFAULT_MAX_GLEANING,
|
|
|
|
| 26 |
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
| 27 |
)
|
| 28 |
from lightrag.utils import get_env_value
|
|
|
|
| 133 |
)
|
| 134 |
"""Maximum number of entity extraction attempts for ambiguous content."""
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
force_llm_summary_on_merge: int = field(
|
| 137 |
default=get_env_value(
|
| 138 |
"FORCE_LLM_SUMMARY_ON_MERGE", DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int
|
lightrag/operate.py
CHANGED
|
@@ -123,7 +123,6 @@ async def _handle_entity_relation_summary(
|
|
| 123 |
|
| 124 |
tokenizer: Tokenizer = global_config["tokenizer"]
|
| 125 |
llm_max_tokens = global_config["llm_model_max_token_size"]
|
| 126 |
-
# summary_max_tokens = global_config["summary_to_max_tokens"]
|
| 127 |
|
| 128 |
language = global_config["addon_params"].get(
|
| 129 |
"language", PROMPTS["DEFAULT_LANGUAGE"]
|
|
|
|
| 123 |
|
| 124 |
tokenizer: Tokenizer = global_config["tokenizer"]
|
| 125 |
llm_max_tokens = global_config["llm_model_max_token_size"]
|
|
|
|
| 126 |
|
| 127 |
language = global_config["addon_params"].get(
|
| 128 |
"language", PROMPTS["DEFAULT_LANGUAGE"]
|