tackhwa
commited on
Commit
·
db3c974
1
Parent(s):
cfb0884
remove regex
Browse files- lightrag/operate.py +2 -5
- lightrag/utils.py +0 -27
lightrag/operate.py
CHANGED
@@ -18,7 +18,6 @@ from .utils import (
|
|
18 |
normalize_extracted_info,
|
19 |
pack_user_ass_to_openai_messages,
|
20 |
split_string_by_multi_markers,
|
21 |
-
extract_fixed_parenthesized_content,
|
22 |
truncate_list_by_token_size,
|
23 |
process_combine_contexts,
|
24 |
compute_args_hash,
|
@@ -153,7 +152,7 @@ async def _handle_single_entity_extraction(
|
|
153 |
chunk_key: str,
|
154 |
file_path: str = "unknown_source",
|
155 |
):
|
156 |
-
if len(record_attributes) < 4 or
|
157 |
return None
|
158 |
|
159 |
# Clean and validate entity name
|
@@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction(
|
|
199 |
chunk_key: str,
|
200 |
file_path: str = "unknown_source",
|
201 |
):
|
202 |
-
if len(record_attributes) < 5 or
|
203 |
return None
|
204 |
# add this record as edge
|
205 |
source = clean_str(record_attributes[1])
|
@@ -550,8 +549,6 @@ async def extract_entities(
|
|
550 |
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
551 |
)
|
552 |
|
553 |
-
records = extract_fixed_parenthesized_content(records)
|
554 |
-
|
555 |
for record in records:
|
556 |
record = re.search(r"\((.*)\)", record)
|
557 |
if record is None:
|
|
|
18 |
normalize_extracted_info,
|
19 |
pack_user_ass_to_openai_messages,
|
20 |
split_string_by_multi_markers,
|
|
|
21 |
truncate_list_by_token_size,
|
22 |
process_combine_contexts,
|
23 |
compute_args_hash,
|
|
|
152 |
chunk_key: str,
|
153 |
file_path: str = "unknown_source",
|
154 |
):
|
155 |
+
if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
|
156 |
return None
|
157 |
|
158 |
# Clean and validate entity name
|
|
|
198 |
chunk_key: str,
|
199 |
file_path: str = "unknown_source",
|
200 |
):
|
201 |
+
if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
|
202 |
return None
|
203 |
# add this record as edge
|
204 |
source = clean_str(record_attributes[1])
|
|
|
549 |
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
550 |
)
|
551 |
|
|
|
|
|
552 |
for record in records:
|
553 |
record = re.search(r"\((.*)\)", record)
|
554 |
if record is None:
|
lightrag/utils.py
CHANGED
@@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
|
|
408 |
return [r.strip() for r in results if r.strip()]
|
409 |
|
410 |
|
411 |
-
def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
|
412 |
-
"""
|
413 |
-
Extract content that should be in parentheses from each record.
|
414 |
-
Ensures each extracted item has both opening and closing parentheses.
|
415 |
-
"""
|
416 |
-
result = []
|
417 |
-
|
418 |
-
for record in records:
|
419 |
-
# First, extract properly matched pairs
|
420 |
-
balanced_matches = re.findall(r"\((.*?)\)", record)
|
421 |
-
for match in balanced_matches:
|
422 |
-
result.append(f"({match})")
|
423 |
-
|
424 |
-
# Process string to handle unbalanced parentheses
|
425 |
-
# For opening without closing
|
426 |
-
open_matches = re.findall(r"\(([^()]*?)$", record)
|
427 |
-
for match in open_matches:
|
428 |
-
result.append(f"({match})")
|
429 |
-
|
430 |
-
# For closing without opening
|
431 |
-
close_matches = re.findall(r"^([^()]*?)\)", record)
|
432 |
-
for match in close_matches:
|
433 |
-
result.append(f"({match})")
|
434 |
-
|
435 |
-
return result
|
436 |
-
|
437 |
-
|
438 |
# Refer the utils functions of the official GraphRAG implementation:
|
439 |
# https://github.com/microsoft/graphrag
|
440 |
def clean_str(input: Any) -> str:
|
|
|
408 |
return [r.strip() for r in results if r.strip()]
|
409 |
|
410 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
# Refer the utils functions of the official GraphRAG implementation:
|
412 |
# https://github.com/microsoft/graphrag
|
413 |
def clean_str(input: Any) -> str:
|