tackhwa commited on
Commit
db3c974
·
1 Parent(s): cfb0884

remove regex

Browse files
Files changed (2) hide show
  1. lightrag/operate.py +2 -5
  2. lightrag/utils.py +0 -27
lightrag/operate.py CHANGED
@@ -18,7 +18,6 @@ from .utils import (
18
  normalize_extracted_info,
19
  pack_user_ass_to_openai_messages,
20
  split_string_by_multi_markers,
21
- extract_fixed_parenthesized_content,
22
  truncate_list_by_token_size,
23
  process_combine_contexts,
24
  compute_args_hash,
@@ -153,7 +152,7 @@ async def _handle_single_entity_extraction(
153
  chunk_key: str,
154
  file_path: str = "unknown_source",
155
  ):
156
- if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
157
  return None
158
 
159
  # Clean and validate entity name
@@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction(
199
  chunk_key: str,
200
  file_path: str = "unknown_source",
201
  ):
202
- if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
203
  return None
204
  # add this record as edge
205
  source = clean_str(record_attributes[1])
@@ -550,8 +549,6 @@ async def extract_entities(
550
  [context_base["record_delimiter"], context_base["completion_delimiter"]],
551
  )
552
 
553
- records = extract_fixed_parenthesized_content(records)
554
-
555
  for record in records:
556
  record = re.search(r"\((.*)\)", record)
557
  if record is None:
 
18
  normalize_extracted_info,
19
  pack_user_ass_to_openai_messages,
20
  split_string_by_multi_markers,
 
21
  truncate_list_by_token_size,
22
  process_combine_contexts,
23
  compute_args_hash,
 
152
  chunk_key: str,
153
  file_path: str = "unknown_source",
154
  ):
155
+ if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
156
  return None
157
 
158
  # Clean and validate entity name
 
198
  chunk_key: str,
199
  file_path: str = "unknown_source",
200
  ):
201
+ if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
202
  return None
203
  # add this record as edge
204
  source = clean_str(record_attributes[1])
 
549
  [context_base["record_delimiter"], context_base["completion_delimiter"]],
550
  )
551
 
 
 
552
  for record in records:
553
  record = re.search(r"\((.*)\)", record)
554
  if record is None:
lightrag/utils.py CHANGED
@@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
408
  return [r.strip() for r in results if r.strip()]
409
 
410
 
411
- def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
412
- """
413
- Extract content that should be in parentheses from each record.
414
- Ensures each extracted item has both opening and closing parentheses.
415
- """
416
- result = []
417
-
418
- for record in records:
419
- # First, extract properly matched pairs
420
- balanced_matches = re.findall(r"\((.*?)\)", record)
421
- for match in balanced_matches:
422
- result.append(f"({match})")
423
-
424
- # Process string to handle unbalanced parentheses
425
- # For opening without closing
426
- open_matches = re.findall(r"\(([^()]*?)$", record)
427
- for match in open_matches:
428
- result.append(f"({match})")
429
-
430
- # For closing without opening
431
- close_matches = re.findall(r"^([^()]*?)\)", record)
432
- for match in close_matches:
433
- result.append(f"({match})")
434
-
435
- return result
436
-
437
-
438
  # Refer the utils functions of the official GraphRAG implementation:
439
  # https://github.com/microsoft/graphrag
440
  def clean_str(input: Any) -> str:
 
408
  return [r.strip() for r in results if r.strip()]
409
 
410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  # Refer the utils functions of the official GraphRAG implementation:
412
  # https://github.com/microsoft/graphrag
413
  def clean_str(input: Any) -> str: