inebrahim99 commited on
Commit
5d304cf
·
1 Parent(s): d9be087

Initial push

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
PathRAG/PathRAG.py CHANGED
@@ -7,10 +7,8 @@ from functools import partial
7
  from typing import Type, cast
8
 
9
 
10
- from .llm import (
11
- gpt_4o_mini_complete,
12
- openai_embedding,
13
- )
14
  from .operate import (
15
  chunking_by_token_size,
16
  extract_entities,
@@ -104,6 +102,7 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
104
 
105
  @dataclass
106
  class PathRAG:
 
107
  working_dir: str = field(
108
  default_factory=lambda: f"./PathRAG_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
109
  )
@@ -116,7 +115,7 @@ class PathRAG:
116
  }
117
  )
118
  kv_storage: str = field(default="JsonKVStorage")
119
- vector_storage: str = field(default="NanoVectorDBStorage")
120
  graph_storage: str = field(default="NetworkXStorage")
121
 
122
  current_log_level = logger.level
@@ -145,13 +144,13 @@ class PathRAG:
145
  )
146
 
147
 
148
- embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding)
149
  embedding_batch_num: int = 32
150
  embedding_func_max_async: int = 16
151
 
152
 
153
- llm_model_func: callable = gpt_4o_mini_complete
154
- llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
155
  llm_model_max_token_size: int = 32768
156
  llm_model_max_async: int = 16
157
  llm_model_kwargs: dict = field(default_factory=dict)
 
7
  from typing import Type, cast
8
 
9
 
10
+ from .llm import hf_model_complete, sentence_transformer_embedding
11
+
 
 
12
  from .operate import (
13
  chunking_by_token_size,
14
  extract_entities,
 
102
 
103
  @dataclass
104
  class PathRAG:
105
+
106
  working_dir: str = field(
107
  default_factory=lambda: f"./PathRAG_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
108
  )
 
115
  }
116
  )
117
  kv_storage: str = field(default="JsonKVStorage")
118
+ vector_storage: str = field(default="ChromaVectorDBStorage")
119
  graph_storage: str = field(default="NetworkXStorage")
120
 
121
  current_log_level = logger.level
 
144
  )
145
 
146
 
147
+ embedding_func: EmbeddingFunc = field(default_factory=lambda: sentence_transformer_embedding)
148
  embedding_batch_num: int = 32
149
  embedding_func_max_async: int = 16
150
 
151
 
152
+ llm_model_func: callable = hf_model_complete
153
+ llm_model_name: str = "mistralai/Mistral-7B-Instruct-v0.3"
154
  llm_model_max_token_size: int = 32768
155
  llm_model_max_async: int = 16
156
  llm_model_kwargs: dict = field(default_factory=dict)
PathRAG/__pycache__/PathRAG.cpython-312.pyc ADDED
Binary file (23.8 kB). View file
 
PathRAG/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (199 Bytes). View file
 
PathRAG/__pycache__/base.cpython-312.pyc ADDED
Binary file (7.39 kB). View file
 
PathRAG/__pycache__/config.cpython-312.pyc ADDED
Binary file (876 Bytes). View file
 
PathRAG/__pycache__/llm.cpython-312.pyc ADDED
Binary file (44.5 kB). View file
 
PathRAG/__pycache__/main.cpython-312.pyc ADDED
Binary file (6.94 kB). View file
 
PathRAG/__pycache__/operate.cpython-312.pyc ADDED
Binary file (50.6 kB). View file
 
PathRAG/__pycache__/prompt.cpython-312.pyc ADDED
Binary file (2.79 kB). View file
 
PathRAG/__pycache__/storage.cpython-312.pyc ADDED
Binary file (22.7 kB). View file
 
PathRAG/__pycache__/utils.cpython-312.pyc ADDED
Binary file (24.2 kB). View file
 
PathRAG/base.py CHANGED
@@ -1,6 +1,6 @@
1
  from dataclasses import dataclass, field
2
  from typing import TypedDict, Union, Literal, Generic, TypeVar
3
-
4
  import numpy as np
5
 
6
  from .utils import EmbeddingFunc
@@ -15,7 +15,7 @@ T = TypeVar("T")
15
 
16
  @dataclass
17
  class QueryParam:
18
- mode: Literal["hybrid"] = "global"
19
  only_need_context: bool = False
20
  only_need_prompt: bool = False
21
  response_type: str = "Multiple Paragraphs"
@@ -81,7 +81,7 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
81
 
82
  @dataclass
83
  class BaseGraphStorage(StorageNameSpace):
84
- embedding_func: EmbeddingFunc = None
85
 
86
  async def has_node(self, node_id: str) -> bool:
87
  raise NotImplementedError
 
1
  from dataclasses import dataclass, field
2
  from typing import TypedDict, Union, Literal, Generic, TypeVar
3
+ from typing import Optional
4
  import numpy as np
5
 
6
  from .utils import EmbeddingFunc
 
15
 
16
  @dataclass
17
  class QueryParam:
18
+ mode: Literal["hybrid"] = "hybrid"
19
  only_need_context: bool = False
20
  only_need_prompt: bool = False
21
  response_type: str = "Multiple Paragraphs"
 
81
 
82
  @dataclass
83
  class BaseGraphStorage(StorageNameSpace):
84
+ embedding_func: Optional[EmbeddingFunc] = None
85
 
86
  async def has_node(self, node_id: str) -> bool:
87
  raise NotImplementedError
PathRAG/config.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
2
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
3
+ model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
4
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
5
+
6
+ async def qwen_generate(prompt, **kwargs):
7
+ output = pipe(prompt, max_length=kwargs.get("max_tokens", 512))
8
+ return output[0]["generated_text"]
9
+
10
+ global_config["llm_model_func"] = qwen_generate
PathRAG/llm.py CHANGED
@@ -12,6 +12,8 @@ import numpy as np
12
  import ollama
13
  import torch
14
  import time
 
 
15
  from openai import (
16
  AsyncOpenAI,
17
  APIConnectionError,
@@ -222,7 +224,7 @@ async def bedrock_complete_if_cache(
222
 
223
 
224
  @lru_cache(maxsize=1)
225
- def initialize_hf_model(model_name):
226
  hf_tokenizer = AutoTokenizer.from_pretrained(
227
  model_name, device_map="auto", trust_remote_code=True
228
  )
@@ -391,11 +393,11 @@ async def lmdeploy_model_if_cache(
391
  "lmdeploy/llama2-chat-70b-4bit", etc.
392
  - iii) The model_id of a model hosted inside a model repo
393
  on huggingface.co, such as "internlm/internlm-chat-7b",
394
- "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
395
  and so on.
396
  chat_template (str): needed when model is a pytorch model on
397
  huggingface.co, such as "internlm-chat-7b",
398
- "Qwen-7B-Chat ", "Baichuan2-7B-Chat" and so on,
399
  and when the model name of local path did not match the original model name in HF.
400
  tp (int): tensor parallel
401
  prompt (Union[str, List[str]]): input texts to be completed.
@@ -567,8 +569,7 @@ async def bedrock_complete(
567
  async def hf_model_complete(
568
  prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
569
  ) -> str:
570
- keyword_extraction = kwargs.pop("keyword_extraction", None)
571
- model_name = kwargs["hashing_kv"].global_config["llm_model_name"]
572
  result = await hf_model_if_cache(
573
  model_name,
574
  prompt,
@@ -576,8 +577,6 @@ async def hf_model_complete(
576
  history_messages=history_messages,
577
  **kwargs,
578
  )
579
- if keyword_extraction: # TODO: use JSON API
580
- return locate_json_string_body_from_string(result)
581
  return result
582
 
583
 
@@ -1093,6 +1092,15 @@ class MultiModel:
1093
 
1094
  return await next_model.gen_func(**args)
1095
 
 
 
 
 
 
 
 
 
 
1096
 
1097
  if __name__ == "__main__":
1098
  import asyncio
 
12
  import ollama
13
  import torch
14
  import time
15
+ from sentence_transformers import SentenceTransformer
16
+
17
  from openai import (
18
  AsyncOpenAI,
19
  APIConnectionError,
 
224
 
225
 
226
  @lru_cache(maxsize=1)
227
+ def initialize_hf_model(model_name="mistralai/Mistral-7B-Instruct-v0.3"):
228
  hf_tokenizer = AutoTokenizer.from_pretrained(
229
  model_name, device_map="auto", trust_remote_code=True
230
  )
 
393
  "lmdeploy/llama2-chat-70b-4bit", etc.
394
  - iii) The model_id of a model hosted inside a model repo
395
  on huggingface.co, such as "internlm/internlm-chat-7b",
396
+ "lmistralai/Mistral-7B-Instruct-v0.3 ", "baichuan-inc/Baichuan2-7B-Chat"
397
  and so on.
398
  chat_template (str): needed when model is a pytorch model on
399
  huggingface.co, such as "internlm-chat-7b",
400
+ "mistralai/Mistral-7B-Instruct-v0.3 ", "Baichuan2-7B-Chat" and so on,
401
  and when the model name of local path did not match the original model name in HF.
402
  tp (int): tensor parallel
403
  prompt (Union[str, List[str]]): input texts to be completed.
 
569
  async def hf_model_complete(
570
  prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
571
  ) -> str:
572
+ model_name = "mistralai/Mistral-7B-Instruct-v0.3"
 
573
  result = await hf_model_if_cache(
574
  model_name,
575
  prompt,
 
577
  history_messages=history_messages,
578
  **kwargs,
579
  )
 
 
580
  return result
581
 
582
 
 
1092
 
1093
  return await next_model.gen_func(**args)
1094
 
1095
+ @wrap_embedding_func_with_attrs(embedding_dim=384, max_token_size=512)
1096
+ async def sentence_transformer_embedding(
1097
+ texts: list[str],
1098
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
1099
+ ) -> np.ndarray:
1100
+ if isinstance(texts, str):
1101
+ texts = [texts]
1102
+ st_model = SentenceTransformer(model_name)
1103
+ return st_model.encode(texts, convert_to_numpy=True)
1104
 
1105
  if __name__ == "__main__":
1106
  import asyncio
PathRAG/main.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import pymupdf
4
+ # import pdfplumber
5
+ # from PIL import Image
6
+ import re
7
+ import networkx as nx
8
+ import matplotlib.pyplot as plt
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
11
+ from transformers.generation import GenerationConfig
12
+ import torch
13
+ torch.manual_seed(1234)
14
+ from sentence_transformers import SentenceTransformer
15
+ from .config import qwen_generate, global_config
16
+ from .llm import hf_model_complete, sentence_transformer_embedding
17
+
18
+
19
+ # ---- PathRAG imports ----
20
+ from PathRAG import PathRAG, QueryParam
21
+
22
+ load_dotenv()
23
+
24
+ # --- Your models ---
25
+ LLM_ID = "mistralai/Mistral-7B-Instruct-v0.3"
26
+ EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
+ PDF_PATH = "AUTOSAR_SWS_RTE.pdf"
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(LLM_ID, trust_remote_code=True)
30
+ llm_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", device_map="cpu", trust_remote_code=True).eval()
31
+ processor = AutoProcessor.from_pretrained(LLM_ID, trust_remote_code=True)
32
+
33
+ # ---- PDF Processing ----
34
+ def get_toc_tables(pdf_path: str):
35
+ doc = pymupdf.open(pdf_path)
36
+ toc = doc.get_toc()
37
+ if not toc:
38
+ raise ValueError("TOC not found")
39
+ start, end = None, None
40
+ for i, (level, title, page_num) in enumerate(toc):
41
+ if title.strip().lower() == "requirements":
42
+ start = page_num - 1
43
+ end = toc[i + 1][2] - 1 if (i + 1) < len(toc) else len(doc)
44
+ break
45
+ if start is None:
46
+ raise ValueError("Requirements section missing")
47
+ all_tables = []
48
+ for page_index in range(start, end):
49
+ page = doc.load_page(page_index)
50
+ tables = page.extract_tables()
51
+ all_tables.extend(tables)
52
+ return all_tables
53
+
54
+ def build_dependency_graph(all_tables):
55
+ req_pattern = re.compile(r'\[(SRS_[A-Za-z0-9_]+)\]')
56
+ sat_pattern = re.compile(r'\[(SWS_[A-Za-z0-9_]+)\]')
57
+ G = nx.DiGraph()
58
+ current_req = None
59
+ for table in all_tables:
60
+ for row in table:
61
+ row_text = " ".join(cell for cell in row if cell)
62
+ srs_matches = req_pattern.findall(row_text)
63
+ if srs_matches:
64
+ current_req = srs_matches[0]
65
+ G.add_node(current_req, type="SRS")
66
+ sws_matches = sat_pattern.findall(row_text)
67
+ if current_req and sws_matches:
68
+ for sws in sws_matches:
69
+ G.add_node(sws, type="SWS")
70
+ G.add_edge(current_req, sws)
71
+ return G
72
+
73
+ def process_pdf_chunks(pdf_path: str):
74
+ doc = pymupdf.open(pdf_path)
75
+ toc = doc.get_toc()
76
+ sections = {}
77
+ for i, (level, title, page_num) in enumerate(toc):
78
+ start = page_num - 1
79
+ end = toc[i + 1][2] - 1 if (i + 1) < len(toc) else doc.page_count
80
+ text = "".join(doc[p].get_text() for p in range(start, end))
81
+ if text.strip():
82
+ sections[title.strip()] = text
83
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
84
+ chunks = []
85
+ for title, content in sections.items():
86
+ for chunk in splitter.split_text(content):
87
+ chunks.append({"title": title, "content": chunk})
88
+ return chunks
89
+
90
+ # ---- PathRAG Pipeline ----
91
+ def init_pathrag():
92
+ # Create PathRAG instance with your embedding function
93
+ pr = PathRAG(
94
+ embedding_func=EMBED_MODEL.encode, # or a wrapper
95
+ llm_func=lambda prompt: tokenizer.decode(
96
+ llm_model.generate(**tokenizer(prompt, return_tensors="pt").to(llm_model.device), max_new_tokens=300)[0],
97
+ skip_special_tokens=True,
98
+ ),
99
+ storage_config={"vector_db": "your_choice"} # modify for FAISS, Milvus, etc.
100
+ )
101
+ return pr
102
+
103
+ def index_pdf_with_pathrag(pathrag: PathRAG, pdf_path: str):
104
+ chunks = process_pdf_chunks(pdf_path)
105
+ for chunk in chunks:
106
+ pathrag.index_text(chunk["content"], metadata={"section": chunk["title"]})
107
+ # Optional: add images or graph data
108
+ tables = get_toc_tables(pdf_path)
109
+ graph = build_dependency_graph(tables)
110
+ # You can serialize graph edges and index them as metadata if needed
111
+
112
+ def query_pathrag(pathrag: PathRAG, question: str):
113
+ params = QueryParam(top_k=5)
114
+ return pathrag.query(question, query_params=params)
115
+
116
+ # ---- Usage ----
117
+ if __name__ == "__main__":
118
+ pr = init_pathrag()
119
+ index_pdf_with_pathrag(pr, PDF_PATH)
120
+ answer = query_pathrag(pr, "What are the dependencies for SRS_XYZ?")
121
+ print("Answer:", answer)
PathRAG/operate.py CHANGED
@@ -36,7 +36,7 @@ from .prompt import GRAPH_FIELD_SEP, PROMPTS
36
 
37
 
38
  def chunking_by_token_size(
39
- content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o"
40
  ):
41
  tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
42
  results = []
 
36
 
37
 
38
  def chunking_by_token_size(
39
+ content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="mistralai/Mistral-7B-Instruct-v0.3"
40
  ):
41
  tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
42
  results = []
PathRAG/prompt.py CHANGED
@@ -2,285 +2,77 @@ GRAPH_FIELD_SEP = "<SEP>"
2
 
3
  PROMPTS = {}
4
 
 
5
  PROMPTS["DEFAULT_LANGUAGE"] = "English"
6
  PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
7
  PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
8
  PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
9
  PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
10
 
11
- PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event", "category"]
 
 
 
12
 
 
13
  PROMPTS["entity_extraction"] = """-Goal-
14
- Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
15
- Use {language} as output language.
16
-
17
- -Steps-
18
- 1. Identify all entities. For each identified entity, extract the following information:
19
- - entity_name: Name of the entity, use same language as input text. If English, capitalized the name.
20
- - entity_type: One of the following types: [{entity_types}]
21
- - entity_description: Comprehensive description of the entity's attributes and activities
22
- Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
23
 
24
- 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
25
- For each pair of related entities, extract the following information:
26
- - source_entity: name of the source entity, as identified in step 1
27
- - target_entity: name of the target entity, as identified in step 1
28
- - relationship_description: explanation as to why you think the source entity and the target entity are related to each other
29
- - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
30
- - relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
31
- Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
32
 
33
- 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
34
- Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
 
35
 
36
- 4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 
37
 
38
- 5. When finished, output {completion_delimiter}
 
 
39
 
40
- ######################
41
- -Examples-
42
- ######################
43
- {examples}
44
 
45
- #############################
46
- -Real Data-
47
- ######################
48
- Entity_types: {entity_types}
49
- Text: {input_text}
50
- ######################
51
- Output:
52
  """
53
 
54
- PROMPTS["entity_extraction_examples"] = [
55
- """Example 1:
56
-
57
- Entity_types: [person, technology, mission, organization, location]
58
- Text:
59
- while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
60
-
61
- Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”
62
-
63
- The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.
64
-
65
- It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
66
- ################
67
- Output:
68
- ("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}
69
- ("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}
70
- ("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}
71
- ("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}
72
- ("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}
73
- ("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}
74
- ("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}
75
- ("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}
76
- ("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
77
- ("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
78
- ("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
79
- #############################""",
80
- """Example 2:
81
-
82
- Entity_types: [person, technology, mission, organization, location]
83
- Text:
84
- They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve.
85
-
86
- Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril.
87
-
88
- Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly
89
- #############
90
- Output:
91
- ("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter}
92
- ("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter}
93
- ("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter}
94
- ("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter}
95
- ("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter}
96
- ("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter}
97
- #############################""",
98
- """Example 3:
99
-
100
- Entity_types: [person, role, technology, organization, event, location, concept]
101
- Text:
102
- their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.
103
-
104
- "It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning."
105
-
106
- Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."
107
-
108
- Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.
109
-
110
- The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation
111
- #############
112
- Output:
113
- ("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter}
114
- ("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter}
115
- ("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter}
116
- ("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter}
117
- ("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter}
118
- ("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter}
119
- ("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter}
120
- ("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter}
121
- ("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}
122
- ("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}
123
- ("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter}
124
- #############################""",
125
- ]
126
-
127
- PROMPTS[
128
- "summarize_entity_descriptions"
129
- ] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
130
- Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
131
- Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
132
- If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
133
- Make sure it is written in third person, and include the entity names so we the have full context.
134
- Use {language} as output language.
135
-
136
- #######
137
- -Data-
138
  Entities: {entity_name}
139
- Description List: {description_list}
140
- #######
141
  Output:
142
  """
143
 
144
- PROMPTS[
145
- "entiti_continue_extraction"
146
- ] = """MANY entities were missed in the last extraction. Add them below using the same format:
147
- """
148
-
149
- PROMPTS[
150
- "entiti_if_loop_extraction"
151
- ] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added.
152
- """
153
-
154
- PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."
155
-
156
  PROMPTS["rag_response"] = """---Role---
157
-
158
- You are a helpful assistant responding to questions about data in the tables provided.
159
-
160
 
161
  ---Goal---
162
-
163
- Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
164
- If you don't know the answer, just say so. Do not make anything up.
165
- Do not include information where the supporting evidence for it is not provided.
166
 
167
  ---Target response length and format---
168
-
169
  {response_type}
170
 
171
- ---Data tables---
172
-
173
  {context_data}
174
-
175
- Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
176
- """
177
-
178
- PROMPTS["keywords_extraction"] = """---Role---
179
-
180
- You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.
181
-
182
- ---Goal---
183
-
184
- Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms.
185
-
186
- ---Instructions---
187
-
188
- - Output the keywords in JSON format.
189
- - The JSON should have two keys:
190
- - "high_level_keywords" for overarching concepts or themes.
191
- - "low_level_keywords" for specific entities or details.
192
-
193
- ######################
194
- -Examples-
195
- ######################
196
- {examples}
197
-
198
- #############################
199
- -Real Data-
200
- ######################
201
- Query: {query}
202
- ######################
203
- The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
204
- Output:
205
-
206
- """
207
-
208
- PROMPTS["keywords_extraction_examples"] = [
209
- """Example 1:
210
-
211
- Query: "How does international trade influence global economic stability?"
212
- ################
213
- Output:
214
- {{
215
- "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
216
- "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
217
- }}
218
- #############################""",
219
- """Example 2:
220
-
221
- Query: "What are the environmental consequences of deforestation on biodiversity?"
222
- ################
223
- Output:
224
- {{
225
- "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
226
- "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
227
- }}
228
- #############################""",
229
- """Example 3:
230
-
231
- Query: "What is the role of education in reducing poverty?"
232
- ################
233
- Output:
234
- {{
235
- "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
236
- "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
237
- }}
238
- #############################""",
239
- ]
240
-
241
-
242
- PROMPTS["naive_rag_response"] = """---Role---
243
-
244
- You are a helpful assistant responding to questions about documents provided.
245
-
246
-
247
- ---Goal---
248
-
249
- Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
250
- If you don't know the answer, just say so. Do not make anything up.
251
- Do not include information where the supporting evidence for it is not provided.
252
-
253
- ---Target response length and format---
254
-
255
- {response_type}
256
-
257
- ---Documents---
258
-
259
- {content_data}
260
-
261
- Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
262
  """
263
 
264
- PROMPTS[
265
- "similarity_check"
266
- ] = """Please analyze the similarity between these two questions:
267
 
268
- Question 1: {original_prompt}
269
- Question 2: {cached_prompt}
270
 
271
- Please evaluate the following two points and provide a similarity score between 0 and 1 directly:
272
- 1. Whether these two questions are semantically similar
273
- 2. Whether the answer to Question 2 can be used to answer Question 1
274
- Similarity score criteria:
275
- 0: Completely unrelated or answer cannot be reused, including but not limited to:
276
- - The questions have different topics
277
- - The locations mentioned in the questions are different
278
- - The times mentioned in the questions are different
279
- - The specific individuals mentioned in the questions are different
280
- - The specific events mentioned in the questions are different
281
- - The background information in the questions is different
282
- - The key conditions in the questions are different
283
- 1: Identical and answer can be directly reused
284
- 0.5: Partially related and answer needs modification to be used
285
- Return only a number between 0-1, without any additional content.
286
  """
 
2
 
3
  PROMPTS = {}
4
 
5
+ # --- Language and Delimiters ---
6
  PROMPTS["DEFAULT_LANGUAGE"] = "English"
7
  PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
8
  PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
9
  PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
10
  PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
11
 
12
+ # --- AUTOSAR-specific entity types ---
13
+ PROMPTS["DEFAULT_ENTITY_TYPES"] = [
14
+ "requirement_id", "component", "signal", "interface", "function", "role", "document_section"
15
+ ]
16
 
17
+ # --- Entity and relationship extraction ---
18
  PROMPTS["entity_extraction"] = """-Goal-
19
+ From AUTOSAR technical documents (text, tables, and images), identify all entities and relationships relevant to system design and requirements.
20
+ Entities may include:
21
+ - requirement_id (e.g., SRS_XXX, SWS_XXX)
22
+ - component (SWC, ECU, etc.)
23
+ - signal/interface
24
+ - functions or roles
 
 
 
25
 
26
+ Use {language} as the output language.
 
 
 
 
 
 
 
27
 
28
+ -Steps-
29
+ 1. Identify all entities with attributes:
30
+ ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
31
 
32
+ 2. Identify relationships:
33
+ ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
34
 
35
+ 3. Include **images or diagrams**:
36
+ For any image or table relevant to the section, extract a short caption/summary and link it:
37
+ ("image"{tuple_delimiter}<image_id_or_ref>{tuple_delimiter}<description>)
38
 
39
+ 4. List high-level section keywords:
40
+ ("content_keywords"{tuple_delimiter}<keywords>)
 
 
41
 
42
+ 5. Use **{record_delimiter}** to separate items and end with {completion_delimiter}.
 
 
 
 
 
 
43
  """
44
 
45
+ # --- Summarize entity descriptions ---
46
+ PROMPTS["summarize_entity_descriptions"] = """Summarize the descriptions for one or more AUTOSAR entities.
47
+ Resolve contradictions and combine all details into a single, clear description.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  Entities: {entity_name}
49
+ Descriptions: {description_list}
 
50
  Output:
51
  """
52
 
53
+ # --- RAG responses for technical content ---
 
 
 
 
 
 
 
 
 
 
 
54
  PROMPTS["rag_response"] = """---Role---
55
+ You are a technical assistant answering questions about AUTOSAR requirements and design documents (text, tables, images).
 
 
56
 
57
  ---Goal---
58
+ Provide a concise and accurate answer using the supplied context (PDF extractions, embeddings, graphs).
59
+ - If you cannot find the answer, say so.
60
+ - Highlight related requirements, components, and images if relevant.
 
61
 
62
  ---Target response length and format---
 
63
  {response_type}
64
 
65
+ ---Context---
 
66
  {context_data}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  """
68
 
69
+ # --- Dependency / Similarity ---
70
+ PROMPTS["similarity_check"] = """Compare two questions about AUTOSAR or system design:
 
71
 
72
+ Q1: {original_prompt}
73
+ Q2: {cached_prompt}
74
 
75
+ Score 0-1:
76
+ 0 = unrelated, 1 = identical, 0.5 = partially related.
77
+ Only output the score.
 
 
 
 
 
 
 
 
 
 
 
 
78
  """
requirements.txt → PathRAG/requirements.txt RENAMED
File without changes
PathRAG/utils.py CHANGED
@@ -61,9 +61,14 @@ class EmbeddingFunc:
61
  else:
62
  self._semaphore = UnlimitedSemaphore()
63
 
 
64
  async def __call__(self, *args, **kwargs) -> np.ndarray:
65
  async with self._semaphore:
66
- return await self.func(*args, **kwargs)
 
 
 
 
67
 
68
 
69
  def locate_json_string_body_from_string(content: str) -> Union[str, None]:
@@ -147,15 +152,15 @@ def write_json(json_obj, file_name):
147
  json.dump(json_obj, f, indent=2, ensure_ascii=False)
148
 
149
 
150
- def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o-mini"):
151
  global ENCODER
152
- if ENCODER is None:
153
- ENCODER = tiktoken.encoding_for_model(model_name)
154
- tokens = ENCODER.encode(content)
155
- return tokens
156
 
157
 
158
- def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o-mini"):
159
  global ENCODER
160
  if ENCODER is None:
161
  ENCODER = tiktoken.encoding_for_model(model_name)
@@ -515,13 +520,15 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
515
  await hashing_kv.upsert({cache_data.mode: mode_cache})
516
 
517
 
518
- def safe_unicode_decode(content):
519
- unicode_escape_pattern = re.compile(r"\\u([0-9a-fA-F]{4})")
520
- def replace_unicode_escape(match):
521
- return chr(int(match.group(1), 16))
 
 
 
 
 
522
 
523
- decoded_content = unicode_escape_pattern.sub(
524
- replace_unicode_escape, content.decode("utf-8")
525
- )
526
 
527
  return decoded_content
 
61
  else:
62
  self._semaphore = UnlimitedSemaphore()
63
 
64
+
65
  async def __call__(self, *args, **kwargs) -> np.ndarray:
66
  async with self._semaphore:
67
+ result = self.func(*args, **kwargs)
68
+ if asyncio.iscoroutine(result):
69
+ return await result
70
+ return result
71
+
72
 
73
 
74
  def locate_json_string_body_from_string(content: str) -> Union[str, None]:
 
152
  json.dump(json_obj, f, indent=2, ensure_ascii=False)
153
 
154
 
155
+ def encode_string_by_tiktoken(content: str, model_name: str = None):
156
  global ENCODER
157
+ if model_name is None:
158
+ model_name = os.environ.get("DEFAULT_TOKENIZER_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
159
+ encoder = tiktoken.encoding_for_model(model_name) # ensure Qwen supported
160
+ return encoder.encode(content)
161
 
162
 
163
+ def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "mistralai/Mistral-7B-Instruct-v0.3"):
164
  global ENCODER
165
  if ENCODER is None:
166
  ENCODER = tiktoken.encoding_for_model(model_name)
 
520
  await hashing_kv.upsert({cache_data.mode: mode_cache})
521
 
522
 
523
+ def safe_unicode_decode(content: bytes):
524
+ try:
525
+ unicode_escape_pattern = re.compile(r"\\u([0-9a-fA-F]{4})")
526
+ def replace_unicode_escape(match):
527
+ return chr(int(match.group(1), 16))
528
+ return unicode_escape_pattern.sub(replace_unicode_escape, content.decode("utf-8"))
529
+ except Exception as e:
530
+ logger.warning(f"Unicode decode failed: {e}")
531
+ return content.decode("utf-8", errors="ignore")
532
 
 
 
 
533
 
534
  return decoded_content