Prakhar Bhandari commited on
Commit
babec93
·
1 Parent(s): ae26cc4

Modular v2.0

Browse files
kg_builder/src/__pycache__/api_connections.cpython-39.pyc CHANGED
Binary files a/kg_builder/src/__pycache__/api_connections.cpython-39.pyc and b/kg_builder/src/__pycache__/api_connections.cpython-39.pyc differ
 
kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc CHANGED
Binary files a/kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc and b/kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc differ
 
kg_builder/src/__pycache__/models.cpython-39.pyc ADDED
Binary file (1.65 kB). View file
 
kg_builder/src/__pycache__/query_graph.cpython-39.pyc CHANGED
Binary files a/kg_builder/src/__pycache__/query_graph.cpython-39.pyc and b/kg_builder/src/__pycache__/query_graph.cpython-39.pyc differ
 
kg_builder/src/__pycache__/utils.cpython-39.pyc ADDED
Binary file (1.66 kB). View file
 
kg_builder/src/api_connections.py CHANGED
@@ -2,6 +2,12 @@
2
  from langchain_community.graphs import Neo4jGraph
3
  from dotenv import load_dotenv
4
  import os
 
 
 
 
 
 
5
 
6
  load_dotenv() # This loads the variables from .env into os.environ
7
 
@@ -17,4 +23,52 @@ graph = Neo4jGraph(
17
  password=password
18
  )
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
 
 
 
 
 
 
 
2
  from langchain_community.graphs import Neo4jGraph
3
  from dotenv import load_dotenv
4
  import os
5
+ from langchain.chains.openai_functions import create_structured_output_chain
6
+ from langchain_openai import ChatOpenAI
7
+ from langchain.prompts import ChatPromptTemplate
8
+ from models import KnowledgeGraph
9
+ from typing import Optional, List
10
+
11
 
12
  load_dotenv() # This loads the variables from .env into os.environ
13
 
 
23
  password=password
24
  )
25
 
26
+ def get_llm():
27
+ api_key = os.getenv("OPENAI_API_KEY")
28
+ if not api_key:
29
+ raise ValueError("No OpenAI API key found in environment variables.")
30
+ return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
31
+
32
+ def get_extraction_chain(
33
+ allowed_nodes: Optional[List[str]] = None,
34
+ allowed_rels: Optional[List[str]] = None
35
+ ):
36
+ llm = get_llm()
37
+ prompt = ChatPromptTemplate.from_messages(
38
+ [(
39
+ "system",
40
+ f"""# Knowledge Graph Instructions for GPT-4
41
+ ## 1. Overview
42
+ You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
43
+ - **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
44
+ - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
45
+
46
+ ## 2. Labeling Nodes
47
+ - **Consistency**: Utilize uniform labels for node types to maintain clarity.
48
+ - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
49
+ - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
50
+
51
+ {'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
52
+ {'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
53
+
54
+ ## 3. Handling Numerical Data and Dates
55
+ - Integrate numerical data and dates as attributes of the corresponding nodes.
56
+ - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
57
+ - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
58
+
59
+ ## 4. Coreference Resolution
60
+ - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
61
+ - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
62
+
63
+ ## 5. Relationship Naming Conventions
64
+ - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
65
+ - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
66
+ - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
67
 
68
+ ## 6. Strict Compliance
69
+ Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
70
+ """),
71
+ ("human", "Use the given format to extract information from the following input: {input}"),
72
+ ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
73
+ ])
74
+ return create_structured_output_chain(KnowledgeGraph, llm, prompt)
kg_builder/src/knowledge_graph_builder.py CHANGED
@@ -1,5 +1,4 @@
1
 
2
- # Add to knowledge_graph_builder.py
3
  from api_connections import graph
4
 
5
  from langchain_community.graphs.graph_document import (
@@ -10,118 +9,16 @@ from langchain_community.graphs.graph_document import (
10
  from langchain.schema import Document
11
  from typing import List, Dict, Any, Optional
12
  from langchain.pydantic_v1 import Field, BaseModel
13
-
14
- class Property(BaseModel):
15
- """A single property consisting of key and value"""
16
- key: str = Field(..., description="key")
17
- value: str = Field(..., description="value")
18
-
19
- class Node(BaseNode):
20
- properties: Optional[List[Property]] = Field(
21
- None, description="List of node properties")
22
-
23
- class Relationship(BaseRelationship):
24
- properties: Optional[List[Property]] = Field(
25
- None, description="List of relationship properties"
26
- )
27
-
28
- class KnowledgeGraph(BaseModel):
29
- """Generate a knowledge graph with entities and relationships."""
30
- nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
31
- rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")
32
-
33
- def format_property_key(s: str) -> str:
34
- words = s.split()
35
- if not words:
36
- return s
37
- first_word = words[0].lower()
38
- capitalized_words = [word.capitalize() for word in words[1:]]
39
- return "".join([first_word] + capitalized_words)
40
-
41
- def props_to_dict(props) -> dict:
42
- """Convert properties to a dictionary."""
43
- properties = {}
44
- if not props:
45
- return properties
46
- for p in props:
47
- properties[format_property_key(p.key)] = p.value
48
- return properties
49
-
50
- def map_to_base_node(node: Node) -> BaseNode:
51
- """Map the KnowledgeGraph Node to the base Node."""
52
- properties = props_to_dict(node.properties) if node.properties else {}
53
- properties["name"] = node.id.title() # Assuming nodes have an 'id' attribute for this operation
54
- return BaseNode(
55
- id=node.id.title(), type=node.type.capitalize(), properties=properties
56
- )
57
-
58
- def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
59
- """Map the KnowledgeGraph Relationship to the base Relationship."""
60
- source = map_to_base_node(rel.source)
61
- target = map_to_base_node(rel.target)
62
- properties = props_to_dict(rel.properties) if rel.properties else {}
63
- return BaseRelationship(
64
- source=source, target=target, type=rel.type, properties=properties
65
- )
66
-
67
- import os
68
- from dotenv import load_dotenv
69
- load_dotenv() # This loads the variables from .env into os.environ
70
 
71
  from langchain.chains.openai_functions import (
72
  create_openai_fn_chain,
73
  create_structured_output_runnable,
74
  create_structured_output_chain,
75
  )
76
- from langchain_openai import ChatOpenAI
77
- from langchain.prompts import ChatPromptTemplate
78
-
79
- # Setting the OpenAI API key for usage in LLM calls
80
- os.environ["OPENAI_API_KEY"]
81
- llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
82
-
83
- def get_extraction_chain(
84
- allowed_nodes: Optional[List[str]] = None,
85
- allowed_rels: Optional[List[str]] = None
86
- ):
87
- prompt = ChatPromptTemplate.from_messages(
88
- [(
89
- "system",
90
- f"""# Knowledge Graph Instructions for GPT-4
91
- ## 1. Overview
92
- You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
93
- - **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
94
- - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
95
-
96
- ## 2. Labeling Nodes
97
- - **Consistency**: Utilize uniform labels for node types to maintain clarity.
98
- - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
99
- - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
100
-
101
- {'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
102
- {'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
103
-
104
- ## 3. Handling Numerical Data and Dates
105
- - Integrate numerical data and dates as attributes of the corresponding nodes.
106
- - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
107
- - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
108
-
109
- ## 4. Coreference Resolution
110
- - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
111
- - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
112
-
113
- ## 5. Relationship Naming Conventions
114
- - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
115
- - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
116
- - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
117
 
118
- ## 6. Strict Compliance
119
- Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
120
- """),
121
- ("human", "Use the given format to extract information from the following input: {input}"),
122
- ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
123
- ])
124
- return create_structured_output_chain(KnowledgeGraph, llm, prompt)
125
 
126
  def extract_and_store_graph(
127
  document: Document,
 
1
 
 
2
  from api_connections import graph
3
 
4
  from langchain_community.graphs.graph_document import (
 
9
  from langchain.schema import Document
10
  from typing import List, Dict, Any, Optional
11
  from langchain.pydantic_v1 import Field, BaseModel
12
+ from models import Node, Relationship, KnowledgeGraph
13
+ from utils import map_to_base_node, map_to_base_relationship
14
+ from api_connections import get_extraction_chain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  from langchain.chains.openai_functions import (
17
  create_openai_fn_chain,
18
  create_structured_output_runnable,
19
  create_structured_output_chain,
20
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
 
 
22
 
23
  def extract_and_store_graph(
24
  document: Document,
kg_builder/src/models.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.graphs.graph_document import (
2
+ Node as BaseNode,
3
+ Relationship as BaseRelationship,
4
+ GraphDocument,
5
+ )
6
+ from langchain.schema import Document
7
+ from typing import List, Dict, Any, Optional
8
+ from langchain.pydantic_v1 import Field, BaseModel
9
+
10
+ class Property(BaseModel):
11
+ """A single property consisting of key and value"""
12
+ key: str = Field(..., description="key")
13
+ value: str = Field(..., description="value")
14
+
15
+ class Node(BaseNode):
16
+ properties: Optional[List[Property]] = Field(
17
+ None, description="List of node properties")
18
+
19
+ class Relationship(BaseRelationship):
20
+ properties: Optional[List[Property]] = Field(
21
+ None, description="List of relationship properties"
22
+ )
23
+
24
+ class KnowledgeGraph(BaseModel):
25
+ """Generate a knowledge graph with entities and relationships."""
26
+ nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
27
+ rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")
kg_builder/src/utils.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.graphs.graph_document import (
2
+ Node as BaseNode,
3
+ Relationship as BaseRelationship,
4
+ GraphDocument,
5
+ )
6
+ from models import Node, Relationship
7
+
8
+ def format_property_key(s: str) -> str:
9
+ words = s.split()
10
+ if not words:
11
+ return s
12
+ first_word = words[0].lower()
13
+ capitalized_words = [word.capitalize() for word in words[1:]]
14
+ return "".join([first_word] + capitalized_words)
15
+
16
+ def props_to_dict(props) -> dict:
17
+ """Convert properties to a dictionary."""
18
+ properties = {}
19
+ if not props:
20
+ return properties
21
+ for p in props:
22
+ properties[format_property_key(p.key)] = p.value
23
+ return properties
24
+
25
+ def map_to_base_node(node: Node) -> BaseNode:
26
+ """Map the KnowledgeGraph Node to the base Node."""
27
+ properties = props_to_dict(node.properties) if node.properties else {}
28
+ properties["name"] = node.id.title() # Assuming nodes have an 'id' attribute for this operation
29
+ return BaseNode(
30
+ id=node.id.title(), type=node.type.capitalize(), properties=properties
31
+ )
32
+
33
+ def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
34
+ """Map the KnowledgeGraph Relationship to the base Relationship."""
35
+ source = map_to_base_node(rel.source)
36
+ target = map_to_base_node(rel.target)
37
+ properties = props_to_dict(rel.properties) if rel.properties else {}
38
+ return BaseRelationship(
39
+ source=source, target=target, type=rel.type, properties=properties
40
+ )