syedsalma2003 commited on
Commit
0723c36
·
1 Parent(s): 00947f0

Migrate database to Neo4j Aura for permanent deployment

Browse files
Files changed (4) hide show
  1. .streamlit/secrets.toml +4 -11
  2. app.py +35 -19
  3. config.py +99 -8
  4. requirements.txt +5 -7
.streamlit/secrets.toml CHANGED
@@ -1,17 +1,10 @@
1
  GOOGLE_API_KEY = "AIzaSyBXdf7KxATDxLDbPZWRhqgZZHgq78_dYDY"
2
 
3
 
4
- # --- ArangoDB Credentials ---
5
- # Use the "APPLICATION ENDPOINT" from your screenshot
6
- ARANGO_HOST = "https://f35a5e8cb378.arangodb.cloud:8529"
7
-
8
- # The database we will create/use inside the deployment
9
- ARANGO_DATABASE = "llm_graph"
10
 
11
- # The default superuser for ArangoDB
12
- ARANGO_USER = "root"
13
 
14
- # Get this by clicking the "eye" icon in your screenshot
15
- ARANGO_PASSWORD = "WkXOGo6mpWF6EYMNEHoJ"
 
16
 
17
- GOOGLE_CSE_ID = "44c6f5678e40a4b20"
 
1
  GOOGLE_API_KEY = "AIzaSyBXdf7KxATDxLDbPZWRhqgZZHgq78_dYDY"
2
 
3
 
4
+ GOOGLE_CSE_ID = "44c6f5678e40a4b20"
 
 
 
 
 
5
 
 
 
6
 
7
+ NEO4J_URI = "neo4j+s://0508fa6e.databases.neo4j.io"
8
+ NEO4J_USERNAME = "neo4j"
9
+ NEO4J_PASSWORD = "bjauiIAEV9NY0nELhcfMlTjYU2hs75Sqh_qw9Uaki94"
10
 
 
app.py CHANGED
@@ -2,9 +2,10 @@
2
 
3
  import streamlit as st
4
  from config import get_llm, get_embeddings_model
5
- from data_processing import process_uploaded_pdf # Simplified import
6
  from vector_store import create_faiss_vector_store
7
- from graph_db import get_arangodb_graph, populate_graph_from_docs, get_graph_qa_chain
 
8
  from qa_chain import generate_response
9
  from visualization import visualize_graph_from_query
10
  from google.api_core.exceptions import ResourceExhausted
@@ -15,7 +16,8 @@ st.title("🧠 AI Research Partner")
15
  # --- Initialize Models and Connections ---
16
  llm = get_llm()
17
  embeddings = get_embeddings_model()
18
- graph = get_arangodb_graph()
 
19
 
20
  # --- Session State Management ---
21
  if "docs" not in st.session_state:
@@ -29,30 +31,42 @@ if "processed_sources" not in st.session_state:
29
 
30
  # --- UI: Sidebar for Data Ingestion ---
31
  with st.sidebar:
32
- st.header("1. Upload Documents")
33
 
34
- # PDF Uploader is now the only input method
35
  uploaded_files = st.file_uploader(
36
  "Upload PDF documents",
37
  type="pdf",
38
  accept_multiple_files=True
39
  )
 
40
 
41
- if st.button("Process Documents"):
42
- if not uploaded_files:
43
- st.warning("Please upload at least one PDF.")
44
  else:
45
  new_docs = []
46
  with st.spinner("Processing documents... This may take a few minutes."):
47
  try:
48
- for file in uploaded_files:
49
- if file.name not in st.session_state.processed_sources:
50
- st.info(f"Processing PDF: {file.name}")
51
- pdf_docs = process_uploaded_pdf(file)
52
- if pdf_docs: # Ensure docs were processed
 
 
 
 
53
  populate_graph_from_docs(graph, pdf_docs, llm, file.name)
54
  new_docs.extend(pdf_docs)
55
  st.session_state.processed_sources.add(file.name)
 
 
 
 
 
 
 
 
56
  except ResourceExhausted:
57
  st.error("API Quota Reached during processing. Please try again tomorrow.")
58
  except Exception as e:
@@ -66,9 +80,9 @@ with st.sidebar:
66
  else:
67
  st.info("No new documents to process.")
68
 
69
- st.header("Processed Documents")
70
- st.markdown(f"**{len(st.session_state.processed_sources)}** documents loaded.")
71
- with st.expander("View Documents"):
72
  for source in st.session_state.processed_sources:
73
  st.write(f"- {source}")
74
 
@@ -79,7 +93,6 @@ st.header("2. Ask Your Research Question")
79
  for i, message in enumerate(st.session_state.messages):
80
  with st.chat_message(message["role"]):
81
  if isinstance(message["content"], dict):
82
- # Re-display the tabbed output from history
83
  tab_list = [
84
  "✅ Main Answer", " perspectives", "🔬 Analytical Insights",
85
  "💡 Creative Insights", "🔎 Recommendations", "📚 Sources & Details"
@@ -133,9 +146,12 @@ if prompt := st.chat_input("Ask a question about your documents..."):
133
  st.subheader("Document Sources (from Vector Search)")
134
  st.text_area("Semantic Context", result["semantic_sources"], height=200)
135
  st.subheader("Knowledge Graph Context")
136
- st.code(result["graph_source"], language="sql")
137
  st.subheader("Visual Knowledge Map")
138
- visualize_graph_from_query(graph, result["graph_source"])
 
 
 
139
 
140
  st.session_state.messages.append({"role": "assistant", "content": result})
141
 
 
2
 
3
  import streamlit as st
4
  from config import get_llm, get_embeddings_model
5
+ from data_processing import process_uploaded_pdf, process_url
6
  from vector_store import create_faiss_vector_store
7
+ # Import the correct Neo4j functions from graph_db
8
+ from graph_db import get_neo4j_graph, populate_graph_from_docs, get_graph_qa_chain
9
  from qa_chain import generate_response
10
  from visualization import visualize_graph_from_query
11
  from google.api_core.exceptions import ResourceExhausted
 
16
  # --- Initialize Models and Connections ---
17
  llm = get_llm()
18
  embeddings = get_embeddings_model()
19
+ # Initialize the Neo4j graph connection
20
+ graph = get_neo4j_graph()
21
 
22
  # --- Session State Management ---
23
  if "docs" not in st.session_state:
 
31
 
32
  # --- UI: Sidebar for Data Ingestion ---
33
  with st.sidebar:
34
+ st.header("1. Add Data Sources")
35
 
 
36
  uploaded_files = st.file_uploader(
37
  "Upload PDF documents",
38
  type="pdf",
39
  accept_multiple_files=True
40
  )
41
+ url_input = st.text_input("Or enter a website URL")
42
 
43
+ if st.button("Process Sources"):
44
+ if not uploaded_files and not url_input:
45
+ st.warning("Please upload a PDF or enter a URL.")
46
  else:
47
  new_docs = []
48
  with st.spinner("Processing documents... This may take a few minutes."):
49
  try:
50
+ # Clear the entire Neo4j database before processing new files
51
+ st.info("Clearing old graph data...")
52
+ graph.query("MATCH (n) DETACH DELETE n")
53
+
54
+ if uploaded_files:
55
+ for file in uploaded_files:
56
+ if file.name not in st.session_state.processed_sources:
57
+ st.info(f"Processing PDF: {file.name}")
58
+ pdf_docs = process_uploaded_pdf(file)
59
  populate_graph_from_docs(graph, pdf_docs, llm, file.name)
60
  new_docs.extend(pdf_docs)
61
  st.session_state.processed_sources.add(file.name)
62
+
63
+ if url_input and url_input not in st.session_state.processed_sources:
64
+ st.info(f"Processing URL: {url_input}")
65
+ url_docs = process_url(url_input)
66
+ populate_graph_from_docs(graph, url_docs, llm, url_input)
67
+ new_docs.extend(url_docs)
68
+ st.session_state.processed_sources.add(url_input)
69
+
70
  except ResourceExhausted:
71
  st.error("API Quota Reached during processing. Please try again tomorrow.")
72
  except Exception as e:
 
80
  else:
81
  st.info("No new documents to process.")
82
 
83
+ st.header("Processed Sources")
84
+ st.markdown(f"**{len(st.session_state.processed_sources)}** sources loaded.")
85
+ with st.expander("View Sources"):
86
  for source in st.session_state.processed_sources:
87
  st.write(f"- {source}")
88
 
 
93
  for i, message in enumerate(st.session_state.messages):
94
  with st.chat_message(message["role"]):
95
  if isinstance(message["content"], dict):
 
96
  tab_list = [
97
  "✅ Main Answer", " perspectives", "🔬 Analytical Insights",
98
  "💡 Creative Insights", "🔎 Recommendations", "📚 Sources & Details"
 
146
  st.subheader("Document Sources (from Vector Search)")
147
  st.text_area("Semantic Context", result["semantic_sources"], height=200)
148
  st.subheader("Knowledge Graph Context")
149
+ st.code(result["graph_source"], language="sql") # Displaying the Cypher query result
150
  st.subheader("Visual Knowledge Map")
151
+ # Note: Visualization may not work as well with the Cypher chain's text output.
152
+ # This is a known area for future improvement.
153
+ st.warning("Visualization is experimental and may not render for all queries.")
154
+ # visualize_graph_from_query(graph, result["graph_source"]) # Commented out for stability
155
 
156
  st.session_state.messages.append({"role": "assistant", "content": result})
157
 
config.py CHANGED
@@ -1,19 +1,17 @@
1
- # config.py
2
 
3
  import streamlit as st
4
  from langchain_google_genai import ChatGoogleGenerativeAI
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
 
7
  # --- LLM and EMBEDDING MODELS ---
8
- LLM_MODEL_NAME = "gemini-2.5-pro"
9
-
10
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
11
 
12
- # --- NEW: ARANGODB DATABASE ---
13
- ARANGO_HOST = st.secrets.get("ARANGO_HOST")
14
- ARANGO_DATABASE = st.secrets.get("ARANGO_DATABASE")
15
- ARANGO_USER = st.secrets.get("ARANGO_USER")
16
- ARANGO_PASSWORD = st.secrets.get("ARANGO_PASSWORD")
17
 
18
  # --- GLOBAL INITIALIZATIONS ---
19
  @st.cache_resource
@@ -23,3 +21,96 @@ def get_llm():
23
  @st.cache_resource
24
  def get_embeddings_model():
25
  return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py (Final Neo4j Version)
2
 
3
  import streamlit as st
4
  from langchain_google_genai import ChatGoogleGenerativeAI
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
 
7
  # --- LLM and EMBEDDING MODELS ---
8
+ LLM_MODEL_NAME = "gemini-1.5-flash-latest"
 
9
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
10
 
11
+ # --- NEW: NEO4J DATABASE ---
12
+ NEO4J_URI = st.secrets.get("NEO4J_URI")
13
+ NEO4J_USERNAME = st.secrets.get("NEO4J_USERNAME")
14
+ NEO4J_PASSWORD = st.secrets.get("NEO4J_PASSWORD")
 
15
 
16
  # --- GLOBAL INITIALIZATIONS ---
17
  @st.cache_resource
 
21
  @st.cache_resource
22
  def get_embeddings_model():
23
  return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
24
+
25
+ #### B. `graph_db.py` (The Biggest Change)
26
+
27
+ # graph_db.py (Final Neo4j Version)
28
+
29
+ from langchain_neo4j import Neo4jGraph
30
+ from langchain.chains import GraphCypherQAChain
31
+ from langchain.prompts import PromptTemplate
32
+ from langchain_core.output_parsers import JsonOutputParser
33
+ from schemas import TripletList
34
+ from config import NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD
35
+ import streamlit as st
36
+
37
+ @st.cache_resource
38
+ def get_neo4j_graph():
39
+ """Initializes and returns the Neo4j graph object."""
40
+ return Neo4jGraph(
41
+ url=NEO4J_URI,
42
+ username=NEO4J_USERNAME,
43
+ password=NEO4J_PASSWORD
44
+ )
45
+
46
+ def format_cypher_relationship(rel: str) -> str:
47
+ """Sanitizes a string for use as a Cypher relationship type."""
48
+ # Replace spaces and hyphens with underscores, convert to uppercase
49
+ return rel.replace(' ', '_').replace('-', '_').upper()
50
+
51
+ def populate_graph_from_docs(graph, docs, llm, source_name: str):
52
+ """
53
+ Extracts entities and relationships and populates the Neo4j graph using MERGE.
54
+ """
55
+ # Create an index for faster lookups on the 'id' property of nodes
56
+ graph.query("CREATE INDEX IF NOT EXISTS FOR (n:Entity) ON (n.id)")
57
+
58
+ extraction_prompt = PromptTemplate.from_template(
59
+ """
60
+ You are an expert data analyst... (Prompt is the same as before)
61
+ TEXT:
62
+ {chunk}
63
+ """
64
+ )
65
+
66
+ extraction_chain = extraction_prompt | llm | JsonOutputParser()
67
+
68
+ st.write(f"Extracting knowledge from '{source_name}' and populating graph...")
69
+ progress_bar = st.progress(0)
70
+
71
+ for i, doc in enumerate(docs):
72
+ try:
73
+ extracted_json = extraction_chain.invoke({"chunk": doc.page_content})
74
+ validated_triplets = TripletList.parse_obj(extracted_json)
75
+
76
+ for triplet in validated_triplets.triplets:
77
+ # Use MERGE to create nodes and relationships without duplicates
78
+ # MERGE is Neo4j's equivalent of UPSERT
79
+ cypher_query = """
80
+ MERGE (h:Entity {id: $head})
81
+ ON CREATE SET h.source = $source
82
+ MERGE (t:Entity {id: $tail})
83
+ ON CREATE SET t.source = $source
84
+ MERGE (h)-[r:`{relation}`]->(t)
85
+ ON CREATE SET r.source = $source
86
+ """
87
+
88
+ # Format the relationship type dynamically
89
+ formatted_query = cypher_query.format(relation=format_cypher_relationship(triplet.relation))
90
+
91
+ graph.query(
92
+ formatted_query,
93
+ params={
94
+ "head": triplet.head,
95
+ "tail": triplet.tail,
96
+ "source": source_name,
97
+ }
98
+ )
99
+
100
+ progress_bar.progress((i + 1) / len(docs), text=f"Processing chunk {i+1}/{len(docs)}")
101
+
102
+ except Exception as e:
103
+ st.error(f"Failed to process chunk {i+1}. Error: {e}")
104
+ continue
105
+
106
+ st.success(f"Knowledge from '{source_name}' has been added to the graph!")
107
+
108
+ def get_graph_qa_chain(graph, llm):
109
+ """Creates and returns a question-answering chain for the Neo4j graph."""
110
+ graph.refresh_schema() # Important to update schema for the QA chain
111
+ return GraphCypherQAChain.from_llm(
112
+ graph=graph,
113
+ llm=llm,
114
+ verbose=True,
115
+ allow_dangerous_requests=True
116
+ )
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- # requirements.txt (Final, Corrected for Deployment)
2
 
3
  # --- Core Frameworks ---
4
  streamlit>=1.33.0
@@ -9,8 +9,6 @@ langchain-community>=0.0.34
9
  # --- LLM & Embeddings ---
10
  langchain-google-genai>=1.0.1
11
  google-generativeai>=0.4.1
12
-
13
- # --- Core Scientific & ML Libraries (No version pins for better cross-platform compatibility) ---
14
  numpy
15
  scikit-learn
16
  sentence-transformers>=2.2.2
@@ -22,12 +20,12 @@ beautifulsoup4>=4.12.3
22
  # --- Databases (Vector & Graph) ---
23
  faiss-cpu>=1.7.4
24
 
25
- # --- ArangoDB Packages ---
26
- python-arango>=7.8.0
27
- langchain-arangodb>=0.1.0
28
 
29
  # --- Web Search & Visualization ---
30
- # langchain-community is already listed above
31
  streamlit-agraph>=0.0.38
32
 
33
  # --- Utilities ---
 
1
+ # requirements.txt (Final for Neo4j Deployment)
2
 
3
  # --- Core Frameworks ---
4
  streamlit>=1.33.0
 
9
  # --- LLM & Embeddings ---
10
  langchain-google-genai>=1.0.1
11
  google-generativeai>=0.4.1
 
 
12
  numpy
13
  scikit-learn
14
  sentence-transformers>=2.2.2
 
20
  # --- Databases (Vector & Graph) ---
21
  faiss-cpu>=1.7.4
22
 
23
+ # --- NEW: Neo4j Packages ---
24
+ neo4j>=5.18.0
25
+ langchain-neo4j>=0.0.5
26
 
27
  # --- Web Search & Visualization ---
28
+ langchain-community>=0.0.34
29
  streamlit-agraph>=0.0.38
30
 
31
  # --- Utilities ---