Spaces:

SALMA003
/

ai-research-partner

Sleeping

App Files Files Community

syedsalma2003 commited on Aug 29

Commit

0723c36

1 Parent(s): 00947f0

Migrate database to Neo4j Aura for permanent deployment

Browse files

Files changed (4) hide show

.streamlit/secrets.toml +4 -11
app.py +35 -19
config.py +99 -8
requirements.txt +5 -7

.streamlit/secrets.toml CHANGED Viewed

@@ -1,17 +1,10 @@
 GOOGLE_API_KEY = "AIzaSyBXdf7KxATDxLDbPZWRhqgZZHgq78_dYDY"
-# --- ArangoDB Credentials ---
-# Use the "APPLICATION ENDPOINT" from your screenshot
-ARANGO_HOST = "https://f35a5e8cb378.arangodb.cloud:8529"
-# The database we will create/use inside the deployment
-ARANGO_DATABASE = "llm_graph"
-# The default superuser for ArangoDB
-ARANGO_USER = "root"
-# Get this by clicking the "eye" icon in your screenshot
-ARANGO_PASSWORD = "WkXOGo6mpWF6EYMNEHoJ"
-GOOGLE_CSE_ID = "44c6f5678e40a4b20"

 GOOGLE_API_KEY = "AIzaSyBXdf7KxATDxLDbPZWRhqgZZHgq78_dYDY"
+GOOGLE_CSE_ID = "44c6f5678e40a4b20"
+NEO4J_URI = "neo4j+s://0508fa6e.databases.neo4j.io"
+NEO4J_USERNAME = "neo4j"
+NEO4J_PASSWORD = "bjauiIAEV9NY0nELhcfMlTjYU2hs75Sqh_qw9Uaki94"

app.py CHANGED Viewed

@@ -2,9 +2,10 @@
 import streamlit as st
 from config import get_llm, get_embeddings_model
-from data_processing import process_uploaded_pdf # Simplified import
 from vector_store import create_faiss_vector_store
-from graph_db import get_arangodb_graph, populate_graph_from_docs, get_graph_qa_chain
 from qa_chain import generate_response
 from visualization import visualize_graph_from_query
 from google.api_core.exceptions import ResourceExhausted
@@ -15,7 +16,8 @@ st.title("🧠 AI Research Partner")
 # --- Initialize Models and Connections ---
 llm = get_llm()
 embeddings = get_embeddings_model()
-graph = get_arangodb_graph()
 # --- Session State Management ---
 if "docs" not in st.session_state:
@@ -29,30 +31,42 @@ if "processed_sources" not in st.session_state:
 # --- UI: Sidebar for Data Ingestion ---
 with st.sidebar:
-    st.header("1. Upload Documents")
-    # PDF Uploader is now the only input method
     uploaded_files = st.file_uploader(
         "Upload PDF documents",
         type="pdf",
         accept_multiple_files=True
     )
-    if st.button("Process Documents"):
-        if not uploaded_files:
-            st.warning("Please upload at least one PDF.")
         else:
             new_docs = []
             with st.spinner("Processing documents... This may take a few minutes."):
                 try:
-                    for file in uploaded_files:
-                        if file.name not in st.session_state.processed_sources:
-                            st.info(f"Processing PDF: {file.name}")
-                            pdf_docs = process_uploaded_pdf(file)
-                            if pdf_docs: # Ensure docs were processed
                                 populate_graph_from_docs(graph, pdf_docs, llm, file.name)
                                 new_docs.extend(pdf_docs)
                                 st.session_state.processed_sources.add(file.name)
                 except ResourceExhausted:
                     st.error("API Quota Reached during processing. Please try again tomorrow.")
                 except Exception as e:
@@ -66,9 +80,9 @@ with st.sidebar:
             else:
                 st.info("No new documents to process.")
-    st.header("Processed Documents")
-    st.markdown(f"**{len(st.session_state.processed_sources)}** documents loaded.")
-    with st.expander("View Documents"):
         for source in st.session_state.processed_sources:
             st.write(f"- {source}")
@@ -79,7 +93,6 @@ st.header("2. Ask Your Research Question")
 for i, message in enumerate(st.session_state.messages):
     with st.chat_message(message["role"]):
         if isinstance(message["content"], dict):
-            # Re-display the tabbed output from history
             tab_list = [
                 "✅ Main Answer", " perspectives", "🔬 Analytical Insights",
                 "💡 Creative Insights", "🔎 Recommendations", "📚 Sources & Details"
@@ -133,9 +146,12 @@ if prompt := st.chat_input("Ask a question about your documents..."):
                         st.subheader("Document Sources (from Vector Search)")
                         st.text_area("Semantic Context", result["semantic_sources"], height=200)
                         st.subheader("Knowledge Graph Context")
-                        st.code(result["graph_source"], language="sql")
                         st.subheader("Visual Knowledge Map")
-                        visualize_graph_from_query(graph, result["graph_source"])
                     st.session_state.messages.append({"role": "assistant", "content": result})

 import streamlit as st
 from config import get_llm, get_embeddings_model
+from data_processing import process_uploaded_pdf, process_url
 from vector_store import create_faiss_vector_store
+# Import the correct Neo4j functions from graph_db
+from graph_db import get_neo4j_graph, populate_graph_from_docs, get_graph_qa_chain
 from qa_chain import generate_response
 from visualization import visualize_graph_from_query
 from google.api_core.exceptions import ResourceExhausted
 # --- Initialize Models and Connections ---
 llm = get_llm()
 embeddings = get_embeddings_model()
+# Initialize the Neo4j graph connection
+graph = get_neo4j_graph()
 # --- Session State Management ---
 if "docs" not in st.session_state:
 # --- UI: Sidebar for Data Ingestion ---
 with st.sidebar:
+    st.header("1. Add Data Sources")
     uploaded_files = st.file_uploader(
         "Upload PDF documents",
         type="pdf",
         accept_multiple_files=True
     )
+    url_input = st.text_input("Or enter a website URL")
+    if st.button("Process Sources"):
+        if not uploaded_files and not url_input:
+            st.warning("Please upload a PDF or enter a URL.")
         else:
             new_docs = []
             with st.spinner("Processing documents... This may take a few minutes."):
                 try:
+                    # Clear the entire Neo4j database before processing new files
+                    st.info("Clearing old graph data...")
+                    graph.query("MATCH (n) DETACH DELETE n")
+                    if uploaded_files:
+                        for file in uploaded_files:
+                            if file.name not in st.session_state.processed_sources:
+                                st.info(f"Processing PDF: {file.name}")
+                                pdf_docs = process_uploaded_pdf(file)
                                 populate_graph_from_docs(graph, pdf_docs, llm, file.name)
                                 new_docs.extend(pdf_docs)
                                 st.session_state.processed_sources.add(file.name)
+                    if url_input and url_input not in st.session_state.processed_sources:
+                        st.info(f"Processing URL: {url_input}")
+                        url_docs = process_url(url_input)
+                        populate_graph_from_docs(graph, url_docs, llm, url_input)
+                        new_docs.extend(url_docs)
+                        st.session_state.processed_sources.add(url_input)
                 except ResourceExhausted:
                     st.error("API Quota Reached during processing. Please try again tomorrow.")
                 except Exception as e:
             else:
                 st.info("No new documents to process.")
+    st.header("Processed Sources")
+    st.markdown(f"**{len(st.session_state.processed_sources)}** sources loaded.")
+    with st.expander("View Sources"):
         for source in st.session_state.processed_sources:
             st.write(f"- {source}")
 for i, message in enumerate(st.session_state.messages):
     with st.chat_message(message["role"]):
         if isinstance(message["content"], dict):
             tab_list = [
                 "✅ Main Answer", " perspectives", "🔬 Analytical Insights",
                 "💡 Creative Insights", "🔎 Recommendations", "📚 Sources & Details"
                         st.subheader("Document Sources (from Vector Search)")
                         st.text_area("Semantic Context", result["semantic_sources"], height=200)
                         st.subheader("Knowledge Graph Context")
+                        st.code(result["graph_source"], language="sql") # Displaying the Cypher query result
                         st.subheader("Visual Knowledge Map")
+                        # Note: Visualization may not work as well with the Cypher chain's text output.
+                        # This is a known area for future improvement.
+                        st.warning("Visualization is experimental and may not render for all queries.")
+                        # visualize_graph_from_query(graph, result["graph_source"]) # Commented out for stability
                     st.session_state.messages.append({"role": "assistant", "content": result})

config.py CHANGED Viewed

@@ -1,19 +1,17 @@
-# config.py
 import streamlit as st
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # --- LLM and EMBEDDING MODELS ---
-LLM_MODEL_NAME = "gemini-2.5-pro"
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-# --- NEW: ARANGODB DATABASE ---
-ARANGO_HOST = st.secrets.get("ARANGO_HOST")
-ARANGO_DATABASE = st.secrets.get("ARANGO_DATABASE")
-ARANGO_USER = st.secrets.get("ARANGO_USER")
-ARANGO_PASSWORD = st.secrets.get("ARANGO_PASSWORD")
 # --- GLOBAL INITIALIZATIONS ---
 @st.cache_resource
@@ -23,3 +21,96 @@ def get_llm():
 @st.cache_resource
 def get_embeddings_model():
     return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

+# config.py (Final Neo4j Version)
 import streamlit as st
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # --- LLM and EMBEDDING MODELS ---
+LLM_MODEL_NAME = "gemini-1.5-flash-latest"
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+# --- NEW: NEO4J DATABASE ---
+NEO4J_URI = st.secrets.get("NEO4J_URI")
+NEO4J_USERNAME = st.secrets.get("NEO4J_USERNAME")
+NEO4J_PASSWORD = st.secrets.get("NEO4J_PASSWORD")
 # --- GLOBAL INITIALIZATIONS ---
 @st.cache_resource
 @st.cache_resource
 def get_embeddings_model():
     return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+#### B. `graph_db.py` (The Biggest Change)
+# graph_db.py (Final Neo4j Version)
+from langchain_neo4j import Neo4jGraph
+from langchain.chains import GraphCypherQAChain
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from schemas import TripletList
+from config import NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD
+import streamlit as st
+@st.cache_resource
+def get_neo4j_graph():
+    """Initializes and returns the Neo4j graph object."""
+    return Neo4jGraph(
+        url=NEO4J_URI,
+        username=NEO4J_USERNAME,
+        password=NEO4J_PASSWORD
+    )
+def format_cypher_relationship(rel: str) -> str:
+    """Sanitizes a string for use as a Cypher relationship type."""
+    # Replace spaces and hyphens with underscores, convert to uppercase
+    return rel.replace(' ', '_').replace('-', '_').upper()
+def populate_graph_from_docs(graph, docs, llm, source_name: str):
+    """
+    Extracts entities and relationships and populates the Neo4j graph using MERGE.
+    """
+    # Create an index for faster lookups on the 'id' property of nodes
+    graph.query("CREATE INDEX IF NOT EXISTS FOR (n:Entity) ON (n.id)")
+    extraction_prompt = PromptTemplate.from_template(
+        """
+        You are an expert data analyst... (Prompt is the same as before)
+        TEXT:
+        {chunk}
+        """
+    )
+    extraction_chain = extraction_prompt | llm | JsonOutputParser()
+    st.write(f"Extracting knowledge from '{source_name}' and populating graph...")
+    progress_bar = st.progress(0)
+    for i, doc in enumerate(docs):
+        try:
+            extracted_json = extraction_chain.invoke({"chunk": doc.page_content})
+            validated_triplets = TripletList.parse_obj(extracted_json)
+            for triplet in validated_triplets.triplets:
+                # Use MERGE to create nodes and relationships without duplicates
+                # MERGE is Neo4j's equivalent of UPSERT
+                cypher_query = """
+                MERGE (h:Entity {id: $head})
+                ON CREATE SET h.source = $source
+                MERGE (t:Entity {id: $tail})
+                ON CREATE SET t.source = $source
+                MERGE (h)-[r:`{relation}`]->(t)
+                ON CREATE SET r.source = $source
+                """
+                # Format the relationship type dynamically
+                formatted_query = cypher_query.format(relation=format_cypher_relationship(triplet.relation))
+                graph.query(
+                    formatted_query,
+                    params={
+                        "head": triplet.head,
+                        "tail": triplet.tail,
+                        "source": source_name,
+                    }
+                )
+            progress_bar.progress((i + 1) / len(docs), text=f"Processing chunk {i+1}/{len(docs)}")
+        except Exception as e:
+            st.error(f"Failed to process chunk {i+1}. Error: {e}")
+            continue
+    st.success(f"Knowledge from '{source_name}' has been added to the graph!")
+def get_graph_qa_chain(graph, llm):
+    """Creates and returns a question-answering chain for the Neo4j graph."""
+    graph.refresh_schema() # Important to update schema for the QA chain
+    return GraphCypherQAChain.from_llm(
+        graph=graph,
+        llm=llm,
+        verbose=True,
+        allow_dangerous_requests=True
+    )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# requirements.txt (Final, Corrected for Deployment)
 # --- Core Frameworks ---
 streamlit>=1.33.0
@@ -9,8 +9,6 @@ langchain-community>=0.0.34
 # --- LLM & Embeddings ---
 langchain-google-genai>=1.0.1
 google-generativeai>=0.4.1
-# --- Core Scientific & ML Libraries (No version pins for better cross-platform compatibility) ---
 numpy
 scikit-learn
 sentence-transformers>=2.2.2
@@ -22,12 +20,12 @@ beautifulsoup4>=4.12.3
 # --- Databases (Vector & Graph) ---
 faiss-cpu>=1.7.4
-# --- ArangoDB Packages ---
-python-arango>=7.8.0
-langchain-arangodb>=0.1.0
 # --- Web Search & Visualization ---
-# langchain-community is already listed above
 streamlit-agraph>=0.0.38
 # --- Utilities ---

+# requirements.txt (Final for Neo4j Deployment)
 # --- Core Frameworks ---
 streamlit>=1.33.0
 # --- LLM & Embeddings ---
 langchain-google-genai>=1.0.1
 google-generativeai>=0.4.1
 numpy
 scikit-learn
 sentence-transformers>=2.2.2
 # --- Databases (Vector & Graph) ---
 faiss-cpu>=1.7.4
+# --- NEW: Neo4j Packages ---
+neo4j>=5.18.0
+langchain-neo4j>=0.0.5
 # --- Web Search & Visualization ---
+langchain-community>=0.0.34
 streamlit-agraph>=0.0.38
 # --- Utilities ---