Spaces:

MerveA
/

InsightRAG_Chatbot

Runtime error

App Files Files Community

MerveA commited on Oct 26

Commit

27cfd4d

1 Parent(s): e649674

Fix langchain dependency for HF Space

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +1 -1
app.py +201 -96
requirements.txt +2 -1

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: "InsightRAG_Chatbot"
 emoji: "🤖"
 colorFrom: "purple"
 colorTo: "blue"

 ---
+title: "RAG Chatbot: ML/AI Assistant"
 emoji: "🤖"
 colorFrom: "purple"
 colorTo: "blue"

app.py CHANGED Viewed

@@ -1,10 +1,23 @@
 import streamlit as st
 import os
 import json
 import time
 from datetime import datetime
-# Page configuration
 st.set_page_config(
     page_title="🤖 RAG Chatbot: ML/AI Assistant",
     page_icon="🤖",
@@ -12,13 +25,8 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# Initialize session state immediately
-if 'messages' not in st.session_state:
-    st.session_state.messages = []
-if 'rag_system' not in st.session_state:
-    st.session_state.rag_system = None
-if 'initialized' not in st.session_state:
-    st.session_state.initialized = False
 # Custom CSS for better styling
 st.markdown("""
@@ -58,24 +66,88 @@ st.markdown("""
 </style>
 """, unsafe_allow_html=True)
-# RAG System Functions
 def initialize_rag_system(api_key):
     """Initialize the RAG system with all components"""
     try:
         # Set API key
         os.environ['GOOGLE_API_KEY'] = api_key
-        # Import required libraries with error handling
-        try:
-            from sentence_transformers import SentenceTransformer
-            import chromadb
-            from chromadb.config import Settings
-            import google.generativeai as genai
-            import re
-        except ImportError as e:
-            st.error(f"Import error: {e}")
-            return None
         # Initialize embedding model
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -88,117 +160,148 @@ def initialize_rag_system(api_key):
         collection_name = "ml_ai_knowledge"
         try:
             collection = chroma_client.get_collection(collection_name)
         except:
             collection = chroma_client.create_collection(
                 name=collection_name,
-                metadata={"description": "ML/AI knowledge base"}
             )
         # Check if collection already has data
         existing_count = collection.count()
         if existing_count == 0:
-            # Load sample data for demo
-            sample_texts = [
-                "Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data. Deep learning uses neural networks with multiple layers to process complex patterns in data.",
-                "Neural networks are computing systems inspired by biological neural networks. They consist of interconnected nodes that process information using a connectionist approach.",
-                "Supervised learning uses labeled training data to learn a mapping from inputs to outputs. Common algorithms include linear regression, decision trees, and support vector machines.",
-                "Unsupervised learning finds hidden patterns in data without labeled examples. Clustering algorithms like K-means group similar data points together.",
-                "Natural language processing combines computational linguistics with machine learning to help computers understand human language. It includes tasks like text classification and sentiment analysis.",
-                "Computer vision enables machines to interpret and understand visual information from the world. It uses deep learning models like convolutional neural networks.",
-                "Reinforcement learning is a type of machine learning where agents learn to make decisions by interacting with an environment and receiving rewards or penalties.",
-                "Feature engineering is the process of selecting and transforming raw data into features that can be used by machine learning algorithms. Good features can significantly improve model performance.",
-                "Cross-validation is a technique used to assess how well a machine learning model generalizes to new data. It involves splitting data into training and validation sets multiple times.",
-                "Overfitting occurs when a model learns the training data too well and performs poorly on new data. Regularization techniques help prevent overfitting.",
-                "Gradient descent is an optimization algorithm used to minimize the cost function in machine learning models. It iteratively adjusts parameters to find the minimum of the function.",
-                "Backpropagation is a method used to train neural networks by calculating gradients and updating weights. It works by propagating errors backward through the network layers.",
-                "Convolutional Neural Networks (CNNs) are specialized neural networks designed for processing grid-like data such as images. They use convolutional layers to detect local features.",
-                "Transformers are a type of neural network architecture that uses attention mechanisms to process sequential data. They are the foundation of modern language models like GPT.",
-                "Large Language Models (LLMs) are AI systems trained on vast amounts of text data to understand and generate human-like text. They can perform various language tasks.",
-                "Generative AI refers to AI systems that can create new content, such as text, images, or code. It differs from predictive AI which focuses on making predictions.",
-                "Transfer learning is a technique where a model trained on one task is adapted for a different but related task. It can significantly reduce training time and improve performance.",
-                "Hyperparameter tuning is the process of finding the optimal hyperparameters for a machine learning model. Common methods include grid search and random search.",
-                "Regularization techniques like L1 and L2 regularization help prevent overfitting by adding penalty terms to the loss function. They encourage simpler models.",
-                "Activation functions introduce non-linearity into neural networks. Common activation functions include ReLU, sigmoid, and tanh."
-            ]
-            # Add sample documents to Chroma
             all_chunks = []
             chunk_ids = []
             chunk_metadatas = []
-            for i, text in enumerate(sample_texts):
-                chunk_id = f"sample_doc_{i}"
-                metadata = {
-                    "source": f"sample_doc_{i}",
-                    "chunk_index": 0,
-                    "total_chunks": 1,
-                    "text_length": len(text)
-                }
-                all_chunks.append(text)
-                chunk_ids.append(chunk_id)
-                chunk_metadatas.append(metadata)
-            # Add documents to Chroma
-            collection.add(
-                documents=all_chunks,
-                ids=chunk_ids,
-                metadatas=chunk_metadatas
-            )
-        # Initialize Gemini using direct API
-        genai.configure(api_key=api_key)
         return {
             'embedding_model': embedding_model,
             'chroma_client': chroma_client,
             'collection': collection,
-            'genai': genai
         }
     except Exception as e:
         st.error(f"Error initializing RAG system: {e}")
         return None
-def rag_pipeline(query, rag_system, n_results=5):
-    """Complete RAG pipeline using direct Gemini API"""
     try:
-        collection = rag_system['collection']
-        genai = rag_system['genai']
-        # Retrieve relevant documents
         results = collection.query(
             query_texts=[query],
             n_results=n_results
         )
         documents = results['documents'][0]
         distances = results['distances'][0]
-        if not documents:
-            return "I couldn't find relevant information for your query. Please try asking about machine learning, deep learning, or AI topics."
-        # Create context
-        context = "\n\n".join(documents)
-        # Generate answer using direct Gemini API
-        model = genai.GenerativeModel('gemini-2.0-flash-exp')
-        prompt = f"""You are an AI assistant specialized in machine learning, deep learning, and artificial intelligence.
-        Use the provided context to answer questions accurately and comprehensively. If the context doesn't contain enough
-        information, you can supplement with your general knowledge, but always prioritize the provided context.
-        Provide clear, well-structured answers with examples when appropriate.
-        Context:
-        {context}
-        Question: {query}
-        Please provide a comprehensive answer based on the context above."""
-        response = model.generate_content(prompt)
-        return response.text, documents, distances
     except Exception as e:
         return f"Error generating response: {e}", [], []
@@ -207,7 +310,7 @@ def rag_pipeline(query, rag_system, n_results=5):
 st.markdown("""
 <div class="main-header">
     <h1>🤖 RAG Chatbot: ML/AI Assistant</h1>
-    <p>Powered by Google Gemini 2.5 Flash + Chroma + Direct API</p>
 </div>
 """, unsafe_allow_html=True)
@@ -221,7 +324,8 @@ with st.sidebar:
         type="password",
         help="Get your API key from Google AI Studio"
     )
     if api_key:
         os.environ['GOOGLE_API_KEY'] = api_key
@@ -281,13 +385,14 @@ if not st.session_state.initialized:
     deep learning, AI, and related topics using:
     - **🤖 Generation Model**: Google Gemini 2.5 Flash
     - **🗄️ Vector Database**: Chroma
-    - **📚 Dataset**: Sample ML/AI knowledge base
     - **🌐 Interface**: Streamlit
     ### 🚀 How It Works
-    1. **Data Loading**: Sample ML/AI content is loaded
     2. **Embedding**: Text is processed and embedded using sentence transformers
     3. **Storage**: Embeddings are stored in Chroma vector database
     4. **Retrieval**: Relevant context is retrieved for user queries
@@ -360,7 +465,7 @@ else:
 st.markdown("---")
 st.markdown("""
 <div style="text-align: center; color: #666; padding: 1rem;">
-    <p>🤖 RAG Chatbot | Powered by Google Gemini 2.5 Flash + Chroma</p>
-    <p>📚 Knowledge Base: ML/AI Sample Dataset</p>
 </div>
 """, unsafe_allow_html=True)

 import streamlit as st
 import os
 import json
+import chromadb
+from chromadb.config import Settings
+from sentence_transformers import SentenceTransformer
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.schema import HumanMessage, SystemMessage
 import time
 from datetime import datetime
+import uuid
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import re
+from dotenv import load_dotenv
+import os
 st.set_page_config(
     page_title="🤖 RAG Chatbot: ML/AI Assistant",
     page_icon="🤖",
     initial_sidebar_state="expanded"
 )
+load_dotenv()
+api_key = os.environ.get("GOOGLE_API_KEY")
 # Custom CSS for better styling
 st.markdown("""
 </style>
 """, unsafe_allow_html=True)
+# Initialize session state
+if 'messages' not in st.session_state:
+    st.session_state.messages = []
+if 'rag_system' not in st.session_state:
+    st.session_state.rag_system = None
+if 'initialized' not in st.session_state:
+    st.session_state.initialized = False
+# RAG System Functions (from notebook)
+def chunk_text(text, chunk_size=500, overlap=50):
+    """Split text into overlapping chunks"""
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = ' '.join(words[i:i + chunk_size])
+        if len(chunk.strip()) > 50:  # Only keep substantial chunks
+            chunks.append(chunk)
+    return chunks
+def load_and_process_dataset():
+    """Load and process The Pile dataset"""
+    print("📚 Loading The Pile dataset...")
+    try:
+        # Load a specific subset that contains ML/AI content
+        dataset = load_dataset("EleutherAI/the_pile", split="train", streaming=True)
+        # Take first 1000 samples for demonstration
+        texts = []
+        ml_keywords = ['machine learning', 'deep learning', 'neural network', 'artificial intelligence',
+                       'algorithm', 'model', 'training', 'data', 'feature', 'classification',
+                       'regression', 'clustering', 'optimization', 'gradient', 'tensor']
+        print("🔍 Filtering ML/AI related content...")
+        count = 0
+        for sample in tqdm(dataset, desc="Processing samples"):
+            if count >= 1000:  # Limit to 1000 samples for demo
+                break
+            text = sample['text']
+            # Check if text contains ML/AI keywords
+            if any(keyword in text.lower() for keyword in ml_keywords):
+                # Clean and preprocess text
+                text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
+                text = text.strip()
+                # Only keep texts that are reasonable length (not too short or too long)
+                if 100 <= len(text) <= 2000:
+                    texts.append(text)
+                    count += 1
+        print(f"✅ Loaded {len(texts)} ML/AI related text samples")
+        return texts
+    except Exception as e:
+        print(f"❌ Error loading dataset: {e}")
+        print("🔄 Using fallback sample data...")
+        # Fallback sample data if The Pile is not accessible
+        texts = [
+            "Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data. Deep learning uses neural networks with multiple layers to process complex patterns in data.",
+            "Neural networks are computing systems inspired by biological neural networks. They consist of interconnected nodes that process information using a connectionist approach.",
+            "Supervised learning uses labeled training data to learn a mapping from inputs to outputs. Common algorithms include linear regression, decision trees, and support vector machines.",
+            "Unsupervised learning finds hidden patterns in data without labeled examples. Clustering algorithms like K-means group similar data points together.",
+            "Natural language processing combines computational linguistics with machine learning to help computers understand human language. It includes tasks like text classification and sentiment analysis.",
+            "Computer vision enables machines to interpret and understand visual information from the world. It uses deep learning models like convolutional neural networks.",
+            "Reinforcement learning is a type of machine learning where agents learn to make decisions by interacting with an environment and receiving rewards or penalties.",
+            "Feature engineering is the process of selecting and transforming raw data into features that can be used by machine learning algorithms. Good features can significantly improve model performance.",
+            "Cross-validation is a technique used to assess how well a machine learning model generalizes to new data. It involves splitting data into training and validation sets multiple times.",
+            "Overfitting occurs when a model learns the training data too well and performs poorly on new data. Regularization techniques help prevent overfitting."
+        ]
+        print(f"✅ Using {len(texts)} sample texts")
+        return texts
 def initialize_rag_system(api_key):
     """Initialize the RAG system with all components"""
     try:
         # Set API key
         os.environ['GOOGLE_API_KEY'] = api_key
         # Initialize embedding model
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
         collection_name = "ml_ai_knowledge"
         try:
             collection = chroma_client.get_collection(collection_name)
+            print(f"✅ Found existing collection: {collection_name}")
         except:
             collection = chroma_client.create_collection(
                 name=collection_name,
+                metadata={"description": "ML/AI knowledge base from The Pile dataset"}
             )
+            print(f"✅ Created new collection: {collection_name}")
         # Check if collection already has data
         existing_count = collection.count()
+        print(f"📊 Current documents in collection: {existing_count}")
         if existing_count == 0:
+            print("🔄 Adding new documents to collection...")
+            # Load and process dataset
+            texts = load_and_process_dataset()
             all_chunks = []
             chunk_ids = []
             chunk_metadatas = []
+            for i, text in enumerate(tqdm(texts, desc="Processing texts")):
+                chunks = chunk_text(text)
+                for j, chunk in enumerate(chunks):
+                    chunk_id = f"doc_{i}_chunk_{j}"
+                    metadata = {
+                        "source": f"the_pile_doc_{i}",
+                        "chunk_index": j,
+                        "total_chunks": len(chunks),
+                        "text_length": len(chunk)
+                    }
+                    all_chunks.append(chunk)
+                    chunk_ids.append(chunk_id)
+                    chunk_metadatas.append(metadata)
+            print(f"📊 Created {len(all_chunks)} text chunks")
+            # Add documents to Chroma in batches to avoid memory issues
+            batch_size = 100
+            for i in tqdm(range(0, len(all_chunks), batch_size), desc="Adding to Chroma"):
+                batch_chunks = all_chunks[i:i + batch_size]
+                batch_ids = chunk_ids[i:i + batch_size]
+                batch_metadatas = chunk_metadatas[i:i + batch_size]
+                collection.add(
+                    documents=batch_chunks,
+                    ids=batch_ids,
+                    metadatas=batch_metadatas
+                )
+            print("✅ All documents added to Chroma!")
+        else:
+            print("✅ Collection already contains data, skipping addition")
+        # Initialize Gemini
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            temperature=0.7,
+            max_output_tokens=1024,
+            convert_system_message_to_human=True
+        )
         return {
             'embedding_model': embedding_model,
             'chroma_client': chroma_client,
             'collection': collection,
+            'llm': llm
         }
     except Exception as e:
         st.error(f"Error initializing RAG system: {e}")
         return None
+def retrieve_relevant_docs(query, collection, n_results=5):
+    """Retrieve relevant documents from Chroma"""
     try:
         results = collection.query(
             query_texts=[query],
             n_results=n_results
         )
+        # Extract documents and metadata
         documents = results['documents'][0]
+        metadatas = results['metadatas'][0]
         distances = results['distances'][0]
+        return documents, metadatas, distances
+    except Exception as e:
+        print(f"Error retrieving documents: {e}")
+        return [], [], []
+def create_context(documents):
+    """Create context string from retrieved documents"""
+    context = "\n\n".join(documents)
+    return context
+def generate_answer(query, context, llm):
+    """Generate answer using Gemini with retrieved context"""
+    system_prompt = """You are an AI assistant specialized in machine learning, deep learning, and artificial intelligence.
+    Use the provided context to answer questions accurately and comprehensively. If the context doesn't contain enough
+    information, you can supplement with your general knowledge, but always prioritize the provided context.
+    Provide clear, well-structured answers with examples when appropriate."""
+    user_prompt = f"""Context:
+    {context}
+    Question: {query}
+    Please provide a comprehensive answer based on the context above."""
+    try:
+        messages = [
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=user_prompt)
+        ]
+        response = llm.invoke(messages)
+        return response.content
+    except Exception as e:
+        return f"Error generating answer: {e}"
+def rag_pipeline(query, rag_system, n_results=5):
+    """Complete RAG pipeline"""
+    try:
+        collection = rag_system['collection']
+        llm = rag_system['llm']
+        # Retrieve relevant documents
+        documents, metadatas, distances = retrieve_relevant_docs(query, collection, n_results)
+        if not documents:
+            return "I couldn't find relevant information for your query. Please try asking about machine learning, deep learning, or AI topics."
+        # Create context
+        context = create_context(documents)
+        # Generate answer
+        answer = generate_answer(query, context, llm)
+        return answer, documents, distances
     except Exception as e:
         return f"Error generating response: {e}", [], []
 st.markdown("""
 <div class="main-header">
     <h1>🤖 RAG Chatbot: ML/AI Assistant</h1>
+    <p>Powered by Google Gemini 2.5 Flash + LangChain + Chroma</p>
 </div>
 """, unsafe_allow_html=True)
         type="password",
         help="Get your API key from Google AI Studio"
     )
     if api_key:
         os.environ['GOOGLE_API_KEY'] = api_key
     deep learning, AI, and related topics using:
     - **🤖 Generation Model**: Google Gemini 2.5 Flash
+    - **🔗 RAG Framework**: LangChain
     - **🗄️ Vector Database**: Chroma
+    - **📚 Dataset**: The Pile (EleutherAI/the_pile) from Hugging Face
     - **🌐 Interface**: Streamlit
     ### 🚀 How It Works
+    1. **Data Loading**: Text data from The Pile dataset is loaded and filtered for ML/AI content
     2. **Embedding**: Text is processed and embedded using sentence transformers
     3. **Storage**: Embeddings are stored in Chroma vector database
     4. **Retrieval**: Relevant context is retrieved for user queries
 st.markdown("---")
 st.markdown("""
 <div style="text-align: center; color: #666; padding: 1rem;">
+    <p>🤖 RAG Chatbot | Powered by Google Gemini 2.5 Flash + LangChain + Chroma</p>
+    <p>📚 Knowledge Base: The Pile Dataset (EleutherAI/the_pile)</p>
 </div>
 """, unsafe_allow_html=True)

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ google-generativeai==0.3.2
 numpy==1.24.3
 pandas==2.0.3
 tqdm==4.66.1
-huggingface-hub>=0.16.4,<1.0.0

 numpy==1.24.3
 pandas==2.0.3
 tqdm==4.66.1
+huggingface-hub>=0.16.4,<1.0.0
+gradio