bharathmunakala commited on
Commit
0786686
Β·
verified Β·
1 Parent(s): 48e68b9

Upload 8 files

Browse files
Files changed (8) hide show
  1. .streamlit/config.toml +55 -0
  2. Dockerfile +32 -0
  3. README.md +146 -11
  4. app.py +321 -0
  5. database.py +161 -0
  6. ingest.py +142 -0
  7. requirements.txt +18 -0
  8. test_mongodb_integration.py +129 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [server]
2
+ enableStaticServing = true
3
+
4
+ [[theme.fontFaces]]
5
+ family = "SpaceGrotesk"
6
+ url = "app/static/SpaceGrotesk-VariableFont_wght.ttf"
7
+
8
+ [[theme.fontFaces]]
9
+ family = "SpaceMono"
10
+ url = "app/static/SpaceMono-Bold.ttf"
11
+ style = "normal"
12
+ weight = 700
13
+
14
+ [[theme.fontFaces]]
15
+ family = "SpaceMono"
16
+ url = "app/static/SpaceMono-BoldItalic.ttf"
17
+ style = "italic"
18
+ weight = 700
19
+
20
+ [[theme.fontFaces]]
21
+ family = "SpaceMono"
22
+ url = "app/static/SpaceMono-Italic.ttf"
23
+ style = "italic"
24
+ weight = 400
25
+
26
+ [[theme.fontFaces]]
27
+ family = "SpaceMono"
28
+ url = "app/static/SpaceMono-Regular.ttf"
29
+ style = "normal"
30
+ weight = 400
31
+
32
+ [theme]
33
+ primaryColor = "#cb785c"
34
+ backgroundColor = "#fdfdf8"
35
+ secondaryBackgroundColor = "#ecebe3"
36
+ textColor = "#3d3a2a"
37
+ linkColor = "#3d3a2a"
38
+ borderColor = "#d3d2ca"
39
+ showWidgetBorder = true
40
+ baseRadius = "0.75rem"
41
+ buttonRadius = "full"
42
+ font = "SpaceGrotesk"
43
+ headingFontWeights = [600,500,500,500,500,500]
44
+ headingFontSizes = ["3rem", "2rem"]
45
+ codeFont = "SpaceMono"
46
+ codeFontSize = ".75rem"
47
+ codeBackgroundColor = "#ecebe4"
48
+ showSidebarBorder = true
49
+ chartCategoricalColors = ["#0ea5e9", "#059669", "#fbbf24"]
50
+
51
+ [theme.sidebar]
52
+ backgroundColor = "#f0f0ec"
53
+ secondaryBackgroundColor = "#ecebe3"
54
+ headingFontSizes = ["1.6rem", "1.4rem", "1.2rem"]
55
+ dataframeHeaderBackgroundColor = "#e4e4e0"
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python image as base
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Set environment variables
8
+ ENV PYTHONDONTWRITEBYTECODE=1 \
9
+ PYTHONUNBUFFERED=1 \
10
+ STREAMLIT_SERVER_PORT=8501 \
11
+ STREAMLIT_SERVER_HEADLESS=true \
12
+ STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
13
+
14
+ # Install system dependencies
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ build-essential \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Copy requirements first to leverage Docker cache
20
+ COPY requirements.txt .
21
+
22
+ # Install Python dependencies
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy the rest of the application
26
+ COPY . .
27
+
28
+ # Expose the port Streamlit runs on
29
+ EXPOSE 8501
30
+
31
+ # Command to run the application
32
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1,11 +1,146 @@
1
- ---
2
- title: Role Base Access Control
3
- emoji: 🐨
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Departmental RAG System with Role-Based Access Control
2
+
3
+ A production-ready Streamlit-based RAG (Retrieval-Augmented Generation) application with MongoDB authentication and role-based access control. This application allows users to securely access and query department-specific documents based on their assigned roles.
4
+
5
+ ![Image](https://github.com/user-attachments/assets/f8a91110-a40f-4f7f-8c76-43fd0932e10c)
6
+
7
+ ## ✨ Features
8
+
9
+ - **Secure Authentication**: MongoDB-based user authentication
10
+ - **Role-Based Access Control (RBAC)**: Granular access to department-specific documents
11
+ - **Efficient Document Retrieval**: ChromaDB vector store with Cohere embeddings
12
+ - **Natural Language Queries**: Powered by Groq's LLM with Cohere embeddings
13
+ - **Modern UI**: Clean, responsive interface with dark/light theme support
14
+ - **Department-Specific Knowledge Bases**: Isolated document repositories for each department
15
+
16
+ ## πŸ› οΈ Prerequisites
17
+
18
+ - Python 3.8+
19
+ - MongoDB Atlas account (or local MongoDB instance)
20
+ - Cohere API key (for text embeddings)
21
+ - Groq API key (for LLM, optional but recommended)
22
+
23
+ ## πŸš€ Quick Start
24
+
25
+ 1. **Clone the repository**
26
+ ```bash
27
+ git clone https://github.com/Bharath8080/RBAC_RAG_V4.git
28
+ cd RBAC_RAG_V4
29
+ ```
30
+
31
+ 2. **Set up the environment**
32
+ ```bash
33
+ # Install uv (if not already installed)
34
+ pip install uv
35
+
36
+ # Create and activate virtual environment
37
+ uv venv
38
+ .venv\Scripts\activate # On Windows
39
+ # OR
40
+ source .venv/bin/activate # On Unix/macOS
41
+
42
+ # Install dependencies
43
+ uv pip install -r requirements.txt
44
+ ```
45
+
46
+ 3. **Configure environment variables**
47
+ Create a `.env` file in the project root with the following content:
48
+ ```
49
+ # MongoDB Configuration
50
+ MONGO_URI=mongodb+srv://<username>:<password>@<cluster-address>/<database>?retryWrites=true&w=majority
51
+ DB_NAME=rag_system
52
+
53
+ # Cohere API Key (for embeddings)
54
+ COHERE_API_KEY=your_cohere_api_key_here
55
+
56
+ # Optional: Groq API Key (for LLM)
57
+ GROQ_API_KEY=your_groq_api_key_here
58
+ ```
59
+ Replace the placeholder values with your actual credentials.
60
+
61
+ 4. **Set up the vector store**
62
+ ```bash
63
+ python ingest.py
64
+ ```
65
+ This will process and index all documents in the `resources/data/` directory.
66
+
67
+ 5. **Initialize the database with sample users**
68
+ ```bash
69
+ python ingest_db.py
70
+ ```
71
+ This will create sample user accounts in your MongoDB database.
72
+
73
+ 6. **Run the application**
74
+ ```bash
75
+ streamlit run app.py
76
+ ```
77
+
78
+ 7. **Access the application**
79
+ Open your browser and navigate to `http://localhost:8501`
80
+
81
+ ## πŸ‘₯ Default User Accounts
82
+
83
+ | Username | Password | Department |
84
+ |----------|-------------|-------------|
85
+ | tony | password123 | engineering |
86
+ | bruce | securepass | marketing |
87
+ | sam | financepass | finance |
88
+ | natasha | hrpass123 | hr |
89
+
90
+ ## πŸ“ Project Structure
91
+
92
+ ```
93
+ RBAC_RAG_V4/
94
+ β”œβ”€β”€ .env.example # Example environment variables
95
+ β”œβ”€β”€ app.py # Main Streamlit application
96
+ β”œβ”€β”€ database.py # Database connection and user management
97
+ β”œβ”€β”€ ingest.py # Document processing and vector store creation
98
+ β”œβ”€β”€ ingest_db.py # Initialize MongoDB with sample users
99
+ β”œβ”€β”€ requirements.txt # Python dependencies
100
+ β”œβ”€β”€ resources/
101
+ β”‚ └── data/ # Department-specific documents
102
+ β”‚ β”œβ”€β”€ engineering/
103
+ β”‚ β”œβ”€β”€ finance/
104
+ β”‚ β”œβ”€β”€ hr/
105
+ β”‚ └── marketing/
106
+ └── chroma_db/ # ChromaDB vector store (created after first run)
107
+ ```
108
+
109
+ ## πŸ”§ Configuration
110
+
111
+ ### Environment Variables
112
+
113
+ - `MONGO_URI`: MongoDB connection string
114
+ - `DB_NAME`: Database name (default: `rag_system`)
115
+ - `COHERE_API_KEY`: Required for text embeddings
116
+ - `GROQ_API_KEY`: Required for LLM responses (falls back to local model if not provided)
117
+
118
+ ### Adding New Users
119
+
120
+ 1. Run the `ingest_db.py` script with new user data
121
+ 2. Or add users directly to MongoDB in the `users` collection
122
+
123
+ ### Adding New Documents
124
+
125
+ 1. Place documents in the appropriate department folder under `resources/data/`
126
+ 2. Run `python ingest.py` to update the vector store
127
+
128
+ ## πŸ›‘οΈ Security Notes
129
+
130
+ - Always store sensitive information in environment variables, never in code
131
+ - Use strong, unique passwords for MongoDB access
132
+ - Regularly update your dependencies for security patches
133
+ - Consider implementing rate limiting in production
134
+ - Enable MongoDB network access restrictions
135
+
136
+ ## πŸ“„ License
137
+
138
+ This project is part of the Codebasics Resume Project Challenge.
139
+
140
+ ## πŸ™ Acknowledgments
141
+
142
+ - [Streamlit](https://streamlit.io/) for the web framework
143
+ - [ChromaDB](https://www.trychroma.com/) for vector storage
144
+ - [Cohere](https://cohere.com/) for embeddings
145
+ - [Groq](https://groq.com/) for LLM inference
146
+ - [MongoDB](https://www.mongodb.com/) for user management
app.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __import__('pysqlite3')
2
+ import sys
3
+ sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
4
+
5
+ import sqlite3
6
+
7
+ import os
8
+ import streamlit as st
9
+ import chromadb
10
+ from typing import Dict, Optional, Any
11
+ from pathlib import Path
12
+ from dotenv import load_dotenv
13
+ from llama_index.core import VectorStoreIndex, StorageContext, Settings
14
+ from llama_index.vector_stores.chroma import ChromaVectorStore
15
+ from llama_index.llms.groq import Groq
16
+ from llama_index.embeddings.cohere import CohereEmbedding
17
+
18
+
19
+ from arize.otel import register
20
+ from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
21
+
22
+ # Setup OTel via Arize's convenience function
23
+ tracer_provider = register(
24
+ space_id=os.getenv("ARIZE_SPACE_ID"),
25
+ api_key=os.getenv("ARIZE_API_KEY"),
26
+ project_name="rbacrag" # Choose a project name
27
+ )
28
+
29
+ # Instrument LlamaIndex
30
+ LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
31
+
32
+ # Import database module
33
+ from database import db, initialize_users
34
+
35
+ # Load environment variables
36
+ load_dotenv()
37
+
38
+ # Initialize default users
39
+ initialize_users()
40
+
41
+ # Role-based access control for documents
42
+ ROLE_ACCESS = {
43
+ "hr": ["hr", "general"],
44
+ "engineering": ["engineering", "general"],
45
+ "finance": ["finance", "general"],
46
+ "marketing": ["marketing", "general"]
47
+ }
48
+
49
+ # Initialize session state
50
+ def initialize_session_state():
51
+ """Initialize or reset the session state"""
52
+ if "authenticated" not in st.session_state:
53
+ st.session_state.authenticated = False
54
+ if "username" not in st.session_state:
55
+ st.session_state.username = None
56
+ if "role" not in st.session_state:
57
+ st.session_state.role = None
58
+ if "messages" not in st.session_state:
59
+ st.session_state.messages = []
60
+ if "vector_index" not in st.session_state:
61
+ st.session_state.vector_index = None
62
+ if "query_engine" not in st.session_state:
63
+ st.session_state.query_engine = None
64
+
65
+ # Set page config
66
+ st.set_page_config(
67
+ page_title="Departmental RAG System",
68
+ page_icon="πŸ”’",
69
+ layout="centered",
70
+ initial_sidebar_state="collapsed"
71
+ )
72
+
73
+ # Initialize session state
74
+ initialize_session_state()
75
+
76
+ # Authentication functions
77
+ def login(username: str, password: str) -> bool:
78
+ """
79
+ Authenticate user and set session state
80
+
81
+ Args:
82
+ username: The username to authenticate
83
+ password: The password to verify
84
+
85
+ Returns:
86
+ bool: True if authentication was successful, False otherwise
87
+ """
88
+ try:
89
+ user = db.verify_user(username, password)
90
+ if user:
91
+ st.session_state.authenticated = True
92
+ st.session_state.username = user["username"]
93
+ st.session_state.role = user["role"]
94
+ st.session_state.messages = [
95
+ {"role": "assistant", "content": f"Welcome, {user['username']}! How can I assist you today?"}
96
+ ]
97
+ st.rerun() # Rerun to update the UI
98
+ return True
99
+ return False
100
+ except Exception as e:
101
+ st.error(f"An error occurred during login: {str(e)}")
102
+ return False
103
+
104
+ def logout():
105
+ """
106
+ Log out the current user and clear session state
107
+ """
108
+ username = st.session_state.get('username', 'Unknown')
109
+ st.session_state.clear()
110
+ initialize_session_state()
111
+ st.success(f"Successfully logged out {username}")
112
+ st.rerun() # Rerun to update the UI
113
+
114
+ @st.cache_resource
115
+ def load_vector_index(role: str):
116
+ """Load the ChromaDB index for the user's role"""
117
+ try:
118
+ # Initialize Cohere embeddings
119
+ cohere_api_key = os.getenv("COHERE_API_KEY")
120
+ if not cohere_api_key:
121
+ raise ValueError("COHERE_API_KEY not found in environment variables")
122
+
123
+ embed_model = CohereEmbedding(
124
+ cohere_api_key=cohere_api_key,
125
+ model_name="embed-english-v3.0",
126
+ input_type="search_document"
127
+ )
128
+ Settings.embed_model = embed_model
129
+
130
+ # Initialize Chroma client
131
+ persist_dir = f"./chroma_db/{role}"
132
+ chroma_client = chromadb.PersistentClient(path=persist_dir)
133
+
134
+ # Get the collection
135
+ chroma_collection = chroma_client.get_collection("documents")
136
+
137
+ # Create vector store
138
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
139
+
140
+ # Create storage context
141
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
142
+
143
+ # Load the index
144
+ index = VectorStoreIndex.from_vector_store(
145
+ vector_store=vector_store,
146
+ storage_context=storage_context,
147
+ embed_model=embed_model
148
+ )
149
+
150
+ return index
151
+ except Exception as e:
152
+ st.error(f"Error loading vector index: {str(e)}")
153
+ st.stop()
154
+
155
+ def chat_interface():
156
+ """Main chat interface"""
157
+ # Add styled heading
158
+ st.markdown(f"<h2 style='color: #1407fa;'>πŸ’¬ {st.session_state.role.capitalize()} Department Chat</h3>", unsafe_allow_html=True)
159
+
160
+ # Display chat messages
161
+ for message in st.session_state.messages:
162
+ with st.chat_message(message["role"]):
163
+ st.markdown(message["content"])
164
+
165
+ # Load the appropriate index for the user's role
166
+ index = load_vector_index(st.session_state.role)
167
+
168
+ # Initialize Groq LLM
169
+ try:
170
+ llm = Groq(
171
+ model="llama3-8b-8192",
172
+ api_key=os.getenv("GROQ_API_KEY"),
173
+ temperature=0.5,
174
+ system_prompt=f"You are a helpful assistant specialized in {st.session_state.role} department documents. Answer the user queries with the help of the provided context with high accuracy and precision."
175
+ )
176
+
177
+ # Create query engine with the LLM
178
+ query_engine = index.as_query_engine(
179
+ llm=llm,
180
+ similarity_top_k=3,
181
+ response_mode="compact"
182
+ )
183
+ except Exception as e:
184
+ st.error(f"Error initializing LLM: {str(e)}")
185
+ st.warning("Falling back to default LLM settings. Some features may be limited.")
186
+ query_engine = index.as_query_engine(
187
+ similarity_top_k=3,
188
+ response_mode="compact"
189
+ )
190
+
191
+ # Chat input
192
+ if prompt := st.chat_input(f"Ask about {st.session_state.role} documents..."):
193
+ # Add user message to chat history
194
+ st.session_state.messages.append({"role": "user", "content": prompt})
195
+
196
+ # Display user message
197
+ with st.chat_message("user"):
198
+ st.markdown(prompt)
199
+
200
+ # Get and display assistant response
201
+ with st.chat_message("assistant"):
202
+ message_placeholder = st.empty()
203
+ full_response = ""
204
+
205
+ try:
206
+ # Get response from query engine
207
+ response = query_engine.query(prompt)
208
+ full_response = str(response)
209
+ message_placeholder.markdown(full_response)
210
+ except Exception as e:
211
+ error_msg = f"Error generating response: {str(e)}"
212
+ message_placeholder.error(error_msg)
213
+ full_response = error_msg
214
+
215
+ # Add assistant response to chat history
216
+ st.session_state.messages.append({"role": "assistant", "content": full_response})
217
+
218
+ def show_login_form():
219
+ """Display the beautiful login form"""
220
+ st.markdown(
221
+ """
222
+ <style>
223
+ .main {
224
+ background-color: #1a1a2e;
225
+ color: white;
226
+ }
227
+ .stTextInput > div > div > input {
228
+ background-color: #2a2a3e;
229
+ color: white;
230
+ border: 1px solid #4a4a6a;
231
+ border-radius: 8px;
232
+ }
233
+ .stTextInput > div > div > input::placeholder {
234
+ color: #a0a0b0 !important;
235
+ opacity: 1 !important;
236
+ }
237
+ .stButton > button {
238
+ background-color: #e94560;
239
+ color: white;
240
+ border: none;
241
+ border-radius: 8px;
242
+ padding: 10px 20px;
243
+ font-size: 16px;
244
+ width: 100%;
245
+ }
246
+ .stButton > button:hover {
247
+ background-color: #d83450;
248
+ }
249
+ h1, h2, h3, h4, h5, h6 {
250
+ color: white;
251
+ }
252
+ .st-emotion-cache-1r6slb0 {
253
+ border: 1px solid #4a4a6a;
254
+ border-radius: 12px;
255
+ padding: 2rem;
256
+ background-color: #232339;
257
+ }
258
+ </style>
259
+ """,
260
+ unsafe_allow_html=True
261
+ )
262
+ st.markdown('<div style="text-align: center; margin-top: -80px; margin-bottom: 30px;"><h1 style="font-size: 3rem;">πŸ”’</h1></div>', unsafe_allow_html=True)
263
+ st.markdown('<h1 style="text-align: center; margin-bottom: 20px;">Department Portal</h1>', unsafe_allow_html=True)
264
+ st.markdown('<p style="text-align: center; color: #a0a0b0; margin-bottom: 30px;">Sign in to access your department\'s knowledge base</p>', unsafe_allow_html=True)
265
+
266
+ with st.container():
267
+ with st.form("login_form", border=True):
268
+ username = st.text_input("Username", placeholder="Enter your username")
269
+ password = st.text_input("Password", type="password", placeholder="Enter your password")
270
+ login_button = st.form_submit_button("Sign In")
271
+
272
+ if login_button:
273
+ if not username or not password:
274
+ st.error("Please enter both username and password")
275
+ elif login(username, password):
276
+ st.success(f"Welcome, {username}! Redirecting...")
277
+ else:
278
+ st.error("Invalid username or password")
279
+
280
+ with st.expander("Need demo credentials?"):
281
+ st.markdown("""
282
+ - **Engineering:** `Tony` / `password123`
283
+ - **Marketing:** `Bruce` / `securepass`
284
+ - **Finance:** `Sam` / `financepass`
285
+ - **HR:** `Natasha` / `hrpass123`
286
+ """)
287
+
288
+ st.markdown('<p style="text-align: center; margin-top: 2rem; color: #a0a0b0;">2025 Department RAG System</p>', unsafe_allow_html=True)
289
+
290
+
291
+ def main():
292
+ """
293
+ Main application entry point
294
+ Handles routing between login and main application
295
+ """
296
+ # Sidebar for logout and user info
297
+ if st.session_state.authenticated:
298
+ st.set_page_config(layout="wide", initial_sidebar_state="expanded")
299
+ with st.sidebar:
300
+ st.markdown(f"### Welcome, {st.session_state.username}")
301
+ st.markdown(f"**Role:** {st.session_state.role.capitalize()}")
302
+
303
+ if st.button("Logout", key="logout_btn"):
304
+ logout()
305
+ return
306
+
307
+ st.markdown("---")
308
+ st.markdown("### About")
309
+ st.markdown("""
310
+ This is a secure departmental RAG system that provides
311
+ role-based access to information across different departments.
312
+ """)
313
+
314
+ # Main content area
315
+ if not st.session_state.authenticated:
316
+ show_login_form()
317
+ else:
318
+ chat_interface()
319
+
320
+ if __name__ == "__main__":
321
+ main()
database.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ from typing import Optional, Dict, Any, List
4
+ from pymongo import MongoClient, ReturnDocument
5
+ from pymongo.errors import DuplicateKeyError, ConnectionFailure
6
+ import bcrypt
7
+ from dotenv import load_dotenv
8
+ import logging
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ load_dotenv()
15
+
16
+ class Database:
17
+ def __init__(self):
18
+ """Initialize database connection and ensure indexes"""
19
+ try:
20
+ mongo_uri = os.getenv("MONGO_URI")
21
+ if not mongo_uri:
22
+ raise ValueError("MONGO_URI environment variable is not set")
23
+
24
+ self.client = MongoClient(
25
+ mongo_uri,
26
+ serverSelectionTimeoutMS=5000, # 5 second timeout
27
+ connectTimeoutMS=30000, # 30 second connection timeout
28
+ socketTimeoutMS=45000, # 45 second socket timeout
29
+ connect=False # Lazy connection
30
+ )
31
+
32
+ # Test the connection
33
+ self.client.admin.command('ping')
34
+
35
+ self.db = self.client[os.getenv("DB_NAME", "rag_system")]
36
+ self.users = self.db["users"]
37
+ self._create_indexes()
38
+ logger.info("Successfully connected to MongoDB")
39
+
40
+ except Exception as e:
41
+ logger.error(f"Failed to connect to MongoDB: {str(e)}")
42
+ raise
43
+
44
+ def _create_indexes(self):
45
+ """Create necessary database indexes"""
46
+ try:
47
+ # Create unique index on username
48
+ self.users.create_index("username", unique=True)
49
+ logger.info("Created database indexes")
50
+ except Exception as e:
51
+ logger.error(f"Error creating indexes: {str(e)}")
52
+ raise
53
+
54
+ def add_user(self, username: str, password: str, role: str) -> bool:
55
+ """Add a new user to the database"""
56
+ if not username or not password or not role:
57
+ logger.warning("Missing required fields for user creation")
58
+ return False
59
+
60
+ try:
61
+ hashed = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt())
62
+ user_data = {
63
+ "username": username,
64
+ "password": hashed.decode('utf-8'),
65
+ "role": role.lower(),
66
+ "created_at": datetime.utcnow(),
67
+ "last_login": None
68
+ }
69
+
70
+ result = self.users.insert_one(user_data)
71
+ if result.inserted_id:
72
+ logger.info(f"Created new user: {username}")
73
+ return True
74
+ return False
75
+
76
+ except DuplicateKeyError:
77
+ logger.warning(f"Username already exists: {username}")
78
+ return False
79
+ except Exception as e:
80
+ logger.error(f"Error adding user {username}: {str(e)}")
81
+ return False
82
+
83
+ def verify_user(self, username: str, password: str) -> Optional[Dict[str, Any]]:
84
+ """Verify user credentials"""
85
+ try:
86
+ user = self.users.find_one({"username": username})
87
+ if not user:
88
+ logger.warning(f"Login attempt for non-existent user: {username}")
89
+ return None
90
+
91
+ if bcrypt.checkpw(password.encode('utf-8'), user["password"].encode('utf-8')):
92
+ # Update last login time
93
+ self.users.update_one(
94
+ {"_id": user["_id"]},
95
+ {"$set": {"last_login": datetime.utcnow()}}
96
+ )
97
+ logger.info(f"Successful login for user: {username}")
98
+ return {
99
+ "username": user["username"],
100
+ "role": user["role"],
101
+ "last_login": user.get("last_login")
102
+ }
103
+
104
+ logger.warning(f"Failed login attempt for user: {username}")
105
+ return None
106
+
107
+ except Exception as e:
108
+ logger.error(f"Error verifying user {username}: {str(e)}")
109
+ return None
110
+
111
+ def get_user(self, username: str) -> Optional[Dict[str, Any]]:
112
+ """Get user by username (without sensitive data)"""
113
+ try:
114
+ user = self.users.find_one(
115
+ {"username": username},
116
+ {"password": 0} # Exclude password from results
117
+ )
118
+ return user
119
+ except Exception as e:
120
+ logger.error(f"Error fetching user {username}: {str(e)}")
121
+ return None
122
+
123
+ # Initialize database connection
124
+ db = Database()
125
+
126
+ def initialize_users():
127
+ """
128
+ Initialize default users if they don't exist.
129
+ Returns tuple of (success_count, total_users, errors)
130
+ """
131
+ from datetime import datetime
132
+
133
+ default_users = [
134
+ {"username": "Tony", "password": "password123", "role": "engineering"},
135
+ {"username": "Bruce", "password": "securepass", "role": "marketing"},
136
+ {"username": "Sam", "password": "financepass", "role": "finance"},
137
+ {"username": "Peter", "password": "pete123", "role": "engineering"},
138
+ {"username": "Sid", "password": "sidpass123", "role": "marketing"},
139
+ {"username": "Natasha", "password": "hrpass123", "role": "hr"}
140
+ ]
141
+
142
+ success_count = 0
143
+ errors = []
144
+
145
+ for user in default_users:
146
+ try:
147
+ if db.add_user(user["username"], user["password"], user["role"]):
148
+ success_count += 1
149
+ logger.info(f"Initialized user: {user['username']}")
150
+ else:
151
+ errors.append(f"Failed to add user: {user['username']}")
152
+ except Exception as e:
153
+ error_msg = f"Error initializing user {user['username']}: {str(e)}"
154
+ logger.error(error_msg)
155
+ errors.append(error_msg)
156
+
157
+ logger.info(f"User initialization complete. Success: {success_count}/{len(default_users)}")
158
+ if errors:
159
+ logger.warning(f"Encountered {len(errors)} errors during user initialization")
160
+
161
+ return success_count, len(default_users), errors
ingest.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from pathlib import Path
4
+ from dotenv import load_dotenv
5
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
6
+ from llama_index.vector_stores.chroma import ChromaVectorStore
7
+ from llama_index.embeddings.cohere import CohereEmbedding
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Configure the embedding model
13
+ cohere_api_key = os.getenv("COHERE_API_KEY")
14
+ if not cohere_api_key:
15
+ raise ValueError("COHERE_API_KEY not found in environment variables")
16
+
17
+ # Initialize the embedding model
18
+ embed_model = CohereEmbedding(
19
+ cohere_api_key=cohere_api_key,
20
+ model_name="embed-english-v3.0",
21
+ input_type="search_document"
22
+ )
23
+
24
+ # Set the global embedding model
25
+ Settings.embed_model = embed_model
26
+
27
+ def process_documents(department: str, base_dir: str = "./resources/data"):
28
+ """
29
+ Process and index documents for a specific department
30
+
31
+ Args:
32
+ department: The department name (e.g., 'hr', 'engineering')
33
+ base_dir: Base directory containing department folders
34
+ """
35
+ print(f"Processing documents for {department} department...")
36
+
37
+ # Define paths
38
+ dept_path = Path(base_dir) / department
39
+ general_path = Path(base_dir) / "general"
40
+ persist_dir = f"./chroma_db/{department}"
41
+
42
+ # Create directory if it doesn't exist
43
+ os.makedirs(persist_dir, exist_ok=True)
44
+
45
+ # Initialize Chroma client
46
+ chroma_client = chromadb.PersistentClient(path=persist_dir)
47
+
48
+ # Clear existing collection if it exists
49
+ try:
50
+ chroma_client.delete_collection("documents")
51
+ except:
52
+ pass
53
+
54
+ # Create a new collection
55
+ chroma_collection = chroma_client.get_or_create_collection("documents")
56
+
57
+ # Create vector store
58
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
59
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
60
+
61
+ # Load department-specific documents
62
+ documents = []
63
+
64
+ # Add department-specific files
65
+ if dept_path.exists() and dept_path.is_dir():
66
+ for file_path in dept_path.glob("*"):
67
+ if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
68
+ print(f"Processing {file_path.name}...")
69
+ try:
70
+ # Read the file content
71
+ with open(file_path, 'r', encoding='utf-8') as f:
72
+ content = f.read()
73
+
74
+ # Create a document with metadata
75
+ from llama_index.core import Document
76
+ doc = Document(
77
+ text=content,
78
+ metadata={
79
+ "source": str(file_path.name),
80
+ "department": department,
81
+ "type": "department_specific"
82
+ }
83
+ )
84
+ documents.append(doc)
85
+ except Exception as e:
86
+ print(f"Error processing {file_path}: {str(e)}")
87
+
88
+ # Add general documents
89
+ if general_path.exists() and general_path.is_dir():
90
+ for file_path in general_path.glob("*"):
91
+ if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.csv']:
92
+ print(f"Processing general document: {file_path.name}...")
93
+ try:
94
+ # Read the file content
95
+ with open(file_path, 'r', encoding='utf-8') as f:
96
+ content = f.read()
97
+
98
+ # Create a document with metadata
99
+ from llama_index.core import Document
100
+ doc = Document(
101
+ text=content,
102
+ metadata={
103
+ "source": str(file_path.name),
104
+ "department": "general",
105
+ "type": "general"
106
+ }
107
+ )
108
+ documents.append(doc)
109
+ except Exception as e:
110
+ print(f"Error processing general document {file_path}: {str(e)}")
111
+
112
+ if not documents:
113
+ print(f"No documents found for {department} department.")
114
+ return
115
+
116
+ print(f"Indexing {len(documents)} documents...")
117
+
118
+ # Create index with the documents
119
+ index = VectorStoreIndex.from_documents(
120
+ documents,
121
+ storage_context=storage_context,
122
+ show_progress=True,
123
+ embed_model=embed_model
124
+ )
125
+
126
+ print(f"βœ… Successfully indexed {len(documents)} documents for {department} department")
127
+ print(f"Index stored in: {persist_dir}")
128
+
129
+ def main():
130
+ """Main function to process documents for all departments"""
131
+ departments = ["hr", "engineering", "finance", "marketing"]
132
+
133
+ for dept in departments:
134
+ print(f"\n{'='*50}")
135
+ print(f"Processing {dept.upper()} department")
136
+ print(f"{'='*50}")
137
+ process_documents(dept)
138
+
139
+ print("\nβœ… Document processing completed for all departments!")
140
+
141
+ if __name__ == "__main__":
142
+ main()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ llama-index
3
+ chromadb
4
+ cohere
5
+ python-dotenv
6
+ pandas
7
+ python-multipart
8
+ llama-index-core
9
+ llama-index-readers-file
10
+ llama-index-embeddings-cohere
11
+ ipython
12
+ llama-index-vector-stores-chroma
13
+ llama-index-llms-groq
14
+ pysqlite3-binary
15
+ pymongo
16
+ bcrypt
17
+ openinference-instrumentation-llama-index
18
+ arize-otel
test_mongodb_integration.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for MongoDB integration.
3
+ Run this script to verify the MongoDB connection and user management.
4
+ """
5
+ import os
6
+ import sys
7
+ from dotenv import load_dotenv
8
+ from database import db, initialize_users
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ def test_connection():
14
+ """Test MongoDB connection"""
15
+ try:
16
+ # Test the connection by pinging the database
17
+ db.client.admin.command('ping')
18
+ print("βœ… Successfully connected to MongoDB")
19
+ return True
20
+ except Exception as e:
21
+ print(f"❌ Failed to connect to MongoDB: {e}")
22
+ return False
23
+
24
+ def test_user_management():
25
+ """Test user management functions"""
26
+ test_username = "test_user_123"
27
+ test_password = "test_password_123"
28
+ test_role = "test_role"
29
+
30
+ # Clean up test user if exists
31
+ db.users.delete_one({"username": test_username})
32
+
33
+ # Test add_user
34
+ print("\nTesting add_user...")
35
+ if db.add_user(test_username, test_password, test_role):
36
+ print(f"βœ… Successfully added test user: {test_username}")
37
+ else:
38
+ print("❌ Failed to add test user")
39
+ return False
40
+
41
+ # Test verify_user with correct password
42
+ print("\nTesting verify_user with correct password...")
43
+ user = db.verify_user(test_username, test_password)
44
+ if user and user["username"] == test_username and user["role"] == test_role:
45
+ print("βœ… Successfully verified user with correct password")
46
+ else:
47
+ print("❌ Failed to verify user with correct password")
48
+ return False
49
+
50
+ # Test verify_user with incorrect password
51
+ print("\nTesting verify_user with incorrect password...")
52
+ user = db.verify_user(test_username, "wrong_password")
53
+ if user is None:
54
+ print("βœ… Correctly rejected incorrect password")
55
+ else:
56
+ print("❌ Incorrectly accepted wrong password")
57
+ return False
58
+
59
+ # Test get_user
60
+ print("\nTesting get_user...")
61
+ user = db.get_user(test_username)
62
+ if user and user["username"] == test_username and user["role"] == test_role:
63
+ print("βœ… Successfully retrieved user details")
64
+ else:
65
+ print("❌ Failed to retrieve user details")
66
+ return False
67
+
68
+ # Clean up
69
+ db.users.delete_one({"username": test_username})
70
+ return True
71
+
72
+ def test_initialize_users():
73
+ """Test user initialization"""
74
+ print("\nTesting user initialization...")
75
+ try:
76
+ # Clean up any existing test users
77
+ test_usernames = ["Tony", "Bruce", "Sam", "Peter", "Sid", "Natasha"]
78
+ db.users.delete_many({"username": {"$in": test_usernames}})
79
+
80
+ success_count, total_users, errors = initialize_users()
81
+
82
+ if errors:
83
+ print(f"⚠️ Encountered {len(errors)} errors during user initialization:")
84
+ for error in errors:
85
+ print(f" - {error}")
86
+
87
+ if success_count == total_users:
88
+ print(f"βœ… Successfully initialized {success_count}/{total_users} users")
89
+ return True
90
+ else:
91
+ print(f"❌ Only initialized {success_count}/{total_users} users")
92
+ return False
93
+ except Exception as e:
94
+ print(f"❌ Error during user initialization test: {str(e)}")
95
+ return False
96
+
97
+ if __name__ == "__main__":
98
+ print("=== Testing MongoDB Integration ===")
99
+
100
+ # Check if required environment variables are set
101
+ required_vars = ['MONGO_URI']
102
+ missing_vars = [var for var in required_vars if not os.getenv(var)]
103
+
104
+ if missing_vars:
105
+ print("❌ Missing required environment variables:")
106
+ for var in missing_vars:
107
+ print(f" - {var}")
108
+ print("\nPlease create a .env file with these variables. See .env.example")
109
+ sys.exit(1)
110
+
111
+ # Run tests
112
+ connection_ok = test_connection()
113
+
114
+ if connection_ok:
115
+ print("\n=== Running User Management Tests ===")
116
+ user_tests_ok = test_user_management()
117
+
118
+ print("\n=== Running User Initialization Test ===")
119
+ init_ok = test_initialize_users()
120
+
121
+ if user_tests_ok and init_ok:
122
+ print("\nβœ… All tests passed!")
123
+ sys.exit(0)
124
+ else:
125
+ print("\n❌ Some tests failed")
126
+ sys.exit(1)
127
+ else:
128
+ print("\n❌ Connection test failed. Please check your MongoDB connection details.")
129
+ sys.exit(1)