Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

+# Next.js
+.next/
+node_modules/
+out/
+# Virtual environment
+venv/
+env/
+ENV/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+# Local development settings
+.env
+.env.local
+# IDE
+.vscode/
+.idea/
+# Operating System
+.DS_Store
+Thumbs.db
+# Misc
+*.pem
+.vercel

Dockerfile CHANGED Viewed

@@ -1,23 +1,11 @@
 FROM python:3.10-slim
-WORKDIR /home/user/app
-# Install git-lfs and other dependencies
-RUN apt-get update && \
-    apt-get install -y git git-lfs poppler-utils && \
-    rm -rf /var/lib/apt/lists/* && \
-    git lfs install
 # Copy requirements first for better caching
 COPY requirements.txt .
 RUN pip install -r requirements.txt
-# Initialize git-lfs and copy the application
-COPY .gitattributes .
-COPY Dataset/Commercial\ Lending\ 101.pdf Dataset/
-RUN ls -la Dataset && \
-    stat Dataset/Commercial\ Lending\ 101.pdf
 # Copy the rest of the application
 COPY . .
@@ -26,5 +14,5 @@ ENV PORT=8501
 EXPOSE ${PORT}
-# Use the correct path to app.py and make port configurable
-CMD ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]

 FROM python:3.10-slim
+WORKDIR /app
 # Copy requirements first for better caching
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 # Copy the rest of the application
 COPY . .
 EXPOSE ${PORT}
+# Use the correct path to app.py
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Configuration management for the CRE Chatbot application.
+"""
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Azure OpenAI Configuration
+AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
+AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_KEY')
+AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
+AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
+# Application Configuration
+MAX_CHUNK_SIZE = 1000
+OVERLAP_SIZE = 200
+TEMPERATURE = 0.7
+MAX_TOKENS = 500
+# Logging Configuration
+LOG_LEVEL = "INFO"
+LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+LOG_FILE = "logs/app.log"
+# Vector Store Configuration
+VECTOR_STORE_PATH = "vector_store"
+def validate_config():
+    """Validate that all required configuration variables are set."""
+    required_vars = [
+        'AZURE_OPENAI_ENDPOINT',
+        'AZURE_OPENAI_API_KEY',
+        'AZURE_OPENAI_DEPLOYMENT_NAME',
+        'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
+    ]
+    missing_vars = [var for var in required_vars if not os.getenv(var)]
+    if missing_vars:
+        raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
+# Validate that all required configuration variables are set.
+validate_config()

app/logging.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Logging configuration for the CRE Chatbot application.
+"""
+import logging
+import os
+from logging.handlers import RotatingFileHandler
+from .config import LOG_LEVEL, LOG_FORMAT, LOG_FILE
+def setup_logging():
+    """Set up logging configuration for the application."""
+    # Create logs directory if it doesn't exist
+    os.makedirs('logs', exist_ok=True)
+    # Set up root logger
+    logger = logging.getLogger()
+    logger.setLevel(LOG_LEVEL)
+    # Create formatters and handlers
+    formatter = logging.Formatter(LOG_FORMAT)
+    # Console Handler
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    # File Handler
+    file_handler = RotatingFileHandler(
+        LOG_FILE,
+        maxBytes=10485760,  # 10MB
+        backupCount=5
+    )
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    # Create separate loggers for different components
+    loggers = {
+        'api': setup_component_logger('api'),
+        'pdf': setup_component_logger('pdf'),
+        'rag': setup_component_logger('rag'),
+        'app': setup_component_logger('app')
+    }
+    return loggers
+def setup_component_logger(name):
+    """Set up a logger for a specific component."""
+    logger = logging.getLogger(name)
+    logger.setLevel(LOG_LEVEL)
+    # Create component-specific log file
+    handler = RotatingFileHandler(
+        f'logs/{name}.log',
+        maxBytes=10485760,  # 10MB
+        backupCount=3
+    )
+    handler.setFormatter(logging.Formatter(LOG_FORMAT))
+    logger.addHandler(handler)
+    return logger

app/main.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+Main Streamlit application for the CRE Chatbot.
+"""
+import logging
+import streamlit as st
+from io import BytesIO
+import sys
+import os
+# Add the project root to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.config import validate_config, AZURE_OPENAI_DEPLOYMENT_NAME
+from app.logging import setup_logging
+from src.pdf_processor import PDFProcessor
+from src.rag_engine import RAGEngine
+# Setup logging
+loggers = setup_logging()
+logger = logging.getLogger('app')
+# Page configuration
+st.set_page_config(
+    page_title="CRE Knowledge Assistant",
+    page_icon="🏢",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+    <style>
+    .main {
+        background-color: #f5f5f5;
+    }
+    .stApp {
+        max-width: 1200px;
+        margin: 0 auto;
+    }
+    .chat-message {
+        padding: 1.5rem;
+        border-radius: 0.5rem;
+        margin-bottom: 1rem;
+        display: flex;
+        flex-direction: column;
+    }
+    .chat-message.user {
+        background-color: #e3f2fd;
+    }
+    .chat-message.assistant {
+        background-color: #f3e5f5;
+    }
+    .chat-message .message {
+        margin-top: 0.5rem;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+# Initialize session state
+if 'rag_engine' not in st.session_state:
+    st.session_state.rag_engine = None
+if 'pdf_processor' not in st.session_state:
+    st.session_state.pdf_processor = PDFProcessor()
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+if 'uploaded_pdfs' not in st.session_state:
+    st.session_state.uploaded_pdfs = set()
+def initialize_rag_engine(deployment_name: str):
+    """Initialize the RAG engine with error handling."""
+    try:
+        st.session_state.rag_engine = RAGEngine(deployment_name)
+        logger.info("RAG Engine initialized successfully")
+    except Exception as e:
+        logger.error(f"Error initializing the application: {str(e)}")
+        st.error(f"Error initializing the application: {str(e)}")
+def process_pdf(pdf_file):
+    """Process uploaded PDF file."""
+    try:
+        # Check if PDF was already processed
+        if pdf_file.name in st.session_state.uploaded_pdfs:
+            st.warning(f"'{pdf_file.name}' has already been processed!")
+            return
+        with st.spinner(f"Processing {pdf_file.name}..."):
+            # Read PDF content
+            pdf_content = pdf_file.read()
+            # Process PDF and get chunks
+            chunks = st.session_state.pdf_processor.process_pdf(
+                BytesIO(pdf_content)
+            )
+            # Add chunks to vector store
+            texts = [chunk[0] for chunk in chunks]
+            metadata = [{"source": pdf_file.name, **chunk[1]} for chunk in chunks]
+            st.session_state.rag_engine.add_documents(texts, metadata)
+            # Mark PDF as processed
+            st.session_state.uploaded_pdfs.add(pdf_file.name)
+        st.success(f"Successfully processed '{pdf_file.name}'!")
+        logger.info(f"PDF '{pdf_file.name}' processed and added to vector store")
+    except Exception as e:
+        logger.error(f"Error processing PDF: {str(e)}")
+        st.error(f"Error processing PDF: {str(e)}")
+def display_chat_message(role: str, content: str):
+    """Display a chat message with proper styling."""
+    with st.container():
+        st.markdown(f"""
+            <div class="chat-message {role}">
+                <div class="role"><strong>{'You' if role == 'user' else 'Assistant'}:</strong></div>
+                <div class="message">{content}</div>
+            </div>
+        """, unsafe_allow_html=True)
+def main():
+    """Main application function."""
+    # Header
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.title("🏢 CRE Knowledge Assistant")
+        st.markdown("*Your AI guide for commercial real estate concepts*")
+    # Sidebar
+    with st.sidebar:
+        st.header("📚 Knowledge Base")
+        st.markdown("Upload your CRE documents to enhance the assistant's knowledge.")
+        # Model configuration (collapsible)
+        with st.expander("⚙️ Model Configuration"):
+            deployment_name = st.text_input(
+                "Model Deployment Name",
+                value=AZURE_OPENAI_DEPLOYMENT_NAME,
+                help="Enter your Azure OpenAI model deployment name"
+            )
+        # Initialize RAG engine if not already done
+        if not st.session_state.rag_engine:
+            initialize_rag_engine(deployment_name)
+        # PDF upload section
+        st.subheader("📄 Upload Documents")
+        uploaded_files = st.file_uploader(
+            "Choose PDF files",
+            type="pdf",
+            accept_multiple_files=True,
+            help="Upload one or more PDF files to add to the knowledge base"
+        )
+        if uploaded_files:
+            for pdf_file in uploaded_files:
+                process_pdf(pdf_file)
+        # Show processed documents
+        if st.session_state.uploaded_pdfs:
+            st.subheader("📚 Processed Documents")
+            for pdf_name in st.session_state.uploaded_pdfs:
+                st.markdown(f"✓ {pdf_name}")
+    # Main chat interface
+    if st.session_state.rag_engine:
+        # Display chat history
+        for message in st.session_state.chat_history:
+            display_chat_message(
+                role=message["role"],
+                content=message["content"]
+            )
+        # Chat input
+        user_question = st.text_input(
+            "Ask a question about commercial real estate:",
+            placeholder="e.g., What is LTV? How is DSCR calculated?",
+            key="user_question"
+        )
+        if user_question:
+            try:
+                # Add user message to chat
+                st.session_state.chat_history.append({
+                    "role": "user",
+                    "content": user_question
+                })
+                with st.spinner("Generating answer..."):
+                    response = st.session_state.rag_engine.query(user_question)
+                # Add assistant response to chat
+                st.session_state.chat_history.append({
+                    "role": "assistant",
+                    "content": response["answer"]
+                })
+                # Display latest messages immediately
+                display_chat_message("user", user_question)
+                display_chat_message("assistant", response["answer"])
+            except Exception as e:
+                logger.error(f"Error generating answer: {str(e)}")
+                st.error(f"Error generating answer: {str(e)}")
+    else:
+        st.info("👆 Please upload PDF documents in the sidebar to start asking questions!")
+if __name__ == "__main__":
+    main()

frontend/main.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+import requests
+import sys
+import os
+# Add the project root to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.config import validate_config
+from app.logging import setup_logging
+def main():
+    # Setup logging
+    setup_logging()
+    st.set_page_config(
+        page_title="CRE Knowledge Assistant",
+        page_icon="🤖",
+        layout="wide"
+    )
+    st.title("CRE Knowledge Assistant")
+    # File uploader
+    uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
+    if uploaded_file:
+        # Convert file to bytes
+        file_bytes = uploaded_file.getvalue()
+        # Send to API endpoint
+        response = requests.post(
+            "api/process_pdf",
+            files={"file": (uploaded_file.name, file_bytes, "application/pdf")}
+        )
+        if response.status_code == 200:
+            st.success("PDF processed successfully!")
+        else:
+            st.error("Error processing PDF")
+    # Query input
+    query = st.text_input("Ask a question about your documents:")
+    if query:
+        # Send query to API endpoint
+        response = requests.post(
+            "api/query",
+            json={"query": query}
+        )
+        if response.status_code == 200:
+            result = response.json()
+            st.write("Answer:", result["answer"])
+        else:
+            st.error("Error processing query")
+if __name__ == "__main__":
+    main()

frontend/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit==1.29.0
+requests==2.31.0
+python-dotenv==1.0.0

index.html ADDED Viewed

	@@ -0,0 +1,30 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>CRE Knowledge Assistant</title>
+    <style>
+        body, html {
+            margin: 0;
+            padding: 0;
+            height: 100%;
+            overflow: hidden;
+        }
+        iframe {
+            width: 100%;
+            height: 100vh;
+            border: none;
+        }
+    </style>
+</head>
+<body>
+    <iframe src="/api" allow="camera;microphone"></iframe>
+    <script>
+        window.addEventListener('message', function(e) {
+            // Handle any messages from the Streamlit app
+            if (e.data.type === 'streamlit') {
+                console.log('Received message from Streamlit:', e.data);
+            }
+        });
+    </script>
+</body>
+</html>

pdf_processor.py CHANGED Viewed

@@ -1,34 +1,17 @@
 from typing import List, Dict
 import os
-import subprocess
-import tempfile
-import pypdf
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 class PDFProcessor:
     def __init__(self):
         self.text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,
-            chunk_overlap=50,
             length_function=len,
-            separators=["\n\n", "\n", ".", " ", ""]
         )
-    def extract_text_with_pdftotext(self, pdf_path: str) -> str:
-        """Use pdftotext (from poppler-utils) to extract text."""
-        try:
-            result = subprocess.run(
-                ['pdftotext', pdf_path, '-'],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            return result.stdout
-        except Exception as e:
-            print(f"pdftotext extraction failed: {str(e)}")
-            return ""
     def process_pdf(self, pdf_path: str) -> List[Dict]:
         """
         Process a PDF file and return chunks of text with metadata.
@@ -39,85 +22,21 @@ class PDFProcessor:
         Returns:
             List[Dict]: List of text chunks with metadata
         """
-        print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
-        if not os.path.exists(pdf_path):
-            raise FileNotFoundError(f"PDF file not found at {pdf_path}")
-        file_size = os.path.getsize(pdf_path)
-        print(f"PDF file exists, size: {file_size} bytes")
-        if file_size < 1000:  # Less than 1KB
-            raise ValueError(f"PDF file seems too small ({file_size} bytes). Might be corrupted or a pointer file.")
-        # Try all three methods
-        methods = [
-            ("PyPDFLoader", self._try_pypdf_loader),
-            ("pypdf", self._try_pypdf_direct),
-            ("pdftotext", self._try_pdftotext)
-        ]
-        last_error = None
-        for method_name, method in methods:
-            try:
-                print(f"\nTrying {method_name} method...")
-                chunks = method(pdf_path)
-                if chunks:
-                    print(f"Successfully extracted {len(chunks)} chunks using {method_name}")
-                    return chunks
-            except Exception as e:
-                print(f"Error with {method_name}: {str(e)}")
-                last_error = e
-        raise Exception(f"All PDF processing methods failed. Last error: {str(last_error)}")
-    def _try_pypdf_loader(self, pdf_path: str) -> List[Dict]:
         loader = PyPDFLoader(pdf_path)
         pages = loader.load()
-        print(f"Loaded {len(pages)} pages")
-        chunks = []
-        for page in pages:
-            content = page.page_content.strip()
-            if content:
-                page_chunks = self.text_splitter.split_text(content)
-                for chunk in page_chunks:
-                    if chunk.strip():
-                        chunks.append({
-                            'text': chunk,
-                            'metadata': {'page': page.metadata['page']}
-                        })
-        return chunks
-    def _try_pypdf_direct(self, pdf_path: str) -> List[Dict]:
-        with open(pdf_path, 'rb') as file:
-            pdf = pypdf.PdfReader(file)
-            print(f"Opened PDF with {len(pdf.pages)} pages")
-            chunks = []
-            for page_num in range(len(pdf.pages)):
-                content = pdf.pages[page_num].extract_text().strip()
-                if content:
-                    page_chunks = self.text_splitter.split_text(content)
-                    for chunk in page_chunks:
-                        if chunk.strip():
-                            chunks.append({
-                                'text': chunk,
-                                'metadata': {'page': page_num + 1}
-                            })
-            return chunks
-    def _try_pdftotext(self, pdf_path: str) -> List[Dict]:
-        text = self.extract_text_with_pdftotext(pdf_path)
-        if not text.strip():
-            return []
-        chunks = []
-        page_chunks = self.text_splitter.split_text(text)
-        for i, chunk in enumerate(page_chunks):
-            if chunk.strip():
-                chunks.append({
-                    'text': chunk,
-                    'metadata': {'page': 1}  # Page info not available with this method
-                })
-        return chunks

 from typing import List, Dict
 import os
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 class PDFProcessor:
     def __init__(self):
         self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
             length_function=len,
+            separators=["\n\n", "\n", " ", ""]
         )
     def process_pdf(self, pdf_path: str) -> List[Dict]:
         """
         Process a PDF file and return chunks of text with metadata.
         Returns:
             List[Dict]: List of text chunks with metadata
         """
+        # Load PDF
         loader = PyPDFLoader(pdf_path)
         pages = loader.load()
+        # Split text into chunks
+        chunks = self.text_splitter.split_documents(pages)
+        # Format chunks with metadata
+        processed_chunks = []
+        for chunk in chunks:
+            processed_chunks.append({
+                'text': chunk.page_content,
+                'metadata': {
+                    'page': chunk.metadata.get('page', 0) + 1
+                }
+            })
+        return processed_chunks

requirements.txt CHANGED Viewed

@@ -1,15 +1,10 @@
 streamlit==1.29.0
 openai==1.6.1
 python-dotenv==1.0.0
-pypdf==3.17.1
-PyPDF2==3.0.1
 langchain==0.0.352
 chromadb==0.4.18
 pydantic==2.5.2
 pydantic-settings==2.1.0
 azure-storage-blob==12.19.0
 numpy>=1.22.5
-duckdb==0.9.2
-typing-inspect==0.8.0
-overrides==7.3.1
-tiktoken==0.5.1

 streamlit==1.29.0
 openai==1.6.1
 python-dotenv==1.0.0
 langchain==0.0.352
 chromadb==0.4.18
 pydantic==2.5.2
 pydantic-settings==2.1.0
 azure-storage-blob==12.19.0
 numpy>=1.22.5
+pypdf==3.17.1

startup.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/sh
2	+ streamlit run app/main.py --server.port 8000 --server.address 0.0.0.0