Spaces:
Sleeping
Sleeping
tony-42069
commited on
Commit
·
f3dfbd4
1
Parent(s):
e0eceb3
Simplified PDF processing and dependencies
Browse files- .deployment +3 -0
- .dockerignore +9 -0
- .gitignore +35 -0
- Dockerfile +3 -15
- app/__init__.py +1 -0
- app/config.py +45 -0
- app/logging.py +59 -0
- app/main.py +209 -0
- frontend/main.py +59 -0
- frontend/requirements.txt +3 -0
- index.html +30 -0
- pdf_processor.py +18 -99
- requirements.txt +1 -6
- startup.sh +2 -0
.deployment
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[config]
|
2 |
+
SCM_DO_BUILD_DURING_DEPLOYMENT=true
|
3 |
+
PYTHON_ENABLE_GUNICORN=false
|
.dockerignore
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.git
|
2 |
+
.gitignore
|
3 |
+
.env
|
4 |
+
__pycache__
|
5 |
+
*.pyc
|
6 |
+
vector_store/
|
7 |
+
venv/
|
8 |
+
.pytest_cache/
|
9 |
+
logs/
|
.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Next.js
|
2 |
+
.next/
|
3 |
+
node_modules/
|
4 |
+
out/
|
5 |
+
|
6 |
+
# Virtual environment
|
7 |
+
venv/
|
8 |
+
env/
|
9 |
+
ENV/
|
10 |
+
|
11 |
+
# Python
|
12 |
+
__pycache__/
|
13 |
+
*.py[cod]
|
14 |
+
*$py.class
|
15 |
+
|
16 |
+
# Distribution / packaging
|
17 |
+
dist/
|
18 |
+
build/
|
19 |
+
*.egg-info/
|
20 |
+
|
21 |
+
# Local development settings
|
22 |
+
.env
|
23 |
+
.env.local
|
24 |
+
|
25 |
+
# IDE
|
26 |
+
.vscode/
|
27 |
+
.idea/
|
28 |
+
|
29 |
+
# Operating System
|
30 |
+
.DS_Store
|
31 |
+
Thumbs.db
|
32 |
+
|
33 |
+
# Misc
|
34 |
+
*.pem
|
35 |
+
.vercel
|
Dockerfile
CHANGED
@@ -1,23 +1,11 @@
|
|
1 |
FROM python:3.10-slim
|
2 |
|
3 |
-
WORKDIR /
|
4 |
-
|
5 |
-
# Install git-lfs and other dependencies
|
6 |
-
RUN apt-get update && \
|
7 |
-
apt-get install -y git git-lfs poppler-utils && \
|
8 |
-
rm -rf /var/lib/apt/lists/* && \
|
9 |
-
git lfs install
|
10 |
|
11 |
# Copy requirements first for better caching
|
12 |
COPY requirements.txt .
|
13 |
RUN pip install -r requirements.txt
|
14 |
|
15 |
-
# Initialize git-lfs and copy the application
|
16 |
-
COPY .gitattributes .
|
17 |
-
COPY Dataset/Commercial\ Lending\ 101.pdf Dataset/
|
18 |
-
RUN ls -la Dataset && \
|
19 |
-
stat Dataset/Commercial\ Lending\ 101.pdf
|
20 |
-
|
21 |
# Copy the rest of the application
|
22 |
COPY . .
|
23 |
|
@@ -26,5 +14,5 @@ ENV PORT=8501
|
|
26 |
|
27 |
EXPOSE ${PORT}
|
28 |
|
29 |
-
# Use the correct path to app.py
|
30 |
-
CMD ["streamlit", "run", "app.py", "--server.port
|
|
|
1 |
FROM python:3.10-slim
|
2 |
|
3 |
+
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# Copy requirements first for better caching
|
6 |
COPY requirements.txt .
|
7 |
RUN pip install -r requirements.txt
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Copy the rest of the application
|
10 |
COPY . .
|
11 |
|
|
|
14 |
|
15 |
EXPOSE ${PORT}
|
16 |
|
17 |
+
# Use the correct path to app.py
|
18 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
app/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
app/config.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration management for the CRE Chatbot application.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
# Load environment variables
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
# Azure OpenAI Configuration
|
11 |
+
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
|
12 |
+
AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_KEY')
|
13 |
+
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
|
14 |
+
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
|
15 |
+
|
16 |
+
# Application Configuration
|
17 |
+
MAX_CHUNK_SIZE = 1000
|
18 |
+
OVERLAP_SIZE = 200
|
19 |
+
TEMPERATURE = 0.7
|
20 |
+
MAX_TOKENS = 500
|
21 |
+
|
22 |
+
# Logging Configuration
|
23 |
+
LOG_LEVEL = "INFO"
|
24 |
+
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
25 |
+
LOG_FILE = "logs/app.log"
|
26 |
+
|
27 |
+
# Vector Store Configuration
|
28 |
+
VECTOR_STORE_PATH = "vector_store"
|
29 |
+
|
30 |
+
def validate_config():
|
31 |
+
"""Validate that all required configuration variables are set."""
|
32 |
+
required_vars = [
|
33 |
+
'AZURE_OPENAI_ENDPOINT',
|
34 |
+
'AZURE_OPENAI_API_KEY',
|
35 |
+
'AZURE_OPENAI_DEPLOYMENT_NAME',
|
36 |
+
'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
|
37 |
+
]
|
38 |
+
|
39 |
+
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
40 |
+
|
41 |
+
if missing_vars:
|
42 |
+
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
43 |
+
|
44 |
+
# Validate that all required configuration variables are set.
|
45 |
+
validate_config()
|
app/logging.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Logging configuration for the CRE Chatbot application.
|
3 |
+
"""
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
from logging.handlers import RotatingFileHandler
|
7 |
+
from .config import LOG_LEVEL, LOG_FORMAT, LOG_FILE
|
8 |
+
|
9 |
+
def setup_logging():
|
10 |
+
"""Set up logging configuration for the application."""
|
11 |
+
# Create logs directory if it doesn't exist
|
12 |
+
os.makedirs('logs', exist_ok=True)
|
13 |
+
|
14 |
+
# Set up root logger
|
15 |
+
logger = logging.getLogger()
|
16 |
+
logger.setLevel(LOG_LEVEL)
|
17 |
+
|
18 |
+
# Create formatters and handlers
|
19 |
+
formatter = logging.Formatter(LOG_FORMAT)
|
20 |
+
|
21 |
+
# Console Handler
|
22 |
+
console_handler = logging.StreamHandler()
|
23 |
+
console_handler.setFormatter(formatter)
|
24 |
+
logger.addHandler(console_handler)
|
25 |
+
|
26 |
+
# File Handler
|
27 |
+
file_handler = RotatingFileHandler(
|
28 |
+
LOG_FILE,
|
29 |
+
maxBytes=10485760, # 10MB
|
30 |
+
backupCount=5
|
31 |
+
)
|
32 |
+
file_handler.setFormatter(formatter)
|
33 |
+
logger.addHandler(file_handler)
|
34 |
+
|
35 |
+
# Create separate loggers for different components
|
36 |
+
loggers = {
|
37 |
+
'api': setup_component_logger('api'),
|
38 |
+
'pdf': setup_component_logger('pdf'),
|
39 |
+
'rag': setup_component_logger('rag'),
|
40 |
+
'app': setup_component_logger('app')
|
41 |
+
}
|
42 |
+
|
43 |
+
return loggers
|
44 |
+
|
45 |
+
def setup_component_logger(name):
|
46 |
+
"""Set up a logger for a specific component."""
|
47 |
+
logger = logging.getLogger(name)
|
48 |
+
logger.setLevel(LOG_LEVEL)
|
49 |
+
|
50 |
+
# Create component-specific log file
|
51 |
+
handler = RotatingFileHandler(
|
52 |
+
f'logs/{name}.log',
|
53 |
+
maxBytes=10485760, # 10MB
|
54 |
+
backupCount=3
|
55 |
+
)
|
56 |
+
handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
57 |
+
logger.addHandler(handler)
|
58 |
+
|
59 |
+
return logger
|
app/main.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main Streamlit application for the CRE Chatbot.
|
3 |
+
"""
|
4 |
+
import logging
|
5 |
+
import streamlit as st
|
6 |
+
from io import BytesIO
|
7 |
+
import sys
|
8 |
+
import os
|
9 |
+
|
10 |
+
# Add the project root to Python path
|
11 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
12 |
+
|
13 |
+
from app.config import validate_config, AZURE_OPENAI_DEPLOYMENT_NAME
|
14 |
+
from app.logging import setup_logging
|
15 |
+
from src.pdf_processor import PDFProcessor
|
16 |
+
from src.rag_engine import RAGEngine
|
17 |
+
|
18 |
+
# Setup logging
|
19 |
+
loggers = setup_logging()
|
20 |
+
logger = logging.getLogger('app')
|
21 |
+
|
22 |
+
# Page configuration
|
23 |
+
st.set_page_config(
|
24 |
+
page_title="CRE Knowledge Assistant",
|
25 |
+
page_icon="🏢",
|
26 |
+
layout="wide",
|
27 |
+
initial_sidebar_state="expanded"
|
28 |
+
)
|
29 |
+
|
30 |
+
# Custom CSS
|
31 |
+
st.markdown("""
|
32 |
+
<style>
|
33 |
+
.main {
|
34 |
+
background-color: #f5f5f5;
|
35 |
+
}
|
36 |
+
.stApp {
|
37 |
+
max-width: 1200px;
|
38 |
+
margin: 0 auto;
|
39 |
+
}
|
40 |
+
.chat-message {
|
41 |
+
padding: 1.5rem;
|
42 |
+
border-radius: 0.5rem;
|
43 |
+
margin-bottom: 1rem;
|
44 |
+
display: flex;
|
45 |
+
flex-direction: column;
|
46 |
+
}
|
47 |
+
.chat-message.user {
|
48 |
+
background-color: #e3f2fd;
|
49 |
+
}
|
50 |
+
.chat-message.assistant {
|
51 |
+
background-color: #f3e5f5;
|
52 |
+
}
|
53 |
+
.chat-message .message {
|
54 |
+
margin-top: 0.5rem;
|
55 |
+
}
|
56 |
+
</style>
|
57 |
+
""", unsafe_allow_html=True)
|
58 |
+
|
59 |
+
# Initialize session state
|
60 |
+
if 'rag_engine' not in st.session_state:
|
61 |
+
st.session_state.rag_engine = None
|
62 |
+
if 'pdf_processor' not in st.session_state:
|
63 |
+
st.session_state.pdf_processor = PDFProcessor()
|
64 |
+
if 'chat_history' not in st.session_state:
|
65 |
+
st.session_state.chat_history = []
|
66 |
+
if 'uploaded_pdfs' not in st.session_state:
|
67 |
+
st.session_state.uploaded_pdfs = set()
|
68 |
+
|
69 |
+
def initialize_rag_engine(deployment_name: str):
|
70 |
+
"""Initialize the RAG engine with error handling."""
|
71 |
+
try:
|
72 |
+
st.session_state.rag_engine = RAGEngine(deployment_name)
|
73 |
+
logger.info("RAG Engine initialized successfully")
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Error initializing the application: {str(e)}")
|
76 |
+
st.error(f"Error initializing the application: {str(e)}")
|
77 |
+
|
78 |
+
def process_pdf(pdf_file):
|
79 |
+
"""Process uploaded PDF file."""
|
80 |
+
try:
|
81 |
+
# Check if PDF was already processed
|
82 |
+
if pdf_file.name in st.session_state.uploaded_pdfs:
|
83 |
+
st.warning(f"'{pdf_file.name}' has already been processed!")
|
84 |
+
return
|
85 |
+
|
86 |
+
with st.spinner(f"Processing {pdf_file.name}..."):
|
87 |
+
# Read PDF content
|
88 |
+
pdf_content = pdf_file.read()
|
89 |
+
|
90 |
+
# Process PDF and get chunks
|
91 |
+
chunks = st.session_state.pdf_processor.process_pdf(
|
92 |
+
BytesIO(pdf_content)
|
93 |
+
)
|
94 |
+
|
95 |
+
# Add chunks to vector store
|
96 |
+
texts = [chunk[0] for chunk in chunks]
|
97 |
+
metadata = [{"source": pdf_file.name, **chunk[1]} for chunk in chunks]
|
98 |
+
st.session_state.rag_engine.add_documents(texts, metadata)
|
99 |
+
|
100 |
+
# Mark PDF as processed
|
101 |
+
st.session_state.uploaded_pdfs.add(pdf_file.name)
|
102 |
+
|
103 |
+
st.success(f"Successfully processed '{pdf_file.name}'!")
|
104 |
+
logger.info(f"PDF '{pdf_file.name}' processed and added to vector store")
|
105 |
+
|
106 |
+
except Exception as e:
|
107 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
108 |
+
st.error(f"Error processing PDF: {str(e)}")
|
109 |
+
|
110 |
+
def display_chat_message(role: str, content: str):
|
111 |
+
"""Display a chat message with proper styling."""
|
112 |
+
with st.container():
|
113 |
+
st.markdown(f"""
|
114 |
+
<div class="chat-message {role}">
|
115 |
+
<div class="role"><strong>{'You' if role == 'user' else 'Assistant'}:</strong></div>
|
116 |
+
<div class="message">{content}</div>
|
117 |
+
</div>
|
118 |
+
""", unsafe_allow_html=True)
|
119 |
+
|
120 |
+
def main():
|
121 |
+
"""Main application function."""
|
122 |
+
# Header
|
123 |
+
col1, col2 = st.columns([2, 1])
|
124 |
+
with col1:
|
125 |
+
st.title("🏢 CRE Knowledge Assistant")
|
126 |
+
st.markdown("*Your AI guide for commercial real estate concepts*")
|
127 |
+
|
128 |
+
# Sidebar
|
129 |
+
with st.sidebar:
|
130 |
+
st.header("📚 Knowledge Base")
|
131 |
+
st.markdown("Upload your CRE documents to enhance the assistant's knowledge.")
|
132 |
+
|
133 |
+
# Model configuration (collapsible)
|
134 |
+
with st.expander("⚙️ Model Configuration"):
|
135 |
+
deployment_name = st.text_input(
|
136 |
+
"Model Deployment Name",
|
137 |
+
value=AZURE_OPENAI_DEPLOYMENT_NAME,
|
138 |
+
help="Enter your Azure OpenAI model deployment name"
|
139 |
+
)
|
140 |
+
|
141 |
+
# Initialize RAG engine if not already done
|
142 |
+
if not st.session_state.rag_engine:
|
143 |
+
initialize_rag_engine(deployment_name)
|
144 |
+
|
145 |
+
# PDF upload section
|
146 |
+
st.subheader("📄 Upload Documents")
|
147 |
+
uploaded_files = st.file_uploader(
|
148 |
+
"Choose PDF files",
|
149 |
+
type="pdf",
|
150 |
+
accept_multiple_files=True,
|
151 |
+
help="Upload one or more PDF files to add to the knowledge base"
|
152 |
+
)
|
153 |
+
|
154 |
+
if uploaded_files:
|
155 |
+
for pdf_file in uploaded_files:
|
156 |
+
process_pdf(pdf_file)
|
157 |
+
|
158 |
+
# Show processed documents
|
159 |
+
if st.session_state.uploaded_pdfs:
|
160 |
+
st.subheader("📚 Processed Documents")
|
161 |
+
for pdf_name in st.session_state.uploaded_pdfs:
|
162 |
+
st.markdown(f"✓ {pdf_name}")
|
163 |
+
|
164 |
+
# Main chat interface
|
165 |
+
if st.session_state.rag_engine:
|
166 |
+
# Display chat history
|
167 |
+
for message in st.session_state.chat_history:
|
168 |
+
display_chat_message(
|
169 |
+
role=message["role"],
|
170 |
+
content=message["content"]
|
171 |
+
)
|
172 |
+
|
173 |
+
# Chat input
|
174 |
+
user_question = st.text_input(
|
175 |
+
"Ask a question about commercial real estate:",
|
176 |
+
placeholder="e.g., What is LTV? How is DSCR calculated?",
|
177 |
+
key="user_question"
|
178 |
+
)
|
179 |
+
|
180 |
+
if user_question:
|
181 |
+
try:
|
182 |
+
# Add user message to chat
|
183 |
+
st.session_state.chat_history.append({
|
184 |
+
"role": "user",
|
185 |
+
"content": user_question
|
186 |
+
})
|
187 |
+
|
188 |
+
with st.spinner("Generating answer..."):
|
189 |
+
response = st.session_state.rag_engine.query(user_question)
|
190 |
+
|
191 |
+
# Add assistant response to chat
|
192 |
+
st.session_state.chat_history.append({
|
193 |
+
"role": "assistant",
|
194 |
+
"content": response["answer"]
|
195 |
+
})
|
196 |
+
|
197 |
+
# Display latest messages immediately
|
198 |
+
display_chat_message("user", user_question)
|
199 |
+
display_chat_message("assistant", response["answer"])
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
logger.error(f"Error generating answer: {str(e)}")
|
203 |
+
st.error(f"Error generating answer: {str(e)}")
|
204 |
+
|
205 |
+
else:
|
206 |
+
st.info("👆 Please upload PDF documents in the sidebar to start asking questions!")
|
207 |
+
|
208 |
+
if __name__ == "__main__":
|
209 |
+
main()
|
frontend/main.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
import sys
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Add the project root to Python path
|
7 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
8 |
+
|
9 |
+
from app.config import validate_config
|
10 |
+
from app.logging import setup_logging
|
11 |
+
|
12 |
+
def main():
|
13 |
+
# Setup logging
|
14 |
+
setup_logging()
|
15 |
+
|
16 |
+
st.set_page_config(
|
17 |
+
page_title="CRE Knowledge Assistant",
|
18 |
+
page_icon="🤖",
|
19 |
+
layout="wide"
|
20 |
+
)
|
21 |
+
|
22 |
+
st.title("CRE Knowledge Assistant")
|
23 |
+
|
24 |
+
# File uploader
|
25 |
+
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
|
26 |
+
|
27 |
+
if uploaded_file:
|
28 |
+
# Convert file to bytes
|
29 |
+
file_bytes = uploaded_file.getvalue()
|
30 |
+
|
31 |
+
# Send to API endpoint
|
32 |
+
response = requests.post(
|
33 |
+
"api/process_pdf",
|
34 |
+
files={"file": (uploaded_file.name, file_bytes, "application/pdf")}
|
35 |
+
)
|
36 |
+
|
37 |
+
if response.status_code == 200:
|
38 |
+
st.success("PDF processed successfully!")
|
39 |
+
else:
|
40 |
+
st.error("Error processing PDF")
|
41 |
+
|
42 |
+
# Query input
|
43 |
+
query = st.text_input("Ask a question about your documents:")
|
44 |
+
|
45 |
+
if query:
|
46 |
+
# Send query to API endpoint
|
47 |
+
response = requests.post(
|
48 |
+
"api/query",
|
49 |
+
json={"query": query}
|
50 |
+
)
|
51 |
+
|
52 |
+
if response.status_code == 200:
|
53 |
+
result = response.json()
|
54 |
+
st.write("Answer:", result["answer"])
|
55 |
+
else:
|
56 |
+
st.error("Error processing query")
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
main()
|
frontend/requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.29.0
|
2 |
+
requests==2.31.0
|
3 |
+
python-dotenv==1.0.0
|
index.html
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>CRE Knowledge Assistant</title>
|
5 |
+
<style>
|
6 |
+
body, html {
|
7 |
+
margin: 0;
|
8 |
+
padding: 0;
|
9 |
+
height: 100%;
|
10 |
+
overflow: hidden;
|
11 |
+
}
|
12 |
+
iframe {
|
13 |
+
width: 100%;
|
14 |
+
height: 100vh;
|
15 |
+
border: none;
|
16 |
+
}
|
17 |
+
</style>
|
18 |
+
</head>
|
19 |
+
<body>
|
20 |
+
<iframe src="/api" allow="camera;microphone"></iframe>
|
21 |
+
<script>
|
22 |
+
window.addEventListener('message', function(e) {
|
23 |
+
// Handle any messages from the Streamlit app
|
24 |
+
if (e.data.type === 'streamlit') {
|
25 |
+
console.log('Received message from Streamlit:', e.data);
|
26 |
+
}
|
27 |
+
});
|
28 |
+
</script>
|
29 |
+
</body>
|
30 |
+
</html>
|
pdf_processor.py
CHANGED
@@ -1,34 +1,17 @@
|
|
1 |
from typing import List, Dict
|
2 |
import os
|
3 |
-
import subprocess
|
4 |
-
import tempfile
|
5 |
-
import pypdf
|
6 |
from langchain.document_loaders import PyPDFLoader
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
|
9 |
class PDFProcessor:
|
10 |
def __init__(self):
|
11 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
12 |
-
chunk_size=
|
13 |
-
chunk_overlap=
|
14 |
length_function=len,
|
15 |
-
separators=["\n\n", "\n", "
|
16 |
)
|
17 |
|
18 |
-
def extract_text_with_pdftotext(self, pdf_path: str) -> str:
|
19 |
-
"""Use pdftotext (from poppler-utils) to extract text."""
|
20 |
-
try:
|
21 |
-
result = subprocess.run(
|
22 |
-
['pdftotext', pdf_path, '-'],
|
23 |
-
capture_output=True,
|
24 |
-
text=True,
|
25 |
-
check=True
|
26 |
-
)
|
27 |
-
return result.stdout
|
28 |
-
except Exception as e:
|
29 |
-
print(f"pdftotext extraction failed: {str(e)}")
|
30 |
-
return ""
|
31 |
-
|
32 |
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
33 |
"""
|
34 |
Process a PDF file and return chunks of text with metadata.
|
@@ -39,85 +22,21 @@ class PDFProcessor:
|
|
39 |
Returns:
|
40 |
List[Dict]: List of text chunks with metadata
|
41 |
"""
|
42 |
-
|
43 |
-
|
44 |
-
if not os.path.exists(pdf_path):
|
45 |
-
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
|
46 |
-
|
47 |
-
file_size = os.path.getsize(pdf_path)
|
48 |
-
print(f"PDF file exists, size: {file_size} bytes")
|
49 |
-
|
50 |
-
if file_size < 1000: # Less than 1KB
|
51 |
-
raise ValueError(f"PDF file seems too small ({file_size} bytes). Might be corrupted or a pointer file.")
|
52 |
-
|
53 |
-
# Try all three methods
|
54 |
-
methods = [
|
55 |
-
("PyPDFLoader", self._try_pypdf_loader),
|
56 |
-
("pypdf", self._try_pypdf_direct),
|
57 |
-
("pdftotext", self._try_pdftotext)
|
58 |
-
]
|
59 |
-
|
60 |
-
last_error = None
|
61 |
-
for method_name, method in methods:
|
62 |
-
try:
|
63 |
-
print(f"\nTrying {method_name} method...")
|
64 |
-
chunks = method(pdf_path)
|
65 |
-
if chunks:
|
66 |
-
print(f"Successfully extracted {len(chunks)} chunks using {method_name}")
|
67 |
-
return chunks
|
68 |
-
except Exception as e:
|
69 |
-
print(f"Error with {method_name}: {str(e)}")
|
70 |
-
last_error = e
|
71 |
-
|
72 |
-
raise Exception(f"All PDF processing methods failed. Last error: {str(last_error)}")
|
73 |
-
|
74 |
-
def _try_pypdf_loader(self, pdf_path: str) -> List[Dict]:
|
75 |
loader = PyPDFLoader(pdf_path)
|
76 |
pages = loader.load()
|
77 |
-
print(f"Loaded {len(pages)} pages")
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
with open(pdf_path, 'rb') as file:
|
94 |
-
pdf = pypdf.PdfReader(file)
|
95 |
-
print(f"Opened PDF with {len(pdf.pages)} pages")
|
96 |
-
|
97 |
-
chunks = []
|
98 |
-
for page_num in range(len(pdf.pages)):
|
99 |
-
content = pdf.pages[page_num].extract_text().strip()
|
100 |
-
if content:
|
101 |
-
page_chunks = self.text_splitter.split_text(content)
|
102 |
-
for chunk in page_chunks:
|
103 |
-
if chunk.strip():
|
104 |
-
chunks.append({
|
105 |
-
'text': chunk,
|
106 |
-
'metadata': {'page': page_num + 1}
|
107 |
-
})
|
108 |
-
return chunks
|
109 |
-
|
110 |
-
def _try_pdftotext(self, pdf_path: str) -> List[Dict]:
|
111 |
-
text = self.extract_text_with_pdftotext(pdf_path)
|
112 |
-
if not text.strip():
|
113 |
-
return []
|
114 |
-
|
115 |
-
chunks = []
|
116 |
-
page_chunks = self.text_splitter.split_text(text)
|
117 |
-
for i, chunk in enumerate(page_chunks):
|
118 |
-
if chunk.strip():
|
119 |
-
chunks.append({
|
120 |
-
'text': chunk,
|
121 |
-
'metadata': {'page': 1} # Page info not available with this method
|
122 |
-
})
|
123 |
-
return chunks
|
|
|
1 |
from typing import List, Dict
|
2 |
import os
|
|
|
|
|
|
|
3 |
from langchain.document_loaders import PyPDFLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
|
6 |
class PDFProcessor:
|
7 |
def __init__(self):
|
8 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
9 |
+
chunk_size=1000,
|
10 |
+
chunk_overlap=200,
|
11 |
length_function=len,
|
12 |
+
separators=["\n\n", "\n", " ", ""]
|
13 |
)
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
16 |
"""
|
17 |
Process a PDF file and return chunks of text with metadata.
|
|
|
22 |
Returns:
|
23 |
List[Dict]: List of text chunks with metadata
|
24 |
"""
|
25 |
+
# Load PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
loader = PyPDFLoader(pdf_path)
|
27 |
pages = loader.load()
|
|
|
28 |
|
29 |
+
# Split text into chunks
|
30 |
+
chunks = self.text_splitter.split_documents(pages)
|
31 |
+
|
32 |
+
# Format chunks with metadata
|
33 |
+
processed_chunks = []
|
34 |
+
for chunk in chunks:
|
35 |
+
processed_chunks.append({
|
36 |
+
'text': chunk.page_content,
|
37 |
+
'metadata': {
|
38 |
+
'page': chunk.metadata.get('page', 0) + 1
|
39 |
+
}
|
40 |
+
})
|
41 |
+
|
42 |
+
return processed_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,15 +1,10 @@
|
|
1 |
streamlit==1.29.0
|
2 |
openai==1.6.1
|
3 |
python-dotenv==1.0.0
|
4 |
-
pypdf==3.17.1
|
5 |
-
PyPDF2==3.0.1
|
6 |
langchain==0.0.352
|
7 |
chromadb==0.4.18
|
8 |
pydantic==2.5.2
|
9 |
pydantic-settings==2.1.0
|
10 |
azure-storage-blob==12.19.0
|
11 |
numpy>=1.22.5
|
12 |
-
|
13 |
-
typing-inspect==0.8.0
|
14 |
-
overrides==7.3.1
|
15 |
-
tiktoken==0.5.1
|
|
|
1 |
streamlit==1.29.0
|
2 |
openai==1.6.1
|
3 |
python-dotenv==1.0.0
|
|
|
|
|
4 |
langchain==0.0.352
|
5 |
chromadb==0.4.18
|
6 |
pydantic==2.5.2
|
7 |
pydantic-settings==2.1.0
|
8 |
azure-storage-blob==12.19.0
|
9 |
numpy>=1.22.5
|
10 |
+
pypdf==3.17.1
|
|
|
|
|
|
startup.sh
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
#!/bin/sh
|
2 |
+
streamlit run app/main.py --server.port 8000 --server.address 0.0.0.0
|