tony-42069 commited on
Commit
f3dfbd4
·
1 Parent(s): e0eceb3

Simplified PDF processing and dependencies

Browse files
.deployment ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [config]
2
+ SCM_DO_BUILD_DURING_DEPLOYMENT=true
3
+ PYTHON_ENABLE_GUNICORN=false
.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .env
4
+ __pycache__
5
+ *.pyc
6
+ vector_store/
7
+ venv/
8
+ .pytest_cache/
9
+ logs/
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Next.js
2
+ .next/
3
+ node_modules/
4
+ out/
5
+
6
+ # Virtual environment
7
+ venv/
8
+ env/
9
+ ENV/
10
+
11
+ # Python
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # Distribution / packaging
17
+ dist/
18
+ build/
19
+ *.egg-info/
20
+
21
+ # Local development settings
22
+ .env
23
+ .env.local
24
+
25
+ # IDE
26
+ .vscode/
27
+ .idea/
28
+
29
+ # Operating System
30
+ .DS_Store
31
+ Thumbs.db
32
+
33
+ # Misc
34
+ *.pem
35
+ .vercel
Dockerfile CHANGED
@@ -1,23 +1,11 @@
1
  FROM python:3.10-slim
2
 
3
- WORKDIR /home/user/app
4
-
5
- # Install git-lfs and other dependencies
6
- RUN apt-get update && \
7
- apt-get install -y git git-lfs poppler-utils && \
8
- rm -rf /var/lib/apt/lists/* && \
9
- git lfs install
10
 
11
  # Copy requirements first for better caching
12
  COPY requirements.txt .
13
  RUN pip install -r requirements.txt
14
 
15
- # Initialize git-lfs and copy the application
16
- COPY .gitattributes .
17
- COPY Dataset/Commercial\ Lending\ 101.pdf Dataset/
18
- RUN ls -la Dataset && \
19
- stat Dataset/Commercial\ Lending\ 101.pdf
20
-
21
  # Copy the rest of the application
22
  COPY . .
23
 
@@ -26,5 +14,5 @@ ENV PORT=8501
26
 
27
  EXPOSE ${PORT}
28
 
29
- # Use the correct path to app.py and make port configurable
30
- CMD ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]
 
1
  FROM python:3.10-slim
2
 
3
+ WORKDIR /app
 
 
 
 
 
 
4
 
5
  # Copy requirements first for better caching
6
  COPY requirements.txt .
7
  RUN pip install -r requirements.txt
8
 
 
 
 
 
 
 
9
  # Copy the rest of the application
10
  COPY . .
11
 
 
14
 
15
  EXPOSE ${PORT}
16
 
17
+ # Use the correct path to app.py
18
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
app/config.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management for the CRE Chatbot application.
3
+ """
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Azure OpenAI Configuration
11
+ AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
12
+ AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_KEY')
13
+ AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
14
+ AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
15
+
16
+ # Application Configuration
17
+ MAX_CHUNK_SIZE = 1000
18
+ OVERLAP_SIZE = 200
19
+ TEMPERATURE = 0.7
20
+ MAX_TOKENS = 500
21
+
22
+ # Logging Configuration
23
+ LOG_LEVEL = "INFO"
24
+ LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
25
+ LOG_FILE = "logs/app.log"
26
+
27
+ # Vector Store Configuration
28
+ VECTOR_STORE_PATH = "vector_store"
29
+
30
+ def validate_config():
31
+ """Validate that all required configuration variables are set."""
32
+ required_vars = [
33
+ 'AZURE_OPENAI_ENDPOINT',
34
+ 'AZURE_OPENAI_API_KEY',
35
+ 'AZURE_OPENAI_DEPLOYMENT_NAME',
36
+ 'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
37
+ ]
38
+
39
+ missing_vars = [var for var in required_vars if not os.getenv(var)]
40
+
41
+ if missing_vars:
42
+ raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
43
+
44
+ # Validate that all required configuration variables are set.
45
+ validate_config()
app/logging.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Logging configuration for the CRE Chatbot application.
3
+ """
4
+ import logging
5
+ import os
6
+ from logging.handlers import RotatingFileHandler
7
+ from .config import LOG_LEVEL, LOG_FORMAT, LOG_FILE
8
+
9
+ def setup_logging():
10
+ """Set up logging configuration for the application."""
11
+ # Create logs directory if it doesn't exist
12
+ os.makedirs('logs', exist_ok=True)
13
+
14
+ # Set up root logger
15
+ logger = logging.getLogger()
16
+ logger.setLevel(LOG_LEVEL)
17
+
18
+ # Create formatters and handlers
19
+ formatter = logging.Formatter(LOG_FORMAT)
20
+
21
+ # Console Handler
22
+ console_handler = logging.StreamHandler()
23
+ console_handler.setFormatter(formatter)
24
+ logger.addHandler(console_handler)
25
+
26
+ # File Handler
27
+ file_handler = RotatingFileHandler(
28
+ LOG_FILE,
29
+ maxBytes=10485760, # 10MB
30
+ backupCount=5
31
+ )
32
+ file_handler.setFormatter(formatter)
33
+ logger.addHandler(file_handler)
34
+
35
+ # Create separate loggers for different components
36
+ loggers = {
37
+ 'api': setup_component_logger('api'),
38
+ 'pdf': setup_component_logger('pdf'),
39
+ 'rag': setup_component_logger('rag'),
40
+ 'app': setup_component_logger('app')
41
+ }
42
+
43
+ return loggers
44
+
45
+ def setup_component_logger(name):
46
+ """Set up a logger for a specific component."""
47
+ logger = logging.getLogger(name)
48
+ logger.setLevel(LOG_LEVEL)
49
+
50
+ # Create component-specific log file
51
+ handler = RotatingFileHandler(
52
+ f'logs/{name}.log',
53
+ maxBytes=10485760, # 10MB
54
+ backupCount=3
55
+ )
56
+ handler.setFormatter(logging.Formatter(LOG_FORMAT))
57
+ logger.addHandler(handler)
58
+
59
+ return logger
app/main.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Streamlit application for the CRE Chatbot.
3
+ """
4
+ import logging
5
+ import streamlit as st
6
+ from io import BytesIO
7
+ import sys
8
+ import os
9
+
10
+ # Add the project root to Python path
11
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
+
13
+ from app.config import validate_config, AZURE_OPENAI_DEPLOYMENT_NAME
14
+ from app.logging import setup_logging
15
+ from src.pdf_processor import PDFProcessor
16
+ from src.rag_engine import RAGEngine
17
+
18
+ # Setup logging
19
+ loggers = setup_logging()
20
+ logger = logging.getLogger('app')
21
+
22
+ # Page configuration
23
+ st.set_page_config(
24
+ page_title="CRE Knowledge Assistant",
25
+ page_icon="🏢",
26
+ layout="wide",
27
+ initial_sidebar_state="expanded"
28
+ )
29
+
30
+ # Custom CSS
31
+ st.markdown("""
32
+ <style>
33
+ .main {
34
+ background-color: #f5f5f5;
35
+ }
36
+ .stApp {
37
+ max-width: 1200px;
38
+ margin: 0 auto;
39
+ }
40
+ .chat-message {
41
+ padding: 1.5rem;
42
+ border-radius: 0.5rem;
43
+ margin-bottom: 1rem;
44
+ display: flex;
45
+ flex-direction: column;
46
+ }
47
+ .chat-message.user {
48
+ background-color: #e3f2fd;
49
+ }
50
+ .chat-message.assistant {
51
+ background-color: #f3e5f5;
52
+ }
53
+ .chat-message .message {
54
+ margin-top: 0.5rem;
55
+ }
56
+ </style>
57
+ """, unsafe_allow_html=True)
58
+
59
+ # Initialize session state
60
+ if 'rag_engine' not in st.session_state:
61
+ st.session_state.rag_engine = None
62
+ if 'pdf_processor' not in st.session_state:
63
+ st.session_state.pdf_processor = PDFProcessor()
64
+ if 'chat_history' not in st.session_state:
65
+ st.session_state.chat_history = []
66
+ if 'uploaded_pdfs' not in st.session_state:
67
+ st.session_state.uploaded_pdfs = set()
68
+
69
+ def initialize_rag_engine(deployment_name: str):
70
+ """Initialize the RAG engine with error handling."""
71
+ try:
72
+ st.session_state.rag_engine = RAGEngine(deployment_name)
73
+ logger.info("RAG Engine initialized successfully")
74
+ except Exception as e:
75
+ logger.error(f"Error initializing the application: {str(e)}")
76
+ st.error(f"Error initializing the application: {str(e)}")
77
+
78
+ def process_pdf(pdf_file):
79
+ """Process uploaded PDF file."""
80
+ try:
81
+ # Check if PDF was already processed
82
+ if pdf_file.name in st.session_state.uploaded_pdfs:
83
+ st.warning(f"'{pdf_file.name}' has already been processed!")
84
+ return
85
+
86
+ with st.spinner(f"Processing {pdf_file.name}..."):
87
+ # Read PDF content
88
+ pdf_content = pdf_file.read()
89
+
90
+ # Process PDF and get chunks
91
+ chunks = st.session_state.pdf_processor.process_pdf(
92
+ BytesIO(pdf_content)
93
+ )
94
+
95
+ # Add chunks to vector store
96
+ texts = [chunk[0] for chunk in chunks]
97
+ metadata = [{"source": pdf_file.name, **chunk[1]} for chunk in chunks]
98
+ st.session_state.rag_engine.add_documents(texts, metadata)
99
+
100
+ # Mark PDF as processed
101
+ st.session_state.uploaded_pdfs.add(pdf_file.name)
102
+
103
+ st.success(f"Successfully processed '{pdf_file.name}'!")
104
+ logger.info(f"PDF '{pdf_file.name}' processed and added to vector store")
105
+
106
+ except Exception as e:
107
+ logger.error(f"Error processing PDF: {str(e)}")
108
+ st.error(f"Error processing PDF: {str(e)}")
109
+
110
+ def display_chat_message(role: str, content: str):
111
+ """Display a chat message with proper styling."""
112
+ with st.container():
113
+ st.markdown(f"""
114
+ <div class="chat-message {role}">
115
+ <div class="role"><strong>{'You' if role == 'user' else 'Assistant'}:</strong></div>
116
+ <div class="message">{content}</div>
117
+ </div>
118
+ """, unsafe_allow_html=True)
119
+
120
+ def main():
121
+ """Main application function."""
122
+ # Header
123
+ col1, col2 = st.columns([2, 1])
124
+ with col1:
125
+ st.title("🏢 CRE Knowledge Assistant")
126
+ st.markdown("*Your AI guide for commercial real estate concepts*")
127
+
128
+ # Sidebar
129
+ with st.sidebar:
130
+ st.header("📚 Knowledge Base")
131
+ st.markdown("Upload your CRE documents to enhance the assistant's knowledge.")
132
+
133
+ # Model configuration (collapsible)
134
+ with st.expander("⚙️ Model Configuration"):
135
+ deployment_name = st.text_input(
136
+ "Model Deployment Name",
137
+ value=AZURE_OPENAI_DEPLOYMENT_NAME,
138
+ help="Enter your Azure OpenAI model deployment name"
139
+ )
140
+
141
+ # Initialize RAG engine if not already done
142
+ if not st.session_state.rag_engine:
143
+ initialize_rag_engine(deployment_name)
144
+
145
+ # PDF upload section
146
+ st.subheader("📄 Upload Documents")
147
+ uploaded_files = st.file_uploader(
148
+ "Choose PDF files",
149
+ type="pdf",
150
+ accept_multiple_files=True,
151
+ help="Upload one or more PDF files to add to the knowledge base"
152
+ )
153
+
154
+ if uploaded_files:
155
+ for pdf_file in uploaded_files:
156
+ process_pdf(pdf_file)
157
+
158
+ # Show processed documents
159
+ if st.session_state.uploaded_pdfs:
160
+ st.subheader("📚 Processed Documents")
161
+ for pdf_name in st.session_state.uploaded_pdfs:
162
+ st.markdown(f"✓ {pdf_name}")
163
+
164
+ # Main chat interface
165
+ if st.session_state.rag_engine:
166
+ # Display chat history
167
+ for message in st.session_state.chat_history:
168
+ display_chat_message(
169
+ role=message["role"],
170
+ content=message["content"]
171
+ )
172
+
173
+ # Chat input
174
+ user_question = st.text_input(
175
+ "Ask a question about commercial real estate:",
176
+ placeholder="e.g., What is LTV? How is DSCR calculated?",
177
+ key="user_question"
178
+ )
179
+
180
+ if user_question:
181
+ try:
182
+ # Add user message to chat
183
+ st.session_state.chat_history.append({
184
+ "role": "user",
185
+ "content": user_question
186
+ })
187
+
188
+ with st.spinner("Generating answer..."):
189
+ response = st.session_state.rag_engine.query(user_question)
190
+
191
+ # Add assistant response to chat
192
+ st.session_state.chat_history.append({
193
+ "role": "assistant",
194
+ "content": response["answer"]
195
+ })
196
+
197
+ # Display latest messages immediately
198
+ display_chat_message("user", user_question)
199
+ display_chat_message("assistant", response["answer"])
200
+
201
+ except Exception as e:
202
+ logger.error(f"Error generating answer: {str(e)}")
203
+ st.error(f"Error generating answer: {str(e)}")
204
+
205
+ else:
206
+ st.info("👆 Please upload PDF documents in the sidebar to start asking questions!")
207
+
208
+ if __name__ == "__main__":
209
+ main()
frontend/main.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import sys
4
+ import os
5
+
6
+ # Add the project root to Python path
7
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from app.config import validate_config
10
+ from app.logging import setup_logging
11
+
12
+ def main():
13
+ # Setup logging
14
+ setup_logging()
15
+
16
+ st.set_page_config(
17
+ page_title="CRE Knowledge Assistant",
18
+ page_icon="🤖",
19
+ layout="wide"
20
+ )
21
+
22
+ st.title("CRE Knowledge Assistant")
23
+
24
+ # File uploader
25
+ uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
26
+
27
+ if uploaded_file:
28
+ # Convert file to bytes
29
+ file_bytes = uploaded_file.getvalue()
30
+
31
+ # Send to API endpoint
32
+ response = requests.post(
33
+ "api/process_pdf",
34
+ files={"file": (uploaded_file.name, file_bytes, "application/pdf")}
35
+ )
36
+
37
+ if response.status_code == 200:
38
+ st.success("PDF processed successfully!")
39
+ else:
40
+ st.error("Error processing PDF")
41
+
42
+ # Query input
43
+ query = st.text_input("Ask a question about your documents:")
44
+
45
+ if query:
46
+ # Send query to API endpoint
47
+ response = requests.post(
48
+ "api/query",
49
+ json={"query": query}
50
+ )
51
+
52
+ if response.status_code == 200:
53
+ result = response.json()
54
+ st.write("Answer:", result["answer"])
55
+ else:
56
+ st.error("Error processing query")
57
+
58
+ if __name__ == "__main__":
59
+ main()
frontend/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit==1.29.0
2
+ requests==2.31.0
3
+ python-dotenv==1.0.0
index.html ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>CRE Knowledge Assistant</title>
5
+ <style>
6
+ body, html {
7
+ margin: 0;
8
+ padding: 0;
9
+ height: 100%;
10
+ overflow: hidden;
11
+ }
12
+ iframe {
13
+ width: 100%;
14
+ height: 100vh;
15
+ border: none;
16
+ }
17
+ </style>
18
+ </head>
19
+ <body>
20
+ <iframe src="/api" allow="camera;microphone"></iframe>
21
+ <script>
22
+ window.addEventListener('message', function(e) {
23
+ // Handle any messages from the Streamlit app
24
+ if (e.data.type === 'streamlit') {
25
+ console.log('Received message from Streamlit:', e.data);
26
+ }
27
+ });
28
+ </script>
29
+ </body>
30
+ </html>
pdf_processor.py CHANGED
@@ -1,34 +1,17 @@
1
  from typing import List, Dict
2
  import os
3
- import subprocess
4
- import tempfile
5
- import pypdf
6
  from langchain.document_loaders import PyPDFLoader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
 
9
  class PDFProcessor:
10
  def __init__(self):
11
  self.text_splitter = RecursiveCharacterTextSplitter(
12
- chunk_size=500,
13
- chunk_overlap=50,
14
  length_function=len,
15
- separators=["\n\n", "\n", ".", " ", ""]
16
  )
17
 
18
- def extract_text_with_pdftotext(self, pdf_path: str) -> str:
19
- """Use pdftotext (from poppler-utils) to extract text."""
20
- try:
21
- result = subprocess.run(
22
- ['pdftotext', pdf_path, '-'],
23
- capture_output=True,
24
- text=True,
25
- check=True
26
- )
27
- return result.stdout
28
- except Exception as e:
29
- print(f"pdftotext extraction failed: {str(e)}")
30
- return ""
31
-
32
  def process_pdf(self, pdf_path: str) -> List[Dict]:
33
  """
34
  Process a PDF file and return chunks of text with metadata.
@@ -39,85 +22,21 @@ class PDFProcessor:
39
  Returns:
40
  List[Dict]: List of text chunks with metadata
41
  """
42
- print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
43
-
44
- if not os.path.exists(pdf_path):
45
- raise FileNotFoundError(f"PDF file not found at {pdf_path}")
46
-
47
- file_size = os.path.getsize(pdf_path)
48
- print(f"PDF file exists, size: {file_size} bytes")
49
-
50
- if file_size < 1000: # Less than 1KB
51
- raise ValueError(f"PDF file seems too small ({file_size} bytes). Might be corrupted or a pointer file.")
52
-
53
- # Try all three methods
54
- methods = [
55
- ("PyPDFLoader", self._try_pypdf_loader),
56
- ("pypdf", self._try_pypdf_direct),
57
- ("pdftotext", self._try_pdftotext)
58
- ]
59
-
60
- last_error = None
61
- for method_name, method in methods:
62
- try:
63
- print(f"\nTrying {method_name} method...")
64
- chunks = method(pdf_path)
65
- if chunks:
66
- print(f"Successfully extracted {len(chunks)} chunks using {method_name}")
67
- return chunks
68
- except Exception as e:
69
- print(f"Error with {method_name}: {str(e)}")
70
- last_error = e
71
-
72
- raise Exception(f"All PDF processing methods failed. Last error: {str(last_error)}")
73
-
74
- def _try_pypdf_loader(self, pdf_path: str) -> List[Dict]:
75
  loader = PyPDFLoader(pdf_path)
76
  pages = loader.load()
77
- print(f"Loaded {len(pages)} pages")
78
 
79
- chunks = []
80
- for page in pages:
81
- content = page.page_content.strip()
82
- if content:
83
- page_chunks = self.text_splitter.split_text(content)
84
- for chunk in page_chunks:
85
- if chunk.strip():
86
- chunks.append({
87
- 'text': chunk,
88
- 'metadata': {'page': page.metadata['page']}
89
- })
90
- return chunks
91
-
92
- def _try_pypdf_direct(self, pdf_path: str) -> List[Dict]:
93
- with open(pdf_path, 'rb') as file:
94
- pdf = pypdf.PdfReader(file)
95
- print(f"Opened PDF with {len(pdf.pages)} pages")
96
-
97
- chunks = []
98
- for page_num in range(len(pdf.pages)):
99
- content = pdf.pages[page_num].extract_text().strip()
100
- if content:
101
- page_chunks = self.text_splitter.split_text(content)
102
- for chunk in page_chunks:
103
- if chunk.strip():
104
- chunks.append({
105
- 'text': chunk,
106
- 'metadata': {'page': page_num + 1}
107
- })
108
- return chunks
109
-
110
- def _try_pdftotext(self, pdf_path: str) -> List[Dict]:
111
- text = self.extract_text_with_pdftotext(pdf_path)
112
- if not text.strip():
113
- return []
114
-
115
- chunks = []
116
- page_chunks = self.text_splitter.split_text(text)
117
- for i, chunk in enumerate(page_chunks):
118
- if chunk.strip():
119
- chunks.append({
120
- 'text': chunk,
121
- 'metadata': {'page': 1} # Page info not available with this method
122
- })
123
- return chunks
 
1
  from typing import List, Dict
2
  import os
 
 
 
3
  from langchain.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
 
6
  class PDFProcessor:
7
  def __init__(self):
8
  self.text_splitter = RecursiveCharacterTextSplitter(
9
+ chunk_size=1000,
10
+ chunk_overlap=200,
11
  length_function=len,
12
+ separators=["\n\n", "\n", " ", ""]
13
  )
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def process_pdf(self, pdf_path: str) -> List[Dict]:
16
  """
17
  Process a PDF file and return chunks of text with metadata.
 
22
  Returns:
23
  List[Dict]: List of text chunks with metadata
24
  """
25
+ # Load PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  loader = PyPDFLoader(pdf_path)
27
  pages = loader.load()
 
28
 
29
+ # Split text into chunks
30
+ chunks = self.text_splitter.split_documents(pages)
31
+
32
+ # Format chunks with metadata
33
+ processed_chunks = []
34
+ for chunk in chunks:
35
+ processed_chunks.append({
36
+ 'text': chunk.page_content,
37
+ 'metadata': {
38
+ 'page': chunk.metadata.get('page', 0) + 1
39
+ }
40
+ })
41
+
42
+ return processed_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,15 +1,10 @@
1
  streamlit==1.29.0
2
  openai==1.6.1
3
  python-dotenv==1.0.0
4
- pypdf==3.17.1
5
- PyPDF2==3.0.1
6
  langchain==0.0.352
7
  chromadb==0.4.18
8
  pydantic==2.5.2
9
  pydantic-settings==2.1.0
10
  azure-storage-blob==12.19.0
11
  numpy>=1.22.5
12
- duckdb==0.9.2
13
- typing-inspect==0.8.0
14
- overrides==7.3.1
15
- tiktoken==0.5.1
 
1
  streamlit==1.29.0
2
  openai==1.6.1
3
  python-dotenv==1.0.0
 
 
4
  langchain==0.0.352
5
  chromadb==0.4.18
6
  pydantic==2.5.2
7
  pydantic-settings==2.1.0
8
  azure-storage-blob==12.19.0
9
  numpy>=1.22.5
10
+ pypdf==3.17.1
 
 
 
startup.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #!/bin/sh
2
+ streamlit run app/main.py --server.port 8000 --server.address 0.0.0.0