Codequestt commited on
Commit
bae2003
·
verified ·
1 Parent(s): ec80195

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -18
app.py CHANGED
@@ -159,35 +159,132 @@ class GraphState(TypedDict):
159
  documents: List[str]
160
 
161
  def process_documents(temp_dir):
162
- """Process documents from the extracted zip folder."""
163
  d = {"chunk": [], "url": []}
164
 
 
 
 
 
 
 
 
 
165
  for path in os.listdir(temp_dir):
 
166
  if os.path.isfile(os.path.join(temp_dir, path)):
167
- url = "https://" + path.replace("=", "/")
168
- file_path = os.path.join(temp_dir, path)
169
-
170
  try:
171
- with open(file_path, 'r', encoding='utf-8') as stream:
172
- content = stream.read()
173
- soup = BeautifulSoup(content, "html.parser")
174
-
175
- title = soup.find("title")
176
- title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
177
-
178
- main_content = soup.find("main")
179
- text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
180
-
181
- full_content = f"{title_text}\n\n{text_content}"
182
-
183
- d["chunk"].append(full_content)
184
- d["url"].append(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  except Exception as e:
186
  print(f"Error processing file {path}: {str(e)}")
 
187
  continue
188
 
 
 
 
 
 
 
 
 
189
  return pd.DataFrame(d)
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def setup_rag_system(temp_dir):
192
  """Initialize the RAG system with the provided documents."""
193
  # Initialize embedding model
 
159
  documents: List[str]
160
 
161
  def process_documents(temp_dir):
162
+ """Process documents from the extracted zip folder with enhanced error handling."""
163
  d = {"chunk": [], "url": []}
164
 
165
+ # Debug information
166
+ print(f"Scanning directory: {temp_dir}")
167
+ print(f"Directory contents: {os.listdir(temp_dir)}")
168
+
169
+ file_count = 0
170
+ processed_count = 0
171
+ error_count = 0
172
+
173
  for path in os.listdir(temp_dir):
174
+ file_count += 1
175
  if os.path.isfile(os.path.join(temp_dir, path)):
 
 
 
176
  try:
177
+ file_path = os.path.join(temp_dir, path)
178
+ print(f"Processing file: {path}")
179
+
180
+ # Try different encodings
181
+ encodings = ['utf-8', 'latin-1', 'cp1252']
182
+ content = None
183
+
184
+ for encoding in encodings:
185
+ try:
186
+ with open(file_path, 'r', encoding=encoding) as stream:
187
+ content = stream.read()
188
+ break
189
+ except UnicodeDecodeError:
190
+ continue
191
+
192
+ if content is None:
193
+ print(f"Failed to read file {path} with any encoding")
194
+ error_count += 1
195
+ continue
196
+
197
+ soup = BeautifulSoup(content, "html.parser")
198
+
199
+ title = soup.find("title")
200
+ title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
201
+
202
+ main_content = soup.find("main")
203
+ text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
204
+
205
+ if not text_content.strip():
206
+ print(f"No content extracted from {path}")
207
+ error_count += 1
208
+ continue
209
+
210
+ full_content = f"{title_text}\n\n{text_content}"
211
+
212
+ d["chunk"].append(full_content)
213
+ d["url"].append("https://" + path.replace("=", "/"))
214
+ processed_count += 1
215
+ print(f"Successfully processed {path}")
216
+
217
  except Exception as e:
218
  print(f"Error processing file {path}: {str(e)}")
219
+ error_count += 1
220
  continue
221
 
222
+ print(f"\nProcessing Summary:")
223
+ print(f"Total files found: {file_count}")
224
+ print(f"Successfully processed: {processed_count}")
225
+ print(f"Errors encountered: {error_count}")
226
+
227
+ if not d["chunk"]:
228
+ raise ValueError(f"No valid documents were processed. Processed {file_count} files with {error_count} errors.")
229
+
230
  return pd.DataFrame(d)
231
 
232
+ def handle_upload(zip_file, csv_file):
233
+ """Handle file uploads and process requirements with enhanced error handling."""
234
+ try:
235
+ # Create temporary directory
236
+ temp_dir = tempfile.mkdtemp()
237
+ print(f"Created temporary directory: {temp_dir}")
238
+
239
+ try:
240
+ # Extract zip file
241
+ print(f"Extracting ZIP file: {zip_file.name}")
242
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
243
+ zip_ref.extractall(temp_dir)
244
+ print(f"ZIP contents: {zip_ref.namelist()}")
245
+
246
+ # Preprocess and read requirements CSV
247
+ print("Processing CSV file...")
248
+ requirements_df = preprocess_csv(csv_file)
249
+ print(f"Found {len(requirements_df)} requirements")
250
+
251
+ # Setup RAG system
252
+ print("Setting up RAG system...")
253
+ vector_store = setup_rag_system(temp_dir)
254
+ rag_chain = create_workflow(vector_store)
255
+
256
+ # Process requirements
257
+ results = []
258
+ for idx, req in enumerate(requirements_df['requirement'], 1):
259
+ print(f"Processing requirement {idx}/{len(requirements_df)}")
260
+ try:
261
+ response = rag_chain.invoke(req)
262
+ results.append({
263
+ 'requirement': req,
264
+ 'response': response
265
+ })
266
+ except Exception as e:
267
+ error_msg = f"Error processing requirement: {str(e)}"
268
+ print(error_msg)
269
+ results.append({
270
+ 'requirement': req,
271
+ 'response': error_msg
272
+ })
273
+
274
+ return pd.DataFrame(results)
275
+
276
+ finally:
277
+ # Cleanup
278
+ print(f"Cleaning up temporary directory: {temp_dir}")
279
+ shutil.rmtree(temp_dir)
280
+
281
+ except Exception as e:
282
+ error_msg = f"Processing error: {str(e)}"
283
+ print(error_msg)
284
+ return pd.DataFrame([{'error': error_msg}])
285
+
286
+ # The rest of the code remains the same...
287
+
288
  def setup_rag_system(temp_dir):
289
  """Initialize the RAG system with the provided documents."""
290
  # Initialize embedding model