Codequestt commited on
Commit
eb2aa50
·
verified ·
1 Parent(s): 21ad442

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -12
app.py CHANGED
@@ -158,25 +158,29 @@ class GraphState(TypedDict):
158
  decision: str
159
  documents: List[str]
160
 
 
 
 
 
161
  def process_documents(temp_dir):
162
  """Process documents from the extracted zip folder with enhanced error handling."""
163
  d = {"chunk": [], "url": []}
164
 
165
  # Debug information
166
  print(f"Scanning directory: {temp_dir}")
167
- print(f"Directory contents: {os.listdir(temp_dir)}")
168
 
169
  file_count = 0
170
  processed_count = 0
171
  error_count = 0
172
 
173
- for path in os.listdir(temp_dir):
174
- file_count += 1
175
- if os.path.isfile(os.path.join(temp_dir, path)):
 
 
 
 
176
  try:
177
- file_path = os.path.join(temp_dir, path)
178
- print(f"Processing file: {path}")
179
-
180
  # Try different encodings
181
  encodings = ['utf-8', 'latin-1', 'cp1252']
182
  content = None
@@ -190,7 +194,7 @@ def process_documents(temp_dir):
190
  continue
191
 
192
  if content is None:
193
- print(f"Failed to read file {path} with any encoding")
194
  error_count += 1
195
  continue
196
 
@@ -203,19 +207,19 @@ def process_documents(temp_dir):
203
  text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
204
 
205
  if not text_content.strip():
206
- print(f"No content extracted from {path}")
207
  error_count += 1
208
  continue
209
 
210
  full_content = f"{title_text}\n\n{text_content}"
211
 
212
  d["chunk"].append(full_content)
213
- d["url"].append("https://" + path.replace("=", "/"))
214
  processed_count += 1
215
- print(f"Successfully processed {path}")
216
 
217
  except Exception as e:
218
- print(f"Error processing file {path}: {str(e)}")
219
  error_count += 1
220
  continue
221
 
@@ -348,6 +352,11 @@ def handle_upload(zip_file, csv_file):
348
  zip_ref.extractall(temp_dir)
349
  print(f"ZIP contents: {zip_ref.namelist()}")
350
 
 
 
 
 
 
351
  # Preprocess and read requirements CSV
352
  print("Processing CSV file...")
353
  requirements_df = preprocess_csv(csv_file)
 
158
  decision: str
159
  documents: List[str]
160
 
161
+ import os
162
+ from bs4 import BeautifulSoup
163
+ import pandas as pd
164
+
165
  def process_documents(temp_dir):
166
  """Process documents from the extracted zip folder with enhanced error handling."""
167
  d = {"chunk": [], "url": []}
168
 
169
  # Debug information
170
  print(f"Scanning directory: {temp_dir}")
 
171
 
172
  file_count = 0
173
  processed_count = 0
174
  error_count = 0
175
 
176
+ # Recursively traverse the directory
177
+ for root, dirs, files in os.walk(temp_dir):
178
+ for file_name in files:
179
+ file_count += 1
180
+ file_path = os.path.join(root, file_name)
181
+ print(f"Processing file: {file_path}")
182
+
183
  try:
 
 
 
184
  # Try different encodings
185
  encodings = ['utf-8', 'latin-1', 'cp1252']
186
  content = None
 
194
  continue
195
 
196
  if content is None:
197
+ print(f"Failed to read file {file_path} with any encoding")
198
  error_count += 1
199
  continue
200
 
 
207
  text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
208
 
209
  if not text_content.strip():
210
+ print(f"No content extracted from {file_path}")
211
  error_count += 1
212
  continue
213
 
214
  full_content = f"{title_text}\n\n{text_content}"
215
 
216
  d["chunk"].append(full_content)
217
+ d["url"].append("https://" + file_name.replace("=", "/"))
218
  processed_count += 1
219
+ print(f"Successfully processed {file_path}")
220
 
221
  except Exception as e:
222
+ print(f"Error processing file {file_path}: {str(e)}")
223
  error_count += 1
224
  continue
225
 
 
352
  zip_ref.extractall(temp_dir)
353
  print(f"ZIP contents: {zip_ref.namelist()}")
354
 
355
+ # Process documents
356
+ print("Processing documents...")
357
+ df = process_documents(temp_dir)
358
+ print(f"Processed {len(df)} documents")
359
+
360
  # Preprocess and read requirements CSV
361
  print("Processing CSV file...")
362
  requirements_df = preprocess_csv(csv_file)