Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -158,25 +158,29 @@ class GraphState(TypedDict):
|
|
158 |
decision: str
|
159 |
documents: List[str]
|
160 |
|
|
|
|
|
|
|
|
|
161 |
def process_documents(temp_dir):
|
162 |
"""Process documents from the extracted zip folder with enhanced error handling."""
|
163 |
d = {"chunk": [], "url": []}
|
164 |
|
165 |
# Debug information
|
166 |
print(f"Scanning directory: {temp_dir}")
|
167 |
-
print(f"Directory contents: {os.listdir(temp_dir)}")
|
168 |
|
169 |
file_count = 0
|
170 |
processed_count = 0
|
171 |
error_count = 0
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
176 |
try:
|
177 |
-
file_path = os.path.join(temp_dir, path)
|
178 |
-
print(f"Processing file: {path}")
|
179 |
-
|
180 |
# Try different encodings
|
181 |
encodings = ['utf-8', 'latin-1', 'cp1252']
|
182 |
content = None
|
@@ -190,7 +194,7 @@ def process_documents(temp_dir):
|
|
190 |
continue
|
191 |
|
192 |
if content is None:
|
193 |
-
print(f"Failed to read file {
|
194 |
error_count += 1
|
195 |
continue
|
196 |
|
@@ -203,19 +207,19 @@ def process_documents(temp_dir):
|
|
203 |
text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
|
204 |
|
205 |
if not text_content.strip():
|
206 |
-
print(f"No content extracted from {
|
207 |
error_count += 1
|
208 |
continue
|
209 |
|
210 |
full_content = f"{title_text}\n\n{text_content}"
|
211 |
|
212 |
d["chunk"].append(full_content)
|
213 |
-
d["url"].append("https://" +
|
214 |
processed_count += 1
|
215 |
-
print(f"Successfully processed {
|
216 |
|
217 |
except Exception as e:
|
218 |
-
print(f"Error processing file {
|
219 |
error_count += 1
|
220 |
continue
|
221 |
|
@@ -348,6 +352,11 @@ def handle_upload(zip_file, csv_file):
|
|
348 |
zip_ref.extractall(temp_dir)
|
349 |
print(f"ZIP contents: {zip_ref.namelist()}")
|
350 |
|
|
|
|
|
|
|
|
|
|
|
351 |
# Preprocess and read requirements CSV
|
352 |
print("Processing CSV file...")
|
353 |
requirements_df = preprocess_csv(csv_file)
|
|
|
158 |
decision: str
|
159 |
documents: List[str]
|
160 |
|
161 |
+
import os
|
162 |
+
from bs4 import BeautifulSoup
|
163 |
+
import pandas as pd
|
164 |
+
|
165 |
def process_documents(temp_dir):
|
166 |
"""Process documents from the extracted zip folder with enhanced error handling."""
|
167 |
d = {"chunk": [], "url": []}
|
168 |
|
169 |
# Debug information
|
170 |
print(f"Scanning directory: {temp_dir}")
|
|
|
171 |
|
172 |
file_count = 0
|
173 |
processed_count = 0
|
174 |
error_count = 0
|
175 |
|
176 |
+
# Recursively traverse the directory
|
177 |
+
for root, dirs, files in os.walk(temp_dir):
|
178 |
+
for file_name in files:
|
179 |
+
file_count += 1
|
180 |
+
file_path = os.path.join(root, file_name)
|
181 |
+
print(f"Processing file: {file_path}")
|
182 |
+
|
183 |
try:
|
|
|
|
|
|
|
184 |
# Try different encodings
|
185 |
encodings = ['utf-8', 'latin-1', 'cp1252']
|
186 |
content = None
|
|
|
194 |
continue
|
195 |
|
196 |
if content is None:
|
197 |
+
print(f"Failed to read file {file_path} with any encoding")
|
198 |
error_count += 1
|
199 |
continue
|
200 |
|
|
|
207 |
text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
|
208 |
|
209 |
if not text_content.strip():
|
210 |
+
print(f"No content extracted from {file_path}")
|
211 |
error_count += 1
|
212 |
continue
|
213 |
|
214 |
full_content = f"{title_text}\n\n{text_content}"
|
215 |
|
216 |
d["chunk"].append(full_content)
|
217 |
+
d["url"].append("https://" + file_name.replace("=", "/"))
|
218 |
processed_count += 1
|
219 |
+
print(f"Successfully processed {file_path}")
|
220 |
|
221 |
except Exception as e:
|
222 |
+
print(f"Error processing file {file_path}: {str(e)}")
|
223 |
error_count += 1
|
224 |
continue
|
225 |
|
|
|
352 |
zip_ref.extractall(temp_dir)
|
353 |
print(f"ZIP contents: {zip_ref.namelist()}")
|
354 |
|
355 |
+
# Process documents
|
356 |
+
print("Processing documents...")
|
357 |
+
df = process_documents(temp_dir)
|
358 |
+
print(f"Processed {len(df)} documents")
|
359 |
+
|
360 |
# Preprocess and read requirements CSV
|
361 |
print("Processing CSV file...")
|
362 |
requirements_df = preprocess_csv(csv_file)
|