Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -159,35 +159,132 @@ class GraphState(TypedDict):
|
|
159 |
documents: List[str]
|
160 |
|
161 |
def process_documents(temp_dir):
|
162 |
-
"""Process documents from the extracted zip folder."""
|
163 |
d = {"chunk": [], "url": []}
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
for path in os.listdir(temp_dir):
|
|
|
166 |
if os.path.isfile(os.path.join(temp_dir, path)):
|
167 |
-
url = "https://" + path.replace("=", "/")
|
168 |
-
file_path = os.path.join(temp_dir, path)
|
169 |
-
|
170 |
try:
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
except Exception as e:
|
186 |
print(f"Error processing file {path}: {str(e)}")
|
|
|
187 |
continue
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
return pd.DataFrame(d)
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
def setup_rag_system(temp_dir):
|
192 |
"""Initialize the RAG system with the provided documents."""
|
193 |
# Initialize embedding model
|
|
|
159 |
documents: List[str]
|
160 |
|
161 |
def process_documents(temp_dir):
|
162 |
+
"""Process documents from the extracted zip folder with enhanced error handling."""
|
163 |
d = {"chunk": [], "url": []}
|
164 |
|
165 |
+
# Debug information
|
166 |
+
print(f"Scanning directory: {temp_dir}")
|
167 |
+
print(f"Directory contents: {os.listdir(temp_dir)}")
|
168 |
+
|
169 |
+
file_count = 0
|
170 |
+
processed_count = 0
|
171 |
+
error_count = 0
|
172 |
+
|
173 |
for path in os.listdir(temp_dir):
|
174 |
+
file_count += 1
|
175 |
if os.path.isfile(os.path.join(temp_dir, path)):
|
|
|
|
|
|
|
176 |
try:
|
177 |
+
file_path = os.path.join(temp_dir, path)
|
178 |
+
print(f"Processing file: {path}")
|
179 |
+
|
180 |
+
# Try different encodings
|
181 |
+
encodings = ['utf-8', 'latin-1', 'cp1252']
|
182 |
+
content = None
|
183 |
+
|
184 |
+
for encoding in encodings:
|
185 |
+
try:
|
186 |
+
with open(file_path, 'r', encoding=encoding) as stream:
|
187 |
+
content = stream.read()
|
188 |
+
break
|
189 |
+
except UnicodeDecodeError:
|
190 |
+
continue
|
191 |
+
|
192 |
+
if content is None:
|
193 |
+
print(f"Failed to read file {path} with any encoding")
|
194 |
+
error_count += 1
|
195 |
+
continue
|
196 |
+
|
197 |
+
soup = BeautifulSoup(content, "html.parser")
|
198 |
+
|
199 |
+
title = soup.find("title")
|
200 |
+
title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
|
201 |
+
|
202 |
+
main_content = soup.find("main")
|
203 |
+
text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
|
204 |
+
|
205 |
+
if not text_content.strip():
|
206 |
+
print(f"No content extracted from {path}")
|
207 |
+
error_count += 1
|
208 |
+
continue
|
209 |
+
|
210 |
+
full_content = f"{title_text}\n\n{text_content}"
|
211 |
+
|
212 |
+
d["chunk"].append(full_content)
|
213 |
+
d["url"].append("https://" + path.replace("=", "/"))
|
214 |
+
processed_count += 1
|
215 |
+
print(f"Successfully processed {path}")
|
216 |
+
|
217 |
except Exception as e:
|
218 |
print(f"Error processing file {path}: {str(e)}")
|
219 |
+
error_count += 1
|
220 |
continue
|
221 |
|
222 |
+
print(f"\nProcessing Summary:")
|
223 |
+
print(f"Total files found: {file_count}")
|
224 |
+
print(f"Successfully processed: {processed_count}")
|
225 |
+
print(f"Errors encountered: {error_count}")
|
226 |
+
|
227 |
+
if not d["chunk"]:
|
228 |
+
raise ValueError(f"No valid documents were processed. Processed {file_count} files with {error_count} errors.")
|
229 |
+
|
230 |
return pd.DataFrame(d)
|
231 |
|
232 |
+
def handle_upload(zip_file, csv_file):
|
233 |
+
"""Handle file uploads and process requirements with enhanced error handling."""
|
234 |
+
try:
|
235 |
+
# Create temporary directory
|
236 |
+
temp_dir = tempfile.mkdtemp()
|
237 |
+
print(f"Created temporary directory: {temp_dir}")
|
238 |
+
|
239 |
+
try:
|
240 |
+
# Extract zip file
|
241 |
+
print(f"Extracting ZIP file: {zip_file.name}")
|
242 |
+
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
243 |
+
zip_ref.extractall(temp_dir)
|
244 |
+
print(f"ZIP contents: {zip_ref.namelist()}")
|
245 |
+
|
246 |
+
# Preprocess and read requirements CSV
|
247 |
+
print("Processing CSV file...")
|
248 |
+
requirements_df = preprocess_csv(csv_file)
|
249 |
+
print(f"Found {len(requirements_df)} requirements")
|
250 |
+
|
251 |
+
# Setup RAG system
|
252 |
+
print("Setting up RAG system...")
|
253 |
+
vector_store = setup_rag_system(temp_dir)
|
254 |
+
rag_chain = create_workflow(vector_store)
|
255 |
+
|
256 |
+
# Process requirements
|
257 |
+
results = []
|
258 |
+
for idx, req in enumerate(requirements_df['requirement'], 1):
|
259 |
+
print(f"Processing requirement {idx}/{len(requirements_df)}")
|
260 |
+
try:
|
261 |
+
response = rag_chain.invoke(req)
|
262 |
+
results.append({
|
263 |
+
'requirement': req,
|
264 |
+
'response': response
|
265 |
+
})
|
266 |
+
except Exception as e:
|
267 |
+
error_msg = f"Error processing requirement: {str(e)}"
|
268 |
+
print(error_msg)
|
269 |
+
results.append({
|
270 |
+
'requirement': req,
|
271 |
+
'response': error_msg
|
272 |
+
})
|
273 |
+
|
274 |
+
return pd.DataFrame(results)
|
275 |
+
|
276 |
+
finally:
|
277 |
+
# Cleanup
|
278 |
+
print(f"Cleaning up temporary directory: {temp_dir}")
|
279 |
+
shutil.rmtree(temp_dir)
|
280 |
+
|
281 |
+
except Exception as e:
|
282 |
+
error_msg = f"Processing error: {str(e)}"
|
283 |
+
print(error_msg)
|
284 |
+
return pd.DataFrame([{'error': error_msg}])
|
285 |
+
|
286 |
+
# The rest of the code remains the same...
|
287 |
+
|
288 |
def setup_rag_system(temp_dir):
|
289 |
"""Initialize the RAG system with the provided documents."""
|
290 |
# Initialize embedding model
|