Spaces:

Codequestt
/

ReqChek

Sleeping

App Files Files Community

Codequestt commited on Feb 11

Commit

bae2003

verified ·

1 Parent(s): ec80195

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -18

app.py CHANGED Viewed

@@ -159,35 +159,132 @@ class GraphState(TypedDict):
     documents: List[str]
 def process_documents(temp_dir):
-    """Process documents from the extracted zip folder."""
     d = {"chunk": [], "url": []}
     for path in os.listdir(temp_dir):
         if os.path.isfile(os.path.join(temp_dir, path)):
-            url = "https://" + path.replace("=", "/")
-            file_path = os.path.join(temp_dir, path)
             try:
-                with open(file_path, 'r', encoding='utf-8') as stream:
-                    content = stream.read()
-                    soup = BeautifulSoup(content, "html.parser")
-                    title = soup.find("title")
-                    title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
-                    main_content = soup.find("main")
-                    text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
-                    full_content = f"{title_text}\n\n{text_content}"
-                    d["chunk"].append(full_content)
-                    d["url"].append(url)
             except Exception as e:
                 print(f"Error processing file {path}: {str(e)}")
                 continue
     return pd.DataFrame(d)
 def setup_rag_system(temp_dir):
     """Initialize the RAG system with the provided documents."""
     # Initialize embedding model

     documents: List[str]
 def process_documents(temp_dir):
+    """Process documents from the extracted zip folder with enhanced error handling."""
     d = {"chunk": [], "url": []}
+    # Debug information
+    print(f"Scanning directory: {temp_dir}")
+    print(f"Directory contents: {os.listdir(temp_dir)}")
+    file_count = 0
+    processed_count = 0
+    error_count = 0
     for path in os.listdir(temp_dir):
+        file_count += 1
         if os.path.isfile(os.path.join(temp_dir, path)):
             try:
+                file_path = os.path.join(temp_dir, path)
+                print(f"Processing file: {path}")
+                # Try different encodings
+                encodings = ['utf-8', 'latin-1', 'cp1252']
+                content = None
+                for encoding in encodings:
+                    try:
+                        with open(file_path, 'r', encoding=encoding) as stream:
+                            content = stream.read()
+                            break
+                    except UnicodeDecodeError:
+                        continue
+                if content is None:
+                    print(f"Failed to read file {path} with any encoding")
+                    error_count += 1
+                    continue
+                soup = BeautifulSoup(content, "html.parser")
+                title = soup.find("title")
+                title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
+                main_content = soup.find("main")
+                text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
+                if not text_content.strip():
+                    print(f"No content extracted from {path}")
+                    error_count += 1
+                    continue
+                full_content = f"{title_text}\n\n{text_content}"
+                d["chunk"].append(full_content)
+                d["url"].append("https://" + path.replace("=", "/"))
+                processed_count += 1
+                print(f"Successfully processed {path}")
             except Exception as e:
                 print(f"Error processing file {path}: {str(e)}")
+                error_count += 1
                 continue
+    print(f"\nProcessing Summary:")
+    print(f"Total files found: {file_count}")
+    print(f"Successfully processed: {processed_count}")
+    print(f"Errors encountered: {error_count}")
+    if not d["chunk"]:
+        raise ValueError(f"No valid documents were processed. Processed {file_count} files with {error_count} errors.")
     return pd.DataFrame(d)
+def handle_upload(zip_file, csv_file):
+    """Handle file uploads and process requirements with enhanced error handling."""
+    try:
+        # Create temporary directory
+        temp_dir = tempfile.mkdtemp()
+        print(f"Created temporary directory: {temp_dir}")
+        try:
+            # Extract zip file
+            print(f"Extracting ZIP file: {zip_file.name}")
+            with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+                print(f"ZIP contents: {zip_ref.namelist()}")
+            # Preprocess and read requirements CSV
+            print("Processing CSV file...")
+            requirements_df = preprocess_csv(csv_file)
+            print(f"Found {len(requirements_df)} requirements")
+            # Setup RAG system
+            print("Setting up RAG system...")
+            vector_store = setup_rag_system(temp_dir)
+            rag_chain = create_workflow(vector_store)
+            # Process requirements
+            results = []
+            for idx, req in enumerate(requirements_df['requirement'], 1):
+                print(f"Processing requirement {idx}/{len(requirements_df)}")
+                try:
+                    response = rag_chain.invoke(req)
+                    results.append({
+                        'requirement': req,
+                        'response': response
+                    })
+                except Exception as e:
+                    error_msg = f"Error processing requirement: {str(e)}"
+                    print(error_msg)
+                    results.append({
+                        'requirement': req,
+                        'response': error_msg
+                    })
+            return pd.DataFrame(results)
+        finally:
+            # Cleanup
+            print(f"Cleaning up temporary directory: {temp_dir}")
+            shutil.rmtree(temp_dir)
+    except Exception as e:
+        error_msg = f"Processing error: {str(e)}"
+        print(error_msg)
+        return pd.DataFrame([{'error': error_msg}])
+# The rest of the code remains the same...
 def setup_rag_system(temp_dir):
     """Initialize the RAG system with the provided documents."""
     # Initialize embedding model