Spaces:

Codequestt
/

ReqChek

Sleeping

App Files Files Community

Codequestt commited on Feb 11

Commit

33cccc8

verified ·

1 Parent(s): b406149

Update app.py

Browse files

Files changed (1) hide show

app.py +270 -84

app.py CHANGED Viewed

@@ -1,121 +1,307 @@
 import gradio as gr
 import pandas as pd
 import os
-import io
 import zipfile
 import shutil
 from bs4 import BeautifulSoup
 from typing import List, TypedDict
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
-from langchain_core.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langgraph.graph import END, StateGraph, START
 import chromadb
-# ... (Keep all necessary imports from section 1 here)
-def process_documents(folder_path):
-    """Process documents from the uploaded folder."""
     d = {"chunk": [], "url": []}
-    for path in os.listdir(folder_path):
-        if not path.endswith(".html"):  # Skip non-HTML files
-            continue
-        url = "https://" + path.replace("=", "/")
-        file_path = os.path.join(folder_path, path)
-        with open(file_path, 'rb') as stream:
-            content = stream.read().decode("utf-8")
-            soup = BeautifulSoup(content, "html.parser")
-            title = soup.find("title")
-            title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
-            main_content = soup.find("main")
-            text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
-            full_content = f"{title_text}\n\n{text_content}"
-            d["chunk"].append(full_content)
-            d["url"].append(url)
     return pd.DataFrame(d)
-def setup_rag_system(folder_path):
     """Initialize the RAG system with the provided documents."""
-    # ... (Keep your existing setup_rag_system implementation here)
     return vector_store
 def create_workflow(vector_store):
     """Create the RAG workflow."""
-    # ... (Keep your existing workflow creation code here)
-    return workflow.compile()
-def handle_upload(folder_files, csv_file):
     try:
         # Create temporary directory
-        temp_dir = "temp_upload"
-        os.makedirs(temp_dir, exist_ok=True)
-        # Process document files
-        doc_dir = os.path.join(temp_dir, "docs")
-        os.makedirs(doc_dir, exist_ok=True)
-        # Handle zip file or individual files
-        for file in folder_files:
-            if file.name.endswith('.zip'):
-                with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
-                    zip_ref.extractall(doc_dir)
-            else:
-                with open(os.path.join(doc_dir, file.name), "wb") as f:
-                    f.write(file.read())
-        # Process CSV requirements
-        csv_content = csv_file.read()
-        requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
-        requirements = requirements_df.iloc[:, 0].tolist()  # Get first column
-        # Setup RAG system
-        vector_store = setup_rag_system(doc_dir)
-        app = create_workflow(vector_store)
-        # Process requirements
-        results = []
-        for question in requirements:
-            inputs = {"question": question}
-            output = app.invoke(inputs)
-            results.append({
-                "Requirement": question,
-                "Response": output.get("generation", "No response generated")
-            })
-        # Cleanup
-        shutil.rmtree(temp_dir)
-        return pd.DataFrame(results)
     except Exception as e:
-        return pd.DataFrame({"Error": [str(e)]})
-def create_gradio_interface():
-    iface = gr.Interface(
-        fn=handle_upload,
-        inputs=[
-            gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
-            gr.File(label="Upload Requirements CSV", type="binary")
-        ],
-        outputs=gr.Dataframe(),
-        title="RAG System for RFP Analysis",
-        description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
-    )
-    return iface
 if __name__ == "__main__":
-    iface = create_gradio_interface()
     iface.launch()

+# import gradio as gr
+# import pandas as pd
+# import os
+# import io
+# import zipfile
+# import shutil
+# from bs4 import BeautifulSoup
+# from typing import List, TypedDict
+# from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain_community.vectorstores import Chroma
+# from langchain_core.documents import Document
+# from langchain_core.prompts import PromptTemplate
+# from langchain_core.output_parsers import StrOutputParser
+# from langchain_core.runnables import RunnablePassthrough
+# from langchain_nvidia_ai_endpoints import ChatNVIDIA
+# from langchain_community.tools.tavily_search import TavilySearchResults
+# from langgraph.graph import END, StateGraph, START
+# import chromadb
+# # ... (Keep all necessary imports from section 1 here)
+# def process_documents(folder_path):
+#     """Process documents from the uploaded folder."""
+#     d = {"chunk": [], "url": []}
+#     for path in os.listdir(folder_path):
+#         if not path.endswith(".html"):  # Skip non-HTML files
+#             continue
+#         url = "https://" + path.replace("=", "/")
+#         file_path = os.path.join(folder_path, path)
+#         with open(file_path, 'rb') as stream:
+#             content = stream.read().decode("utf-8")
+#             soup = BeautifulSoup(content, "html.parser")
+#             title = soup.find("title")
+#             title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
+#             main_content = soup.find("main")
+#             text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
+#             full_content = f"{title_text}\n\n{text_content}"
+#             d["chunk"].append(full_content)
+#             d["url"].append(url)
+#     return pd.DataFrame(d)
+# def setup_rag_system(folder_path):
+#     """Initialize the RAG system with the provided documents."""
+#     # ... (Keep your existing setup_rag_system implementation here)
+#     return vector_store
+# def create_workflow(vector_store):
+#     """Create the RAG workflow."""
+#     # ... (Keep your existing workflow creation code here)
+#     return workflow.compile()
+# def handle_upload(folder_files, csv_file):
+#     try:
+#         # Create temporary directory
+#         temp_dir = "temp_upload"
+#         os.makedirs(temp_dir, exist_ok=True)
+#         # Process document files
+#         doc_dir = os.path.join(temp_dir, "docs")
+#         os.makedirs(doc_dir, exist_ok=True)
+#         # Handle zip file or individual files
+#         for file in folder_files:
+#             if file.name.endswith('.zip'):
+#                 with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
+#                     zip_ref.extractall(doc_dir)
+#             else:
+#                 with open(os.path.join(doc_dir, file.name), "wb") as f:
+#                     f.write(file.read())
+#         # Process CSV requirements
+#         csv_content = csv_file.read()
+#         requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
+#         requirements = requirements_df.iloc[:, 0].tolist()  # Get first column
+#         # Setup RAG system
+#         vector_store = setup_rag_system(doc_dir)
+#         app = create_workflow(vector_store)
+#         # Process requirements
+#         results = []
+#         for question in requirements:
+#             inputs = {"question": question}
+#             output = app.invoke(inputs)
+#             results.append({
+#                 "Requirement": question,
+#                 "Response": output.get("generation", "No response generated")
+#             })
+#         # Cleanup
+#         shutil.rmtree(temp_dir)
+#         return pd.DataFrame(results)
+#     except Exception as e:
+#         return pd.DataFrame({"Error": [str(e)]})
+# def create_gradio_interface():
+#     iface = gr.Interface(
+#         fn=handle_upload,
+#         inputs=[
+#             gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
+#             gr.File(label="Upload Requirements CSV", type="binary")
+#         ],
+#         outputs=gr.Dataframe(),
+#         title="RAG System for RFP Analysis",
+#         description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
+#     )
+#     return iface
+# if __name__ == "__main__":
+#     iface = create_gradio_interface()
+#     iface.launch()
 import gradio as gr
 import pandas as pd
 import os
+import torch
 import zipfile
+import tempfile
 import shutil
 from bs4 import BeautifulSoup
 from typing import List, TypedDict
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
+from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
+from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langgraph.graph import END, StateGraph, START
 import chromadb
+def process_documents(temp_dir):
+    """Process documents from the extracted zip folder."""
     d = {"chunk": [], "url": []}
+    for path in os.listdir(temp_dir):
+        if os.path.isfile(os.path.join(temp_dir, path)):
+            url = "https://" + path.replace("=", "/")
+            file_path = os.path.join(temp_dir, path)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as stream:
+                    content = stream.read()
+                    soup = BeautifulSoup(content, "html.parser")
+                    title = soup.find("title")
+                    title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
+                    main_content = soup.find("main")
+                    text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
+                    full_content = f"{title_text}\n\n{text_content}"
+                    d["chunk"].append(full_content)
+                    d["url"].append(url)
+            except Exception as e:
+                print(f"Error processing file {path}: {str(e)}")
+                continue
     return pd.DataFrame(d)
+def setup_rag_system(temp_dir):
     """Initialize the RAG system with the provided documents."""
+    # Initialize embedding model
+    model_name = "dunzhang/stella_en_1.5B_v5"
+    model_kwargs = {'trust_remote_code': 'True'}
+    embedding_model = HuggingFaceEmbeddings(
+        model_name=model_name,
+        show_progress=True,
+        model_kwargs=model_kwargs
+    )
+    # Process documents
+    df = process_documents(temp_dir)
+    if df.empty:
+        raise ValueError("No valid documents were processed")
+    df["chunk_id"] = range(len(df))
+    # Create documents list
+    list_of_documents = [
+        Document(
+            page_content=record['chunk'],
+            metadata={"source_url": record['url']}
+        )
+        for record in df[['chunk', 'url']].to_dict(orient='records')
+    ]
+    # Setup vector store
+    ids = [str(i) for i in df['chunk_id'].to_list()]
+    client = chromadb.PersistentClient(path=tempfile.mkdtemp())  # Use temporary directory
+    vector_store = Chroma(
+        client=client,
+        collection_name="rag-chroma",
+        embedding_function=embedding_model,
+    )
+    # Add documents in batches
+    batch_size = 100  # Smaller batch size for better memory management
+    for i in range(0, len(list_of_documents), batch_size):
+        end_idx = min(i + batch_size, len(list_of_documents))
+        vector_store.add_documents(
+            documents=list_of_documents[i:end_idx],
+            ids=ids[i:end_idx]
+        )
     return vector_store
 def create_workflow(vector_store):
     """Create the RAG workflow."""
+    retriever = vector_store.as_retriever(search_kwargs={"k": 7})
+    llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
+    rag_prompt = PromptTemplate.from_template(
+        """You are an assistant for responding to Request For Proposal documents for a
+        bidder in the field of Data Science and Engineering. Use the following pieces
+        of retrieved context to respond to the requests. If you don't know the answer,
+        just say that you don't know.
+        Question: {question}
+        Context: {context}
+        Answer:"""
+    )
+    def format_docs(result):
+        return "\n\n".join(doc.page_content for doc in result)
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | rag_prompt
+        | llm
+        | StrOutputParser()
+    )
+    return rag_chain
+def handle_upload(zip_file, csv_file):
+    """Handle file uploads and process requirements."""
     try:
         # Create temporary directory
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Extract zip file
+            with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            # Read requirements CSV
+            requirements_df = pd.read_csv(csv_file.name, encoding='latin-1')
+            if 'requirement' not in requirements_df.columns:
+                raise ValueError("CSV file must contain a 'requirement' column")
+            # Setup RAG system
+            vector_store = setup_rag_system(temp_dir)
+            rag_chain = create_workflow(vector_store)
+            # Process requirements
+            results = []
+            for req in requirements_df['requirement']:
+                try:
+                    response = rag_chain.invoke(req)
+                    results.append({
+                        'requirement': req,
+                        'response': response
+                    })
+                except Exception as e:
+                    results.append({
+                        'requirement': req,
+                        'response': f"Error processing requirement: {str(e)}"
+                    })
+            return pd.DataFrame(results)
+        finally:
+            # Cleanup
+            shutil.rmtree(temp_dir)
     except Exception as e:
+        return pd.DataFrame([{'error': str(e)}])
+# Create and launch the Gradio interface
+iface = gr.Interface(
+    fn=handle_upload,
+    inputs=[
+        gr.File(label="Upload ZIP folder containing URLs"),
+        gr.File(label="Upload Requirements CSV")
+    ],
+    outputs=gr.Dataframe(),
+    title="RAG System for RFP Analysis",
+    description="Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.",
+    examples=[],
+    cache_examples=False
+)
 if __name__ == "__main__":
     iface.launch()