Spaces:

Codequestt
/

ReqChek

Sleeping

App Files Files Community

Codequestt commited on Feb 11

Commit

b406149

verified ·

1 Parent(s): 814f14b

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -141

app.py CHANGED Viewed

@@ -1,37 +1,32 @@
 import gradio as gr
 import pandas as pd
 import os
-import torch
 from bs4 import BeautifulSoup
 from typing import List, TypedDict
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
-from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
-from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langgraph.graph import END, StateGraph, START
 import chromadb
-class GradeDocuments(BaseModel):
-    """Binary score for relevance check on retrieved documents."""
-    binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
-class GraphState(TypedDict):
-    """Represents the state of our graph."""
-    question: str
-    generation: str
-    decision: str
-    documents: List[str]
 def process_documents(folder_path):
     """Process documents from the uploaded folder."""
     d = {"chunk": [], "url": []}
     for path in os.listdir(folder_path):
         url = "https://" + path.replace("=", "/")
         file_path = os.path.join(folder_path, path)
@@ -54,152 +49,73 @@ def process_documents(folder_path):
 def setup_rag_system(folder_path):
     """Initialize the RAG system with the provided documents."""
-    # Initialize embedding model
-    model_name = "dunzhang/stella_en_1.5B_v5"
-    model_kwargs = {'trust_remote_code': 'True'}
-    embedding_model = HuggingFaceEmbeddings(
-        model_name=model_name,
-        show_progress=True,
-        model_kwargs=model_kwargs
-    )
-    # Process documents
-    df = process_documents(folder_path)
-    df["chunk_id"] = range(len(df))
-    # Create documents list
-    list_of_documents = [
-        Document(
-            page_content=record['chunk'],
-            metadata={"source_url": record['url']}
-        )
-        for record in df[['chunk', 'url']].to_dict(orient='records')
-    ]
-    # Setup vector store
-    ids = [str(i) for i in df['chunk_id'].to_list()]
-    client = chromadb.PersistentClient()
-    vector_store = Chroma(
-        client=client,
-        collection_name="rag-chroma",
-        embedding_function=embedding_model,
-    )
-    # Add documents in batches
-    start_index = 0
-    max_batch_size = 5461
-    total_len = len(list_of_documents)
-    for i in range(1, total_len//5461 + 2):
-        end_index = i*5461
-        if 54500 - start_index < 5461:
-            vector_store.add_documents(documents=list_of_documents[start_index:], ids=ids[start_index:])
-            break
-        else:
-            vector_store.add_documents(
-                documents=list_of_documents[start_index:end_index],
-                ids=ids[start_index:end_index]
-            )
-        start_index = end_index
     return vector_store
 def create_workflow(vector_store):
     """Create the RAG workflow."""
-    # Initialize components
-    retriever = vector_store.as_retriever(search_kwargs={"k": 7})
-    llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
-    web_search_tool = TavilySearchResults(k=3)
-    # Create prompt templates and chains
-    rag_prompt = PromptTemplate.from_template(
-        """You are an assistant for responding to Request For Proposal documents for a
-        bidder in the field of Data Science and Engineering. Use the following pieces
-        of retrieved context to respond to the requests. If you don't know the answer,
-        just say that you don't know.
-        Question: {question}
-        Context: {context}
-        Answer:"""
-    )
-    def format_docs(result):
-        return "\n\n".join(doc.page_content for doc in result)
-    rag_chain = (
-        {"context": retriever | format_docs, "question": RunnablePassthrough()}
-        | rag_prompt
-        | llm
-        | StrOutputParser()
-    )
-    # Create workflow graph
-    workflow = StateGraph(GraphState)
-    # Define nodes and edges (similar to your original code)
-    # ... (Add all your node definitions and graph construction here)
     return workflow.compile()
-def process_requirements(folder_path, csv_file):
-    """Process requirements from CSV and generate responses."""
-    # Setup RAG system
-    vector_store = setup_rag_system(folder_path)
-    app = create_workflow(vector_store)
-    # Read requirements
-    requirements = pd.read_csv(csv_file, encoding='latin-1')
-    results = []
-    for request in requirements:
-        inputs = {"question": request}
-        output = app.invoke(inputs)
-        results.append({
-            "request": request,
-            "response": output["generation"]
-        })
-    return pd.DataFrame(results)
 def create_gradio_interface():
-    """Create the Gradio interface."""
-    def handle_upload(folder, csv):
-        try:
-            # Save uploaded files
-            folder_path = "temp_docs"
-            os.makedirs(folder_path, exist_ok=True)
-            for file in folder:
-                file_path = os.path.join(folder_path, file.name)
-                with open(file_path, "wb") as f:
-                    f.write(file.read())
-            # Process requirements
-            results_df = process_requirements(folder_path, csv.name)
-            # Cleanup
-            for file in os.listdir(folder_path):
-                os.remove(os.path.join(folder_path, file))
-            os.rmdir(folder_path)
-            return results_df
-        except Exception as e:
-            return f"Error: {str(e)}"
-    # Create interface
     iface = gr.Interface(
         fn=handle_upload,
         inputs=[
-            gr.File(file_count="multiple", label="Upload Document Folder"),
-            gr.File(label="Upload Requirements CSV")
         ],
         outputs=gr.Dataframe(),
         title="RAG System for RFP Analysis",
-        description="Upload a folder of documents and a CSV file with requirements to analyze."
     )
     return iface
-# Create and launch the interface
 if __name__ == "__main__":
     iface = create_gradio_interface()
     iface.launch()

 import gradio as gr
 import pandas as pd
 import os
+import io
+import zipfile
+import shutil
 from bs4 import BeautifulSoup
 from typing import List, TypedDict
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_core.documents import Document
+from langchain_core.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langgraph.graph import END, StateGraph, START
 import chromadb
+# ... (Keep all necessary imports from section 1 here)
 def process_documents(folder_path):
     """Process documents from the uploaded folder."""
     d = {"chunk": [], "url": []}
     for path in os.listdir(folder_path):
+        if not path.endswith(".html"):  # Skip non-HTML files
+            continue
         url = "https://" + path.replace("=", "/")
         file_path = os.path.join(folder_path, path)
 def setup_rag_system(folder_path):
     """Initialize the RAG system with the provided documents."""
+    # ... (Keep your existing setup_rag_system implementation here)
     return vector_store
 def create_workflow(vector_store):
     """Create the RAG workflow."""
+    # ... (Keep your existing workflow creation code here)
     return workflow.compile()
+def handle_upload(folder_files, csv_file):
+    try:
+        # Create temporary directory
+        temp_dir = "temp_upload"
+        os.makedirs(temp_dir, exist_ok=True)
+        # Process document files
+        doc_dir = os.path.join(temp_dir, "docs")
+        os.makedirs(doc_dir, exist_ok=True)
+        # Handle zip file or individual files
+        for file in folder_files:
+            if file.name.endswith('.zip'):
+                with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
+                    zip_ref.extractall(doc_dir)
+            else:
+                with open(os.path.join(doc_dir, file.name), "wb") as f:
+                    f.write(file.read())
+        # Process CSV requirements
+        csv_content = csv_file.read()
+        requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
+        requirements = requirements_df.iloc[:, 0].tolist()  # Get first column
+        # Setup RAG system
+        vector_store = setup_rag_system(doc_dir)
+        app = create_workflow(vector_store)
+        # Process requirements
+        results = []
+        for question in requirements:
+            inputs = {"question": question}
+            output = app.invoke(inputs)
+            results.append({
+                "Requirement": question,
+                "Response": output.get("generation", "No response generated")
+            })
+        # Cleanup
+        shutil.rmtree(temp_dir)
+        return pd.DataFrame(results)
+    except Exception as e:
+        return pd.DataFrame({"Error": [str(e)]})
 def create_gradio_interface():
     iface = gr.Interface(
         fn=handle_upload,
         inputs=[
+            gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
+            gr.File(label="Upload Requirements CSV", type="binary")
         ],
         outputs=gr.Dataframe(),
         title="RAG System for RFP Analysis",
+        description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
     )
     return iface
 if __name__ == "__main__":
     iface = create_gradio_interface()
     iface.launch()