Spaces:

Codequestt
/

ReqChek

Sleeping

App Files Files Community

Codequestt commited on Feb 10

Commit

74aaf3b

verified ·

1 Parent(s): 9f6b322

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -152

app.py CHANGED Viewed

@@ -1,187 +1,205 @@
-import os
 import gradio as gr
-from PyPDF2 import PdfReader
 from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_core.documents import Document
-import chromadb
 from langchain_community.vectorstores import Chroma
-from langchain_nvidia_ai_endpoints import ChatNVIDIA
-from langchain_core.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.pydantic_v1 import BaseModel, Field
-from langgraph.graph import StateGraph, END
-from typing import List, TypedDict
-import pandas as pd
-# Set API keys
-os.environ["TAVILY_API_KEY"] = "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"
-os.environ["NVIDIA_API_KEY"] = "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"
-os.environ["LANGCHAIN_PROJECT"] = "RAG Compliance Checker"
-# Initialize embedding model
-model_name = "dunzhang/stella_en_1.5B_v5"
-embedding_model = HuggingFaceEmbeddings(
-    model_name=model_name,
-    model_kwargs={'trust_remote_code': True},
-    show_progress=True
-)
-# Define data models
 class GradeDocuments(BaseModel):
-    binary_score: str = Field(description="Relevance score 'yes' or 'no'")
 class GraphState(TypedDict):
     question: str
     generation: str
     decision: str
-    documents: List[Document]
-def create_workflow(retriever):
-    # Define workflow nodes
-    def retrieve(state):
-        print("---RETRIEVING DOCUMENTS---")
-        question = state["question"]
-        documents = retriever.invoke(question)
-        return {"documents": documents, "question": question}
-    def grade_documents(state):
-        print("---GRADING DOCUMENTS---")
-        question = state["question"]
-        documents = state["documents"]
-        llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
-        grader = llm.with_structured_output(GradeDocuments)
-        system = """You are a relevance grader. Determine if the document contains
-        information related to the question. Answer 'yes' or 'no'."""
-        prompt = ChatPromptTemplate.from_messages([
-            ("system", system),
-            ("human", "Document:\n{document}\n\nQuestion: {question}")
-        ])
-        filtered_docs = []
-        for doc in documents:
-            response = (prompt | grader).invoke({
-                "question": question,
-                "document": doc.page_content
-            })
-            if response.binary_score == "yes":
-                filtered_docs.append(doc)
-        return {"documents": filtered_docs, "question": question}
-    def generate_response(state):
-        print("---GENERATING RESPONSE---")
-        question = state["question"]
-        documents = state["documents"]
-        template = """Answer the question using only the context below:
-        Context: {context}
-        Question: {question}"""
-        prompt = PromptTemplate.from_template(template)
-        llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
-        chain = (
-            {"context": lambda _: "\n\n".join(d.page_content for d in documents), "question": RunnablePassthrough()}
-            | prompt
-            | llm
-            | StrOutputParser()
-        )
-        return {"generation": chain.invoke(question)}
-    # Build workflow
-    workflow = StateGraph(GraphState)
-    workflow.add_node("retrieve", retrieve)
-    workflow.add_node("grade", grade_documents)
-    workflow.add_node("generate", generate_response)
-    workflow.add_edge("retrieve", "grade")
-    workflow.add_conditional_edges(
-        "grade",
-        lambda state: "generate" if len(state["documents"]) > 0 else END,
-        {"generate": "generate"}
     )
-    workflow.add_edge("generate", END)
-    return workflow.compile()
-def process_documents(folder_path):
-    """Process PDF files from uploaded folder"""
-    documents = []
-    for filename in os.listdir(folder_path):
-        if filename.endswith(".pdf"):
-            path = os.path.join(folder_path, filename)
-            try:
-                reader = PdfReader(path)
-                text = "\n".join([page.extract_text() for page in reader.pages])
-                documents.append(Document(
-                    page_content=text,
-                    metadata={"source": filename}
-                ))
-            except Exception as e:
-                print(f"Error processing {filename}: {str(e)}")
-    return documents
-def analyze_requirements(csv_file, documents):
-    """Main analysis function"""
-    # Create vector store
     client = chromadb.PersistentClient()
     vector_store = Chroma(
         client=client,
-        collection_name="dynamic_rag",
-        embedding_function=embedding_model
     )
     # Add documents in batches
-    batch_size = 500
-    for i in range(0, len(documents), batch_size):
-        batch = documents[i:i+batch_size]
-        vector_store.add_documents(batch, ids=[str(n) for n in range(len(batch))])
-    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
-    app = create_workflow(retriever)
-    # Process requirements
-    df = pd.read_csv(csv_file.name)
-    results = []
-    for req in df['Requirement']:
-        response = app.invoke({"question": req})
         results.append({
-            "Requirement": req,
-            "Response": response["generation"],
-            "Status": "Processed"
         })
     return pd.DataFrame(results)
-# Gradio interface
-with gr.Blocks(title="RAG Compliance Checker") as interface:
-    gr.Markdown("# AI Compliance Assistant")
-    gr.Markdown("Upload documents and requirements CSV for compliance analysis")
-    with gr.Row():
-        with gr.Column():
-            doc_upload = gr.File(label="Upload Documents Folder", file_count="directory")
-            csv_upload = gr.File(label="Upload Requirements CSV", file_types=[".csv"])
-            submit_btn = gr.Button("Analyze", variant="primary")
-        with gr.Column():
-            results_table = gr.DataFrame(
-                label="Analysis Results",
-                headers=["Requirement", "Response", "Status"],
-                interactive=False
-            )
-            status = gr.Textbox(label="Processing Status")
-    submit_btn.click(
-        fn=lambda doc, csv: analyze_requirements(csv, process_documents(doc)),
-        inputs=[doc_upload, csv_upload],
-        outputs=results_table,
-        api_name="analyze"
     )
 if __name__ == "__main__":
-    interface.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import gradio as gr
+import pandas as pd
+import os
+import torch
+from bs4 import BeautifulSoup
+from typing import List, TypedDict
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_nvidia_ai_endpoints import ChatNVIDIA
 from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langgraph.graph import END, StateGraph, START
+import chromadb
 class GradeDocuments(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+    binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
 class GraphState(TypedDict):
+    """Represents the state of our graph."""
     question: str
     generation: str
     decision: str
+    documents: List[str]
+def process_documents(folder_path):
+    """Process documents from the uploaded folder."""
+    d = {"chunk": [], "url": []}
+    for path in os.listdir(folder_path):
+        url = "https://" + path.replace("=", "/")
+        file_path = os.path.join(folder_path, path)
+        with open(file_path, 'rb') as stream:
+            content = stream.read().decode("utf-8")
+            soup = BeautifulSoup(content, "html.parser")
+            title = soup.find("title")
+            title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
+            main_content = soup.find("main")
+            text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
+            full_content = f"{title_text}\n\n{text_content}"
+            d["chunk"].append(full_content)
+            d["url"].append(url)
+    return pd.DataFrame(d)
+def setup_rag_system(folder_path):
+    """Initialize the RAG system with the provided documents."""
+    # Initialize embedding model
+    model_name = "dunzhang/stella_en_1.5B_v5"
+    model_kwargs = {'trust_remote_code': 'True'}
+    embedding_model = HuggingFaceEmbeddings(
+        model_name=model_name,
+        show_progress=True,
+        model_kwargs=model_kwargs
     )
+    # Process documents
+    df = process_documents(folder_path)
+    df["chunk_id"] = range(len(df))
+    # Create documents list
+    list_of_documents = [
+        Document(
+            page_content=record['chunk'],
+            metadata={"source_url": record['url']}
+        )
+        for record in df[['chunk', 'url']].to_dict(orient='records')
+    ]
+    # Setup vector store
+    ids = [str(i) for i in df['chunk_id'].to_list()]
     client = chromadb.PersistentClient()
     vector_store = Chroma(
         client=client,
+        collection_name="rag-chroma",
+        embedding_function=embedding_model,
     )
     # Add documents in batches
+    start_index = 0
+    max_batch_size = 5461
+    total_len = len(list_of_documents)
+    for i in range(1, total_len//5461 + 2):
+        end_index = i*5461
+        if 54500 - start_index < 5461:
+            vector_store.add_documents(documents=list_of_documents[start_index:], ids=ids[start_index:])
+            break
+        else:
+            vector_store.add_documents(
+                documents=list_of_documents[start_index:end_index],
+                ids=ids[start_index:end_index]
+            )
+        start_index = end_index
+    return vector_store
+def create_workflow(vector_store):
+    """Create the RAG workflow."""
+    # Initialize components
+    retriever = vector_store.as_retriever(search_kwargs={"k": 7})
+    llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
+    web_search_tool = TavilySearchResults(k=3)
+    # Create prompt templates and chains
+    rag_prompt = PromptTemplate.from_template(
+        """You are an assistant for responding to Request For Proposal documents for a
+        bidder in the field of Data Science and Engineering. Use the following pieces
+        of retrieved context to respond to the requests. If you don't know the answer,
+        just say that you don't know.
+        Question: {question}
+        Context: {context}
+        Answer:"""
+    )
+    def format_docs(result):
+        return "\n\n".join(doc.page_content for doc in result)
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | rag_prompt
+        | llm
+        | StrOutputParser()
+    )
+    # Create workflow graph
+    workflow = StateGraph(GraphState)
+    # Define nodes and edges (similar to your original code)
+    # ... (Add all your node definitions and graph construction here)
+    return workflow.compile()
+def process_requirements(folder_path, csv_file):
+    """Process requirements from CSV and generate responses."""
+    # Setup RAG system
+    vector_store = setup_rag_system(folder_path)
+    app = create_workflow(vector_store)
+    # Read requirements
+    requirements = pd.read_csv(csv_file, encoding='latin-1')
+    results = []
+    for request in requirements:
+        inputs = {"question": request}
+        output = app.invoke(inputs)
         results.append({
+            "request": request,
+            "response": output["generation"]
         })
     return pd.DataFrame(results)
+def create_gradio_interface():
+    """Create the Gradio interface."""
+    def handle_upload(folder, csv):
+        try:
+            # Save uploaded files
+            folder_path = "temp_docs"
+            os.makedirs(folder_path, exist_ok=True)
+            for file in folder:
+                file_path = os.path.join(folder_path, file.name)
+                with open(file_path, "wb") as f:
+                    f.write(file.read())
+            # Process requirements
+            results_df = process_requirements(folder_path, csv.name)
+            # Cleanup
+            for file in os.listdir(folder_path):
+                os.remove(os.path.join(folder_path, file))
+            os.rmdir(folder_path)
+            return results_df
+        except Exception as e:
+            return f"Error: {str(e)}"
+    # Create interface
+    iface = gr.Interface(
+        fn=handle_upload,
+        inputs=[
+            gr.File(file_count="multiple", label="Upload Document Folder"),
+            gr.File(label="Upload Requirements CSV")
+        ],
+        outputs=gr.Dataframe(),
+        title="RAG System for RFP Analysis",
+        description="Upload a folder of documents and a CSV file with requirements to analyze."
     )
+    return iface
+# Create and launch the interface
 if __name__ == "__main__":
+    iface = create_gradio_interface()
+    iface.launch()