Codequestt commited on
Commit
74aaf3b
·
verified ·
1 Parent(s): 9f6b322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -152
app.py CHANGED
@@ -1,187 +1,205 @@
1
- import os
2
  import gradio as gr
3
- from PyPDF2 import PdfReader
 
 
 
 
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
- from langchain_core.documents import Document
6
- import chromadb
7
  from langchain_community.vectorstores import Chroma
8
- from langchain_nvidia_ai_endpoints import ChatNVIDIA
9
- from langchain_core.prompts import PromptTemplate
10
  from langchain_core.output_parsers import StrOutputParser
 
 
11
  from langchain_core.pydantic_v1 import BaseModel, Field
12
- from langgraph.graph import StateGraph, END
13
- from typing import List, TypedDict
14
- import pandas as pd
15
-
16
- # Set API keys
17
- os.environ["TAVILY_API_KEY"] = "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"
18
- os.environ["NVIDIA_API_KEY"] = "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"
19
- os.environ["LANGCHAIN_PROJECT"] = "RAG Compliance Checker"
20
-
21
- # Initialize embedding model
22
- model_name = "dunzhang/stella_en_1.5B_v5"
23
- embedding_model = HuggingFaceEmbeddings(
24
- model_name=model_name,
25
- model_kwargs={'trust_remote_code': True},
26
- show_progress=True
27
- )
28
 
29
- # Define data models
30
  class GradeDocuments(BaseModel):
31
- binary_score: str = Field(description="Relevance score 'yes' or 'no'")
 
32
 
33
  class GraphState(TypedDict):
 
34
  question: str
35
  generation: str
36
  decision: str
37
- documents: List[Document]
38
 
39
- def create_workflow(retriever):
40
- # Define workflow nodes
41
- def retrieve(state):
42
- print("---RETRIEVING DOCUMENTS---")
43
- question = state["question"]
44
- documents = retriever.invoke(question)
45
- return {"documents": documents, "question": question}
46
-
47
- def grade_documents(state):
48
- print("---GRADING DOCUMENTS---")
49
- question = state["question"]
50
- documents = state["documents"]
51
-
52
- llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
53
- grader = llm.with_structured_output(GradeDocuments)
54
-
55
- system = """You are a relevance grader. Determine if the document contains
56
- information related to the question. Answer 'yes' or 'no'."""
57
- prompt = ChatPromptTemplate.from_messages([
58
- ("system", system),
59
- ("human", "Document:\n{document}\n\nQuestion: {question}")
60
- ])
61
-
62
- filtered_docs = []
63
- for doc in documents:
64
- response = (prompt | grader).invoke({
65
- "question": question,
66
- "document": doc.page_content
67
- })
68
- if response.binary_score == "yes":
69
- filtered_docs.append(doc)
70
-
71
- return {"documents": filtered_docs, "question": question}
72
-
73
- def generate_response(state):
74
- print("---GENERATING RESPONSE---")
75
- question = state["question"]
76
- documents = state["documents"]
77
-
78
- template = """Answer the question using only the context below:
79
- Context: {context}
80
- Question: {question}"""
81
-
82
- prompt = PromptTemplate.from_template(template)
83
- llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
84
-
85
- chain = (
86
- {"context": lambda _: "\n\n".join(d.page_content for d in documents), "question": RunnablePassthrough()}
87
- | prompt
88
- | llm
89
- | StrOutputParser()
90
- )
91
 
92
- return {"generation": chain.invoke(question)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- # Build workflow
95
- workflow = StateGraph(GraphState)
96
- workflow.add_node("retrieve", retrieve)
97
- workflow.add_node("grade", grade_documents)
98
- workflow.add_node("generate", generate_response)
99
-
100
- workflow.add_edge("retrieve", "grade")
101
- workflow.add_conditional_edges(
102
- "grade",
103
- lambda state: "generate" if len(state["documents"]) > 0 else END,
104
- {"generate": "generate"}
105
  )
106
- workflow.add_edge("generate", END)
107
 
108
- return workflow.compile()
109
-
110
- def process_documents(folder_path):
111
- """Process PDF files from uploaded folder"""
112
- documents = []
113
- for filename in os.listdir(folder_path):
114
- if filename.endswith(".pdf"):
115
- path = os.path.join(folder_path, filename)
116
- try:
117
- reader = PdfReader(path)
118
- text = "\n".join([page.extract_text() for page in reader.pages])
119
- documents.append(Document(
120
- page_content=text,
121
- metadata={"source": filename}
122
- ))
123
- except Exception as e:
124
- print(f"Error processing {filename}: {str(e)}")
125
- return documents
126
-
127
- def analyze_requirements(csv_file, documents):
128
- """Main analysis function"""
129
- # Create vector store
130
  client = chromadb.PersistentClient()
131
  vector_store = Chroma(
132
  client=client,
133
- collection_name="dynamic_rag",
134
- embedding_function=embedding_model
135
  )
136
 
137
  # Add documents in batches
138
- batch_size = 500
139
- for i in range(0, len(documents), batch_size):
140
- batch = documents[i:i+batch_size]
141
- vector_store.add_documents(batch, ids=[str(n) for n in range(len(batch))])
142
 
143
- retriever = vector_store.as_retriever(search_kwargs={"k": 5})
144
- app = create_workflow(retriever)
 
 
 
 
 
 
 
 
 
145
 
146
- # Process requirements
147
- df = pd.read_csv(csv_file.name)
148
- results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- for req in df['Requirement']:
151
- response = app.invoke({"question": req})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  results.append({
153
- "Requirement": req,
154
- "Response": response["generation"],
155
- "Status": "Processed"
156
  })
157
 
158
  return pd.DataFrame(results)
159
 
160
- # Gradio interface
161
- with gr.Blocks(title="RAG Compliance Checker") as interface:
162
- gr.Markdown("# AI Compliance Assistant")
163
- gr.Markdown("Upload documents and requirements CSV for compliance analysis")
164
-
165
- with gr.Row():
166
- with gr.Column():
167
- doc_upload = gr.File(label="Upload Documents Folder", file_count="directory")
168
- csv_upload = gr.File(label="Upload Requirements CSV", file_types=[".csv"])
169
- submit_btn = gr.Button("Analyze", variant="primary")
170
-
171
- with gr.Column():
172
- results_table = gr.DataFrame(
173
- label="Analysis Results",
174
- headers=["Requirement", "Response", "Status"],
175
- interactive=False
176
- )
177
- status = gr.Textbox(label="Processing Status")
 
 
 
 
 
178
 
179
- submit_btn.click(
180
- fn=lambda doc, csv: analyze_requirements(csv, process_documents(doc)),
181
- inputs=[doc_upload, csv_upload],
182
- outputs=results_table,
183
- api_name="analyze"
 
 
 
 
 
184
  )
 
 
185
 
 
186
  if __name__ == "__main__":
187
- interface.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import torch
5
+ from bs4 import BeautifulSoup
6
+ from typing import List, TypedDict
7
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
8
  from langchain_community.vectorstores import Chroma
9
+ from langchain_core.documents import Document
10
+ from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
11
  from langchain_core.output_parsers import StrOutputParser
12
+ from langchain_core.runnables import RunnablePassthrough
13
+ from langchain_nvidia_ai_endpoints import ChatNVIDIA
14
  from langchain_core.pydantic_v1 import BaseModel, Field
15
+ from langchain_community.tools.tavily_search import TavilySearchResults
16
+ from langgraph.graph import END, StateGraph, START
17
+ import chromadb
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
 
19
  class GradeDocuments(BaseModel):
20
+ """Binary score for relevance check on retrieved documents."""
21
+ binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
22
 
23
  class GraphState(TypedDict):
24
+ """Represents the state of our graph."""
25
  question: str
26
  generation: str
27
  decision: str
28
+ documents: List[str]
29
 
30
+ def process_documents(folder_path):
31
+ """Process documents from the uploaded folder."""
32
+ d = {"chunk": [], "url": []}
33
+
34
+ for path in os.listdir(folder_path):
35
+ url = "https://" + path.replace("=", "/")
36
+ file_path = os.path.join(folder_path, path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ with open(file_path, 'rb') as stream:
39
+ content = stream.read().decode("utf-8")
40
+ soup = BeautifulSoup(content, "html.parser")
41
+
42
+ title = soup.find("title")
43
+ title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
44
+
45
+ main_content = soup.find("main")
46
+ text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
47
+
48
+ full_content = f"{title_text}\n\n{text_content}"
49
+
50
+ d["chunk"].append(full_content)
51
+ d["url"].append(url)
52
+
53
+ return pd.DataFrame(d)
54
 
55
+ def setup_rag_system(folder_path):
56
+ """Initialize the RAG system with the provided documents."""
57
+ # Initialize embedding model
58
+ model_name = "dunzhang/stella_en_1.5B_v5"
59
+ model_kwargs = {'trust_remote_code': 'True'}
60
+ embedding_model = HuggingFaceEmbeddings(
61
+ model_name=model_name,
62
+ show_progress=True,
63
+ model_kwargs=model_kwargs
 
 
64
  )
 
65
 
66
+ # Process documents
67
+ df = process_documents(folder_path)
68
+ df["chunk_id"] = range(len(df))
69
+
70
+ # Create documents list
71
+ list_of_documents = [
72
+ Document(
73
+ page_content=record['chunk'],
74
+ metadata={"source_url": record['url']}
75
+ )
76
+ for record in df[['chunk', 'url']].to_dict(orient='records')
77
+ ]
78
+
79
+ # Setup vector store
80
+ ids = [str(i) for i in df['chunk_id'].to_list()]
 
 
 
 
 
 
 
81
  client = chromadb.PersistentClient()
82
  vector_store = Chroma(
83
  client=client,
84
+ collection_name="rag-chroma",
85
+ embedding_function=embedding_model,
86
  )
87
 
88
  # Add documents in batches
89
+ start_index = 0
90
+ max_batch_size = 5461
91
+ total_len = len(list_of_documents)
 
92
 
93
+ for i in range(1, total_len//5461 + 2):
94
+ end_index = i*5461
95
+ if 54500 - start_index < 5461:
96
+ vector_store.add_documents(documents=list_of_documents[start_index:], ids=ids[start_index:])
97
+ break
98
+ else:
99
+ vector_store.add_documents(
100
+ documents=list_of_documents[start_index:end_index],
101
+ ids=ids[start_index:end_index]
102
+ )
103
+ start_index = end_index
104
 
105
+ return vector_store
106
+
107
+ def create_workflow(vector_store):
108
+ """Create the RAG workflow."""
109
+ # Initialize components
110
+ retriever = vector_store.as_retriever(search_kwargs={"k": 7})
111
+ llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
112
+ web_search_tool = TavilySearchResults(k=3)
113
+
114
+ # Create prompt templates and chains
115
+ rag_prompt = PromptTemplate.from_template(
116
+ """You are an assistant for responding to Request For Proposal documents for a
117
+ bidder in the field of Data Science and Engineering. Use the following pieces
118
+ of retrieved context to respond to the requests. If you don't know the answer,
119
+ just say that you don't know.
120
+
121
+ Question: {question}
122
+ Context: {context}
123
+ Answer:"""
124
+ )
125
+
126
+ def format_docs(result):
127
+ return "\n\n".join(doc.page_content for doc in result)
128
 
129
+ rag_chain = (
130
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
131
+ | rag_prompt
132
+ | llm
133
+ | StrOutputParser()
134
+ )
135
+
136
+ # Create workflow graph
137
+ workflow = StateGraph(GraphState)
138
+
139
+ # Define nodes and edges (similar to your original code)
140
+ # ... (Add all your node definitions and graph construction here)
141
+
142
+ return workflow.compile()
143
+
144
+ def process_requirements(folder_path, csv_file):
145
+ """Process requirements from CSV and generate responses."""
146
+ # Setup RAG system
147
+ vector_store = setup_rag_system(folder_path)
148
+ app = create_workflow(vector_store)
149
+
150
+ # Read requirements
151
+ requirements = pd.read_csv(csv_file, encoding='latin-1')
152
+
153
+ results = []
154
+ for request in requirements:
155
+ inputs = {"question": request}
156
+ output = app.invoke(inputs)
157
  results.append({
158
+ "request": request,
159
+ "response": output["generation"]
 
160
  })
161
 
162
  return pd.DataFrame(results)
163
 
164
+ def create_gradio_interface():
165
+ """Create the Gradio interface."""
166
+ def handle_upload(folder, csv):
167
+ try:
168
+ # Save uploaded files
169
+ folder_path = "temp_docs"
170
+ os.makedirs(folder_path, exist_ok=True)
171
+ for file in folder:
172
+ file_path = os.path.join(folder_path, file.name)
173
+ with open(file_path, "wb") as f:
174
+ f.write(file.read())
175
+
176
+ # Process requirements
177
+ results_df = process_requirements(folder_path, csv.name)
178
+
179
+ # Cleanup
180
+ for file in os.listdir(folder_path):
181
+ os.remove(os.path.join(folder_path, file))
182
+ os.rmdir(folder_path)
183
+
184
+ return results_df
185
+ except Exception as e:
186
+ return f"Error: {str(e)}"
187
 
188
+ # Create interface
189
+ iface = gr.Interface(
190
+ fn=handle_upload,
191
+ inputs=[
192
+ gr.File(file_count="multiple", label="Upload Document Folder"),
193
+ gr.File(label="Upload Requirements CSV")
194
+ ],
195
+ outputs=gr.Dataframe(),
196
+ title="RAG System for RFP Analysis",
197
+ description="Upload a folder of documents and a CSV file with requirements to analyze."
198
  )
199
+
200
+ return iface
201
 
202
+ # Create and launch the interface
203
  if __name__ == "__main__":
204
+ iface = create_gradio_interface()
205
+ iface.launch()