Codequestt commited on
Commit
33cccc8
·
verified ·
1 Parent(s): b406149

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -84
app.py CHANGED
@@ -1,121 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
- import io
5
  import zipfile
 
6
  import shutil
7
  from bs4 import BeautifulSoup
8
  from typing import List, TypedDict
9
  from langchain_huggingface import HuggingFaceEmbeddings
10
  from langchain_community.vectorstores import Chroma
11
  from langchain_core.documents import Document
12
- from langchain_core.prompts import PromptTemplate
13
  from langchain_core.output_parsers import StrOutputParser
14
  from langchain_core.runnables import RunnablePassthrough
15
  from langchain_nvidia_ai_endpoints import ChatNVIDIA
 
16
  from langchain_community.tools.tavily_search import TavilySearchResults
17
  from langgraph.graph import END, StateGraph, START
18
  import chromadb
19
 
20
- # ... (Keep all necessary imports from section 1 here)
21
-
22
- def process_documents(folder_path):
23
- """Process documents from the uploaded folder."""
24
  d = {"chunk": [], "url": []}
25
 
26
- for path in os.listdir(folder_path):
27
- if not path.endswith(".html"): # Skip non-HTML files
28
- continue
29
-
30
- url = "https://" + path.replace("=", "/")
31
- file_path = os.path.join(folder_path, path)
32
-
33
- with open(file_path, 'rb') as stream:
34
- content = stream.read().decode("utf-8")
35
- soup = BeautifulSoup(content, "html.parser")
36
-
37
- title = soup.find("title")
38
- title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
39
-
40
- main_content = soup.find("main")
41
- text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
42
 
43
- full_content = f"{title_text}\n\n{text_content}"
44
-
45
- d["chunk"].append(full_content)
46
- d["url"].append(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  return pd.DataFrame(d)
49
 
50
- def setup_rag_system(folder_path):
51
  """Initialize the RAG system with the provided documents."""
52
- # ... (Keep your existing setup_rag_system implementation here)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return vector_store
54
 
55
  def create_workflow(vector_store):
56
  """Create the RAG workflow."""
57
- # ... (Keep your existing workflow creation code here)
58
- return workflow.compile()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- def handle_upload(folder_files, csv_file):
 
61
  try:
62
  # Create temporary directory
63
- temp_dir = "temp_upload"
64
- os.makedirs(temp_dir, exist_ok=True)
65
 
66
- # Process document files
67
- doc_dir = os.path.join(temp_dir, "docs")
68
- os.makedirs(doc_dir, exist_ok=True)
69
-
70
- # Handle zip file or individual files
71
- for file in folder_files:
72
- if file.name.endswith('.zip'):
73
- with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
74
- zip_ref.extractall(doc_dir)
75
- else:
76
- with open(os.path.join(doc_dir, file.name), "wb") as f:
77
- f.write(file.read())
78
-
79
- # Process CSV requirements
80
- csv_content = csv_file.read()
81
- requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
82
- requirements = requirements_df.iloc[:, 0].tolist() # Get first column
83
-
84
- # Setup RAG system
85
- vector_store = setup_rag_system(doc_dir)
86
- app = create_workflow(vector_store)
87
-
88
- # Process requirements
89
- results = []
90
- for question in requirements:
91
- inputs = {"question": question}
92
- output = app.invoke(inputs)
93
- results.append({
94
- "Requirement": question,
95
- "Response": output.get("generation", "No response generated")
96
- })
97
-
98
- # Cleanup
99
- shutil.rmtree(temp_dir)
100
-
101
- return pd.DataFrame(results)
102
-
103
  except Exception as e:
104
- return pd.DataFrame({"Error": [str(e)]})
105
 
106
- def create_gradio_interface():
107
- iface = gr.Interface(
108
- fn=handle_upload,
109
- inputs=[
110
- gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
111
- gr.File(label="Upload Requirements CSV", type="binary")
112
- ],
113
- outputs=gr.Dataframe(),
114
- title="RAG System for RFP Analysis",
115
- description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
116
- )
117
- return iface
 
118
 
119
  if __name__ == "__main__":
120
- iface = create_gradio_interface()
121
  iface.launch()
 
1
+ # import gradio as gr
2
+ # import pandas as pd
3
+ # import os
4
+ # import io
5
+ # import zipfile
6
+ # import shutil
7
+ # from bs4 import BeautifulSoup
8
+ # from typing import List, TypedDict
9
+ # from langchain_huggingface import HuggingFaceEmbeddings
10
+ # from langchain_community.vectorstores import Chroma
11
+ # from langchain_core.documents import Document
12
+ # from langchain_core.prompts import PromptTemplate
13
+ # from langchain_core.output_parsers import StrOutputParser
14
+ # from langchain_core.runnables import RunnablePassthrough
15
+ # from langchain_nvidia_ai_endpoints import ChatNVIDIA
16
+ # from langchain_community.tools.tavily_search import TavilySearchResults
17
+ # from langgraph.graph import END, StateGraph, START
18
+ # import chromadb
19
+
20
+ # # ... (Keep all necessary imports from section 1 here)
21
+
22
+ # def process_documents(folder_path):
23
+ # """Process documents from the uploaded folder."""
24
+ # d = {"chunk": [], "url": []}
25
+
26
+ # for path in os.listdir(folder_path):
27
+ # if not path.endswith(".html"): # Skip non-HTML files
28
+ # continue
29
+
30
+ # url = "https://" + path.replace("=", "/")
31
+ # file_path = os.path.join(folder_path, path)
32
+
33
+ # with open(file_path, 'rb') as stream:
34
+ # content = stream.read().decode("utf-8")
35
+ # soup = BeautifulSoup(content, "html.parser")
36
+
37
+ # title = soup.find("title")
38
+ # title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
39
+
40
+ # main_content = soup.find("main")
41
+ # text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
42
+
43
+ # full_content = f"{title_text}\n\n{text_content}"
44
+
45
+ # d["chunk"].append(full_content)
46
+ # d["url"].append(url)
47
+
48
+ # return pd.DataFrame(d)
49
+
50
+ # def setup_rag_system(folder_path):
51
+ # """Initialize the RAG system with the provided documents."""
52
+ # # ... (Keep your existing setup_rag_system implementation here)
53
+ # return vector_store
54
+
55
+ # def create_workflow(vector_store):
56
+ # """Create the RAG workflow."""
57
+ # # ... (Keep your existing workflow creation code here)
58
+ # return workflow.compile()
59
+
60
+ # def handle_upload(folder_files, csv_file):
61
+ # try:
62
+ # # Create temporary directory
63
+ # temp_dir = "temp_upload"
64
+ # os.makedirs(temp_dir, exist_ok=True)
65
+
66
+ # # Process document files
67
+ # doc_dir = os.path.join(temp_dir, "docs")
68
+ # os.makedirs(doc_dir, exist_ok=True)
69
+
70
+ # # Handle zip file or individual files
71
+ # for file in folder_files:
72
+ # if file.name.endswith('.zip'):
73
+ # with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
74
+ # zip_ref.extractall(doc_dir)
75
+ # else:
76
+ # with open(os.path.join(doc_dir, file.name), "wb") as f:
77
+ # f.write(file.read())
78
+
79
+ # # Process CSV requirements
80
+ # csv_content = csv_file.read()
81
+ # requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
82
+ # requirements = requirements_df.iloc[:, 0].tolist() # Get first column
83
+
84
+ # # Setup RAG system
85
+ # vector_store = setup_rag_system(doc_dir)
86
+ # app = create_workflow(vector_store)
87
+
88
+ # # Process requirements
89
+ # results = []
90
+ # for question in requirements:
91
+ # inputs = {"question": question}
92
+ # output = app.invoke(inputs)
93
+ # results.append({
94
+ # "Requirement": question,
95
+ # "Response": output.get("generation", "No response generated")
96
+ # })
97
+
98
+ # # Cleanup
99
+ # shutil.rmtree(temp_dir)
100
+
101
+ # return pd.DataFrame(results)
102
+
103
+ # except Exception as e:
104
+ # return pd.DataFrame({"Error": [str(e)]})
105
+
106
+ # def create_gradio_interface():
107
+ # iface = gr.Interface(
108
+ # fn=handle_upload,
109
+ # inputs=[
110
+ # gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
111
+ # gr.File(label="Upload Requirements CSV", type="binary")
112
+ # ],
113
+ # outputs=gr.Dataframe(),
114
+ # title="RAG System for RFP Analysis",
115
+ # description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
116
+ # )
117
+ # return iface
118
+
119
+ # if __name__ == "__main__":
120
+ # iface = create_gradio_interface()
121
+ # iface.launch()
122
+
123
  import gradio as gr
124
  import pandas as pd
125
  import os
126
+ import torch
127
  import zipfile
128
+ import tempfile
129
  import shutil
130
  from bs4 import BeautifulSoup
131
  from typing import List, TypedDict
132
  from langchain_huggingface import HuggingFaceEmbeddings
133
  from langchain_community.vectorstores import Chroma
134
  from langchain_core.documents import Document
135
+ from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
136
  from langchain_core.output_parsers import StrOutputParser
137
  from langchain_core.runnables import RunnablePassthrough
138
  from langchain_nvidia_ai_endpoints import ChatNVIDIA
139
+ from langchain_core.pydantic_v1 import BaseModel, Field
140
  from langchain_community.tools.tavily_search import TavilySearchResults
141
  from langgraph.graph import END, StateGraph, START
142
  import chromadb
143
 
144
+ def process_documents(temp_dir):
145
+ """Process documents from the extracted zip folder."""
 
 
146
  d = {"chunk": [], "url": []}
147
 
148
+ for path in os.listdir(temp_dir):
149
+ if os.path.isfile(os.path.join(temp_dir, path)):
150
+ url = "https://" + path.replace("=", "/")
151
+ file_path = os.path.join(temp_dir, path)
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ try:
154
+ with open(file_path, 'r', encoding='utf-8') as stream:
155
+ content = stream.read()
156
+ soup = BeautifulSoup(content, "html.parser")
157
+
158
+ title = soup.find("title")
159
+ title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
160
+
161
+ main_content = soup.find("main")
162
+ text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
163
+
164
+ full_content = f"{title_text}\n\n{text_content}"
165
+
166
+ d["chunk"].append(full_content)
167
+ d["url"].append(url)
168
+ except Exception as e:
169
+ print(f"Error processing file {path}: {str(e)}")
170
+ continue
171
 
172
  return pd.DataFrame(d)
173
 
174
+ def setup_rag_system(temp_dir):
175
  """Initialize the RAG system with the provided documents."""
176
+ # Initialize embedding model
177
+ model_name = "dunzhang/stella_en_1.5B_v5"
178
+ model_kwargs = {'trust_remote_code': 'True'}
179
+ embedding_model = HuggingFaceEmbeddings(
180
+ model_name=model_name,
181
+ show_progress=True,
182
+ model_kwargs=model_kwargs
183
+ )
184
+
185
+ # Process documents
186
+ df = process_documents(temp_dir)
187
+ if df.empty:
188
+ raise ValueError("No valid documents were processed")
189
+
190
+ df["chunk_id"] = range(len(df))
191
+
192
+ # Create documents list
193
+ list_of_documents = [
194
+ Document(
195
+ page_content=record['chunk'],
196
+ metadata={"source_url": record['url']}
197
+ )
198
+ for record in df[['chunk', 'url']].to_dict(orient='records')
199
+ ]
200
+
201
+ # Setup vector store
202
+ ids = [str(i) for i in df['chunk_id'].to_list()]
203
+ client = chromadb.PersistentClient(path=tempfile.mkdtemp()) # Use temporary directory
204
+ vector_store = Chroma(
205
+ client=client,
206
+ collection_name="rag-chroma",
207
+ embedding_function=embedding_model,
208
+ )
209
+
210
+ # Add documents in batches
211
+ batch_size = 100 # Smaller batch size for better memory management
212
+ for i in range(0, len(list_of_documents), batch_size):
213
+ end_idx = min(i + batch_size, len(list_of_documents))
214
+ vector_store.add_documents(
215
+ documents=list_of_documents[i:end_idx],
216
+ ids=ids[i:end_idx]
217
+ )
218
+
219
  return vector_store
220
 
221
  def create_workflow(vector_store):
222
  """Create the RAG workflow."""
223
+ retriever = vector_store.as_retriever(search_kwargs={"k": 7})
224
+ llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
225
+
226
+ rag_prompt = PromptTemplate.from_template(
227
+ """You are an assistant for responding to Request For Proposal documents for a
228
+ bidder in the field of Data Science and Engineering. Use the following pieces
229
+ of retrieved context to respond to the requests. If you don't know the answer,
230
+ just say that you don't know.
231
+ Question: {question}
232
+ Context: {context}
233
+ Answer:"""
234
+ )
235
+
236
+ def format_docs(result):
237
+ return "\n\n".join(doc.page_content for doc in result)
238
+
239
+ rag_chain = (
240
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
241
+ | rag_prompt
242
+ | llm
243
+ | StrOutputParser()
244
+ )
245
+
246
+ return rag_chain
247
 
248
+ def handle_upload(zip_file, csv_file):
249
+ """Handle file uploads and process requirements."""
250
  try:
251
  # Create temporary directory
252
+ temp_dir = tempfile.mkdtemp()
 
253
 
254
+ try:
255
+ # Extract zip file
256
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
257
+ zip_ref.extractall(temp_dir)
258
+
259
+ # Read requirements CSV
260
+ requirements_df = pd.read_csv(csv_file.name, encoding='latin-1')
261
+ if 'requirement' not in requirements_df.columns:
262
+ raise ValueError("CSV file must contain a 'requirement' column")
263
+
264
+ # Setup RAG system
265
+ vector_store = setup_rag_system(temp_dir)
266
+ rag_chain = create_workflow(vector_store)
267
+
268
+ # Process requirements
269
+ results = []
270
+ for req in requirements_df['requirement']:
271
+ try:
272
+ response = rag_chain.invoke(req)
273
+ results.append({
274
+ 'requirement': req,
275
+ 'response': response
276
+ })
277
+ except Exception as e:
278
+ results.append({
279
+ 'requirement': req,
280
+ 'response': f"Error processing requirement: {str(e)}"
281
+ })
282
+
283
+ return pd.DataFrame(results)
284
+
285
+ finally:
286
+ # Cleanup
287
+ shutil.rmtree(temp_dir)
288
+
 
 
289
  except Exception as e:
290
+ return pd.DataFrame([{'error': str(e)}])
291
 
292
+ # Create and launch the Gradio interface
293
+ iface = gr.Interface(
294
+ fn=handle_upload,
295
+ inputs=[
296
+ gr.File(label="Upload ZIP folder containing URLs"),
297
+ gr.File(label="Upload Requirements CSV")
298
+ ],
299
+ outputs=gr.Dataframe(),
300
+ title="RAG System for RFP Analysis",
301
+ description="Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.",
302
+ examples=[],
303
+ cache_examples=False
304
+ )
305
 
306
  if __name__ == "__main__":
 
307
  iface.launch()