Codequestt commited on
Commit
b406149
·
verified ·
1 Parent(s): 814f14b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -141
app.py CHANGED
@@ -1,37 +1,32 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
- import torch
 
 
5
  from bs4 import BeautifulSoup
6
  from typing import List, TypedDict
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_community.vectorstores import Chroma
9
  from langchain_core.documents import Document
10
- from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_core.runnables import RunnablePassthrough
13
  from langchain_nvidia_ai_endpoints import ChatNVIDIA
14
- from langchain_core.pydantic_v1 import BaseModel, Field
15
  from langchain_community.tools.tavily_search import TavilySearchResults
16
  from langgraph.graph import END, StateGraph, START
17
  import chromadb
18
 
19
- class GradeDocuments(BaseModel):
20
- """Binary score for relevance check on retrieved documents."""
21
- binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
22
-
23
- class GraphState(TypedDict):
24
- """Represents the state of our graph."""
25
- question: str
26
- generation: str
27
- decision: str
28
- documents: List[str]
29
 
30
  def process_documents(folder_path):
31
  """Process documents from the uploaded folder."""
32
  d = {"chunk": [], "url": []}
33
 
34
  for path in os.listdir(folder_path):
 
 
 
35
  url = "https://" + path.replace("=", "/")
36
  file_path = os.path.join(folder_path, path)
37
 
@@ -54,152 +49,73 @@ def process_documents(folder_path):
54
 
55
  def setup_rag_system(folder_path):
56
  """Initialize the RAG system with the provided documents."""
57
- # Initialize embedding model
58
- model_name = "dunzhang/stella_en_1.5B_v5"
59
- model_kwargs = {'trust_remote_code': 'True'}
60
- embedding_model = HuggingFaceEmbeddings(
61
- model_name=model_name,
62
- show_progress=True,
63
- model_kwargs=model_kwargs
64
- )
65
-
66
- # Process documents
67
- df = process_documents(folder_path)
68
- df["chunk_id"] = range(len(df))
69
-
70
- # Create documents list
71
- list_of_documents = [
72
- Document(
73
- page_content=record['chunk'],
74
- metadata={"source_url": record['url']}
75
- )
76
- for record in df[['chunk', 'url']].to_dict(orient='records')
77
- ]
78
-
79
- # Setup vector store
80
- ids = [str(i) for i in df['chunk_id'].to_list()]
81
- client = chromadb.PersistentClient()
82
- vector_store = Chroma(
83
- client=client,
84
- collection_name="rag-chroma",
85
- embedding_function=embedding_model,
86
- )
87
-
88
- # Add documents in batches
89
- start_index = 0
90
- max_batch_size = 5461
91
- total_len = len(list_of_documents)
92
-
93
- for i in range(1, total_len//5461 + 2):
94
- end_index = i*5461
95
- if 54500 - start_index < 5461:
96
- vector_store.add_documents(documents=list_of_documents[start_index:], ids=ids[start_index:])
97
- break
98
- else:
99
- vector_store.add_documents(
100
- documents=list_of_documents[start_index:end_index],
101
- ids=ids[start_index:end_index]
102
- )
103
- start_index = end_index
104
-
105
  return vector_store
106
 
107
  def create_workflow(vector_store):
108
  """Create the RAG workflow."""
109
- # Initialize components
110
- retriever = vector_store.as_retriever(search_kwargs={"k": 7})
111
- llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
112
- web_search_tool = TavilySearchResults(k=3)
113
-
114
- # Create prompt templates and chains
115
- rag_prompt = PromptTemplate.from_template(
116
- """You are an assistant for responding to Request For Proposal documents for a
117
- bidder in the field of Data Science and Engineering. Use the following pieces
118
- of retrieved context to respond to the requests. If you don't know the answer,
119
- just say that you don't know.
120
-
121
- Question: {question}
122
- Context: {context}
123
- Answer:"""
124
- )
125
-
126
- def format_docs(result):
127
- return "\n\n".join(doc.page_content for doc in result)
128
-
129
- rag_chain = (
130
- {"context": retriever | format_docs, "question": RunnablePassthrough()}
131
- | rag_prompt
132
- | llm
133
- | StrOutputParser()
134
- )
135
-
136
- # Create workflow graph
137
- workflow = StateGraph(GraphState)
138
-
139
- # Define nodes and edges (similar to your original code)
140
- # ... (Add all your node definitions and graph construction here)
141
-
142
  return workflow.compile()
143
 
144
- def process_requirements(folder_path, csv_file):
145
- """Process requirements from CSV and generate responses."""
146
- # Setup RAG system
147
- vector_store = setup_rag_system(folder_path)
148
- app = create_workflow(vector_store)
149
-
150
- # Read requirements
151
- requirements = pd.read_csv(csv_file, encoding='latin-1')
152
-
153
- results = []
154
- for request in requirements:
155
- inputs = {"question": request}
156
- output = app.invoke(inputs)
157
- results.append({
158
- "request": request,
159
- "response": output["generation"]
160
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- return pd.DataFrame(results)
 
163
 
164
  def create_gradio_interface():
165
- """Create the Gradio interface."""
166
- def handle_upload(folder, csv):
167
- try:
168
- # Save uploaded files
169
- folder_path = "temp_docs"
170
- os.makedirs(folder_path, exist_ok=True)
171
- for file in folder:
172
- file_path = os.path.join(folder_path, file.name)
173
- with open(file_path, "wb") as f:
174
- f.write(file.read())
175
-
176
- # Process requirements
177
- results_df = process_requirements(folder_path, csv.name)
178
-
179
- # Cleanup
180
- for file in os.listdir(folder_path):
181
- os.remove(os.path.join(folder_path, file))
182
- os.rmdir(folder_path)
183
-
184
- return results_df
185
- except Exception as e:
186
- return f"Error: {str(e)}"
187
-
188
- # Create interface
189
  iface = gr.Interface(
190
  fn=handle_upload,
191
  inputs=[
192
- gr.File(file_count="multiple", label="Upload Document Folder"),
193
- gr.File(label="Upload Requirements CSV")
194
  ],
195
  outputs=gr.Dataframe(),
196
  title="RAG System for RFP Analysis",
197
- description="Upload a folder of documents and a CSV file with requirements to analyze."
198
  )
199
-
200
  return iface
201
 
202
- # Create and launch the interface
203
  if __name__ == "__main__":
204
  iface = create_gradio_interface()
205
  iface.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
+ import io
5
+ import zipfile
6
+ import shutil
7
  from bs4 import BeautifulSoup
8
  from typing import List, TypedDict
9
  from langchain_huggingface import HuggingFaceEmbeddings
10
  from langchain_community.vectorstores import Chroma
11
  from langchain_core.documents import Document
12
+ from langchain_core.prompts import PromptTemplate
13
  from langchain_core.output_parsers import StrOutputParser
14
  from langchain_core.runnables import RunnablePassthrough
15
  from langchain_nvidia_ai_endpoints import ChatNVIDIA
 
16
  from langchain_community.tools.tavily_search import TavilySearchResults
17
  from langgraph.graph import END, StateGraph, START
18
  import chromadb
19
 
20
+ # ... (Keep all necessary imports from section 1 here)
 
 
 
 
 
 
 
 
 
21
 
22
  def process_documents(folder_path):
23
  """Process documents from the uploaded folder."""
24
  d = {"chunk": [], "url": []}
25
 
26
  for path in os.listdir(folder_path):
27
+ if not path.endswith(".html"): # Skip non-HTML files
28
+ continue
29
+
30
  url = "https://" + path.replace("=", "/")
31
  file_path = os.path.join(folder_path, path)
32
 
 
49
 
50
  def setup_rag_system(folder_path):
51
  """Initialize the RAG system with the provided documents."""
52
+ # ... (Keep your existing setup_rag_system implementation here)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return vector_store
54
 
55
  def create_workflow(vector_store):
56
  """Create the RAG workflow."""
57
+ # ... (Keep your existing workflow creation code here)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return workflow.compile()
59
 
60
+ def handle_upload(folder_files, csv_file):
61
+ try:
62
+ # Create temporary directory
63
+ temp_dir = "temp_upload"
64
+ os.makedirs(temp_dir, exist_ok=True)
65
+
66
+ # Process document files
67
+ doc_dir = os.path.join(temp_dir, "docs")
68
+ os.makedirs(doc_dir, exist_ok=True)
69
+
70
+ # Handle zip file or individual files
71
+ for file in folder_files:
72
+ if file.name.endswith('.zip'):
73
+ with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
74
+ zip_ref.extractall(doc_dir)
75
+ else:
76
+ with open(os.path.join(doc_dir, file.name), "wb") as f:
77
+ f.write(file.read())
78
+
79
+ # Process CSV requirements
80
+ csv_content = csv_file.read()
81
+ requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
82
+ requirements = requirements_df.iloc[:, 0].tolist() # Get first column
83
+
84
+ # Setup RAG system
85
+ vector_store = setup_rag_system(doc_dir)
86
+ app = create_workflow(vector_store)
87
+
88
+ # Process requirements
89
+ results = []
90
+ for question in requirements:
91
+ inputs = {"question": question}
92
+ output = app.invoke(inputs)
93
+ results.append({
94
+ "Requirement": question,
95
+ "Response": output.get("generation", "No response generated")
96
+ })
97
+
98
+ # Cleanup
99
+ shutil.rmtree(temp_dir)
100
+
101
+ return pd.DataFrame(results)
102
 
103
+ except Exception as e:
104
+ return pd.DataFrame({"Error": [str(e)]})
105
 
106
  def create_gradio_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  iface = gr.Interface(
108
  fn=handle_upload,
109
  inputs=[
110
+ gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
111
+ gr.File(label="Upload Requirements CSV", type="binary")
112
  ],
113
  outputs=gr.Dataframe(),
114
  title="RAG System for RFP Analysis",
115
+ description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
116
  )
 
117
  return iface
118
 
 
119
  if __name__ == "__main__":
120
  iface = create_gradio_interface()
121
  iface.launch()