Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,37 +1,32 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import os
|
4 |
-
import
|
|
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
from typing import List, TypedDict
|
7 |
from langchain_huggingface import HuggingFaceEmbeddings
|
8 |
from langchain_community.vectorstores import Chroma
|
9 |
from langchain_core.documents import Document
|
10 |
-
from langchain_core.prompts import PromptTemplate
|
11 |
from langchain_core.output_parsers import StrOutputParser
|
12 |
from langchain_core.runnables import RunnablePassthrough
|
13 |
from langchain_nvidia_ai_endpoints import ChatNVIDIA
|
14 |
-
from langchain_core.pydantic_v1 import BaseModel, Field
|
15 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
16 |
from langgraph.graph import END, StateGraph, START
|
17 |
import chromadb
|
18 |
|
19 |
-
|
20 |
-
"""Binary score for relevance check on retrieved documents."""
|
21 |
-
binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
|
22 |
-
|
23 |
-
class GraphState(TypedDict):
|
24 |
-
"""Represents the state of our graph."""
|
25 |
-
question: str
|
26 |
-
generation: str
|
27 |
-
decision: str
|
28 |
-
documents: List[str]
|
29 |
|
30 |
def process_documents(folder_path):
|
31 |
"""Process documents from the uploaded folder."""
|
32 |
d = {"chunk": [], "url": []}
|
33 |
|
34 |
for path in os.listdir(folder_path):
|
|
|
|
|
|
|
35 |
url = "https://" + path.replace("=", "/")
|
36 |
file_path = os.path.join(folder_path, path)
|
37 |
|
@@ -54,152 +49,73 @@ def process_documents(folder_path):
|
|
54 |
|
55 |
def setup_rag_system(folder_path):
|
56 |
"""Initialize the RAG system with the provided documents."""
|
57 |
-
#
|
58 |
-
model_name = "dunzhang/stella_en_1.5B_v5"
|
59 |
-
model_kwargs = {'trust_remote_code': 'True'}
|
60 |
-
embedding_model = HuggingFaceEmbeddings(
|
61 |
-
model_name=model_name,
|
62 |
-
show_progress=True,
|
63 |
-
model_kwargs=model_kwargs
|
64 |
-
)
|
65 |
-
|
66 |
-
# Process documents
|
67 |
-
df = process_documents(folder_path)
|
68 |
-
df["chunk_id"] = range(len(df))
|
69 |
-
|
70 |
-
# Create documents list
|
71 |
-
list_of_documents = [
|
72 |
-
Document(
|
73 |
-
page_content=record['chunk'],
|
74 |
-
metadata={"source_url": record['url']}
|
75 |
-
)
|
76 |
-
for record in df[['chunk', 'url']].to_dict(orient='records')
|
77 |
-
]
|
78 |
-
|
79 |
-
# Setup vector store
|
80 |
-
ids = [str(i) for i in df['chunk_id'].to_list()]
|
81 |
-
client = chromadb.PersistentClient()
|
82 |
-
vector_store = Chroma(
|
83 |
-
client=client,
|
84 |
-
collection_name="rag-chroma",
|
85 |
-
embedding_function=embedding_model,
|
86 |
-
)
|
87 |
-
|
88 |
-
# Add documents in batches
|
89 |
-
start_index = 0
|
90 |
-
max_batch_size = 5461
|
91 |
-
total_len = len(list_of_documents)
|
92 |
-
|
93 |
-
for i in range(1, total_len//5461 + 2):
|
94 |
-
end_index = i*5461
|
95 |
-
if 54500 - start_index < 5461:
|
96 |
-
vector_store.add_documents(documents=list_of_documents[start_index:], ids=ids[start_index:])
|
97 |
-
break
|
98 |
-
else:
|
99 |
-
vector_store.add_documents(
|
100 |
-
documents=list_of_documents[start_index:end_index],
|
101 |
-
ids=ids[start_index:end_index]
|
102 |
-
)
|
103 |
-
start_index = end_index
|
104 |
-
|
105 |
return vector_store
|
106 |
|
107 |
def create_workflow(vector_store):
|
108 |
"""Create the RAG workflow."""
|
109 |
-
#
|
110 |
-
retriever = vector_store.as_retriever(search_kwargs={"k": 7})
|
111 |
-
llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
|
112 |
-
web_search_tool = TavilySearchResults(k=3)
|
113 |
-
|
114 |
-
# Create prompt templates and chains
|
115 |
-
rag_prompt = PromptTemplate.from_template(
|
116 |
-
"""You are an assistant for responding to Request For Proposal documents for a
|
117 |
-
bidder in the field of Data Science and Engineering. Use the following pieces
|
118 |
-
of retrieved context to respond to the requests. If you don't know the answer,
|
119 |
-
just say that you don't know.
|
120 |
-
|
121 |
-
Question: {question}
|
122 |
-
Context: {context}
|
123 |
-
Answer:"""
|
124 |
-
)
|
125 |
-
|
126 |
-
def format_docs(result):
|
127 |
-
return "\n\n".join(doc.page_content for doc in result)
|
128 |
-
|
129 |
-
rag_chain = (
|
130 |
-
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
131 |
-
| rag_prompt
|
132 |
-
| llm
|
133 |
-
| StrOutputParser()
|
134 |
-
)
|
135 |
-
|
136 |
-
# Create workflow graph
|
137 |
-
workflow = StateGraph(GraphState)
|
138 |
-
|
139 |
-
# Define nodes and edges (similar to your original code)
|
140 |
-
# ... (Add all your node definitions and graph construction here)
|
141 |
-
|
142 |
return workflow.compile()
|
143 |
|
144 |
-
def
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
-
|
|
|
163 |
|
164 |
def create_gradio_interface():
|
165 |
-
"""Create the Gradio interface."""
|
166 |
-
def handle_upload(folder, csv):
|
167 |
-
try:
|
168 |
-
# Save uploaded files
|
169 |
-
folder_path = "temp_docs"
|
170 |
-
os.makedirs(folder_path, exist_ok=True)
|
171 |
-
for file in folder:
|
172 |
-
file_path = os.path.join(folder_path, file.name)
|
173 |
-
with open(file_path, "wb") as f:
|
174 |
-
f.write(file.read())
|
175 |
-
|
176 |
-
# Process requirements
|
177 |
-
results_df = process_requirements(folder_path, csv.name)
|
178 |
-
|
179 |
-
# Cleanup
|
180 |
-
for file in os.listdir(folder_path):
|
181 |
-
os.remove(os.path.join(folder_path, file))
|
182 |
-
os.rmdir(folder_path)
|
183 |
-
|
184 |
-
return results_df
|
185 |
-
except Exception as e:
|
186 |
-
return f"Error: {str(e)}"
|
187 |
-
|
188 |
-
# Create interface
|
189 |
iface = gr.Interface(
|
190 |
fn=handle_upload,
|
191 |
inputs=[
|
192 |
-
gr.File(file_count="multiple", label="Upload
|
193 |
-
gr.File(label="Upload Requirements CSV")
|
194 |
],
|
195 |
outputs=gr.Dataframe(),
|
196 |
title="RAG System for RFP Analysis",
|
197 |
-
description="Upload
|
198 |
)
|
199 |
-
|
200 |
return iface
|
201 |
|
202 |
-
# Create and launch the interface
|
203 |
if __name__ == "__main__":
|
204 |
iface = create_gradio_interface()
|
205 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import os
|
4 |
+
import io
|
5 |
+
import zipfile
|
6 |
+
import shutil
|
7 |
from bs4 import BeautifulSoup
|
8 |
from typing import List, TypedDict
|
9 |
from langchain_huggingface import HuggingFaceEmbeddings
|
10 |
from langchain_community.vectorstores import Chroma
|
11 |
from langchain_core.documents import Document
|
12 |
+
from langchain_core.prompts import PromptTemplate
|
13 |
from langchain_core.output_parsers import StrOutputParser
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_nvidia_ai_endpoints import ChatNVIDIA
|
|
|
16 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
17 |
from langgraph.graph import END, StateGraph, START
|
18 |
import chromadb
|
19 |
|
20 |
+
# ... (Keep all necessary imports from section 1 here)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def process_documents(folder_path):
|
23 |
"""Process documents from the uploaded folder."""
|
24 |
d = {"chunk": [], "url": []}
|
25 |
|
26 |
for path in os.listdir(folder_path):
|
27 |
+
if not path.endswith(".html"): # Skip non-HTML files
|
28 |
+
continue
|
29 |
+
|
30 |
url = "https://" + path.replace("=", "/")
|
31 |
file_path = os.path.join(folder_path, path)
|
32 |
|
|
|
49 |
|
50 |
def setup_rag_system(folder_path):
|
51 |
"""Initialize the RAG system with the provided documents."""
|
52 |
+
# ... (Keep your existing setup_rag_system implementation here)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
return vector_store
|
54 |
|
55 |
def create_workflow(vector_store):
|
56 |
"""Create the RAG workflow."""
|
57 |
+
# ... (Keep your existing workflow creation code here)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
return workflow.compile()
|
59 |
|
60 |
+
def handle_upload(folder_files, csv_file):
|
61 |
+
try:
|
62 |
+
# Create temporary directory
|
63 |
+
temp_dir = "temp_upload"
|
64 |
+
os.makedirs(temp_dir, exist_ok=True)
|
65 |
+
|
66 |
+
# Process document files
|
67 |
+
doc_dir = os.path.join(temp_dir, "docs")
|
68 |
+
os.makedirs(doc_dir, exist_ok=True)
|
69 |
+
|
70 |
+
# Handle zip file or individual files
|
71 |
+
for file in folder_files:
|
72 |
+
if file.name.endswith('.zip'):
|
73 |
+
with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
|
74 |
+
zip_ref.extractall(doc_dir)
|
75 |
+
else:
|
76 |
+
with open(os.path.join(doc_dir, file.name), "wb") as f:
|
77 |
+
f.write(file.read())
|
78 |
+
|
79 |
+
# Process CSV requirements
|
80 |
+
csv_content = csv_file.read()
|
81 |
+
requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
|
82 |
+
requirements = requirements_df.iloc[:, 0].tolist() # Get first column
|
83 |
+
|
84 |
+
# Setup RAG system
|
85 |
+
vector_store = setup_rag_system(doc_dir)
|
86 |
+
app = create_workflow(vector_store)
|
87 |
+
|
88 |
+
# Process requirements
|
89 |
+
results = []
|
90 |
+
for question in requirements:
|
91 |
+
inputs = {"question": question}
|
92 |
+
output = app.invoke(inputs)
|
93 |
+
results.append({
|
94 |
+
"Requirement": question,
|
95 |
+
"Response": output.get("generation", "No response generated")
|
96 |
+
})
|
97 |
+
|
98 |
+
# Cleanup
|
99 |
+
shutil.rmtree(temp_dir)
|
100 |
+
|
101 |
+
return pd.DataFrame(results)
|
102 |
|
103 |
+
except Exception as e:
|
104 |
+
return pd.DataFrame({"Error": [str(e)]})
|
105 |
|
106 |
def create_gradio_interface():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
iface = gr.Interface(
|
108 |
fn=handle_upload,
|
109 |
inputs=[
|
110 |
+
gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
|
111 |
+
gr.File(label="Upload Requirements CSV", type="binary")
|
112 |
],
|
113 |
outputs=gr.Dataframe(),
|
114 |
title="RAG System for RFP Analysis",
|
115 |
+
description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
|
116 |
)
|
|
|
117 |
return iface
|
118 |
|
|
|
119 |
if __name__ == "__main__":
|
120 |
iface = create_gradio_interface()
|
121 |
iface.launch()
|