Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -120,6 +120,303 @@
|
|
120 |
# iface = create_gradio_interface()
|
121 |
# iface.launch()
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
import gradio as gr
|
124 |
import pandas as pd
|
125 |
import os
|
@@ -144,7 +441,6 @@ import io
|
|
144 |
|
145 |
# Environment variables setup
|
146 |
os.environ["TAVILY_API_KEY"] = "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"
|
147 |
-
os.environ["NVIDIA_API_KEY"] = "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"
|
148 |
os.environ["LANGCHAIN_PROJECT"] = "RAG project"
|
149 |
|
150 |
class GradeDocuments(BaseModel):
|
@@ -158,10 +454,6 @@ class GraphState(TypedDict):
|
|
158 |
decision: str
|
159 |
documents: List[str]
|
160 |
|
161 |
-
import os
|
162 |
-
from bs4 import BeautifulSoup
|
163 |
-
import pandas as pd
|
164 |
-
|
165 |
def process_documents(temp_dir):
|
166 |
"""Process documents from the extracted zip folder with enhanced error handling."""
|
167 |
d = {"chunk": [], "url": []}
|
@@ -233,10 +525,6 @@ def process_documents(temp_dir):
|
|
233 |
|
234 |
return pd.DataFrame(d)
|
235 |
|
236 |
-
|
237 |
-
|
238 |
-
# The rest of the code remains the same...
|
239 |
-
|
240 |
def setup_rag_system(temp_dir):
|
241 |
"""Initialize the RAG system with the provided documents."""
|
242 |
# Initialize embedding model
|
@@ -338,9 +626,12 @@ def preprocess_csv(csv_file):
|
|
338 |
except Exception as e2:
|
339 |
raise ValueError(f"Could not process CSV file: {str(e2)}")
|
340 |
|
341 |
-
def handle_upload(zip_file, csv_file):
|
342 |
"""Handle file uploads and process requirements with enhanced error handling."""
|
343 |
try:
|
|
|
|
|
|
|
344 |
# Create temporary directory
|
345 |
temp_dir = tempfile.mkdtemp()
|
346 |
print(f"Created temporary directory: {temp_dir}")
|
@@ -396,18 +687,21 @@ def handle_upload(zip_file, csv_file):
|
|
396 |
error_msg = f"Processing error: {str(e)}"
|
397 |
print(error_msg)
|
398 |
return pd.DataFrame([{'error': error_msg}])
|
|
|
399 |
def main():
|
400 |
"""Main function to run the Gradio interface."""
|
401 |
iface = gr.Interface(
|
402 |
fn=handle_upload,
|
403 |
inputs=[
|
404 |
gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
|
405 |
-
gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"])
|
|
|
406 |
],
|
407 |
outputs=gr.Dataframe(),
|
408 |
title="RAG System for RFP Analysis",
|
409 |
description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
|
410 |
-
The CSV file should contain requirements either as a single column or with a 'requirement' column header.
|
|
|
411 |
examples=[],
|
412 |
cache_examples=False
|
413 |
)
|
|
|
120 |
# iface = create_gradio_interface()
|
121 |
# iface.launch()
|
122 |
|
123 |
+
# import gradio as gr
|
124 |
+
# import pandas as pd
|
125 |
+
# import os
|
126 |
+
# import torch
|
127 |
+
# import zipfile
|
128 |
+
# import tempfile
|
129 |
+
# import shutil
|
130 |
+
# from bs4 import BeautifulSoup
|
131 |
+
# from typing import List, TypedDict
|
132 |
+
# from langchain_huggingface import HuggingFaceEmbeddings
|
133 |
+
# from langchain_community.vectorstores import Chroma
|
134 |
+
# from langchain_core.documents import Document
|
135 |
+
# from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
|
136 |
+
# from langchain_core.output_parsers import StrOutputParser
|
137 |
+
# from langchain_core.runnables import RunnablePassthrough
|
138 |
+
# from langchain_nvidia_ai_endpoints import ChatNVIDIA
|
139 |
+
# from langchain_core.pydantic_v1 import BaseModel, Field
|
140 |
+
# from langchain_community.tools.tavily_search import TavilySearchResults
|
141 |
+
# from langgraph.graph import END, StateGraph, START
|
142 |
+
# import chromadb
|
143 |
+
# import io
|
144 |
+
|
145 |
+
# # Environment variables setup
|
146 |
+
# os.environ["TAVILY_API_KEY"] = "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"
|
147 |
+
# os.environ["NVIDIA_API_KEY"] = "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"
|
148 |
+
# os.environ["LANGCHAIN_PROJECT"] = "RAG project"
|
149 |
+
|
150 |
+
# class GradeDocuments(BaseModel):
|
151 |
+
# """Binary score for relevance check on retrieved documents."""
|
152 |
+
# binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
|
153 |
+
|
154 |
+
# class GraphState(TypedDict):
|
155 |
+
# """Represents the state of our graph."""
|
156 |
+
# question: str
|
157 |
+
# generation: str
|
158 |
+
# decision: str
|
159 |
+
# documents: List[str]
|
160 |
+
|
161 |
+
# import os
|
162 |
+
# from bs4 import BeautifulSoup
|
163 |
+
# import pandas as pd
|
164 |
+
|
165 |
+
# def process_documents(temp_dir):
|
166 |
+
# """Process documents from the extracted zip folder with enhanced error handling."""
|
167 |
+
# d = {"chunk": [], "url": []}
|
168 |
+
|
169 |
+
# # Debug information
|
170 |
+
# print(f"Scanning directory: {temp_dir}")
|
171 |
+
|
172 |
+
# file_count = 0
|
173 |
+
# processed_count = 0
|
174 |
+
# error_count = 0
|
175 |
+
|
176 |
+
# # Recursively traverse the directory
|
177 |
+
# for root, dirs, files in os.walk(temp_dir):
|
178 |
+
# for file_name in files:
|
179 |
+
# file_count += 1
|
180 |
+
# file_path = os.path.join(root, file_name)
|
181 |
+
# print(f"Processing file: {file_path}")
|
182 |
+
|
183 |
+
# try:
|
184 |
+
# # Try different encodings
|
185 |
+
# encodings = ['utf-8', 'latin-1', 'cp1252']
|
186 |
+
# content = None
|
187 |
+
|
188 |
+
# for encoding in encodings:
|
189 |
+
# try:
|
190 |
+
# with open(file_path, 'r', encoding=encoding) as stream:
|
191 |
+
# content = stream.read()
|
192 |
+
# break
|
193 |
+
# except UnicodeDecodeError:
|
194 |
+
# continue
|
195 |
+
|
196 |
+
# if content is None:
|
197 |
+
# print(f"Failed to read file {file_path} with any encoding")
|
198 |
+
# error_count += 1
|
199 |
+
# continue
|
200 |
+
|
201 |
+
# soup = BeautifulSoup(content, "html.parser")
|
202 |
+
|
203 |
+
# title = soup.find("title")
|
204 |
+
# title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
|
205 |
+
|
206 |
+
# main_content = soup.find("main")
|
207 |
+
# text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
|
208 |
+
|
209 |
+
# if not text_content.strip():
|
210 |
+
# print(f"No content extracted from {file_path}")
|
211 |
+
# error_count += 1
|
212 |
+
# continue
|
213 |
+
|
214 |
+
# full_content = f"{title_text}\n\n{text_content}"
|
215 |
+
|
216 |
+
# d["chunk"].append(full_content)
|
217 |
+
# d["url"].append("https://" + file_name.replace("=", "/"))
|
218 |
+
# processed_count += 1
|
219 |
+
# print(f"Successfully processed {file_path}")
|
220 |
+
|
221 |
+
# except Exception as e:
|
222 |
+
# print(f"Error processing file {file_path}: {str(e)}")
|
223 |
+
# error_count += 1
|
224 |
+
# continue
|
225 |
+
|
226 |
+
# print(f"\nProcessing Summary:")
|
227 |
+
# print(f"Total files found: {file_count}")
|
228 |
+
# print(f"Successfully processed: {processed_count}")
|
229 |
+
# print(f"Errors encountered: {error_count}")
|
230 |
+
|
231 |
+
# if not d["chunk"]:
|
232 |
+
# raise ValueError(f"No valid documents were processed. Processed {file_count} files with {error_count} errors.")
|
233 |
+
|
234 |
+
# return pd.DataFrame(d)
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
+
# # The rest of the code remains the same...
|
239 |
+
|
240 |
+
# def setup_rag_system(temp_dir):
|
241 |
+
# """Initialize the RAG system with the provided documents."""
|
242 |
+
# # Initialize embedding model
|
243 |
+
# model_name = "dunzhang/stella_en_1.5B_v5"
|
244 |
+
# model_kwargs = {'trust_remote_code': 'True'}
|
245 |
+
# embedding_model = HuggingFaceEmbeddings(
|
246 |
+
# model_name=model_name,
|
247 |
+
# show_progress=True,
|
248 |
+
# model_kwargs=model_kwargs
|
249 |
+
# )
|
250 |
+
|
251 |
+
# # Process documents
|
252 |
+
# df = process_documents(temp_dir)
|
253 |
+
# if df.empty:
|
254 |
+
# raise ValueError("No valid documents were processed")
|
255 |
+
|
256 |
+
# df["chunk_id"] = range(len(df))
|
257 |
+
|
258 |
+
# # Create documents list
|
259 |
+
# list_of_documents = [
|
260 |
+
# Document(
|
261 |
+
# page_content=record['chunk'],
|
262 |
+
# metadata={"source_url": record['url']}
|
263 |
+
# )
|
264 |
+
# for record in df[['chunk', 'url']].to_dict(orient='records')
|
265 |
+
# ]
|
266 |
+
|
267 |
+
# # Setup vector store
|
268 |
+
# ids = [str(i) for i in df['chunk_id'].to_list()]
|
269 |
+
# client = chromadb.PersistentClient(path=tempfile.mkdtemp())
|
270 |
+
# vector_store = Chroma(
|
271 |
+
# client=client,
|
272 |
+
# collection_name="rag-chroma",
|
273 |
+
# embedding_function=embedding_model,
|
274 |
+
# )
|
275 |
+
|
276 |
+
# # Add documents in batches
|
277 |
+
# batch_size = 100
|
278 |
+
# for i in range(0, len(list_of_documents), batch_size):
|
279 |
+
# end_idx = min(i + batch_size, len(list_of_documents))
|
280 |
+
# vector_store.add_documents(
|
281 |
+
# documents=list_of_documents[i:end_idx],
|
282 |
+
# ids=ids[i:end_idx]
|
283 |
+
# )
|
284 |
+
|
285 |
+
# return vector_store
|
286 |
+
|
287 |
+
# def create_workflow(vector_store):
|
288 |
+
# """Create the RAG workflow."""
|
289 |
+
# retriever = vector_store.as_retriever(search_kwargs={"k": 7})
|
290 |
+
# llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
|
291 |
+
|
292 |
+
# rag_prompt = PromptTemplate.from_template(
|
293 |
+
# """You are an assistant for responding to Request For Proposal documents for a
|
294 |
+
# bidder in the field of Data Science and Engineering. Use the following pieces
|
295 |
+
# of retrieved context to respond to the requests. If you don't know the answer,
|
296 |
+
# just say that you don't know. Provide detailed responses with specific examples
|
297 |
+
# and capabilities where possible.
|
298 |
+
|
299 |
+
# Question: {question}
|
300 |
+
# Context: {context}
|
301 |
+
# Answer:"""
|
302 |
+
# )
|
303 |
+
|
304 |
+
# def format_docs(result):
|
305 |
+
# return "\n\n".join(doc.page_content for doc in result)
|
306 |
+
|
307 |
+
# rag_chain = (
|
308 |
+
# {"context": retriever | format_docs, "question": RunnablePassthrough()}
|
309 |
+
# | rag_prompt
|
310 |
+
# | llm
|
311 |
+
# | StrOutputParser()
|
312 |
+
# )
|
313 |
+
|
314 |
+
# return rag_chain
|
315 |
+
|
316 |
+
# def preprocess_csv(csv_file):
|
317 |
+
# """Preprocess the CSV file to ensure proper format."""
|
318 |
+
# try:
|
319 |
+
# # First try reading as is
|
320 |
+
# df = pd.read_csv(csv_file.name, encoding='latin-1')
|
321 |
+
|
322 |
+
# # If there's only one column and no header
|
323 |
+
# if len(df.columns) == 1 and df.columns[0] != 'requirement':
|
324 |
+
# # Read again with no header and assign column name
|
325 |
+
# df = pd.read_csv(csv_file.name, encoding='latin-1', header=None, names=['requirement'])
|
326 |
+
|
327 |
+
# # If there's no 'requirement' column, assume first column is requirements
|
328 |
+
# if 'requirement' not in df.columns:
|
329 |
+
# df = df.rename(columns={df.columns[0]: 'requirement'})
|
330 |
+
|
331 |
+
# return df
|
332 |
+
# except Exception as e:
|
333 |
+
# # If standard CSV reading fails, try reading as plain text
|
334 |
+
# try:
|
335 |
+
# with open(csv_file.name, 'r', encoding='latin-1') as f:
|
336 |
+
# requirements = f.read().strip().split('\n')
|
337 |
+
# return pd.DataFrame({'requirement': requirements})
|
338 |
+
# except Exception as e2:
|
339 |
+
# raise ValueError(f"Could not process CSV file: {str(e2)}")
|
340 |
+
|
341 |
+
# def handle_upload(zip_file, csv_file):
|
342 |
+
# """Handle file uploads and process requirements with enhanced error handling."""
|
343 |
+
# try:
|
344 |
+
# # Create temporary directory
|
345 |
+
# temp_dir = tempfile.mkdtemp()
|
346 |
+
# print(f"Created temporary directory: {temp_dir}")
|
347 |
+
|
348 |
+
# try:
|
349 |
+
# # Extract zip file
|
350 |
+
# print(f"Extracting ZIP file: {zip_file.name}")
|
351 |
+
# with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
352 |
+
# zip_ref.extractall(temp_dir)
|
353 |
+
# print(f"ZIP contents: {zip_ref.namelist()}")
|
354 |
+
|
355 |
+
# # Process documents
|
356 |
+
# print("Processing documents...")
|
357 |
+
# df = process_documents(temp_dir)
|
358 |
+
# print(f"Processed {len(df)} documents")
|
359 |
+
|
360 |
+
# # Preprocess and read requirements CSV
|
361 |
+
# print("Processing CSV file...")
|
362 |
+
# requirements_df = preprocess_csv(csv_file)
|
363 |
+
# print(f"Found {len(requirements_df)} requirements")
|
364 |
+
|
365 |
+
# # Setup RAG system
|
366 |
+
# print("Setting up RAG system...")
|
367 |
+
# vector_store = setup_rag_system(temp_dir)
|
368 |
+
# rag_chain = create_workflow(vector_store)
|
369 |
+
|
370 |
+
# # Process requirements
|
371 |
+
# results = []
|
372 |
+
# for idx, req in enumerate(requirements_df['requirement'], 1):
|
373 |
+
# print(f"Processing requirement {idx}/{len(requirements_df)}")
|
374 |
+
# try:
|
375 |
+
# response = rag_chain.invoke(req)
|
376 |
+
# results.append({
|
377 |
+
# 'requirement': req,
|
378 |
+
# 'response': response
|
379 |
+
# })
|
380 |
+
# except Exception as e:
|
381 |
+
# error_msg = f"Error processing requirement: {str(e)}"
|
382 |
+
# print(error_msg)
|
383 |
+
# results.append({
|
384 |
+
# 'requirement': req,
|
385 |
+
# 'response': error_msg
|
386 |
+
# })
|
387 |
+
|
388 |
+
# return pd.DataFrame(results)
|
389 |
+
|
390 |
+
# finally:
|
391 |
+
# # Cleanup
|
392 |
+
# print(f"Cleaning up temporary directory: {temp_dir}")
|
393 |
+
# shutil.rmtree(temp_dir)
|
394 |
+
|
395 |
+
# except Exception as e:
|
396 |
+
# error_msg = f"Processing error: {str(e)}"
|
397 |
+
# print(error_msg)
|
398 |
+
# return pd.DataFrame([{'error': error_msg}])
|
399 |
+
# def main():
|
400 |
+
# """Main function to run the Gradio interface."""
|
401 |
+
# iface = gr.Interface(
|
402 |
+
# fn=handle_upload,
|
403 |
+
# inputs=[
|
404 |
+
# gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
|
405 |
+
# gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"])
|
406 |
+
# ],
|
407 |
+
# outputs=gr.Dataframe(),
|
408 |
+
# title="RAG System for RFP Analysis",
|
409 |
+
# description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
|
410 |
+
# The CSV file should contain requirements either as a single column or with a 'requirement' column header.""",
|
411 |
+
# examples=[],
|
412 |
+
# cache_examples=False
|
413 |
+
# )
|
414 |
+
|
415 |
+
# iface.launch(share=True)
|
416 |
+
|
417 |
+
# if __name__ == "__main__":
|
418 |
+
# main()
|
419 |
+
|
420 |
import gradio as gr
|
421 |
import pandas as pd
|
422 |
import os
|
|
|
441 |
|
442 |
# Environment variables setup
|
443 |
os.environ["TAVILY_API_KEY"] = "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"
|
|
|
444 |
os.environ["LANGCHAIN_PROJECT"] = "RAG project"
|
445 |
|
446 |
class GradeDocuments(BaseModel):
|
|
|
454 |
decision: str
|
455 |
documents: List[str]
|
456 |
|
|
|
|
|
|
|
|
|
457 |
def process_documents(temp_dir):
|
458 |
"""Process documents from the extracted zip folder with enhanced error handling."""
|
459 |
d = {"chunk": [], "url": []}
|
|
|
525 |
|
526 |
return pd.DataFrame(d)
|
527 |
|
|
|
|
|
|
|
|
|
528 |
def setup_rag_system(temp_dir):
|
529 |
"""Initialize the RAG system with the provided documents."""
|
530 |
# Initialize embedding model
|
|
|
626 |
except Exception as e2:
|
627 |
raise ValueError(f"Could not process CSV file: {str(e2)}")
|
628 |
|
629 |
+
def handle_upload(zip_file, csv_file, nvidia_api_key):
|
630 |
"""Handle file uploads and process requirements with enhanced error handling."""
|
631 |
try:
|
632 |
+
# Set the NVIDIA API key from user input
|
633 |
+
os.environ["NVIDIA_API_KEY"] = nvidia_api_key
|
634 |
+
|
635 |
# Create temporary directory
|
636 |
temp_dir = tempfile.mkdtemp()
|
637 |
print(f"Created temporary directory: {temp_dir}")
|
|
|
687 |
error_msg = f"Processing error: {str(e)}"
|
688 |
print(error_msg)
|
689 |
return pd.DataFrame([{'error': error_msg}])
|
690 |
+
|
691 |
def main():
|
692 |
"""Main function to run the Gradio interface."""
|
693 |
iface = gr.Interface(
|
694 |
fn=handle_upload,
|
695 |
inputs=[
|
696 |
gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
|
697 |
+
gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"]),
|
698 |
+
gr.Textbox(label="Enter your NVIDIA API Key", type="password")
|
699 |
],
|
700 |
outputs=gr.Dataframe(),
|
701 |
title="RAG System for RFP Analysis",
|
702 |
description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
|
703 |
+
The CSV file should contain requirements either as a single column or with a 'requirement' column header.
|
704 |
+
Enter your NVIDIA API key to use the service.""",
|
705 |
examples=[],
|
706 |
cache_examples=False
|
707 |
)
|