Codequestt commited on
Commit
260d06a
·
verified ·
1 Parent(s): b392da4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -12
app.py CHANGED
@@ -120,6 +120,303 @@
120
  # iface = create_gradio_interface()
121
  # iface.launch()
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  import gradio as gr
124
  import pandas as pd
125
  import os
@@ -144,7 +441,6 @@ import io
144
 
145
  # Environment variables setup
146
  os.environ["TAVILY_API_KEY"] = "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"
147
- os.environ["NVIDIA_API_KEY"] = "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"
148
  os.environ["LANGCHAIN_PROJECT"] = "RAG project"
149
 
150
  class GradeDocuments(BaseModel):
@@ -158,10 +454,6 @@ class GraphState(TypedDict):
158
  decision: str
159
  documents: List[str]
160
 
161
- import os
162
- from bs4 import BeautifulSoup
163
- import pandas as pd
164
-
165
  def process_documents(temp_dir):
166
  """Process documents from the extracted zip folder with enhanced error handling."""
167
  d = {"chunk": [], "url": []}
@@ -233,10 +525,6 @@ def process_documents(temp_dir):
233
 
234
  return pd.DataFrame(d)
235
 
236
-
237
-
238
- # The rest of the code remains the same...
239
-
240
  def setup_rag_system(temp_dir):
241
  """Initialize the RAG system with the provided documents."""
242
  # Initialize embedding model
@@ -338,9 +626,12 @@ def preprocess_csv(csv_file):
338
  except Exception as e2:
339
  raise ValueError(f"Could not process CSV file: {str(e2)}")
340
 
341
- def handle_upload(zip_file, csv_file):
342
  """Handle file uploads and process requirements with enhanced error handling."""
343
  try:
 
 
 
344
  # Create temporary directory
345
  temp_dir = tempfile.mkdtemp()
346
  print(f"Created temporary directory: {temp_dir}")
@@ -396,18 +687,21 @@ def handle_upload(zip_file, csv_file):
396
  error_msg = f"Processing error: {str(e)}"
397
  print(error_msg)
398
  return pd.DataFrame([{'error': error_msg}])
 
399
  def main():
400
  """Main function to run the Gradio interface."""
401
  iface = gr.Interface(
402
  fn=handle_upload,
403
  inputs=[
404
  gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
405
- gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"])
 
406
  ],
407
  outputs=gr.Dataframe(),
408
  title="RAG System for RFP Analysis",
409
  description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
410
- The CSV file should contain requirements either as a single column or with a 'requirement' column header.""",
 
411
  examples=[],
412
  cache_examples=False
413
  )
 
120
  # iface = create_gradio_interface()
121
  # iface.launch()
122
 
123
+ # import gradio as gr
124
+ # import pandas as pd
125
+ # import os
126
+ # import torch
127
+ # import zipfile
128
+ # import tempfile
129
+ # import shutil
130
+ # from bs4 import BeautifulSoup
131
+ # from typing import List, TypedDict
132
+ # from langchain_huggingface import HuggingFaceEmbeddings
133
+ # from langchain_community.vectorstores import Chroma
134
+ # from langchain_core.documents import Document
135
+ # from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
136
+ # from langchain_core.output_parsers import StrOutputParser
137
+ # from langchain_core.runnables import RunnablePassthrough
138
+ # from langchain_nvidia_ai_endpoints import ChatNVIDIA
139
+ # from langchain_core.pydantic_v1 import BaseModel, Field
140
+ # from langchain_community.tools.tavily_search import TavilySearchResults
141
+ # from langgraph.graph import END, StateGraph, START
142
+ # import chromadb
143
+ # import io
144
+
145
+ # # Environment variables setup
146
+ # os.environ["TAVILY_API_KEY"] = "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"
147
+ # os.environ["NVIDIA_API_KEY"] = "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"
148
+ # os.environ["LANGCHAIN_PROJECT"] = "RAG project"
149
+
150
+ # class GradeDocuments(BaseModel):
151
+ # """Binary score for relevance check on retrieved documents."""
152
+ # binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
153
+
154
+ # class GraphState(TypedDict):
155
+ # """Represents the state of our graph."""
156
+ # question: str
157
+ # generation: str
158
+ # decision: str
159
+ # documents: List[str]
160
+
161
+ # import os
162
+ # from bs4 import BeautifulSoup
163
+ # import pandas as pd
164
+
165
+ # def process_documents(temp_dir):
166
+ # """Process documents from the extracted zip folder with enhanced error handling."""
167
+ # d = {"chunk": [], "url": []}
168
+
169
+ # # Debug information
170
+ # print(f"Scanning directory: {temp_dir}")
171
+
172
+ # file_count = 0
173
+ # processed_count = 0
174
+ # error_count = 0
175
+
176
+ # # Recursively traverse the directory
177
+ # for root, dirs, files in os.walk(temp_dir):
178
+ # for file_name in files:
179
+ # file_count += 1
180
+ # file_path = os.path.join(root, file_name)
181
+ # print(f"Processing file: {file_path}")
182
+
183
+ # try:
184
+ # # Try different encodings
185
+ # encodings = ['utf-8', 'latin-1', 'cp1252']
186
+ # content = None
187
+
188
+ # for encoding in encodings:
189
+ # try:
190
+ # with open(file_path, 'r', encoding=encoding) as stream:
191
+ # content = stream.read()
192
+ # break
193
+ # except UnicodeDecodeError:
194
+ # continue
195
+
196
+ # if content is None:
197
+ # print(f"Failed to read file {file_path} with any encoding")
198
+ # error_count += 1
199
+ # continue
200
+
201
+ # soup = BeautifulSoup(content, "html.parser")
202
+
203
+ # title = soup.find("title")
204
+ # title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
205
+
206
+ # main_content = soup.find("main")
207
+ # text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
208
+
209
+ # if not text_content.strip():
210
+ # print(f"No content extracted from {file_path}")
211
+ # error_count += 1
212
+ # continue
213
+
214
+ # full_content = f"{title_text}\n\n{text_content}"
215
+
216
+ # d["chunk"].append(full_content)
217
+ # d["url"].append("https://" + file_name.replace("=", "/"))
218
+ # processed_count += 1
219
+ # print(f"Successfully processed {file_path}")
220
+
221
+ # except Exception as e:
222
+ # print(f"Error processing file {file_path}: {str(e)}")
223
+ # error_count += 1
224
+ # continue
225
+
226
+ # print(f"\nProcessing Summary:")
227
+ # print(f"Total files found: {file_count}")
228
+ # print(f"Successfully processed: {processed_count}")
229
+ # print(f"Errors encountered: {error_count}")
230
+
231
+ # if not d["chunk"]:
232
+ # raise ValueError(f"No valid documents were processed. Processed {file_count} files with {error_count} errors.")
233
+
234
+ # return pd.DataFrame(d)
235
+
236
+
237
+
238
+ # # The rest of the code remains the same...
239
+
240
+ # def setup_rag_system(temp_dir):
241
+ # """Initialize the RAG system with the provided documents."""
242
+ # # Initialize embedding model
243
+ # model_name = "dunzhang/stella_en_1.5B_v5"
244
+ # model_kwargs = {'trust_remote_code': 'True'}
245
+ # embedding_model = HuggingFaceEmbeddings(
246
+ # model_name=model_name,
247
+ # show_progress=True,
248
+ # model_kwargs=model_kwargs
249
+ # )
250
+
251
+ # # Process documents
252
+ # df = process_documents(temp_dir)
253
+ # if df.empty:
254
+ # raise ValueError("No valid documents were processed")
255
+
256
+ # df["chunk_id"] = range(len(df))
257
+
258
+ # # Create documents list
259
+ # list_of_documents = [
260
+ # Document(
261
+ # page_content=record['chunk'],
262
+ # metadata={"source_url": record['url']}
263
+ # )
264
+ # for record in df[['chunk', 'url']].to_dict(orient='records')
265
+ # ]
266
+
267
+ # # Setup vector store
268
+ # ids = [str(i) for i in df['chunk_id'].to_list()]
269
+ # client = chromadb.PersistentClient(path=tempfile.mkdtemp())
270
+ # vector_store = Chroma(
271
+ # client=client,
272
+ # collection_name="rag-chroma",
273
+ # embedding_function=embedding_model,
274
+ # )
275
+
276
+ # # Add documents in batches
277
+ # batch_size = 100
278
+ # for i in range(0, len(list_of_documents), batch_size):
279
+ # end_idx = min(i + batch_size, len(list_of_documents))
280
+ # vector_store.add_documents(
281
+ # documents=list_of_documents[i:end_idx],
282
+ # ids=ids[i:end_idx]
283
+ # )
284
+
285
+ # return vector_store
286
+
287
+ # def create_workflow(vector_store):
288
+ # """Create the RAG workflow."""
289
+ # retriever = vector_store.as_retriever(search_kwargs={"k": 7})
290
+ # llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
291
+
292
+ # rag_prompt = PromptTemplate.from_template(
293
+ # """You are an assistant for responding to Request For Proposal documents for a
294
+ # bidder in the field of Data Science and Engineering. Use the following pieces
295
+ # of retrieved context to respond to the requests. If you don't know the answer,
296
+ # just say that you don't know. Provide detailed responses with specific examples
297
+ # and capabilities where possible.
298
+
299
+ # Question: {question}
300
+ # Context: {context}
301
+ # Answer:"""
302
+ # )
303
+
304
+ # def format_docs(result):
305
+ # return "\n\n".join(doc.page_content for doc in result)
306
+
307
+ # rag_chain = (
308
+ # {"context": retriever | format_docs, "question": RunnablePassthrough()}
309
+ # | rag_prompt
310
+ # | llm
311
+ # | StrOutputParser()
312
+ # )
313
+
314
+ # return rag_chain
315
+
316
+ # def preprocess_csv(csv_file):
317
+ # """Preprocess the CSV file to ensure proper format."""
318
+ # try:
319
+ # # First try reading as is
320
+ # df = pd.read_csv(csv_file.name, encoding='latin-1')
321
+
322
+ # # If there's only one column and no header
323
+ # if len(df.columns) == 1 and df.columns[0] != 'requirement':
324
+ # # Read again with no header and assign column name
325
+ # df = pd.read_csv(csv_file.name, encoding='latin-1', header=None, names=['requirement'])
326
+
327
+ # # If there's no 'requirement' column, assume first column is requirements
328
+ # if 'requirement' not in df.columns:
329
+ # df = df.rename(columns={df.columns[0]: 'requirement'})
330
+
331
+ # return df
332
+ # except Exception as e:
333
+ # # If standard CSV reading fails, try reading as plain text
334
+ # try:
335
+ # with open(csv_file.name, 'r', encoding='latin-1') as f:
336
+ # requirements = f.read().strip().split('\n')
337
+ # return pd.DataFrame({'requirement': requirements})
338
+ # except Exception as e2:
339
+ # raise ValueError(f"Could not process CSV file: {str(e2)}")
340
+
341
+ # def handle_upload(zip_file, csv_file):
342
+ # """Handle file uploads and process requirements with enhanced error handling."""
343
+ # try:
344
+ # # Create temporary directory
345
+ # temp_dir = tempfile.mkdtemp()
346
+ # print(f"Created temporary directory: {temp_dir}")
347
+
348
+ # try:
349
+ # # Extract zip file
350
+ # print(f"Extracting ZIP file: {zip_file.name}")
351
+ # with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
352
+ # zip_ref.extractall(temp_dir)
353
+ # print(f"ZIP contents: {zip_ref.namelist()}")
354
+
355
+ # # Process documents
356
+ # print("Processing documents...")
357
+ # df = process_documents(temp_dir)
358
+ # print(f"Processed {len(df)} documents")
359
+
360
+ # # Preprocess and read requirements CSV
361
+ # print("Processing CSV file...")
362
+ # requirements_df = preprocess_csv(csv_file)
363
+ # print(f"Found {len(requirements_df)} requirements")
364
+
365
+ # # Setup RAG system
366
+ # print("Setting up RAG system...")
367
+ # vector_store = setup_rag_system(temp_dir)
368
+ # rag_chain = create_workflow(vector_store)
369
+
370
+ # # Process requirements
371
+ # results = []
372
+ # for idx, req in enumerate(requirements_df['requirement'], 1):
373
+ # print(f"Processing requirement {idx}/{len(requirements_df)}")
374
+ # try:
375
+ # response = rag_chain.invoke(req)
376
+ # results.append({
377
+ # 'requirement': req,
378
+ # 'response': response
379
+ # })
380
+ # except Exception as e:
381
+ # error_msg = f"Error processing requirement: {str(e)}"
382
+ # print(error_msg)
383
+ # results.append({
384
+ # 'requirement': req,
385
+ # 'response': error_msg
386
+ # })
387
+
388
+ # return pd.DataFrame(results)
389
+
390
+ # finally:
391
+ # # Cleanup
392
+ # print(f"Cleaning up temporary directory: {temp_dir}")
393
+ # shutil.rmtree(temp_dir)
394
+
395
+ # except Exception as e:
396
+ # error_msg = f"Processing error: {str(e)}"
397
+ # print(error_msg)
398
+ # return pd.DataFrame([{'error': error_msg}])
399
+ # def main():
400
+ # """Main function to run the Gradio interface."""
401
+ # iface = gr.Interface(
402
+ # fn=handle_upload,
403
+ # inputs=[
404
+ # gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
405
+ # gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"])
406
+ # ],
407
+ # outputs=gr.Dataframe(),
408
+ # title="RAG System for RFP Analysis",
409
+ # description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
410
+ # The CSV file should contain requirements either as a single column or with a 'requirement' column header.""",
411
+ # examples=[],
412
+ # cache_examples=False
413
+ # )
414
+
415
+ # iface.launch(share=True)
416
+
417
+ # if __name__ == "__main__":
418
+ # main()
419
+
420
  import gradio as gr
421
  import pandas as pd
422
  import os
 
441
 
442
  # Environment variables setup
443
  os.environ["TAVILY_API_KEY"] = "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"
 
444
  os.environ["LANGCHAIN_PROJECT"] = "RAG project"
445
 
446
  class GradeDocuments(BaseModel):
 
454
  decision: str
455
  documents: List[str]
456
 
 
 
 
 
457
  def process_documents(temp_dir):
458
  """Process documents from the extracted zip folder with enhanced error handling."""
459
  d = {"chunk": [], "url": []}
 
525
 
526
  return pd.DataFrame(d)
527
 
 
 
 
 
528
  def setup_rag_system(temp_dir):
529
  """Initialize the RAG system with the provided documents."""
530
  # Initialize embedding model
 
626
  except Exception as e2:
627
  raise ValueError(f"Could not process CSV file: {str(e2)}")
628
 
629
+ def handle_upload(zip_file, csv_file, nvidia_api_key):
630
  """Handle file uploads and process requirements with enhanced error handling."""
631
  try:
632
+ # Set the NVIDIA API key from user input
633
+ os.environ["NVIDIA_API_KEY"] = nvidia_api_key
634
+
635
  # Create temporary directory
636
  temp_dir = tempfile.mkdtemp()
637
  print(f"Created temporary directory: {temp_dir}")
 
687
  error_msg = f"Processing error: {str(e)}"
688
  print(error_msg)
689
  return pd.DataFrame([{'error': error_msg}])
690
+
691
  def main():
692
  """Main function to run the Gradio interface."""
693
  iface = gr.Interface(
694
  fn=handle_upload,
695
  inputs=[
696
  gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
697
+ gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"]),
698
+ gr.Textbox(label="Enter your NVIDIA API Key", type="password")
699
  ],
700
  outputs=gr.Dataframe(),
701
  title="RAG System for RFP Analysis",
702
  description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
703
+ The CSV file should contain requirements either as a single column or with a 'requirement' column header.
704
+ Enter your NVIDIA API key to use the service.""",
705
  examples=[],
706
  cache_examples=False
707
  )