Anne31415 commited on
Commit
4a9dfc8
1 Parent(s): e7ea365

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -34
app.py CHANGED
@@ -56,7 +56,7 @@ repo2 = Repository(
56
  clone_from="Anne31415/Chat_Store", # Replace with your repository URL
57
  token=os.environ["HUB_TOKEN"] # Use the secret token to authenticate
58
  )
59
- repo.git_pull() # Pull the latest changes (if any)
60
 
61
 
62
  # Step 2: Load the PDF File
@@ -69,16 +69,6 @@ pdf_path3 = "Private_Book/Kosten_Strukturdaten_RAG_vorbereited.pdf"
69
  api_key = os.getenv("OPENAI_API_KEY")
70
  # Retrieve the API key from st.secrets
71
 
72
- @st.cache_data
73
- def extract_text_from_pdf(pdf_path):
74
- text = ""
75
- reader = PdfReader(pdf_path)
76
- for page in reader.pages:
77
- text += page.extract_text() + " " # Concatenate text from each page
78
- return text
79
-
80
- # Use the function to get pdf_text
81
- pdf_text = extract_text_from_pdf(pdf_path3)
82
 
83
 
84
  @st.cache_resource
@@ -126,6 +116,8 @@ def load_vector_store(file_path, store_name, force_reload=False):
126
  return VectorStore
127
 
128
 
 
 
129
  # Utility function to load text from a PDF
130
  def load_pdf_text(file_path):
131
  pdf_reader = PdfReader(file_path)
@@ -134,6 +126,22 @@ def load_pdf_text(file_path):
134
  text += page.extract_text() or "" # Add fallback for pages where text extraction fails
135
  return text
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def load_chatbot():
138
  #return load_qa_chain(llm=OpenAI(), chain_type="stuff")
139
  return load_qa_chain(llm=OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")
@@ -245,7 +253,17 @@ def display_session_id():
245
  session_id = st.session_state['session_id']
246
  st.sidebar.markdown(f"**Ihre Session ID:** `{session_id}`")
247
  st.sidebar.markdown("Verwenden Sie diese ID als Referenz bei Mitteilungen oder Rückmeldungen.")
 
 
 
 
 
 
248
 
 
 
 
 
249
 
250
  def page1():
251
  try:
@@ -489,11 +507,19 @@ def page2():
489
 
490
 
491
 
 
 
492
  def page3():
493
  try:
494
  # Basic layout setup
495
  st.title("Kosten- und Strukturdaten der Krankenhäuser")
496
 
 
 
 
 
 
 
497
  # Initialize CromA client and handle collection
498
  chroma_client = chromadb.Client()
499
  try:
@@ -506,10 +532,7 @@ def page3():
506
 
507
  # Add documents to the collection if not already done
508
  if "documents_added" not in st.session_state:
509
- collection.add(
510
- documents=[pdf_text],
511
- ids=[("Kosten_Strukturdaten0602204")]
512
- )
513
  st.session_state["documents_added"] = True
514
 
515
  # Display chat history
@@ -522,25 +545,14 @@ def page3():
522
  full_query = ask_bot(query)
523
  st.session_state['chat_history_page3'].append(("User", query, "new"))
524
 
 
 
 
 
 
 
 
525
 
526
- # Query the CromA collection
527
- results = collection.query(
528
- query_texts=[full_query],
529
- n_results=5
530
- )
531
-
532
- # Process and display response from CromA results
533
- if results and results['documents']:
534
- try:
535
- # Accessing the first document of the first result
536
- top_document = results['documents'][0][0] # Adjusted access
537
- response = f"Top result: {top_document}"
538
- except KeyError as ke:
539
- st.error(f"KeyError encountered: {ke}")
540
- response = "Error in processing the response."
541
- else:
542
- response = "No results found for your query."
543
-
544
  st.session_state['chat_history_page3'].append(("Eve", response, "new"))
545
 
546
 
@@ -551,7 +563,38 @@ def page3():
551
  st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
552
 
553
  except Exception as e:
554
- st.error(f"An error occurred: {repr(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
 
556
 
557
  def page4():
 
56
  clone_from="Anne31415/Chat_Store", # Replace with your repository URL
57
  token=os.environ["HUB_TOKEN"] # Use the secret token to authenticate
58
  )
59
+ repo2.git_pull() # Pull the latest changes (if any)
60
 
61
 
62
  # Step 2: Load the PDF File
 
69
  api_key = os.getenv("OPENAI_API_KEY")
70
  # Retrieve the API key from st.secrets
71
 
 
 
 
 
 
 
 
 
 
 
72
 
73
 
74
  @st.cache_resource
 
116
  return VectorStore
117
 
118
 
119
+
120
+
121
  # Utility function to load text from a PDF
122
  def load_pdf_text(file_path):
123
  pdf_reader = PdfReader(file_path)
 
126
  text += page.extract_text() or "" # Add fallback for pages where text extraction fails
127
  return text
128
 
129
+
130
+
131
+ # Utility function to load text from a PDF and split it into pages
132
+ def load_pdf_text_by_page(file_path):
133
+ pdf_reader = PdfReader(file_path)
134
+ pages_text = []
135
+ for page in pdf_reader.pages:
136
+ # Extract text for each page and add it to the list
137
+ page_text = page.extract_text() or "" # Add fallback for pages where text extraction fails
138
+ pages_text.append(page_text)
139
+ return pages_text
140
+
141
+ # Use the new function to get a list of texts, each representing a page
142
+ pdf_pages = load_pdf_text_by_page(pdf_path3)
143
+
144
+
145
  def load_chatbot():
146
  #return load_qa_chain(llm=OpenAI(), chain_type="stuff")
147
  return load_qa_chain(llm=OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")
 
253
  session_id = st.session_state['session_id']
254
  st.sidebar.markdown(f"**Ihre Session ID:** `{session_id}`")
255
  st.sidebar.markdown("Verwenden Sie diese ID als Referenz bei Mitteilungen oder Rückmeldungen.")
256
+
257
+ def preprocess_and_store_pdf_text(pdf_path, collection, text_splitter):
258
+
259
+ # Load and split the PDF text
260
+ text = load_pdf_text(pdf_path)
261
+ chunks = text_splitter.split_text(text=text)
262
 
263
+ # Store each chunk as a separate document in CromA DB
264
+ for i, chunk in enumerate(chunks):
265
+ document_id = f"Chunk_{i+1}"
266
+ collection.add(documents=[chunk], ids=[document_id])
267
 
268
  def page1():
269
  try:
 
507
 
508
 
509
 
510
+
511
+
512
  def page3():
513
  try:
514
  # Basic layout setup
515
  st.title("Kosten- und Strukturdaten der Krankenhäuser")
516
 
517
+
518
+ # Initialize text splitter
519
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200, length_function=len)
520
+
521
+
522
+
523
  # Initialize CromA client and handle collection
524
  chroma_client = chromadb.Client()
525
  try:
 
532
 
533
  # Add documents to the collection if not already done
534
  if "documents_added" not in st.session_state:
535
+ preprocess_and_store_pdf_text(pdf_path3, collection, text_splitter)
 
 
 
536
  st.session_state["documents_added"] = True
537
 
538
  # Display chat history
 
545
  full_query = ask_bot(query)
546
  st.session_state['chat_history_page3'].append(("User", query, "new"))
547
 
548
+ # Query the CromA collection with error handling
549
+ try:
550
+ results = collection.query(query_texts=[full_query], n_results=5)
551
+ response = process_croma_results(results)
552
+ except Exception as query_exception:
553
+ log_error(f"CromA DB query error: {query_exception}") # Logging function to be implemented
554
+ response = "An error occurred while processing your query."
555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  st.session_state['chat_history_page3'].append(("Eve", response, "new"))
557
 
558
 
 
563
  st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
564
 
565
  except Exception as e:
566
+ log_error(f"General error in page3: {e}") # Log general errors
567
+ st.error(f"An unexpected error occurred: {repr(e)}")
568
+
569
+ def log_error(message):
570
+ """
571
+ Logs an error message. Can be enhanced to write to a file or external logging service.
572
+ """
573
+ # Example: Print to console, can be replaced with file logging or external service logging
574
+ print(message)
575
+
576
+ def process_croma_results(results):
577
+ """
578
+ Process the query results from CromA DB and generate a response.
579
+ """
580
+ if results and results['documents']:
581
+ try:
582
+ # Example processing: Extract and concatenate texts from top documents
583
+ top_documents = results['documents'][0] # Adjusted access
584
+ response_texts = [doc['text'] for doc in top_documents if 'text' in doc]
585
+ response = " ".join(response_texts[:3]) # Limiting to top 3 documents for brevity
586
+ except KeyError as ke:
587
+ response = "Error in processing the response."
588
+ else:
589
+ response = "No results found for your query."
590
+ return response
591
+
592
+ # TODO: Implement additional error handling and logging
593
+ # TODO: Review for security and performance improvements
594
+
595
+ # This is a modified snippet focusing on the querying and response handling for CromA DB.
596
+ # The full integration requires updating the main application code.
597
+
598
 
599
 
600
  def page4():