Spaces:

Anne31415
/

Public_BookBot

Sleeping

App Files Files Community

Anne31415 commited on Feb 6, 2024

Commit

4a9dfc8

verified ·

1 Parent(s): e7ea365

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -34

app.py CHANGED Viewed

@@ -56,7 +56,7 @@ repo2 = Repository(
     clone_from="Anne31415/Chat_Store",  # Replace with your repository URL
     token=os.environ["HUB_TOKEN"]  # Use the secret token to authenticate
 )
-repo.git_pull()  # Pull the latest changes (if any)
 # Step 2: Load the PDF File
@@ -69,16 +69,6 @@ pdf_path3 = "Private_Book/Kosten_Strukturdaten_RAG_vorbereited.pdf"
 api_key = os.getenv("OPENAI_API_KEY")
 # Retrieve the API key from st.secrets
-@st.cache_data
-def extract_text_from_pdf(pdf_path):
-    text = ""
-    reader = PdfReader(pdf_path)
-    for page in reader.pages:
-        text += page.extract_text() + " "  # Concatenate text from each page
-    return text
-# Use the function to get pdf_text
-pdf_text = extract_text_from_pdf(pdf_path3)
 @st.cache_resource
@@ -126,6 +116,8 @@ def load_vector_store(file_path, store_name, force_reload=False):
     return VectorStore
 # Utility function to load text from a PDF
 def load_pdf_text(file_path):
     pdf_reader = PdfReader(file_path)
@@ -134,6 +126,22 @@ def load_pdf_text(file_path):
         text += page.extract_text() or ""  # Add fallback for pages where text extraction fails
     return text
 def load_chatbot():
     #return load_qa_chain(llm=OpenAI(), chain_type="stuff")
     return load_qa_chain(llm=OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")
@@ -245,7 +253,17 @@ def display_session_id():
     session_id = st.session_state['session_id']
     st.sidebar.markdown(f"**Ihre Session ID:** `{session_id}`")
     st.sidebar.markdown("Verwenden Sie diese ID als Referenz bei Mitteilungen oder Rückmeldungen.")
 def page1():
     try:
@@ -489,11 +507,19 @@ def page2():
 def page3():
     try:
         # Basic layout setup
         st.title("Kosten- und Strukturdaten der Krankenhäuser")
         # Initialize CromA client and handle collection
         chroma_client = chromadb.Client()
         try:
@@ -506,10 +532,7 @@ def page3():
         # Add documents to the collection if not already done
         if "documents_added" not in st.session_state:
-            collection.add(
-                documents=[pdf_text],
-                ids=[("Kosten_Strukturdaten0602204")]
-            )
             st.session_state["documents_added"] = True
         # Display chat history
@@ -522,25 +545,14 @@ def page3():
             full_query = ask_bot(query)
             st.session_state['chat_history_page3'].append(("User", query, "new"))
-            # Query the CromA collection
-            results = collection.query(
-                query_texts=[full_query],
-                n_results=5
-            )
-            # Process and display response from CromA results
-            if results and results['documents']:
-                try:
-                    # Accessing the first document of the first result
-                    top_document = results['documents'][0][0]  # Adjusted access
-                    response = f"Top result: {top_document}"
-                except KeyError as ke:
-                    st.error(f"KeyError encountered: {ke}")
-                    response = "Error in processing the response."
-            else:
-                response = "No results found for your query."
             st.session_state['chat_history_page3'].append(("Eve", response, "new"))
@@ -551,7 +563,38 @@ def page3():
             st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
     except Exception as e:
-        st.error(f"An error occurred: {repr(e)}")
 def page4():

     clone_from="Anne31415/Chat_Store",  # Replace with your repository URL
     token=os.environ["HUB_TOKEN"]  # Use the secret token to authenticate
 )
+repo2.git_pull()  # Pull the latest changes (if any)
 # Step 2: Load the PDF File
 api_key = os.getenv("OPENAI_API_KEY")
 # Retrieve the API key from st.secrets
 @st.cache_resource
     return VectorStore
 # Utility function to load text from a PDF
 def load_pdf_text(file_path):
     pdf_reader = PdfReader(file_path)
         text += page.extract_text() or ""  # Add fallback for pages where text extraction fails
     return text
+# Utility function to load text from a PDF and split it into pages
+def load_pdf_text_by_page(file_path):
+    pdf_reader = PdfReader(file_path)
+    pages_text = []
+    for page in pdf_reader.pages:
+        # Extract text for each page and add it to the list
+        page_text = page.extract_text() or ""  # Add fallback for pages where text extraction fails
+        pages_text.append(page_text)
+    return pages_text
+# Use the new function to get a list of texts, each representing a page
+pdf_pages = load_pdf_text_by_page(pdf_path3)
 def load_chatbot():
     #return load_qa_chain(llm=OpenAI(), chain_type="stuff")
     return load_qa_chain(llm=OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")
     session_id = st.session_state['session_id']
     st.sidebar.markdown(f"**Ihre Session ID:** `{session_id}`")
     st.sidebar.markdown("Verwenden Sie diese ID als Referenz bei Mitteilungen oder Rückmeldungen.")
+def preprocess_and_store_pdf_text(pdf_path, collection, text_splitter):
+    # Load and split the PDF text
+    text = load_pdf_text(pdf_path)
+    chunks = text_splitter.split_text(text=text)
+    # Store each chunk as a separate document in CromA DB
+    for i, chunk in enumerate(chunks):
+        document_id = f"Chunk_{i+1}"
+        collection.add(documents=[chunk], ids=[document_id])
 def page1():
     try:
 def page3():
     try:
         # Basic layout setup
         st.title("Kosten- und Strukturdaten der Krankenhäuser")
+        # Initialize text splitter
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200, length_function=len)
         # Initialize CromA client and handle collection
         chroma_client = chromadb.Client()
         try:
         # Add documents to the collection if not already done
         if "documents_added" not in st.session_state:
+            preprocess_and_store_pdf_text(pdf_path3, collection, text_splitter)
             st.session_state["documents_added"] = True
         # Display chat history
             full_query = ask_bot(query)
             st.session_state['chat_history_page3'].append(("User", query, "new"))
+            # Query the CromA collection with error handling
+            try:
+                results = collection.query(query_texts=[full_query], n_results=5)
+                response = process_croma_results(results)
+            except Exception as query_exception:
+                log_error(f"CromA DB query error: {query_exception}")  # Logging function to be implemented
+                response = "An error occurred while processing your query."
             st.session_state['chat_history_page3'].append(("Eve", response, "new"))
             st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
     except Exception as e:
+        log_error(f"General error in page3: {e}")  # Log general errors
+        st.error(f"An unexpected error occurred: {repr(e)}")
+def log_error(message):
+    """
+    Logs an error message. Can be enhanced to write to a file or external logging service.
+    """
+    # Example: Print to console, can be replaced with file logging or external service logging
+    print(message)
+def process_croma_results(results):
+    """
+    Process the query results from CromA DB and generate a response.
+    """
+    if results and results['documents']:
+        try:
+            # Example processing: Extract and concatenate texts from top documents
+            top_documents = results['documents'][0]  # Adjusted access
+            response_texts = [doc['text'] for doc in top_documents if 'text' in doc]
+            response = " ".join(response_texts[:3])  # Limiting to top 3 documents for brevity
+        except KeyError as ke:
+            response = "Error in processing the response."
+    else:
+        response = "No results found for your query."
+    return response
+# TODO: Implement additional error handling and logging
+# TODO: Review for security and performance improvements
+# This is a modified snippet focusing on the querying and response handling for CromA DB.
+# The full integration requires updating the main application code.
 def page4():