Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import hashlib | |
| import time | |
| from pinecone import Pinecone | |
| import google.generativeai as genai | |
| # Import your data processing functions | |
| from data_processor import ( | |
| get_document_text, | |
| split_text_into_chunks, | |
| generate_embeddings, | |
| index_chunks_in_pinecone, | |
| ) | |
| # --- Page Configuration --- | |
| st.set_page_config( | |
| page_title="Insurance DocAI π€", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # --- API and Client Initialization --- | |
| # Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face | |
| try: | |
| GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"] | |
| PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"] | |
| genai.configure(api_key=GOOGLE_API_KEY) | |
| pc = Pinecone(api_key=PINECONE_API_KEY) | |
| INDEX_NAME = "hackrx-policy-index" | |
| except Exception as e: | |
| st.error("π¨ Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="π¨") | |
| st.stop() | |
| # --- Helper Functions (adapted from your main.py) --- | |
| def create_doc_id_from_url(url: str) -> str: | |
| """Creates a stable SHA256 hash of the URL to use as a document ID (namespace).""" | |
| return hashlib.sha256(url.encode('utf-8')).hexdigest() | |
| def generate_answer_with_gemini(question: str, context: str) -> str: | |
| """Generates an answer using Gemini based on the provided context.""" | |
| model = genai.GenerativeModel('gemini-1.5-flash-latest') | |
| prompt = f""" | |
| You are an expert insurance policy analyst. | |
| Based ONLY on the context provided below from an insurance document, answer the user's question concisely. | |
| Do not use any external knowledge or make assumptions. | |
| If the answer cannot be found in the provided context, state that clearly. | |
| CONTEXT: | |
| --- | |
| {context} | |
| --- | |
| QUESTION: {question} | |
| ANSWER: | |
| """ | |
| try: | |
| response = model.generate_content(prompt) | |
| return response.text.strip() if response.parts else "The model's response was empty." | |
| except Exception as e: | |
| return f"An error occurred while generating the answer: {e}" | |
| # --- Caching --- | |
| # Use Streamlit's caching to avoid re-processing the same document repeatedly. | |
| def process_document(doc_url): | |
| """ | |
| Full pipeline: Downloads, chunks, embeds, and indexes a document. | |
| This function is cached, so it only runs once per URL. | |
| """ | |
| with st.spinner(f"Processing document: {doc_url}... This may take a moment."): | |
| namespace = create_doc_id_from_url(doc_url) | |
| index = pc.Index(INDEX_NAME) | |
| # Check if the document is already processed by checking the namespace | |
| stats = index.describe_index_stats() | |
| if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0: | |
| st.success(f"Document '{doc_url}' is already processed and ready for questions.") | |
| return namespace | |
| # Full processing pipeline | |
| document_text = get_document_text(doc_url) | |
| if not document_text: | |
| st.error("Failed to retrieve or extract text from the document.") | |
| return None | |
| chunks = split_text_into_chunks(document_text) | |
| if not chunks: | |
| st.error("Failed to split document into chunks.") | |
| return None | |
| embeddings = generate_embeddings(chunks) | |
| if not embeddings: | |
| st.error("Failed to generate embeddings.") | |
| return None | |
| index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace) | |
| st.success(f"Successfully processed and indexed document: {doc_url}") | |
| return namespace | |
| # --- Streamlit UI --- | |
| st.title("π Insurance DocAI: Your Insurance Policy Expert") | |
| st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.") | |
| # Initialize session state for conversation history | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| # Input for document URL | |
| doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input") | |
| if doc_url: | |
| # Process the document and get the namespace | |
| namespace = process_document(doc_url) | |
| if namespace: | |
| st.info("Document is ready. You can now ask questions below.") | |
| # Display chat messages from history on app rerun | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.markdown(message["content"]) | |
| # Accept user input | |
| if prompt := st.chat_input("Ask a question about the policy"): | |
| # Add user message to chat history | |
| st.session_state.messages.append({"role": "user", "content": prompt}) | |
| # Display user message in chat message container | |
| with st.chat_message("user"): | |
| st.markdown(prompt) | |
| # Display assistant response in chat message container | |
| with st.chat_message("assistant"): | |
| message_placeholder = st.empty() | |
| with st.spinner("Thinking..."): | |
| # 1. Generate embedding for the question | |
| question_embedding_response = genai.embed_content( | |
| model="models/embedding-001", | |
| content=prompt, | |
| task_type="retrieval_query" | |
| ) | |
| question_embedding = question_embedding_response['embedding'] | |
| # 2. Query Pinecone for relevant context | |
| index = pc.Index(INDEX_NAME) | |
| search_results = index.query( | |
| vector=question_embedding, | |
| top_k=5, | |
| include_metadata=True, | |
| namespace=namespace | |
| ) | |
| # 3. Assemble the context and generate the answer | |
| context_chunks = [match.metadata['text'] for match in search_results.matches] | |
| context = "\n\n".join(context_chunks) | |
| answer = generate_answer_with_gemini(prompt, context) | |
| message_placeholder.markdown(answer) | |
| # Add assistant response to chat history | |
| st.session_state.messages.append({"role": "assistant", "content": answer}) |