Insurance_DocAI / app.py
archis99's picture
Fix: Frontend changes in app.py
888e988
import os
import streamlit as st
import hashlib
import time
from pinecone import Pinecone
import google.generativeai as genai
# Import your data processing functions
from data_processor import (
get_document_text,
split_text_into_chunks,
generate_embeddings,
index_chunks_in_pinecone,
)
# --- Page Configuration ---
st.set_page_config(
page_title="Insurance DocAI πŸ€–",
page_icon="πŸ“„",
layout="wide"
)
# --- API and Client Initialization ---
# Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
try:
GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
genai.configure(api_key=GOOGLE_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "hackrx-policy-index"
except Exception as e:
st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨")
st.stop()
# --- Helper Functions (adapted from your main.py) ---
def create_doc_id_from_url(url: str) -> str:
"""Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
return hashlib.sha256(url.encode('utf-8')).hexdigest()
def generate_answer_with_gemini(question: str, context: str) -> str:
"""Generates an answer using Gemini based on the provided context."""
model = genai.GenerativeModel('gemini-1.5-flash-latest')
prompt = f"""
You are an expert insurance policy analyst.
Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
Do not use any external knowledge or make assumptions.
If the answer cannot be found in the provided context, state that clearly.
CONTEXT:
---
{context}
---
QUESTION: {question}
ANSWER:
"""
try:
response = model.generate_content(prompt)
return response.text.strip() if response.parts else "The model's response was empty."
except Exception as e:
return f"An error occurred while generating the answer: {e}"
# --- Caching ---
# Use Streamlit's caching to avoid re-processing the same document repeatedly.
@st.cache_data(show_spinner=False)
def process_document(doc_url):
"""
Full pipeline: Downloads, chunks, embeds, and indexes a document.
This function is cached, so it only runs once per URL.
"""
with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
namespace = create_doc_id_from_url(doc_url)
index = pc.Index(INDEX_NAME)
# Check if the document is already processed by checking the namespace
stats = index.describe_index_stats()
if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
st.success(f"Document '{doc_url}' is already processed and ready for questions.")
return namespace
# Full processing pipeline
document_text = get_document_text(doc_url)
if not document_text:
st.error("Failed to retrieve or extract text from the document.")
return None
chunks = split_text_into_chunks(document_text)
if not chunks:
st.error("Failed to split document into chunks.")
return None
embeddings = generate_embeddings(chunks)
if not embeddings:
st.error("Failed to generate embeddings.")
return None
index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
st.success(f"Successfully processed and indexed document: {doc_url}")
return namespace
# --- Streamlit UI ---
st.title("πŸ“„ Insurance DocAI: Your Insurance Policy Expert")
st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")
# Initialize session state for conversation history
if "messages" not in st.session_state:
st.session_state.messages = []
# Input for document URL
doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")
if doc_url:
# Process the document and get the namespace
namespace = process_document(doc_url)
if namespace:
st.info("Document is ready. You can now ask questions below.")
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Accept user input
if prompt := st.chat_input("Ask a question about the policy"):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(prompt)
# Display assistant response in chat message container
with st.chat_message("assistant"):
message_placeholder = st.empty()
with st.spinner("Thinking..."):
# 1. Generate embedding for the question
question_embedding_response = genai.embed_content(
model="models/embedding-001",
content=prompt,
task_type="retrieval_query"
)
question_embedding = question_embedding_response['embedding']
# 2. Query Pinecone for relevant context
index = pc.Index(INDEX_NAME)
search_results = index.query(
vector=question_embedding,
top_k=5,
include_metadata=True,
namespace=namespace
)
# 3. Assemble the context and generate the answer
context_chunks = [match.metadata['text'] for match in search_results.matches]
context = "\n\n".join(context_chunks)
answer = generate_answer_with_gemini(prompt, context)
message_placeholder.markdown(answer)
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": answer})