Spaces:

archis99
/

Insurance_DocAI

Sleeping

App Files Files Community

Insurance_DocAI / app.py

archis99

Fix: Frontend changes in app.py

888e988 5 months ago

raw

history blame contribute delete

6.44 kB

	import os
	import streamlit as st
	import hashlib
	import time
	from pinecone import Pinecone
	import google.generativeai as genai

	# Import your data processing functions
	from data_processor import (
	get_document_text,
	split_text_into_chunks,
	generate_embeddings,
	index_chunks_in_pinecone,
	)

	# --- Page Configuration ---
	st.set_page_config(
	page_title="Insurance DocAI 🤖",
	page_icon="📄",
	layout="wide"
	)

	# --- API and Client Initialization ---
	# Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
	try:
	GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
	PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]

	genai.configure(api_key=GOOGLE_API_KEY)
	pc = Pinecone(api_key=PINECONE_API_KEY)
	INDEX_NAME = "hackrx-policy-index"

	except Exception as e:
	st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨")
	st.stop()


	# --- Helper Functions (adapted from your main.py) ---

	def create_doc_id_from_url(url: str) -> str:
	"""Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
	return hashlib.sha256(url.encode('utf-8')).hexdigest()

	def generate_answer_with_gemini(question: str, context: str) -> str:
	"""Generates an answer using Gemini based on the provided context."""
	model = genai.GenerativeModel('gemini-1.5-flash-latest')
	prompt = f"""
	You are an expert insurance policy analyst.
	Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
	Do not use any external knowledge or make assumptions.
	If the answer cannot be found in the provided context, state that clearly.

	CONTEXT:
	---
	{context}
	---

	QUESTION: {question}

	ANSWER:
	"""
	try:
	response = model.generate_content(prompt)
	return response.text.strip() if response.parts else "The model's response was empty."
	except Exception as e:
	return f"An error occurred while generating the answer: {e}"

	# --- Caching ---
	# Use Streamlit's caching to avoid re-processing the same document repeatedly.
	@st.cache_data(show_spinner=False)
	def process_document(doc_url):
	"""
	Full pipeline: Downloads, chunks, embeds, and indexes a document.
	This function is cached, so it only runs once per URL.
	"""
	with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
	namespace = create_doc_id_from_url(doc_url)
	index = pc.Index(INDEX_NAME)

	# Check if the document is already processed by checking the namespace
	stats = index.describe_index_stats()
	if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
	st.success(f"Document '{doc_url}' is already processed and ready for questions.")
	return namespace

	# Full processing pipeline
	document_text = get_document_text(doc_url)
	if not document_text:
	st.error("Failed to retrieve or extract text from the document.")
	return None

	chunks = split_text_into_chunks(document_text)
	if not chunks:
	st.error("Failed to split document into chunks.")
	return None

	embeddings = generate_embeddings(chunks)
	if not embeddings:
	st.error("Failed to generate embeddings.")
	return None

	index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
	st.success(f"Successfully processed and indexed document: {doc_url}")
	return namespace

	# --- Streamlit UI ---

	st.title("📄 Insurance DocAI: Your Insurance Policy Expert")
	st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")

	# Initialize session state for conversation history
	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Input for document URL
	doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")

	if doc_url:
	# Process the document and get the namespace
	namespace = process_document(doc_url)

	if namespace:
	st.info("Document is ready. You can now ask questions below.")

	# Display chat messages from history on app rerun
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Accept user input
	if prompt := st.chat_input("Ask a question about the policy"):
	# Add user message to chat history
	st.session_state.messages.append({"role": "user", "content": prompt})
	# Display user message in chat message container
	with st.chat_message("user"):
	st.markdown(prompt)

	# Display assistant response in chat message container
	with st.chat_message("assistant"):
	message_placeholder = st.empty()
	with st.spinner("Thinking..."):
	# 1. Generate embedding for the question
	question_embedding_response = genai.embed_content(
	model="models/embedding-001",
	content=prompt,
	task_type="retrieval_query"
	)
	question_embedding = question_embedding_response['embedding']

	# 2. Query Pinecone for relevant context
	index = pc.Index(INDEX_NAME)
	search_results = index.query(
	vector=question_embedding,
	top_k=5,
	include_metadata=True,
	namespace=namespace
	)

	# 3. Assemble the context and generate the answer
	context_chunks = [match.metadata['text'] for match in search_results.matches]
	context = "\n\n".join(context_chunks)

	answer = generate_answer_with_gemini(prompt, context)

	message_placeholder.markdown(answer)

	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": answer})