chat_document / app.py
Waseemhassan771's picture
Update app.py
4834e9d verified
import os
import streamlit as st
import fitz # PyMuPDF
from google.cloud import language_v1
import requests
import json
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
# Load the environment variables from the .env file
load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
# Initialize Pinecone
try:
pc = Pinecone(api_key=pinecone_api_key)
except Exception as e:
st.error(f"Error initializing Pinecone: {e}")
st.stop()
index_name = 'pdf-analysis'
if index_name not in pc.list_indexes().names():
try:
pc.create_index(
name=index_name,
dimension=768,
metric='euclidean',
spec=ServerlessSpec(
cloud='aws',
region='us-west-2'
)
)
except Exception as e:
st.error(f"Error creating Pinecone index: {e}")
st.stop()
# Function to analyze entities and get embeddings using the API key
def get_embeddings(text, api_key):
url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
headers = {
"Content-Type": "application/json",
}
data = {
"document": {
"type": "PLAIN_TEXT",
"content": text
},
"encodingType": "UTF8"
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
embeddings = response.json()
return embeddings
except requests.exceptions.RequestException as e:
st.error(f"Error getting embeddings: {e}")
return None
# Streamlit app
st.title("Chat with Your Document")
st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.")
# File upload
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
try:
# Load the PDF file
pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
pdf_text = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
pdf_text += page.get_text()
# Get embeddings for the PDF text
embeddings = get_embeddings(pdf_text, google_api_key)
if embeddings is None:
st.stop()
vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])]
# Create or connect to Pinecone index
index = pc.Index(index_name)
index.upsert(vectors)
# Chat with the document
user_input = st.text_input("Ask a question about the document:")
if st.button("Ask"):
if user_input:
# Get embeddings for the user query
user_query_embeddings = get_embeddings(user_input, google_api_key)
if user_query_embeddings is None:
st.stop()
query_vector = user_query_embeddings['entities'][0]['name']
# Perform similarity search
results = index.query(query_vector, top_k=5)
response_text = "Relevant information from the document:\n"
for result in results['matches']:
response_text += f"Text: {result['text']}, Score: {result['score']}\n"
st.write(response_text.strip())
else:
st.write("Please enter a question to ask.")
# Display the PDF text
st.write("Extracted Text from PDF:")
st.write(pdf_text)
except Exception as e:
st.error(f"Error processing PDF file: {e}")
##