Spaces:

Waseemhassan771
/

chat_document

Running

App Files Files Community

chat_document / app.py

Waseemhassan771

Update app.py

4834e9d verified 6 months ago

raw

history blame contribute delete

3.66 kB

	import os
	import streamlit as st
	import fitz # PyMuPDF
	from google.cloud import language_v1
	import requests
	import json
	from dotenv import load_dotenv
	from pinecone import Pinecone, ServerlessSpec

	# Load the environment variables from the .env file
	load_dotenv()
	google_api_key = os.getenv('GOOGLE_API_KEY')
	pinecone_api_key = os.getenv('PINECONE_API_KEY')

	# Initialize Pinecone
	try:
	pc = Pinecone(api_key=pinecone_api_key)
	except Exception as e:
	st.error(f"Error initializing Pinecone: {e}")
	st.stop()

	index_name = 'pdf-analysis'
	if index_name not in pc.list_indexes().names():
	try:
	pc.create_index(
	name=index_name,
	dimension=768,
	metric='euclidean',
	spec=ServerlessSpec(
	cloud='aws',
	region='us-west-2'
	)
	)
	except Exception as e:
	st.error(f"Error creating Pinecone index: {e}")
	st.stop()

	# Function to analyze entities and get embeddings using the API key
	def get_embeddings(text, api_key):
	url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
	headers = {
	"Content-Type": "application/json",
	}
	data = {
	"document": {
	"type": "PLAIN_TEXT",
	"content": text
	},
	"encodingType": "UTF8"
	}
	try:
	response = requests.post(url, headers=headers, json=data)
	response.raise_for_status()
	embeddings = response.json()
	return embeddings
	except requests.exceptions.RequestException as e:
	st.error(f"Error getting embeddings: {e}")
	return None

	# Streamlit app
	st.title("Chat with Your Document")
	st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.")

	# File upload
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	try:
	# Load the PDF file
	pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
	pdf_text = ""
	for page_num in range(pdf_document.page_count):
	page = pdf_document.load_page(page_num)
	pdf_text += page.get_text()

	# Get embeddings for the PDF text
	embeddings = get_embeddings(pdf_text, google_api_key)
	if embeddings is None:
	st.stop()
	vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])]

	# Create or connect to Pinecone index
	index = pc.Index(index_name)
	index.upsert(vectors)

	# Chat with the document
	user_input = st.text_input("Ask a question about the document:")
	if st.button("Ask"):
	if user_input:
	# Get embeddings for the user query
	user_query_embeddings = get_embeddings(user_input, google_api_key)
	if user_query_embeddings is None:
	st.stop()
	query_vector = user_query_embeddings['entities'][0]['name']

	# Perform similarity search
	results = index.query(query_vector, top_k=5)
	response_text = "Relevant information from the document:\n"
	for result in results['matches']:
	response_text += f"Text: {result['text']}, Score: {result['score']}\n"

	st.write(response_text.strip())
	else:
	st.write("Please enter a question to ask.")

	# Display the PDF text
	st.write("Extracted Text from PDF:")
	st.write(pdf_text)
	except Exception as e:
	st.error(f"Error processing PDF file: {e}")
	##