Spaces:
Sleeping
Sleeping
File size: 3,659 Bytes
43bb7df eefd852 8c6a92e bbe7373 fcb237a 43bb7df eefd852 bbe7373 43bb7df bbe7373 c601db3 fcb237a bbe7373 fcb237a c601db3 fcb237a c601db3 bbe7373 8c6a92e c601db3 43bb7df bbe7373 43bb7df c601db3 43bb7df c601db3 bbe7373 c601db3 43bb7df c601db3 eefd852 c601db3 43bb7df c601db3 4834e9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import streamlit as st
import fitz # PyMuPDF
from google.cloud import language_v1
import requests
import json
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
# Load the environment variables from the .env file
load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
# Initialize Pinecone
try:
pc = Pinecone(api_key=pinecone_api_key)
except Exception as e:
st.error(f"Error initializing Pinecone: {e}")
st.stop()
index_name = 'pdf-analysis'
if index_name not in pc.list_indexes().names():
try:
pc.create_index(
name=index_name,
dimension=768,
metric='euclidean',
spec=ServerlessSpec(
cloud='aws',
region='us-west-2'
)
)
except Exception as e:
st.error(f"Error creating Pinecone index: {e}")
st.stop()
# Function to analyze entities and get embeddings using the API key
def get_embeddings(text, api_key):
url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
headers = {
"Content-Type": "application/json",
}
data = {
"document": {
"type": "PLAIN_TEXT",
"content": text
},
"encodingType": "UTF8"
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
embeddings = response.json()
return embeddings
except requests.exceptions.RequestException as e:
st.error(f"Error getting embeddings: {e}")
return None
# Streamlit app
st.title("Chat with Your Document")
st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.")
# File upload
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
try:
# Load the PDF file
pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
pdf_text = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
pdf_text += page.get_text()
# Get embeddings for the PDF text
embeddings = get_embeddings(pdf_text, google_api_key)
if embeddings is None:
st.stop()
vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])]
# Create or connect to Pinecone index
index = pc.Index(index_name)
index.upsert(vectors)
# Chat with the document
user_input = st.text_input("Ask a question about the document:")
if st.button("Ask"):
if user_input:
# Get embeddings for the user query
user_query_embeddings = get_embeddings(user_input, google_api_key)
if user_query_embeddings is None:
st.stop()
query_vector = user_query_embeddings['entities'][0]['name']
# Perform similarity search
results = index.query(query_vector, top_k=5)
response_text = "Relevant information from the document:\n"
for result in results['matches']:
response_text += f"Text: {result['text']}, Score: {result['score']}\n"
st.write(response_text.strip())
else:
st.write("Please enter a question to ask.")
# Display the PDF text
st.write("Extracted Text from PDF:")
st.write(pdf_text)
except Exception as e:
st.error(f"Error processing PDF file: {e}")
## |