File size: 3,659 Bytes
43bb7df
 
 
eefd852
8c6a92e
bbe7373
 
fcb237a
43bb7df
 
 
eefd852
bbe7373
43bb7df
bbe7373
c601db3
 
 
 
 
fcb237a
bbe7373
fcb237a
c601db3
 
 
 
 
 
 
 
 
fcb237a
c601db3
 
 
bbe7373
 
 
8c6a92e
 
 
 
 
 
 
 
 
 
 
c601db3
 
 
 
 
 
 
 
43bb7df
 
 
bbe7373
43bb7df
 
 
 
 
c601db3
 
 
 
 
 
 
43bb7df
c601db3
 
 
 
 
bbe7373
c601db3
 
 
43bb7df
c601db3
 
 
 
 
 
 
 
 
eefd852
c601db3
 
 
 
 
 
 
 
 
43bb7df
c601db3
 
 
 
 
4834e9d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import streamlit as st
import fitz  # PyMuPDF
from google.cloud import language_v1
import requests
import json
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load the environment variables from the .env file
load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

# Initialize Pinecone
try:
    pc = Pinecone(api_key=pinecone_api_key)
except Exception as e:
    st.error(f"Error initializing Pinecone: {e}")
    st.stop()

index_name = 'pdf-analysis'
if index_name not in pc.list_indexes().names():
    try:
        pc.create_index(
            name=index_name,
            dimension=768,
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )
    except Exception as e:
        st.error(f"Error creating Pinecone index: {e}")
        st.stop()

# Function to analyze entities and get embeddings using the API key
def get_embeddings(text, api_key):
    url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "document": {
            "type": "PLAIN_TEXT",
            "content": text
        },
        "encodingType": "UTF8"
    }
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        embeddings = response.json()
        return embeddings
    except requests.exceptions.RequestException as e:
        st.error(f"Error getting embeddings: {e}")
        return None

# Streamlit app
st.title("Chat with Your Document")
st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.")

# File upload
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    try:
        # Load the PDF file
        pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        pdf_text = ""
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            pdf_text += page.get_text()

        # Get embeddings for the PDF text
        embeddings = get_embeddings(pdf_text, google_api_key)
        if embeddings is None:
            st.stop()
        vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])]

        # Create or connect to Pinecone index
        index = pc.Index(index_name)
        index.upsert(vectors)

        # Chat with the document
        user_input = st.text_input("Ask a question about the document:")
        if st.button("Ask"):
            if user_input:
                # Get embeddings for the user query
                user_query_embeddings = get_embeddings(user_input, google_api_key)
                if user_query_embeddings is None:
                    st.stop()
                query_vector = user_query_embeddings['entities'][0]['name']

                # Perform similarity search
                results = index.query(query_vector, top_k=5)
                response_text = "Relevant information from the document:\n"
                for result in results['matches']:
                    response_text += f"Text: {result['text']}, Score: {result['score']}\n"
                
                st.write(response_text.strip())
            else:
                st.write("Please enter a question to ask.")

        # Display the PDF text
        st.write("Extracted Text from PDF:")
        st.write(pdf_text)
    except Exception as e:
        st.error(f"Error processing PDF file: {e}")
##