File size: 3,878 Bytes
a8bff12
 
6a855ac
 
3497391
a8bff12
1751cab
0d5476d
 
 
c93ff5d
 
 
 
0d5476d
c93ff5d
 
 
 
 
 
 
 
 
 
 
 
 
f85d7f1
c93ff5d
 
 
 
 
 
 
0d5476d
c93ff5d
 
 
 
0d5476d
c93ff5d
 
0d5476d
c93ff5d
 
0d5476d
 
 
 
 
 
 
 
 
c93ff5d
 
553822f
ee56620
9ae92ce
0d5476d
236237b
0d5476d
 
5cb07cf
9a9676f
1751cab
 
0d5476d
333d1fc
0d5476d
 
a8bff12
 
 
 
 
4400522
cb0759e
88d59c2
d104a1b
88d59c2
a8bff12
 
70c3821
 
0d5476d
70c3821
 
0d5476d
 
a8bff12
 
 
 
 
 
 
 
0d5476d
a8bff12
 
 
 
 
0d5476d
a8bff12
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import fitz
import io
# from transformers import pipeline
import requests
import os

import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
my_token = os.getenv('my_repo_token')
# Function to get embeddings using a pre-trained model
def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    return embeddings

# Function to find the most relevant context using FAISS
def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    # Get embeddings for contexts and question
    all_texts = [question] + contexts
    embeddings = get_embeddings(all_texts, model_name=model_name)
    
    # Separate the question embedding and context embeddings
    question_embedding = embeddings[0]
    context_embeddings = embeddings[1:]
    
    # Create a FAISS index and add context embeddings
    dimension = context_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(context_embeddings)
    
    # Search for the nearest neighbor to the question embedding
    _, indices = index.search(question_embedding.reshape(1, -1), 1)
    
    # Get the most relevant context
    most_relevant_index = indices[0][0]
    return contexts[most_relevant_index]










API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {my_token}"}
# pipe = pipeline("text-generation", model="mistralai/Mixtral-8x7B-v0.1", token = my_token)

def query(payload):
	response = requests.post(API_URL_LLMA, headers=headers, json=payload)
    
	return response.json()
    # return pipe(payload)
	



# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
    # This function should return the answer to the question based on the PDF content
    # Here we just return a mock response
 
    answer = query(   {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",})
    answer = answer[0]["generated_text"]
    answer = answer[answer.find("Answer")+6:]
    return answer
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    pdf_text = ""
    pdf_arr = []
    for page_num in range(len(pdf_reader.pages)):
        pdf_text = pdf_reader.pages[page_num].extract_text()
        pdf_arr.append(pdf_text)
    return pdf_arr
# Streamlit app
st.title("PDF Explorer")

# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file is not None:
    # Extract text from uploaded PDF
    pdf_arr = extract_text_from_pdf(uploaded_file)
    
    st.write("PDF Uploaded Successfully.")
    
    # Text area for entering a question
    question = st.text_input("Ask a question about the PDF")
    pdf_text = find_most_relevant_context(pdf_arr,question)
    
    if st.button("Get Answer"):
        if question:
            # Get the answer from the backend
            answer = answer_question_from_pdf(pdf_text, question)
            st.write("Answer:", answer)
        else:
            st.write("Please enter a question.")
else:
    st.write("Please upload a PDF file.")