import streamlit as st
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import fitz
import io
# from transformers import pipeline
import requests
import os
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
my_token = os.getenv('my_repo_token')
# Function to get embeddings using a pre-trained model
def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
return embeddings
# Function to find the most relevant context using FAISS
def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'):
# Get embeddings for contexts and question
all_texts = [question] + contexts
embeddings = get_embeddings(all_texts, model_name=model_name)
# Separate the question embedding and context embeddings
question_embedding = embeddings[0]
context_embeddings = embeddings[1:]
# Create a FAISS index and add context embeddings
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
# Search for the nearest neighbor to the question embedding
_, indices = index.search(question_embedding.reshape(1, -1), 1)
# Get the most relevant context
most_relevant_index = indices[0][0]
return contexts[most_relevant_index]
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {my_token}"}
# pipe = pipeline("text-generation", model="mistralai/Mixtral-8x7B-v0.1", token = my_token)
def query(payload):
response = requests.post(API_URL_LLMA, headers=headers, json=payload)
return response.json()
# return pipe(payload)
# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
# This function should return the answer to the question based on the PDF content
# Here we just return a mock response
answer = query( {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",})
answer = answer[0]["generated_text"]
answer = answer[answer.find("Answer")+6:]
return answer
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
pdf_text = ""
pdf_arr = []
for page_num in range(len(pdf_reader.pages)):
pdf_text = pdf_reader.pages[page_num].extract_text()
return pdf_arr
# Streamlit app
st.title("PDF Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Extract text from uploaded PDF
pdf_arr = extract_text_from_pdf(uploaded_file)
st.write("PDF Uploaded Successfully.")
# Text area for entering a question
question = st.text_input("Ask a question about the PDF")
pdf_text = find_most_relevant_context(pdf_arr,question)
if st.button("Get Answer"):
if question:
# Get the answer from the backend
answer = answer_question_from_pdf(pdf_text, question)
st.write("Answer:", answer)
st.write("Please enter a question.")
st.write("Please upload a PDF file.")