pdf / app.py
navid72m's picture
Update app.py
cb0759e verified
raw
history blame contribute delete
No virus
3.88 kB
import streamlit as st
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import fitz
import io
# from transformers import pipeline
import requests
import os
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
my_token = os.getenv('my_repo_token')
# Function to get embeddings using a pre-trained model
def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
return embeddings
# Function to find the most relevant context using FAISS
def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'):
# Get embeddings for contexts and question
all_texts = [question] + contexts
embeddings = get_embeddings(all_texts, model_name=model_name)
# Separate the question embedding and context embeddings
question_embedding = embeddings[0]
context_embeddings = embeddings[1:]
# Create a FAISS index and add context embeddings
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)
# Search for the nearest neighbor to the question embedding
_, indices = index.search(question_embedding.reshape(1, -1), 1)
# Get the most relevant context
most_relevant_index = indices[0][0]
return contexts[most_relevant_index]
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {my_token}"}
# pipe = pipeline("text-generation", model="mistralai/Mixtral-8x7B-v0.1", token = my_token)
def query(payload):
response = requests.post(API_URL_LLMA, headers=headers, json=payload)
return response.json()
# return pipe(payload)
# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
# This function should return the answer to the question based on the PDF content
# Here we just return a mock response
answer = query( {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",})
answer = answer[0]["generated_text"]
answer = answer[answer.find("Answer")+6:]
return answer
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
pdf_text = ""
pdf_arr = []
for page_num in range(len(pdf_reader.pages)):
pdf_text = pdf_reader.pages[page_num].extract_text()
pdf_arr.append(pdf_text)
return pdf_arr
# Streamlit app
st.title("PDF Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Extract text from uploaded PDF
pdf_arr = extract_text_from_pdf(uploaded_file)
st.write("PDF Uploaded Successfully.")
# Text area for entering a question
question = st.text_input("Ask a question about the PDF")
pdf_text = find_most_relevant_context(pdf_arr,question)
if st.button("Get Answer"):
if question:
# Get the answer from the backend
answer = answer_question_from_pdf(pdf_text, question)
st.write("Answer:", answer)
else:
st.write("Please enter a question.")
else:
st.write("Please upload a PDF file.")