|
import streamlit as st |
|
from PyPDF2 import PdfReader |
|
import pytesseract |
|
from PIL import Image |
|
import fitz |
|
import io |
|
|
|
import requests |
|
import os |
|
|
|
import faiss |
|
import numpy as np |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
my_token = os.getenv('my_repo_token') |
|
|
|
def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'): |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy() |
|
|
|
return embeddings |
|
|
|
|
|
def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'): |
|
|
|
all_texts = [question] + contexts |
|
embeddings = get_embeddings(all_texts, model_name=model_name) |
|
|
|
|
|
question_embedding = embeddings[0] |
|
context_embeddings = embeddings[1:] |
|
|
|
|
|
dimension = context_embeddings.shape[1] |
|
index = faiss.IndexFlatL2(dimension) |
|
index.add(context_embeddings) |
|
|
|
|
|
_, indices = index.search(question_embedding.reshape(1, -1), 1) |
|
|
|
|
|
most_relevant_index = indices[0][0] |
|
return contexts[most_relevant_index] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b" |
|
API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1" |
|
API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" |
|
headers = {"Authorization": f"Bearer {my_token}"} |
|
|
|
|
|
def query(payload): |
|
response = requests.post(API_URL_LLMA, headers=headers, json=payload) |
|
|
|
return response.json() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def answer_question_from_pdf(pdf_text, question): |
|
|
|
|
|
|
|
answer = query( {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",}) |
|
answer = answer[0]["generated_text"] |
|
answer = answer[answer.find("Answer")+6:] |
|
return answer |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
pdf_reader = PdfReader(pdf_file) |
|
pdf_text = "" |
|
pdf_arr = [] |
|
for page_num in range(len(pdf_reader.pages)): |
|
pdf_text = pdf_reader.pages[page_num].extract_text() |
|
pdf_arr.append(pdf_text) |
|
return pdf_arr |
|
|
|
st.title("PDF Explorer") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") |
|
|
|
if uploaded_file is not None: |
|
|
|
pdf_arr = extract_text_from_pdf(uploaded_file) |
|
|
|
st.write("PDF Uploaded Successfully.") |
|
|
|
|
|
question = st.text_input("Ask a question about the PDF") |
|
pdf_text = find_most_relevant_context(pdf_arr,question) |
|
|
|
if st.button("Get Answer"): |
|
if question: |
|
|
|
answer = answer_question_from_pdf(pdf_text, question) |
|
st.write("Answer:", answer) |
|
else: |
|
st.write("Please enter a question.") |
|
else: |
|
st.write("Please upload a PDF file.") |
|
|