File size: 3,878 Bytes
a8bff12 6a855ac 3497391 a8bff12 1751cab 0d5476d c93ff5d 0d5476d c93ff5d f85d7f1 c93ff5d 0d5476d c93ff5d 0d5476d c93ff5d 0d5476d c93ff5d 0d5476d c93ff5d 553822f ee56620 9ae92ce 0d5476d 236237b 0d5476d 5cb07cf 9a9676f 1751cab 0d5476d 333d1fc 0d5476d a8bff12 4400522 cb0759e 88d59c2 d104a1b 88d59c2 a8bff12 70c3821 0d5476d 70c3821 0d5476d a8bff12 0d5476d a8bff12 0d5476d a8bff12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import streamlit as st
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import fitz
import io
# from transformers import pipeline
import requests
import os
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
my_token = os.getenv('my_repo_token')
# Function to get embeddings using a pre-trained model
def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
return embeddings
# Function to find the most relevant context using FAISS
def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'):
# Get embeddings for contexts and question
all_texts = [question] + contexts
embeddings = get_embeddings(all_texts, model_name=model_name)
# Separate the question embedding and context embeddings
question_embedding = embeddings[0]
context_embeddings = embeddings[1:]
# Create a FAISS index and add context embeddings
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)
# Search for the nearest neighbor to the question embedding
_, indices = index.search(question_embedding.reshape(1, -1), 1)
# Get the most relevant context
most_relevant_index = indices[0][0]
return contexts[most_relevant_index]
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {my_token}"}
# pipe = pipeline("text-generation", model="mistralai/Mixtral-8x7B-v0.1", token = my_token)
def query(payload):
response = requests.post(API_URL_LLMA, headers=headers, json=payload)
return response.json()
# return pipe(payload)
# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
# This function should return the answer to the question based on the PDF content
# Here we just return a mock response
answer = query( {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",})
answer = answer[0]["generated_text"]
answer = answer[answer.find("Answer")+6:]
return answer
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
pdf_text = ""
pdf_arr = []
for page_num in range(len(pdf_reader.pages)):
pdf_text = pdf_reader.pages[page_num].extract_text()
pdf_arr.append(pdf_text)
return pdf_arr
# Streamlit app
st.title("PDF Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Extract text from uploaded PDF
pdf_arr = extract_text_from_pdf(uploaded_file)
st.write("PDF Uploaded Successfully.")
# Text area for entering a question
question = st.text_input("Ask a question about the PDF")
pdf_text = find_most_relevant_context(pdf_arr,question)
if st.button("Get Answer"):
if question:
# Get the answer from the backend
answer = answer_question_from_pdf(pdf_text, question)
st.write("Answer:", answer)
else:
st.write("Please enter a question.")
else:
st.write("Please upload a PDF file.")
|