Spaces:

navid72m
/

pdf

Sleeping

App Files Files Community

pdf / app.py

navid72m

Update app.py

cb0759e verified 4 months ago

raw

history blame contribute delete

No virus

3.88 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	import pytesseract
	from PIL import Image
	import fitz
	import io
	# from transformers import pipeline
	import requests
	import os

	import faiss
	import numpy as np
	from transformers import AutoTokenizer, AutoModel
	import torch
	my_token = os.getenv('my_repo_token')
	# Function to get embeddings using a pre-trained model
	def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'):
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

	return embeddings

	# Function to find the most relevant context using FAISS
	def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'):
	# Get embeddings for contexts and question
	all_texts = [question] + contexts
	embeddings = get_embeddings(all_texts, model_name=model_name)

	# Separate the question embedding and context embeddings
	question_embedding = embeddings[0]
	context_embeddings = embeddings[1:]

	# Create a FAISS index and add context embeddings
	dimension = context_embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(context_embeddings)

	# Search for the nearest neighbor to the question embedding
	_, indices = index.search(question_embedding.reshape(1, -1), 1)

	# Get the most relevant context
	most_relevant_index = indices[0][0]
	return contexts[most_relevant_index]










	API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
	API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
	API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
	headers = {"Authorization": f"Bearer {my_token}"}
	# pipe = pipeline("text-generation", model="mistralai/Mixtral-8x7B-v0.1", token = my_token)

	def query(payload):
	response = requests.post(API_URL_LLMA, headers=headers, json=payload)

	return response.json()
	# return pipe(payload)




	# Mock function for answering questions from the PDF
	# Replace this with your actual backend function
	def answer_question_from_pdf(pdf_text, question):
	# This function should return the answer to the question based on the PDF content
	# Here we just return a mock response

	answer = query( {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",})
	answer = answer[0]["generated_text"]
	answer = answer[answer.find("Answer")+6:]
	return answer
	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	pdf_reader = PdfReader(pdf_file)
	pdf_text = ""
	pdf_arr = []
	for page_num in range(len(pdf_reader.pages)):
	pdf_text = pdf_reader.pages[page_num].extract_text()
	pdf_arr.append(pdf_text)
	return pdf_arr
	# Streamlit app
	st.title("PDF Explorer")

	# File uploader
	uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

	if uploaded_file is not None:
	# Extract text from uploaded PDF
	pdf_arr = extract_text_from_pdf(uploaded_file)

	st.write("PDF Uploaded Successfully.")

	# Text area for entering a question
	question = st.text_input("Ask a question about the PDF")
	pdf_text = find_most_relevant_context(pdf_arr,question)

	if st.button("Get Answer"):
	if question:
	# Get the answer from the backend
	answer = answer_question_from_pdf(pdf_text, question)
	st.write("Answer:", answer)
	else:
	st.write("Please enter a question.")
	else:
	st.write("Please upload a PDF file.")