pdf_reader

Sleeping

App Files Files Community

pdf_reader / app.py

ARKamaliD

adjusted to my local ollama instance

c83b1e7 over 1 year ago

raw

history blame contribute delete

3.14 kB

	import streamlit as st
	import pytesseract
	from PIL import Image
	import fitz
	import io

	import requests

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	from ollama import Client

	client = Client(host='http://localhost:11434')


	def find_most_relevant_context(contexts, question, max_features=10000):
	# Vectorize contexts and question with limited features
	tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
	tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)

	# Compute cosine similarity between question and contexts
	similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

	# Get index of context with highest similarity
	most_relevant_index = similarity_scores.argmax()

	return contexts[most_relevant_index]


	ollama_url = "http://localhost:11434/api/generate"
	ollama_headers = {"Content-Type": "application/json"}


	def query(payload):
	response = requests.post(ollama_url, headers=ollama_headers, json=payload)
	return response.json()


	# Mock function for answering questions from the PDF
	# Replace this with your actual backend function
	def answer_question_from_pdf(pdf_text, question):
	# This function should return the answer to the question based on the PDF content
	# Here we just return a mock response

	return (client.chat(
	model='mixtral:8x7b',
	messages=[
	{
	'role': 'user',
	'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
	},
	]))


	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	# Open the PDF file
	pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")

	pdf_arr = []

	# Iterate through each page
	for page_num in range(len(pdf_document)):
	# Get the page
	page = pdf_document.load_page(page_num)

	# Get the page as an image
	pix = page.get_pixmap()
	img = Image.open(io.BytesIO(pix.tobytes()))

	# Perform OCR on the image
	pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
	pdf_text = pytesseract.image_to_string(img)
	pdf_arr.append(pdf_text)

	return pdf_arr


	# Streamlit app
	st.title("PDF Explorer")

	# File uploader
	uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

	if uploaded_file is not None:
	# Extract text from uploaded PDF
	pdf_arr = extract_text_from_pdf(uploaded_file)

	st.write("PDF Uploaded Successfully.")

	# Text area for entering a question
	question = st.text_input("Ask a question about the PDF")
	pdf_text = find_most_relevant_context(pdf_arr, question)

	if st.button("Get Answer"):
	if question:
	# Get the answer from the backend
	answer = answer_question_from_pdf(pdf_text, question)
	st.write("Answer:", answer)
	else:
	st.write("Please enter a question.")
	else:
	st.write("Please upload a PDF file.")