pdf_reader / app.py
ARKamaliD's picture
adjusted to my local ollama instance
c83b1e7
import streamlit as st
import pytesseract
from PIL import Image
import fitz
import io
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ollama import Client
client = Client(host='http://localhost:11434')
def find_most_relevant_context(contexts, question, max_features=10000):
# Vectorize contexts and question with limited features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
# Compute cosine similarity between question and contexts
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
# Get index of context with highest similarity
most_relevant_index = similarity_scores.argmax()
return contexts[most_relevant_index]
ollama_url = "http://localhost:11434/api/generate"
ollama_headers = {"Content-Type": "application/json"}
def query(payload):
response = requests.post(ollama_url, headers=ollama_headers, json=payload)
return response.json()
# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
# This function should return the answer to the question based on the PDF content
# Here we just return a mock response
return (client.chat(
model='mixtral:8x7b',
messages=[
{
'role': 'user',
'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
},
]))
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
# Open the PDF file
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
pdf_arr = []
# Iterate through each page
for page_num in range(len(pdf_document)):
# Get the page
page = pdf_document.load_page(page_num)
# Get the page as an image
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes()))
# Perform OCR on the image
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pdf_text = pytesseract.image_to_string(img)
pdf_arr.append(pdf_text)
return pdf_arr
# Streamlit app
st.title("PDF Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Extract text from uploaded PDF
pdf_arr = extract_text_from_pdf(uploaded_file)
st.write("PDF Uploaded Successfully.")
# Text area for entering a question
question = st.text_input("Ask a question about the PDF")
pdf_text = find_most_relevant_context(pdf_arr, question)
if st.button("Get Answer"):
if question:
# Get the answer from the backend
answer = answer_question_from_pdf(pdf_text, question)
st.write("Answer:", answer)
else:
st.write("Please enter a question.")
else:
st.write("Please upload a PDF file.")