Spaces:
Paused
Paused
File size: 3,141 Bytes
a8bff12 6a855ac 3497391 a8bff12 fa99cd6 0d5476d ef0480b c83b1e7 ef0480b 0d5476d c83b1e7 0d5476d c83b1e7 0d5476d c83b1e7 0d5476d c83b1e7 0d5476d c83b1e7 0d5476d a8bff12 c83b1e7 a8bff12 66f696e 441206e c83b1e7 0d5476d c83b1e7 66f696e c83b1e7 66f696e c83b1e7 66f696e c83b1e7 66f696e 0d5476d c83b1e7 0d5476d c83b1e7 a8bff12 0d5476d c83b1e7 a8bff12 c83b1e7 a8bff12 c83b1e7 a8bff12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import streamlit as st
import pytesseract
from PIL import Image
import fitz
import io
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ollama import Client
client = Client(host='http://localhost:11434')
def find_most_relevant_context(contexts, question, max_features=10000):
# Vectorize contexts and question with limited features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
# Compute cosine similarity between question and contexts
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
# Get index of context with highest similarity
most_relevant_index = similarity_scores.argmax()
return contexts[most_relevant_index]
ollama_url = "http://localhost:11434/api/generate"
ollama_headers = {"Content-Type": "application/json"}
def query(payload):
response = requests.post(ollama_url, headers=ollama_headers, json=payload)
return response.json()
# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
# This function should return the answer to the question based on the PDF content
# Here we just return a mock response
return (client.chat(
model='mixtral:8x7b',
messages=[
{
'role': 'user',
'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
},
]))
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
# Open the PDF file
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
pdf_arr = []
# Iterate through each page
for page_num in range(len(pdf_document)):
# Get the page
page = pdf_document.load_page(page_num)
# Get the page as an image
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes()))
# Perform OCR on the image
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pdf_text = pytesseract.image_to_string(img)
pdf_arr.append(pdf_text)
return pdf_arr
# Streamlit app
st.title("PDF Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Extract text from uploaded PDF
pdf_arr = extract_text_from_pdf(uploaded_file)
st.write("PDF Uploaded Successfully.")
# Text area for entering a question
question = st.text_input("Ask a question about the PDF")
pdf_text = find_most_relevant_context(pdf_arr, question)
if st.button("Get Answer"):
if question:
# Get the answer from the backend
answer = answer_question_from_pdf(pdf_text, question)
st.write("Answer:", answer)
else:
st.write("Please enter a question.")
else:
st.write("Please upload a PDF file.")
|