Study_Assistant / app.py
Khd-B's picture
Update app.py
51c0f70 verified
import pdfplumber
from sentence_transformers import SentenceTransformer
import streamlit as st
from gtts import gTTS
import os
from sklearn.metrics.pairwise import cosine_similarity
# Function to extract text from a limited number of pages in a PDF
@st.cache_resource
def load_pdf_and_extract_text(pdf_path, max_pages=20):
all_sentences = []
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
st.write(f"Total pages to process: {total_pages}")
for i, page in enumerate(pdf.pages):
if i >= max_pages:
break
st.write(f"Processing page {i + 1}...")
text = page.extract_text()
if text:
all_sentences.extend(text.split('. '))
st.progress((i + 1) / max_pages) # Update progress
return all_sentences
# Load your PDF file
pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space
all_sentences = load_pdf_and_extract_text(pdf_path)
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Create embeddings from extracted sentences
@st.cache_resource
def create_embeddings(sentences):
return model.encode(sentences, convert_to_tensor=True)
pdf_embeddings = create_embeddings(all_sentences)
# Function to respond to user query
def respond_to_query(query):
query_embedding = model.encode(query, convert_to_tensor=True)
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
best_match_index = similarities.argmax()
response = all_sentences[best_match_index]
return response
# Streamlit app
st.title("Study Assistant")
query = st.text_input("Type your question:")
submit_button = st.button("Ask")
if submit_button:
if query:
response = respond_to_query(query)
# Text-to-Speech
tts = gTTS(response)
tts.save("response.mp3")
# (Optional) Playing audio might not work in Spaces, consider alternatives
st.write(response)
else:
st.write("Please enter a question.")