Spaces:
Sleeping
Sleeping
import pdfplumber | |
from sentence_transformers import SentenceTransformer | |
import streamlit as st | |
from gtts import gTTS | |
import os | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Function to extract text from a limited number of pages in a PDF | |
def load_pdf_and_extract_text(pdf_path, max_pages=20): | |
all_sentences = [] | |
with pdfplumber.open(pdf_path) as pdf: | |
total_pages = len(pdf.pages) | |
st.write(f"Total pages to process: {total_pages}") | |
for i, page in enumerate(pdf.pages): | |
if i >= max_pages: | |
break | |
st.write(f"Processing page {i + 1}...") | |
text = page.extract_text() | |
if text: | |
all_sentences.extend(text.split('. ')) | |
st.progress((i + 1) / max_pages) # Update progress | |
return all_sentences | |
# Load your PDF file | |
pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space | |
all_sentences = load_pdf_and_extract_text(pdf_path) | |
# Initialize the model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Create embeddings from extracted sentences | |
def create_embeddings(sentences): | |
return model.encode(sentences, convert_to_tensor=True) | |
pdf_embeddings = create_embeddings(all_sentences) | |
# Function to respond to user query | |
def respond_to_query(query): | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings) | |
best_match_index = similarities.argmax() | |
response = all_sentences[best_match_index] | |
return response | |
# Streamlit app | |
st.title("Study Assistant") | |
query = st.text_input("Type your question:") | |
submit_button = st.button("Ask") | |
if submit_button: | |
if query: | |
response = respond_to_query(query) | |
# Text-to-Speech | |
tts = gTTS(response) | |
tts.save("response.mp3") | |
# (Optional) Playing audio might not work in Spaces, consider alternatives | |
st.write(response) | |
else: | |
st.write("Please enter a question.") | |