Spaces:
Sleeping
Sleeping
import streamlit as st | |
from streamlit_option_menu import option_menu | |
import fitz # PyMuPDF | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
import requests | |
import os | |
import time | |
# Page configuration | |
st.set_page_config( | |
page_title="PDF Study Assistant", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="collapsed" | |
) | |
# Custom CSS for colorful design | |
st.markdown(""" | |
<style> | |
:root { | |
--primary: #ff4b4b; | |
--secondary: #ff9a3d; | |
--accent1: #ffcb74; | |
--accent2: #3a86ff; | |
--background: #f0f2f6; | |
--card: #ffffff; | |
} | |
.stApp { | |
background: linear-gradient(135deg, var(--background) 0%, #e0e5ec 100%); | |
} | |
.stButton>button { | |
background: linear-gradient(to right, var(--secondary), var(--primary)); | |
color: white; | |
border-radius: 12px; | |
padding: 8px 20px; | |
font-weight: 600; | |
} | |
.stTextInput>div>div>input { | |
border-radius: 12px; | |
border: 2px solid var(--accent2); | |
padding: 10px; | |
} | |
.card { | |
background: var(--card); | |
border-radius: 15px; | |
box-shadow: 0 8px 16px rgba(0,0,0,0.1); | |
padding: 20px; | |
margin-bottom: 20px; | |
} | |
.header { | |
background: linear-gradient(to right, var(--accent2), var(--primary)); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
text-align: center; | |
margin-bottom: 30px; | |
} | |
.tab-content { | |
animation: fadeIn 0.5s ease-in-out; | |
} | |
.error { | |
background-color: #ffebee; | |
border-left: 4px solid #f44336; | |
padding: 10px; | |
margin: 10px 0; | |
} | |
.info { | |
background-color: #e3f2fd; | |
border-left: 4px solid #2196f3; | |
padding: 10px; | |
margin: 10px 0; | |
} | |
.success { | |
background-color: #e8f5e9; | |
border-left: 4px solid #4caf50; | |
padding: 10px; | |
margin: 10px 0; | |
} | |
@keyframes fadeIn { | |
from { opacity: 0; } | |
to { opacity: 1; } | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Initialize session state | |
if 'pdf_processed' not in st.session_state: | |
st.session_state.pdf_processed = False | |
if 'vector_store' not in st.session_state: | |
st.session_state.vector_store = None | |
if 'pages' not in st.session_state: | |
st.session_state.pages = [] | |
if 'history' not in st.session_state: | |
st.session_state.history = [] | |
if 'token_valid' not in st.session_state: | |
st.session_state.token_valid = None | |
# Load embedding model with caching | |
def load_embedding_model(): | |
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
def check_token_validity(): | |
"""Check if the token is valid by making a simple API call""" | |
if not os.getenv("HF_API_KEY"): | |
return False | |
try: | |
headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} | |
response = requests.get("https://huggingface.co/api/whoami", headers=headers) | |
return response.status_code == 200 | |
except: | |
return False | |
def query_hf_inference_api(prompt, max_tokens=200, model="google/flan-t5-base"): | |
"""Query Hugging Face Inference API with better error handling""" | |
API_URL = f"https://api-inference.huggingface.co/models/{model}" | |
headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {} | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": max_tokens, | |
"temperature": 0.5, | |
"do_sample": False | |
} | |
} | |
try: | |
response = requests.post(API_URL, headers=headers, json=payload) | |
if response.status_code == 200: | |
result = response.json() | |
return result[0]['generated_text'] if result else "" | |
elif response.status_code == 403: | |
# Detailed debug information | |
st.session_state.token_valid = check_token_validity() | |
debug_info = f""" | |
<div class="error"> | |
<h4>403 Forbidden Error</h4> | |
<p>Token is set: <strong>{'Yes' if os.getenv('HF_API_KEY') else 'No'}</strong></p> | |
<p>Token valid: <strong>{'Yes' if st.session_state.token_valid else 'No'}</strong></p> | |
<p>Model: {model}</p> | |
<p>Possible solutions:</p> | |
<ol> | |
<li>Visit the <a href="https://huggingface.co/{model}" target="_blank">model page</a> and click "Agree and access repository"</li> | |
<li>Ensure your token has "read" permissions</li> | |
<li>Wait 5-10 minutes after accepting terms</li> | |
<li>Try a different model using the dropdown below</li> | |
</ol> | |
</div> | |
""" | |
st.markdown(debug_info, unsafe_allow_html=True) | |
return "" | |
elif response.status_code == 429: | |
st.warning("Rate limit exceeded. Waiting and retrying...") | |
time.sleep(3) | |
return query_hf_inference_api(prompt, max_tokens, model) | |
else: | |
st.error(f"API Error {response.status_code}: {response.text[:200]}") | |
return "" | |
except Exception as e: | |
st.error(f"Connection error: {str(e)}") | |
return "" | |
def process_pdf(pdf_file): | |
"""Extract text from PDF and create vector store""" | |
with st.spinner("π Reading PDF..."): | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
text = "" | |
st.session_state.pages = [] | |
for page in doc: | |
page_text = page.get_text() | |
text += page_text | |
st.session_state.pages.append(page_text) | |
with st.spinner("π Processing text..."): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text) | |
embeddings = load_embedding_model() | |
st.session_state.vector_store = FAISS.from_texts(chunks, embeddings) | |
st.session_state.pdf_processed = True | |
st.success("β PDF processed successfully!") | |
def ask_question(question, model_choice): | |
"""Answer a question using the vector store and Hugging Face API""" | |
if not st.session_state.vector_store: | |
return "PDF not processed yet", [] | |
# Find relevant passages | |
docs = st.session_state.vector_store.similarity_search(question, k=3) | |
context = "\n\n".join([doc.page_content[:500] for doc in docs]) | |
# Format prompt for the model | |
prompt = f""" | |
Based on the following context, answer the question. | |
If the answer isn't in the context, say "I don't know". | |
Context: | |
{context} | |
Question: {question} | |
Answer: | |
""" | |
# Query the model | |
answer = query_hf_inference_api(prompt, model=model_choice) | |
# Add to history | |
st.session_state.history.append({ | |
"question": question, | |
"answer": answer, | |
"sources": [doc.page_content for doc in docs], | |
"model": model_choice | |
}) | |
return answer, docs | |
def generate_qa_for_chapter(start_page, end_page, model_choice): | |
"""Generate Q&A for specific chapter pages""" | |
if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page: | |
st.error("Invalid page range") | |
return [] | |
chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page]) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=800, | |
chunk_overlap=100, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(chapter_text) | |
qa_pairs = [] | |
with st.spinner(f"π§ Generating Q&A for pages {start_page}-{end_page}..."): | |
for i, chunk in enumerate(chunks): | |
if i % 2 == 0: # Generate question | |
prompt = f"Based on this text, generate one study question: {chunk[:500]}" | |
question = query_hf_inference_api(prompt, model=model_choice, max_tokens=100) | |
if question and not question.endswith("?"): | |
question += "?" | |
if question: # Only add if we got a valid question | |
qa_pairs.append((question, "")) | |
else: # Generate answer | |
if qa_pairs: # Ensure we have a question to answer | |
prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}" | |
answer = query_hf_inference_api(prompt, model=model_choice, max_tokens=200) | |
qa_pairs[-1] = (qa_pairs[-1][0], answer) | |
return qa_pairs | |
# App header | |
st.markdown("<h1 class='header'>π PDF Study Assistant</h1>", unsafe_allow_html=True) | |
# Model selection | |
MODEL_OPTIONS = { | |
"google/flan-t5-base": "T5 Base (Recommended)", | |
"google/flan-t5-large": "T5 Large (Requires Auth)", | |
"mrm8488/t5-base-finetuned-question-generation-ap": "Question Generation", | |
"declare-lab/flan-alpaca-base": "Alpaca Base" | |
} | |
# Debug info panel | |
with st.expander("π§ Debug Information", expanded=False): | |
st.subheader("Hugging Face Token Status") | |
# Check token validity | |
token_valid = check_token_validity() | |
st.session_state.token_valid = token_valid | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write(f"Token is set: {'β Yes' if os.getenv('HF_API_KEY') else 'β No'}") | |
with col2: | |
st.write(f"Token is valid: {'β Yes' if token_valid else 'β No'}") | |
if os.getenv('HF_API_KEY'): | |
st.markdown(""" | |
<div class="info"> | |
<p>Your token is set but we're still having issues. Try these steps:</p> | |
<ol> | |
<li>Visit the model page for your selected model</li> | |
<li>Click "Agree and access repository"</li> | |
<li>Wait 5-10 minutes for changes to propagate</li> | |
<li>Try a different model from the dropdown</li> | |
</ol> | |
</div> | |
""", unsafe_allow_html=True) | |
else: | |
st.markdown(""" | |
<div class="error"> | |
<p>Token is not set! Add it in your Space secrets:</p> | |
<ol> | |
<li>Go to your Space β Settings β Secrets</li> | |
<li>Add <code>HF_API_KEY</code> with your token</li> | |
<li>Redeploy the Space</li> | |
</ol> | |
<p>Get your token: <a href="https://huggingface.co/settings/tokens" target="_blank">https://huggingface.co/settings/tokens</a></p> | |
</div> | |
""", unsafe_allow_html=True) | |
# PDF Upload Section (FIXED LABEL ERROR) | |
with st.container(): | |
st.subheader("π€ Upload Your Textbook/Notes") | |
# Fixed empty label issue by adding a space and hiding it | |
pdf_file = st.file_uploader( | |
"Upload PDF", | |
type="pdf", | |
label_visibility="collapsed" | |
) | |
# Main content | |
if pdf_file: | |
if not st.session_state.pdf_processed: | |
process_pdf(pdf_file) | |
if st.session_state.pdf_processed: | |
# Model selection | |
st.subheader("Model Selection") | |
model_choice = st.selectbox( | |
"Choose AI model:", | |
options=list(MODEL_OPTIONS.keys()), | |
format_func=lambda x: MODEL_OPTIONS[x], | |
help="Some models require accepting terms on Hugging Face" | |
) | |
# Navigation tabs | |
selected_tab = option_menu( | |
None, | |
["Ask Questions", "Generate Chapter Q&A", "History"], | |
icons=["chat", "book", "clock-history"], | |
menu_icon="cast", | |
default_index=0, | |
orientation="horizontal", | |
styles={ | |
"container": {"padding": "0!important", "background-color": "#f9f9f9"}, | |
"nav-link": {"font-size": "16px", "font-weight": "bold"}, | |
"nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"}, | |
} | |
) | |
# Question Answering Tab | |
if selected_tab == "Ask Questions": | |
st.markdown("### π¬ Ask Questions About Your Document") | |
user_question = st.text_input("Type your question here:", key="user_question") | |
if user_question: | |
with st.spinner("π€ Thinking..."): | |
answer, docs = ask_question(user_question, model_choice) | |
if answer: | |
st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True) | |
with st.expander("π See source passages"): | |
for i, doc in enumerate(docs): | |
st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...") | |
# Chapter Q&A Generation Tab | |
elif selected_tab == "Generate Chapter Q&A": | |
st.markdown("### π Generate Q&A for Specific Chapter") | |
col1, col2 = st.columns(2) | |
with col1: | |
start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1) | |
with col2: | |
end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages))) | |
if st.button("Generate Q&A", key="generate_qa"): | |
qa_pairs = generate_qa_for_chapter(start_page, end_page, model_choice) | |
if qa_pairs: | |
st.markdown(f"<h4>π Generated Questions for Pages {start_page}-{end_page}</h4>", unsafe_allow_html=True) | |
for i, (question, answer) in enumerate(qa_pairs): | |
st.markdown(f""" | |
<div class='card'> | |
<b>Q{i+1}:</b> {question}<br> | |
<b>A{i+1}:</b> {answer} | |
</div> | |
""", unsafe_allow_html=True) | |
else: | |
st.warning("No Q&A pairs generated. Try a different page range.") | |
# History Tab | |
elif selected_tab == "History": | |
st.markdown("### β³ Question History") | |
if not st.session_state.history: | |
st.info("No questions asked yet.") | |
else: | |
for i, item in enumerate(reversed(st.session_state.history)): | |
with st.expander(f"Q{i+1}: {item['question']} ({MODEL_OPTIONS.get(item['model'], item['model'])})"): | |
st.markdown(f"**Answer:** {item['answer']}") | |
st.markdown("**Source Passages:**") | |
for j, source in enumerate(item['sources']): | |
st.markdown(f"{j+1}. {source[:500]}...") | |
# Footer | |
st.markdown("---") | |
st.markdown(""" | |
<div style="text-align: center; padding: 20px;"> | |
Built with β€οΈ for students | PDF Study Assistant v4.1 | |
</div> | |
""", unsafe_allow_html=True) |