Spaces:

raymondEDS
/

dev_LMS

Sleeping

dev_LMS / src /streamlit_app.py

raymondEDS

Using pdf workaround

aa1280b 6 months ago

17.4 kB

	import streamlit as st
	import PyPDF2
	import io
	import base64
	from datetime import datetime
	import json
	import tempfile
	import os

	# Page configuration
	st.set_page_config(
	page_title="Dev LMS",
	page_icon="📚",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Initialize session state
	if 'uploaded_documents' not in st.session_state:
	st.session_state.uploaded_documents = {}
	if 'current_user' not in st.session_state:
	st.session_state.current_user = "User"

	def save_document_info(filename, file_content, file_type, temp_path=None):
	"""Save document information to session state"""
	if 'documents' not in st.session_state.uploaded_documents:
	st.session_state.uploaded_documents['documents'] = []

	document_info = {
	'filename': filename,
	'upload_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	'file_type': file_type,
	'size': len(file_content),
	'content': file_content.decode('latin-1') if isinstance(file_content, bytes) else str(file_content),
	'temp_path': temp_path # Store temp path for later use
	}

	st.session_state.uploaded_documents['documents'].append(document_info)

	def extract_pdf_text_from_temp(temp_path):
	"""Extract text from PDF file using temporary file path"""
	try:
	with open(temp_path, "rb") as pdf_file:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	st.error(f"Error reading PDF: {str(e)}")
	return ""

	def extract_pdf_text_from_memory(uploaded_file):
	"""Extract text from PDF file in memory"""
	try:
	pdf_reader = PyPDF2.PdfReader(uploaded_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	st.error(f"Error reading PDF: {str(e)}")
	return ""

	def cleanup_temp_file(temp_path):
	"""Clean up temporary file"""
	try:
	if temp_path and os.path.exists(temp_path):
	os.remove(temp_path)
	except Exception as e:
	st.warning(f"Could not clean up temporary file: {str(e)}")

	def main():
	# Sidebar for navigation
	with st.sidebar:
	st.title("📚 Dev LMS")
	st.markdown("---")

	# Navigation
	page = st.selectbox(
	"Navigation",
	["Dashboard", "Upload Documents", "My Documents", "Document Library", "Settings"]
	)

	# Main content area
	if page == "Dashboard":
	show_dashboard()
	elif page == "Upload Documents":
	show_upload_documents()
	elif page == "My Documents":
	show_my_documents()
	elif page == "Document Library":
	show_document_library()
	elif page == "Settings":
	show_settings()

	def show_dashboard():
	"""Show the main dashboard"""
	st.title("📊 Dashboard")
	st.markdown("---")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric(
	label="Total Documents",
	value=len(st.session_state.uploaded_documents.get('documents', [])),
	delta="0"
	)

	with col2:
	st.metric(
	label="System Status",
	value="Active",
	delta="0"
	)

	with col3:
	st.metric(
	label="Storage Used",
	value="Session",
	delta="0"
	)

	st.markdown("---")

	# Recent activity
	st.subheader("📈 Recent Activity")
	documents = st.session_state.uploaded_documents.get('documents', [])
	if documents:
	recent_docs = documents[-5:]
	for doc in recent_docs:
	with st.container():
	col1, col2, col3 = st.columns([3, 2, 1])
	with col1:
	st.write(f"{doc['filename']}")
	with col2:
	st.write(doc['upload_time'])
	with col3:
	st.write(f"{doc['file_type']}")
	st.markdown("---")
	else:
	st.info("No documents uploaded yet. Start by uploading a PDF document!")

	def show_upload_documents():
	"""Show document upload interface"""
	st.title("📤 Upload Documents")
	st.markdown("---")

	# Add information about file upload
	st.info("💡 Note: File upload uses temporary storage for better compatibility with Hugging Face Spaces.")

	uploaded_file = st.file_uploader(
	"Choose a PDF file",
	type=['pdf'],
	help="Upload PDF documents to the LMS (max 200MB)",
	accept_multiple_files=False
	)

	if uploaded_file is not None:
	try:
	# Display file info
	file_details = {
	"Filename": uploaded_file.name,
	"File size": f"{uploaded_file.size / 1024:.2f} KB",
	"File type": uploaded_file.type
	}

	st.write("File Details:")
	for key, value in file_details.items():
	st.write(f"- {key}: {value}")

	# Create temporary file for better PDF processing
	temp_path = None
	try:
	with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as temp:
	bytes_data = uploaded_file.getvalue()
	temp.write(bytes_data)
	temp_path = temp.name

	st.success(f"📁 File temporarily stored at: {temp_path}")

	# Extract and display PDF content using temporary file
	pdf_text = extract_pdf_text_from_temp(temp_path)

	if pdf_text.strip():
	st.subheader("📄 Document Preview")
	with st.expander("View extracted text"):
	st.text_area("PDF Content", pdf_text, height=300)
	else:
	st.warning("⚠️ Could not extract text from this PDF. The file may be image-based or encrypted.")

	# Upload button
	if st.button("Upload Document", type="primary"):
	try:
	# Save document info with temporary file path
	save_document_info(
	uploaded_file.name,
	bytes_data,
	"PDF",
	temp_path
	)

	st.success(f"✅ Document '{uploaded_file.name}' uploaded successfully!")
	st.balloons()

	# Clear the file uploader
	st.rerun()

	except Exception as e:
	st.error(f"❌ Error uploading document: {str(e)}")
	st.info("💡 Try uploading a smaller file or refresh the page.")
	# Clean up temp file on error
	cleanup_temp_file(temp_path)

	except Exception as e:
	st.error(f"❌ Error creating temporary file: {str(e)}")
	st.info("💡 Please try uploading a different PDF file.")
	cleanup_temp_file(temp_path)

	except Exception as e:
	st.error(f"❌ Error processing file: {str(e)}")
	st.info("💡 Please try uploading a different PDF file.")

	# Add helpful tips
	with st.expander("💡 Upload Tips"):
	st.markdown("""
	For best results:
	- Use PDF files under 200MB
	- Ensure PDFs contain text (not just images)
	- Avoid password-protected PDFs
	- If upload fails, try refreshing the page

	Technical details:
	- Files are temporarily stored on the server
	- Text extraction uses temporary file processing
	- Automatic cleanup of temporary files

	Supported formats: PDF only
	""")

	def show_my_documents():
	"""Show uploaded documents"""
	st.title("📁 My Documents")
	st.markdown("---")

	documents = st.session_state.uploaded_documents.get('documents', [])

	if not documents:
	st.info("You haven't uploaded any documents yet.")
	return

	# Search functionality
	search_term = st.text_input("🔍 Search documents", placeholder="Enter filename or content...")

	# Filter documents based on search
	filtered_docs = documents
	if search_term:
	filtered_docs = [
	doc for doc in documents
	if search_term.lower() in doc['filename'].lower() or
	search_term.lower() in doc.get('content', '').lower()
	]

	if not filtered_docs:
	st.warning("No documents match your search criteria.")
	return

	# Display documents
	for i, doc in enumerate(filtered_docs):
	with st.container():
	col1, col2, col3, col4 = st.columns([3, 2, 1, 1])

	with col1:
	st.write(f"{doc['filename']}")

	with col2:
	st.write(doc['upload_time'])

	with col3:
	st.write(f"{doc['file_type']}")

	with col4:
	if st.button(f"View {i}", key=f"view_{i}"):
	st.subheader(f"📄 {doc['filename']}")
	st.write(f"Uploaded: {doc['upload_time']}")
	st.write(f"Size: {doc['size']} bytes")

	# Check if we have a temporary file path for better content extraction
	if doc.get('temp_path') and os.path.exists(doc['temp_path']):
	try:
	# Extract fresh content from temporary file
	fresh_content = extract_pdf_text_from_temp(doc['temp_path'])
	if fresh_content.strip():
	st.text_area("Document Content (Fresh Extract)", fresh_content, height=400, key=f"fresh_content_{i}")
	else:
	# Fall back to stored content
	if 'content' in doc and doc['content']:
	st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"content_{i}")
	except Exception as e:
	st.warning(f"Could not read from temporary file: {str(e)}")
	# Fall back to stored content
	if 'content' in doc and doc['content']:
	st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"content_{i}")
	else:
	# Display stored content
	if 'content' in doc and doc['content']:
	st.text_area("Document Content", doc['content'], height=400, key=f"content_{i}")

	st.markdown("---")

	def show_document_library():
	"""Show all documents in the system"""
	st.title("📚 Document Library")
	st.markdown("---")

	documents = st.session_state.uploaded_documents.get('documents', [])

	if not documents:
	st.info("No documents have been uploaded to the system yet.")
	return

	# Search functionality
	search_term = st.text_input("🔍 Search all documents", placeholder="Enter filename or content...")

	# Filter documents based on search
	filtered_docs = documents
	if search_term:
	filtered_docs = [
	doc for doc in documents
	if search_term.lower() in doc['filename'].lower() or
	search_term.lower() in doc.get('content', '').lower()
	]

	if not filtered_docs:
	st.warning("No documents match your search criteria.")
	return

	# Display documents
	for i, doc in enumerate(filtered_docs):
	with st.container():
	col1, col2, col3, col4 = st.columns([3, 2, 1, 1])

	with col1:
	st.write(f"{doc['filename']}")

	with col2:
	st.write(doc['upload_time'])

	with col3:
	st.write(f"{doc['file_type']}")

	with col4:
	if st.button(f"View {i}", key=f"lib_view_{i}"):
	st.subheader(f"📄 {doc['filename']}")
	st.write(f"Uploaded: {doc['upload_time']}")
	st.write(f"Size: {doc['size']} bytes")

	# Check if we have a temporary file path for better content extraction
	if doc.get('temp_path') and os.path.exists(doc['temp_path']):
	try:
	# Extract fresh content from temporary file
	fresh_content = extract_pdf_text_from_temp(doc['temp_path'])
	if fresh_content.strip():
	st.text_area("Document Content (Fresh Extract)", fresh_content, height=400, key=f"lib_fresh_content_{i}")
	else:
	# Fall back to stored content
	if 'content' in doc and doc['content']:
	st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"lib_content_{i}")
	except Exception as e:
	st.warning(f"Could not read from temporary file: {str(e)}")
	# Fall back to stored content
	if 'content' in doc and doc['content']:
	st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"lib_content_{i}")
	else:
	# Display stored content
	if 'content' in doc and doc['content']:
	st.text_area("Document Content", doc['content'], height=400, key=f"lib_content_{i}")

	st.markdown("---")

	def show_settings():
	"""Show user settings"""
	st.title("⚙️ Settings")
	st.markdown("---")

	st.subheader("🔧 System Information")
	st.write("Version: Dev LMS v1.0")
	st.write("Features:")
	st.write("- PDF document upload with temporary storage")
	st.write("- Document search and preview")
	st.write("- Document library")
	st.write("- Session-based storage")

	st.markdown("---")

	# Export data option
	if st.button("📥 Export All Data"):
	documents = st.session_state.uploaded_documents.get('documents', [])
	if documents:
	# Create JSON export
	export_data = {
	'export_date': datetime.now().isoformat(),
	'documents': documents
	}

	st.download_button(
	label="Download JSON Export",
	data=json.dumps(export_data, indent=2),
	file_name=f"lms_data_export.json",
	mime="application/json"
	)
	else:
	st.info("No data to export.")

	st.markdown("---")

	# Clear data option
	if st.button("🗑️ Clear All Data"):
	if st.session_state.uploaded_documents.get('documents'):
	# Clean up temporary files before clearing data
	documents = st.session_state.uploaded_documents['documents']
	for doc in documents:
	if doc.get('temp_path'):
	cleanup_temp_file(doc['temp_path'])

	st.session_state.uploaded_documents['documents'] = []
	st.success("All documents and temporary files have been cleared!")
	st.rerun()
	else:
	st.info("No documents to clear.")

	st.markdown("---")

	# Cleanup temporary files option
	if st.button("🧹 Cleanup Temporary Files"):
	documents = st.session_state.uploaded_documents.get('documents', [])
	cleaned_count = 0

	for doc in documents:
	if doc.get('temp_path') and not os.path.exists(doc['temp_path']):
	# Remove temp_path reference if file doesn't exist
	doc.pop('temp_path', None)
	cleaned_count += 1

	if cleaned_count > 0:
	st.success(f"Cleaned up {cleaned_count} missing temporary file references!")
	else:
	st.info("No cleanup needed - all temporary files are properly managed.")

	st.markdown("---")

	# System status
	st.subheader("📊 System Status")
	documents = st.session_state.uploaded_documents.get('documents', [])
	temp_files_count = sum(1 for doc in documents if doc.get('temp_path') and os.path.exists(doc['temp_path']))

	col1, col2 = st.columns(2)
	with col1:
	st.metric("Total Documents", len(documents))
	with col2:
	st.metric("Active Temp Files", temp_files_count)

	if __name__ == "__main__":
	main()