Spaces:

docsumo
/

ocr-results

Running

App Files Files

xet

Community

ocr-results / app.py

spookie-boogie

Update the markdown folder name

84dea15 verified 7 months ago

raw

history blame contribute delete

6.53 kB

	import streamlit as st
	import os
	import glob
	from pathlib import Path

	# Set page configuration
	st.set_page_config(
	page_title="OCR analysis results",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Custom CSS for a cleaner interface
	st.markdown("""
	<style>
	.main { padding-top: 1rem; }
	.stTabs [data-baseweb="tab-list"] {
	gap: 1rem;
	margin-bottom: 1rem;
	}
	.stTabs [data-baseweb="tab"] {
	height: 50px;
	white-space: pre-wrap;
	border-radius: 4px 4px 0 0;
	font-weight: bold;
	letter-spacing: 1px;
	}
	.image-container {
	display: flex;
	justify-content: center;
	}
	.markdown-container {
	margin-top: 2rem;
	border: 1px solid #f0f0f0;
	padding: 1rem;
	border-radius: 5px;
	background-color: #f9f9f9;
	}
	.toggle-container {
	margin-bottom: 1rem;
	}
	</style>
	""", unsafe_allow_html=True)

	def get_all_docs(repo_path="markdowns"):
	"""
	Gets all document IDs from the nested structure in markdowns directory.
	Structure: markdowns/folder_id/doc_id/
	"""
	all_docs = []

	if not os.path.exists(repo_path):
	return []

	# Get all folder_ids (we'll use the paths but skip showing them in UI)
	folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]

	# For each folder, get all doc_ids
	for folder_path in folder_paths:
	doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]

	for doc_path in doc_paths:
	doc_id = os.path.basename(doc_path)
	all_docs.append((doc_id, doc_path))

	return all_docs

	def read_markdown_file(file_path):
	"""Reads the content of a markdown file."""
	if os.path.exists(file_path):
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception as e:
	return f"Error reading markdown file: {str(e)}"
	return "Markdown file not found."

	def display_model_outputs(doc_path):
	"""Displays the markdown outputs from different models for the current document."""
	# Check which markdown files are available
	md_files = glob.glob(os.path.join(doc_path, "*.md"))

	if not md_files:
	st.warning("No markdown files found for this document")
	return

	# Extract model names and sort them
	model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
	model_names.sort() # Ensure consistent order

	# Convert model names to uppercase
	display_names = [name.upper() for name in model_names]

	# Initialize show_parsed in session_state if not already set
	if 'show_parsed' not in st.session_state:
	st.session_state.show_parsed = False

	# Toggle for raw/parsed markdown that preserves state
	st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
	show_parsed = st.checkbox(
	"Show Parsed Markdown",
	value=st.session_state.show_parsed,
	key="parsed_markdown_toggle",
	on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle)
	)
	st.markdown("</div>", unsafe_allow_html=True)

	# Create tabs for each model
	tabs = st.tabs(display_names)

	for i, model_name in enumerate(model_names):
	md_path = os.path.join(doc_path, f"{model_name}.md")
	md_content = read_markdown_file(md_path)

	with tabs[i]:
	if show_parsed:
	st.markdown(md_content, unsafe_allow_html=True)
	else:
	st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
	st.code(md_content, language="markdown")
	st.markdown("</div>", unsafe_allow_html=True)

	def main():
	"""Main function to run the Streamlit app."""
	st.title("Document Analysis Leaderboard")

	# Get all doc-ids from the fixed repository path
	repo_path = "markdowns"

	with st.spinner("Loading documents..."):
	all_docs = get_all_docs(repo_path)

	if not all_docs:
	st.error(f"No documents found in {repo_path}. Please check the directory structure.")
	if os.path.exists(repo_path):
	st.info(f"The path {repo_path} exists, but no documents were found.")
	else:
	st.info(f"The path {repo_path} does not exist.")
	return

	# Sort docs by doc_id for consistent ordering
	all_docs.sort()

	# Initialize session state for current index
	if 'current_index' not in st.session_state:
	st.session_state.current_index = 0

	# Ensure current_index is within bounds
	st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)

	# Current document info
	doc_id, doc_path = all_docs[st.session_state.current_index]

	# Navigation buttons
	col1, col2, col3 = st.columns([1, 4, 1])

	with col1:
	if st.button("← Previous", use_container_width=True):
	st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
	st.rerun()

	with col2:
	st.markdown(f"### Document: {doc_id}")
	st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")

	with col3:
	if st.button("Next →", use_container_width=True):
	st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
	st.rerun()

	# Display document image with reduced size and centered
	image_path = os.path.join(doc_path, "image.jpg")

	try:
	if os.path.exists(image_path):
	# Use columns to center and size the image
	col1, col2, col3 = st.columns([1, 2, 1])
	with col2:
	st.markdown("<div class='image-container'>", unsafe_allow_html=True)
	st.image(image_path, width=500) # Fixed width for smaller size
	st.markdown("</div>", unsafe_allow_html=True)
	else:
	st.info("Image not available for this document")
	except Exception as e:
	st.error(f"Error loading image: {str(e)}")

	# Add separator between image and model outputs
	st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)

	# Display model outputs
	display_model_outputs(doc_path)

	if __name__ == "__main__":
	main()