ocr-results / app.py
spookie-boogie's picture
Update the markdown folder name
84dea15 verified
import streamlit as st
import os
import glob
from pathlib import Path
# Set page configuration
st.set_page_config(
page_title="OCR analysis results",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for a cleaner interface
st.markdown("""
<style>
.main { padding-top: 1rem; }
.stTabs [data-baseweb="tab-list"] {
gap: 1rem;
margin-bottom: 1rem;
}
.stTabs [data-baseweb="tab"] {
height: 50px;
white-space: pre-wrap;
border-radius: 4px 4px 0 0;
font-weight: bold;
letter-spacing: 1px;
}
.image-container {
display: flex;
justify-content: center;
}
.markdown-container {
margin-top: 2rem;
border: 1px solid #f0f0f0;
padding: 1rem;
border-radius: 5px;
background-color: #f9f9f9;
}
.toggle-container {
margin-bottom: 1rem;
}
</style>
""", unsafe_allow_html=True)
def get_all_docs(repo_path="markdowns"):
"""
Gets all document IDs from the nested structure in markdowns directory.
Structure: markdowns/folder_id/doc_id/
"""
all_docs = []
if not os.path.exists(repo_path):
return []
# Get all folder_ids (we'll use the paths but skip showing them in UI)
folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]
# For each folder, get all doc_ids
for folder_path in folder_paths:
doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]
for doc_path in doc_paths:
doc_id = os.path.basename(doc_path)
all_docs.append((doc_id, doc_path))
return all_docs
def read_markdown_file(file_path):
"""Reads the content of a markdown file."""
if os.path.exists(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
return f"Error reading markdown file: {str(e)}"
return "Markdown file not found."
def display_model_outputs(doc_path):
"""Displays the markdown outputs from different models for the current document."""
# Check which markdown files are available
md_files = glob.glob(os.path.join(doc_path, "*.md"))
if not md_files:
st.warning("No markdown files found for this document")
return
# Extract model names and sort them
model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
model_names.sort() # Ensure consistent order
# Convert model names to uppercase
display_names = [name.upper() for name in model_names]
# Initialize show_parsed in session_state if not already set
if 'show_parsed' not in st.session_state:
st.session_state.show_parsed = False
# Toggle for raw/parsed markdown that preserves state
st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
show_parsed = st.checkbox(
"Show Parsed Markdown",
value=st.session_state.show_parsed,
key="parsed_markdown_toggle",
on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle)
)
st.markdown("</div>", unsafe_allow_html=True)
# Create tabs for each model
tabs = st.tabs(display_names)
for i, model_name in enumerate(model_names):
md_path = os.path.join(doc_path, f"{model_name}.md")
md_content = read_markdown_file(md_path)
with tabs[i]:
if show_parsed:
st.markdown(md_content, unsafe_allow_html=True)
else:
st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
st.code(md_content, language="markdown")
st.markdown("</div>", unsafe_allow_html=True)
def main():
"""Main function to run the Streamlit app."""
st.title("Document Analysis Leaderboard")
# Get all doc-ids from the fixed repository path
repo_path = "markdowns"
with st.spinner("Loading documents..."):
all_docs = get_all_docs(repo_path)
if not all_docs:
st.error(f"No documents found in {repo_path}. Please check the directory structure.")
if os.path.exists(repo_path):
st.info(f"The path {repo_path} exists, but no documents were found.")
else:
st.info(f"The path {repo_path} does not exist.")
return
# Sort docs by doc_id for consistent ordering
all_docs.sort()
# Initialize session state for current index
if 'current_index' not in st.session_state:
st.session_state.current_index = 0
# Ensure current_index is within bounds
st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)
# Current document info
doc_id, doc_path = all_docs[st.session_state.current_index]
# Navigation buttons
col1, col2, col3 = st.columns([1, 4, 1])
with col1:
if st.button("← Previous", use_container_width=True):
st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
st.rerun()
with col2:
st.markdown(f"### Document: {doc_id}")
st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")
with col3:
if st.button("Next β†’", use_container_width=True):
st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
st.rerun()
# Display document image with reduced size and centered
image_path = os.path.join(doc_path, "image.jpg")
try:
if os.path.exists(image_path):
# Use columns to center and size the image
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.markdown("<div class='image-container'>", unsafe_allow_html=True)
st.image(image_path, width=500) # Fixed width for smaller size
st.markdown("</div>", unsafe_allow_html=True)
else:
st.info("Image not available for this document")
except Exception as e:
st.error(f"Error loading image: {str(e)}")
# Add separator between image and model outputs
st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)
# Display model outputs
display_model_outputs(doc_path)
if __name__ == "__main__":
main()