Spaces:
Running
Running
import streamlit as st | |
import os | |
import glob | |
from pathlib import Path | |
# Set page configuration | |
st.set_page_config( | |
page_title="OCR analysis results", | |
layout="wide", | |
initial_sidebar_state="collapsed" | |
) | |
# Custom CSS for a cleaner interface | |
st.markdown(""" | |
<style> | |
.main { padding-top: 1rem; } | |
.stTabs [data-baseweb="tab-list"] { | |
gap: 1rem; | |
margin-bottom: 1rem; | |
} | |
.stTabs [data-baseweb="tab"] { | |
height: 50px; | |
white-space: pre-wrap; | |
border-radius: 4px 4px 0 0; | |
font-weight: bold; | |
letter-spacing: 1px; | |
} | |
.image-container { | |
display: flex; | |
justify-content: center; | |
} | |
.markdown-container { | |
margin-top: 2rem; | |
border: 1px solid #f0f0f0; | |
padding: 1rem; | |
border-radius: 5px; | |
background-color: #f9f9f9; | |
} | |
.toggle-container { | |
margin-bottom: 1rem; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
def get_all_docs(repo_path="markdowns"): | |
""" | |
Gets all document IDs from the nested structure in markdowns directory. | |
Structure: markdowns/folder_id/doc_id/ | |
""" | |
all_docs = [] | |
if not os.path.exists(repo_path): | |
return [] | |
# Get all folder_ids (we'll use the paths but skip showing them in UI) | |
folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)] | |
# For each folder, get all doc_ids | |
for folder_path in folder_paths: | |
doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)] | |
for doc_path in doc_paths: | |
doc_id = os.path.basename(doc_path) | |
all_docs.append((doc_id, doc_path)) | |
return all_docs | |
def read_markdown_file(file_path): | |
"""Reads the content of a markdown file.""" | |
if os.path.exists(file_path): | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return f.read() | |
except Exception as e: | |
return f"Error reading markdown file: {str(e)}" | |
return "Markdown file not found." | |
def display_model_outputs(doc_path): | |
"""Displays the markdown outputs from different models for the current document.""" | |
# Check which markdown files are available | |
md_files = glob.glob(os.path.join(doc_path, "*.md")) | |
if not md_files: | |
st.warning("No markdown files found for this document") | |
return | |
# Extract model names and sort them | |
model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files] | |
model_names.sort() # Ensure consistent order | |
# Convert model names to uppercase | |
display_names = [name.upper() for name in model_names] | |
# Initialize show_parsed in session_state if not already set | |
if 'show_parsed' not in st.session_state: | |
st.session_state.show_parsed = False | |
# Toggle for raw/parsed markdown that preserves state | |
st.markdown("<div class='toggle-container'>", unsafe_allow_html=True) | |
show_parsed = st.checkbox( | |
"Show Parsed Markdown", | |
value=st.session_state.show_parsed, | |
key="parsed_markdown_toggle", | |
on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle) | |
) | |
st.markdown("</div>", unsafe_allow_html=True) | |
# Create tabs for each model | |
tabs = st.tabs(display_names) | |
for i, model_name in enumerate(model_names): | |
md_path = os.path.join(doc_path, f"{model_name}.md") | |
md_content = read_markdown_file(md_path) | |
with tabs[i]: | |
if show_parsed: | |
st.markdown(md_content, unsafe_allow_html=True) | |
else: | |
st.markdown("<div class='markdown-container'>", unsafe_allow_html=True) | |
st.code(md_content, language="markdown") | |
st.markdown("</div>", unsafe_allow_html=True) | |
def main(): | |
"""Main function to run the Streamlit app.""" | |
st.title("Document Analysis Leaderboard") | |
# Get all doc-ids from the fixed repository path | |
repo_path = "markdowns" | |
with st.spinner("Loading documents..."): | |
all_docs = get_all_docs(repo_path) | |
if not all_docs: | |
st.error(f"No documents found in {repo_path}. Please check the directory structure.") | |
if os.path.exists(repo_path): | |
st.info(f"The path {repo_path} exists, but no documents were found.") | |
else: | |
st.info(f"The path {repo_path} does not exist.") | |
return | |
# Sort docs by doc_id for consistent ordering | |
all_docs.sort() | |
# Initialize session state for current index | |
if 'current_index' not in st.session_state: | |
st.session_state.current_index = 0 | |
# Ensure current_index is within bounds | |
st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1) | |
# Current document info | |
doc_id, doc_path = all_docs[st.session_state.current_index] | |
# Navigation buttons | |
col1, col2, col3 = st.columns([1, 4, 1]) | |
with col1: | |
if st.button("β Previous", use_container_width=True): | |
st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs) | |
st.rerun() | |
with col2: | |
st.markdown(f"### Document: {doc_id}") | |
st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}") | |
with col3: | |
if st.button("Next β", use_container_width=True): | |
st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs) | |
st.rerun() | |
# Display document image with reduced size and centered | |
image_path = os.path.join(doc_path, "image.jpg") | |
try: | |
if os.path.exists(image_path): | |
# Use columns to center and size the image | |
col1, col2, col3 = st.columns([1, 2, 1]) | |
with col2: | |
st.markdown("<div class='image-container'>", unsafe_allow_html=True) | |
st.image(image_path, width=500) # Fixed width for smaller size | |
st.markdown("</div>", unsafe_allow_html=True) | |
else: | |
st.info("Image not available for this document") | |
except Exception as e: | |
st.error(f"Error loading image: {str(e)}") | |
# Add separator between image and model outputs | |
st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True) | |
# Display model outputs | |
display_model_outputs(doc_path) | |
if __name__ == "__main__": | |
main() |