Spaces:

impresso-project
/

ocrqa-demo

Sleeping

App Files Files Community

simon-clmtd commited on Sep 12

Commit

31e0f17

verified ·

1 Parent(s): 76cc6b5

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -17

app.py CHANGED Viewed

@@ -59,9 +59,9 @@ def process_ocr_qa(text, lang_choice):
                 # Unknown tokens (potential OCR errors)
                 if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
                     unknown_tokens = diagnostics['unknown_tokens']
-                    output_lines.append(f"❌ Potential OCR errors ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
                 elif 'unknown_tokens' in diagnostics:
-                    output_lines.append("✨ No potential OCR errors detected!")
             # Other fields
             for key, value in result.items():
@@ -81,7 +81,7 @@ with gr.Blocks(title="OCR QA Demo") as demo:
     gr.HTML(
     """
     <a href="https://impresso-project.ch" target="_blank">
-        <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height: 100px;">
     </a>
     """
 )
@@ -102,37 +102,56 @@ with gr.Blocks(title="OCR QA Demo") as demo:
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
-                label="Enter OCR Text",
                 value=EXAMPLE_TEXT,
                 lines=8,
-                placeholder="Enter your OCR text here..."
             )
             lang_dropdown = gr.Dropdown(
-                choices=["Auto-detect"] + LANGUAGES,
                 value="de",
-                label="Language"
             )
-            submit_btn = gr.Button("🔍 Analyze OCR Quality", variant="primary")
         with gr.Column():
             with gr.Row():
                 output = gr.Textbox(
-                    label="Analysis Results",
                     lines=15,
-                    placeholder="Results will appear here...",
                     scale=10
                 )
-                info_btn = gr.Button("Pipeline Info", size="sm", scale=1)
     # Info modal/accordion for pipeline details
-    with gr.Accordion("📝 About the OCR QA Pipeline", open=False, visible=False) as info_accordion:
         gr.Markdown(
             """
-            - **Quality Score**: Evaluates the overall quality of OCR text. From 0.0 (poor) to 1.0 (excellent)
-            - **Known tokens**: Words recognized as valid in the selected language
-            - **Potential OCR errors**: Identifies common OCR mistakes and artifacts
-            """
-        )
     submit_btn.click(
         fn=process_ocr_qa,

                 # Unknown tokens (potential OCR errors)
                 if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
                     unknown_tokens = diagnostics['unknown_tokens']
+                    output_lines.append(f"❌ Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
                 elif 'unknown_tokens' in diagnostics:
+                    output_lines.append("✨ All tokens matched known lexicons – no OCR errors detected.")
             # Other fields
             for key, value in result.items():
     gr.HTML(
     """
     <a href="https://impresso-project.ch" target="_blank">
+        <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height: 84px;">
     </a>
     """
 )
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
+                label="OCR Text (from digitized sources)",
                 value=EXAMPLE_TEXT,
                 lines=8,
+                placeholder="Paste OCR-processed text from a historical document..."
             )
             lang_dropdown = gr.Dropdown(
+                choices=LANGUAGES,
                 value="de",
+                label="Language of the Text"
             )
+            submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary")
         with gr.Column():
             with gr.Row():
                 output = gr.Textbox(
+                    label="OCR Quality Report",
                     lines=15,
+                    placeholder="The quality assessment will appear here...",
                     scale=10
                 )
+                info_btn = gr.Button("Demo Info", size="md", scale=1)
     # Info modal/accordion for pipeline details
+    with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion:
         gr.Markdown(
             """
+        ### 📝 About the OCR QA Method
+    This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
+    #### How it works:
+    - **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
+    - **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
+    - **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
+    - **Diagnostics output**:
+        - ✅ **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
+        - ❌ **Unrecognized tokens**: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
+    #### ⚠️ Limitations:
+    - The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
+    - The method may fail to flag **short OCR artifacts** (e.g., 1–2 character noise) and **non-alphabetic symbols**.
+    As such, the score should be understood as a **heuristic indicator**, best used for:
+    - Comparative assessments between OCR outputs
+    - Filtering low-quality text from large corpora
+    - Supporting decisions in corpus preparation and annotation workflows
+    It is **not a substitute for manual inspection** or ground-truth evaluation.
+    """
+)
     submit_btn.click(
         fn=process_ocr_qa,