Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -59,9 +59,9 @@ def process_ocr_qa(text, lang_choice):
|
|
| 59 |
# Unknown tokens (potential OCR errors)
|
| 60 |
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
|
| 61 |
unknown_tokens = diagnostics['unknown_tokens']
|
| 62 |
-
output_lines.append(f"β
|
| 63 |
elif 'unknown_tokens' in diagnostics:
|
| 64 |
-
output_lines.append("β¨
|
| 65 |
|
| 66 |
# Other fields
|
| 67 |
for key, value in result.items():
|
|
@@ -81,7 +81,7 @@ with gr.Blocks(title="OCR QA Demo") as demo:
|
|
| 81 |
gr.HTML(
|
| 82 |
"""
|
| 83 |
<a href="https://impresso-project.ch" target="_blank">
|
| 84 |
-
<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height:
|
| 85 |
</a>
|
| 86 |
"""
|
| 87 |
)
|
|
@@ -102,37 +102,56 @@ with gr.Blocks(title="OCR QA Demo") as demo:
|
|
| 102 |
with gr.Row():
|
| 103 |
with gr.Column():
|
| 104 |
text_input = gr.Textbox(
|
| 105 |
-
label="
|
| 106 |
value=EXAMPLE_TEXT,
|
| 107 |
lines=8,
|
| 108 |
-
placeholder="
|
| 109 |
)
|
| 110 |
lang_dropdown = gr.Dropdown(
|
| 111 |
-
choices=
|
| 112 |
value="de",
|
| 113 |
-
label="Language"
|
| 114 |
)
|
| 115 |
-
submit_btn = gr.Button("π
|
| 116 |
|
| 117 |
with gr.Column():
|
| 118 |
with gr.Row():
|
| 119 |
output = gr.Textbox(
|
| 120 |
-
label="
|
| 121 |
lines=15,
|
| 122 |
-
placeholder="
|
| 123 |
scale=10
|
| 124 |
)
|
| 125 |
-
info_btn = gr.Button("
|
| 126 |
|
| 127 |
# Info modal/accordion for pipeline details
|
| 128 |
-
with gr.Accordion("π About the OCR QA
|
| 129 |
gr.Markdown(
|
| 130 |
"""
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
submit_btn.click(
|
| 138 |
fn=process_ocr_qa,
|
|
|
|
| 59 |
# Unknown tokens (potential OCR errors)
|
| 60 |
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
|
| 61 |
unknown_tokens = diagnostics['unknown_tokens']
|
| 62 |
+
output_lines.append(f"β Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
|
| 63 |
elif 'unknown_tokens' in diagnostics:
|
| 64 |
+
output_lines.append("β¨ All tokens matched known lexicons β no OCR errors detected.")
|
| 65 |
|
| 66 |
# Other fields
|
| 67 |
for key, value in result.items():
|
|
|
|
| 81 |
gr.HTML(
|
| 82 |
"""
|
| 83 |
<a href="https://impresso-project.ch" target="_blank">
|
| 84 |
+
<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height: 84px;">
|
| 85 |
</a>
|
| 86 |
"""
|
| 87 |
)
|
|
|
|
| 102 |
with gr.Row():
|
| 103 |
with gr.Column():
|
| 104 |
text_input = gr.Textbox(
|
| 105 |
+
label="OCR Text (from digitized sources)",
|
| 106 |
value=EXAMPLE_TEXT,
|
| 107 |
lines=8,
|
| 108 |
+
placeholder="Paste OCR-processed text from a historical document..."
|
| 109 |
)
|
| 110 |
lang_dropdown = gr.Dropdown(
|
| 111 |
+
choices=LANGUAGES,
|
| 112 |
value="de",
|
| 113 |
+
label="Language of the Text"
|
| 114 |
)
|
| 115 |
+
submit_btn = gr.Button("π Assess OCR Text Quality", variant="primary")
|
| 116 |
|
| 117 |
with gr.Column():
|
| 118 |
with gr.Row():
|
| 119 |
output = gr.Textbox(
|
| 120 |
+
label="OCR Quality Report",
|
| 121 |
lines=15,
|
| 122 |
+
placeholder="The quality assessment will appear here...",
|
| 123 |
scale=10
|
| 124 |
)
|
| 125 |
+
info_btn = gr.Button("Demo Info", size="md", scale=1)
|
| 126 |
|
| 127 |
# Info modal/accordion for pipeline details
|
| 128 |
+
with gr.Accordion("π About the OCR QA Method", open=False, visible=False) as info_accordion:
|
| 129 |
gr.Markdown(
|
| 130 |
"""
|
| 131 |
+
### π About the OCR QA Method
|
| 132 |
+
|
| 133 |
+
This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
|
| 134 |
+
|
| 135 |
+
#### How it works:
|
| 136 |
+
- **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
|
| 137 |
+
- **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
|
| 138 |
+
- **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
|
| 139 |
+
- **Diagnostics output**:
|
| 140 |
+
- β
**Known tokens**: Words found in the reference wordlist, presumed correctly OCRβd.
|
| 141 |
+
- β **Unrecognized tokens**: Words not found in the listβoften OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
|
| 142 |
+
|
| 143 |
+
#### β οΈ Limitations:
|
| 144 |
+
- The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
|
| 145 |
+
- The method may fail to flag **short OCR artifacts** (e.g., 1β2 character noise) and **non-alphabetic symbols**.
|
| 146 |
+
|
| 147 |
+
As such, the score should be understood as a **heuristic indicator**, best used for:
|
| 148 |
+
- Comparative assessments between OCR outputs
|
| 149 |
+
- Filtering low-quality text from large corpora
|
| 150 |
+
- Supporting decisions in corpus preparation and annotation workflows
|
| 151 |
+
|
| 152 |
+
It is **not a substitute for manual inspection** or ground-truth evaluation.
|
| 153 |
+
"""
|
| 154 |
+
)
|
| 155 |
|
| 156 |
submit_btn.click(
|
| 157 |
fn=process_ocr_qa,
|