simon-clmtd commited on
Commit
31e0f17
Β·
verified Β·
1 Parent(s): 76cc6b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -17
app.py CHANGED
@@ -59,9 +59,9 @@ def process_ocr_qa(text, lang_choice):
59
  # Unknown tokens (potential OCR errors)
60
  if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
61
  unknown_tokens = diagnostics['unknown_tokens']
62
- output_lines.append(f"❌ Potential OCR errors ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
63
  elif 'unknown_tokens' in diagnostics:
64
- output_lines.append("✨ No potential OCR errors detected!")
65
 
66
  # Other fields
67
  for key, value in result.items():
@@ -81,7 +81,7 @@ with gr.Blocks(title="OCR QA Demo") as demo:
81
  gr.HTML(
82
  """
83
  <a href="https://impresso-project.ch" target="_blank">
84
- <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height: 100px;">
85
  </a>
86
  """
87
  )
@@ -102,37 +102,56 @@ with gr.Blocks(title="OCR QA Demo") as demo:
102
  with gr.Row():
103
  with gr.Column():
104
  text_input = gr.Textbox(
105
- label="Enter OCR Text",
106
  value=EXAMPLE_TEXT,
107
  lines=8,
108
- placeholder="Enter your OCR text here..."
109
  )
110
  lang_dropdown = gr.Dropdown(
111
- choices=["Auto-detect"] + LANGUAGES,
112
  value="de",
113
- label="Language"
114
  )
115
- submit_btn = gr.Button("πŸ” Analyze OCR Quality", variant="primary")
116
 
117
  with gr.Column():
118
  with gr.Row():
119
  output = gr.Textbox(
120
- label="Analysis Results",
121
  lines=15,
122
- placeholder="Results will appear here...",
123
  scale=10
124
  )
125
- info_btn = gr.Button("Pipeline Info", size="sm", scale=1)
126
 
127
  # Info modal/accordion for pipeline details
128
- with gr.Accordion("πŸ“ About the OCR QA Pipeline", open=False, visible=False) as info_accordion:
129
  gr.Markdown(
130
  """
131
- - **Quality Score**: Evaluates the overall quality of OCR text. From 0.0 (poor) to 1.0 (excellent)
132
- - **Known tokens**: Words recognized as valid in the selected language
133
- - **Potential OCR errors**: Identifies common OCR mistakes and artifacts
134
- """
135
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  submit_btn.click(
138
  fn=process_ocr_qa,
 
59
  # Unknown tokens (potential OCR errors)
60
  if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
61
  unknown_tokens = diagnostics['unknown_tokens']
62
+ output_lines.append(f"❌ Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
63
  elif 'unknown_tokens' in diagnostics:
64
+ output_lines.append("✨ All tokens matched known lexicons – no OCR errors detected.")
65
 
66
  # Other fields
67
  for key, value in result.items():
 
81
  gr.HTML(
82
  """
83
  <a href="https://impresso-project.ch" target="_blank">
84
+ <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height: 84px;">
85
  </a>
86
  """
87
  )
 
102
  with gr.Row():
103
  with gr.Column():
104
  text_input = gr.Textbox(
105
+ label="OCR Text (from digitized sources)",
106
  value=EXAMPLE_TEXT,
107
  lines=8,
108
+ placeholder="Paste OCR-processed text from a historical document..."
109
  )
110
  lang_dropdown = gr.Dropdown(
111
+ choices=LANGUAGES,
112
  value="de",
113
+ label="Language of the Text"
114
  )
115
+ submit_btn = gr.Button("πŸ” Assess OCR Text Quality", variant="primary")
116
 
117
  with gr.Column():
118
  with gr.Row():
119
  output = gr.Textbox(
120
+ label="OCR Quality Report",
121
  lines=15,
122
+ placeholder="The quality assessment will appear here...",
123
  scale=10
124
  )
125
+ info_btn = gr.Button("Demo Info", size="md", scale=1)
126
 
127
  # Info modal/accordion for pipeline details
128
+ with gr.Accordion("πŸ“ About the OCR QA Method", open=False, visible=False) as info_accordion:
129
  gr.Markdown(
130
  """
131
+ ### πŸ“ About the OCR QA Method
132
+
133
+ This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
134
+
135
+ #### How it works:
136
+ - **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
137
+ - **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
138
+ - **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
139
+ - **Diagnostics output**:
140
+ - βœ… **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
141
+ - ❌ **Unrecognized tokens**: Words not found in the listβ€”often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
142
+
143
+ #### ⚠️ Limitations:
144
+ - The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
145
+ - The method may fail to flag **short OCR artifacts** (e.g., 1–2 character noise) and **non-alphabetic symbols**.
146
+
147
+ As such, the score should be understood as a **heuristic indicator**, best used for:
148
+ - Comparative assessments between OCR outputs
149
+ - Filtering low-quality text from large corpora
150
+ - Supporting decisions in corpus preparation and annotation workflows
151
+
152
+ It is **not a substitute for manual inspection** or ground-truth evaluation.
153
+ """
154
+ )
155
 
156
  submit_btn.click(
157
  fn=process_ocr_qa,