Spaces:
Sleeping
Sleeping
Thomas Dehaene
commited on
Commit
·
d5101c4
1
Parent(s):
e168969
Add app
Browse files
README.md
CHANGED
@@ -1,37 +1,28 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
-
|
11 |
# Configuration
|
12 |
-
|
13 |
`title`: _string_
|
14 |
Display title for the Space
|
15 |
-
|
16 |
`emoji`: _string_
|
17 |
Space emoji (emoji-only character allowed)
|
18 |
-
|
19 |
`colorFrom`: _string_
|
20 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
21 |
-
|
22 |
`colorTo`: _string_
|
23 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
24 |
-
|
25 |
`sdk`: _string_
|
26 |
Can be either `gradio` or `streamlit`
|
27 |
-
|
28 |
`sdk_version` : _string_
|
29 |
Only applicable for `streamlit` SDK.
|
30 |
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
31 |
-
|
32 |
`app_file`: _string_
|
33 |
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
34 |
Path is relative to the root of the repository.
|
35 |
-
|
36 |
`pinned`: _boolean_
|
37 |
-
Whether the Space stays on top of your list.
|
|
|
1 |
---
|
2 |
+
title: ByT5 dOCRtor
|
3 |
+
emoji: 💊
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
|
|
10 |
# Configuration
|
|
|
11 |
`title`: _string_
|
12 |
Display title for the Space
|
|
|
13 |
`emoji`: _string_
|
14 |
Space emoji (emoji-only character allowed)
|
|
|
15 |
`colorFrom`: _string_
|
16 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
|
|
17 |
`colorTo`: _string_
|
18 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
|
|
19 |
`sdk`: _string_
|
20 |
Can be either `gradio` or `streamlit`
|
|
|
21 |
`sdk_version` : _string_
|
22 |
Only applicable for `streamlit` SDK.
|
23 |
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
|
|
24 |
`app_file`: _string_
|
25 |
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
26 |
Path is relative to the root of the repository.
|
|
|
27 |
`pinned`: _boolean_
|
28 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from textwrap import wrap
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
from transformers import pipeline
|
5 |
+
import nlpaug.augmenter.char as nac
|
6 |
+
|
7 |
+
st.markdown('# ByT5 Dutch OCR Corrector :pill:')
|
8 |
+
st.write('This app corrects common dutch OCR mistakes, to showcase how this could be used in an OCR post-processing pipeline.')
|
9 |
+
|
10 |
+
st.markdown("""
|
11 |
+
To use this:
|
12 |
+
- Enter a text with OCR mistakes and hit 'unscramble':point_down:
|
13 |
+
- Or enter a normal text, scramble it :twisted_rightwards_arrows: and then hit 'unscramble' :point_down:""")
|
14 |
+
|
15 |
+
@st.cache(allow_output_mutation=True,
|
16 |
+
suppress_st_warning=True,
|
17 |
+
show_spinner=False)
|
18 |
+
def load_model():
|
19 |
+
with st.spinner('Please wait for the model to load...'):
|
20 |
+
ocr_pipeline=pipeline(
|
21 |
+
'text2text-generation',
|
22 |
+
model='ml6team/byt5-base-dutch-ocr-correction',
|
23 |
+
tokenizer='ml6team/byt5-base-dutch-ocr-correction'
|
24 |
+
)
|
25 |
+
return ocr_pipeline
|
26 |
+
|
27 |
+
ocr_pipeline = load_model()
|
28 |
+
|
29 |
+
if 'text' not in st.session_state:
|
30 |
+
st.session_state.text = ""
|
31 |
+
|
32 |
+
left_area, right_area = st.columns(2)
|
33 |
+
|
34 |
+
# Format the left area
|
35 |
+
left_area.header("Input")
|
36 |
+
form = left_area.form(key='ocrcorrector')
|
37 |
+
placeholder = form.empty()
|
38 |
+
placeholder.empty()
|
39 |
+
input_text = placeholder.text_area(value=st.session_state.text, label='Insert text:', key='input_text')
|
40 |
+
scramble_button = form.form_submit_button(label='Scramble')
|
41 |
+
submit_button = form.form_submit_button(label='Unscramble')
|
42 |
+
|
43 |
+
# Right area
|
44 |
+
right_area.header("Output")
|
45 |
+
|
46 |
+
if scramble_button:
|
47 |
+
aug = nac.OcrAug()
|
48 |
+
st.session_state.text = st.session_state.input_text
|
49 |
+
base_text = st.session_state.text
|
50 |
+
augmented_data = aug.augment(base_text)
|
51 |
+
st.session_state.text = augmented_data
|
52 |
+
del st.session_state.input_text
|
53 |
+
placeholder.empty()
|
54 |
+
input_text = placeholder.text_area(value=st.session_state.text, label='Insert text:', key='input_text')
|
55 |
+
|
56 |
+
if submit_button:
|
57 |
+
base_text = st.session_state.input_text
|
58 |
+
output_text = " ".join([x['generated_text'] for x in ocr_pipeline(wrap(base_text, 128))])
|
59 |
+
right_area.markdown('#####')
|
60 |
+
right_area.text_area(value=output_text, label="Corrected text:")
|
61 |
+
|