Thomas Dehaene commited on
Commit
d5101c4
·
1 Parent(s): e168969
Files changed (2) hide show
  1. README.md +5 -14
  2. app.py +61 -0
README.md CHANGED
@@ -1,37 +1,28 @@
1
  ---
2
- title: Byt5_ocr_corrector
3
- emoji: 🚀
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
10
-
11
  # Configuration
12
-
13
  `title`: _string_
14
  Display title for the Space
15
-
16
  `emoji`: _string_
17
  Space emoji (emoji-only character allowed)
18
-
19
  `colorFrom`: _string_
20
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
-
22
  `colorTo`: _string_
23
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
-
25
  `sdk`: _string_
26
  Can be either `gradio` or `streamlit`
27
-
28
  `sdk_version` : _string_
29
  Only applicable for `streamlit` SDK.
30
  See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
-
32
  `app_file`: _string_
33
  Path to your main application file (which contains either `gradio` or `streamlit` Python code).
34
  Path is relative to the root of the repository.
35
-
36
  `pinned`: _boolean_
37
- Whether the Space stays on top of your list.
 
1
  ---
2
+ title: ByT5 dOCRtor
3
+ emoji: 💊
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
 
10
  # Configuration
 
11
  `title`: _string_
12
  Display title for the Space
 
13
  `emoji`: _string_
14
  Space emoji (emoji-only character allowed)
 
15
  `colorFrom`: _string_
16
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
 
17
  `colorTo`: _string_
18
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
 
19
  `sdk`: _string_
20
  Can be either `gradio` or `streamlit`
 
21
  `sdk_version` : _string_
22
  Only applicable for `streamlit` SDK.
23
  See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
 
24
  `app_file`: _string_
25
  Path to your main application file (which contains either `gradio` or `streamlit` Python code).
26
  Path is relative to the root of the repository.
 
27
  `pinned`: _boolean_
28
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textwrap import wrap
2
+
3
+ import streamlit as st
4
+ from transformers import pipeline
5
+ import nlpaug.augmenter.char as nac
6
+
7
+ st.markdown('# ByT5 Dutch OCR Corrector :pill:')
8
+ st.write('This app corrects common dutch OCR mistakes, to showcase how this could be used in an OCR post-processing pipeline.')
9
+
10
+ st.markdown("""
11
+ To use this:
12
+ - Enter a text with OCR mistakes and hit 'unscramble':point_down:
13
+ - Or enter a normal text, scramble it :twisted_rightwards_arrows: and then hit 'unscramble' :point_down:""")
14
+
15
+ @st.cache(allow_output_mutation=True,
16
+ suppress_st_warning=True,
17
+ show_spinner=False)
18
+ def load_model():
19
+ with st.spinner('Please wait for the model to load...'):
20
+ ocr_pipeline=pipeline(
21
+ 'text2text-generation',
22
+ model='ml6team/byt5-base-dutch-ocr-correction',
23
+ tokenizer='ml6team/byt5-base-dutch-ocr-correction'
24
+ )
25
+ return ocr_pipeline
26
+
27
+ ocr_pipeline = load_model()
28
+
29
+ if 'text' not in st.session_state:
30
+ st.session_state.text = ""
31
+
32
+ left_area, right_area = st.columns(2)
33
+
34
+ # Format the left area
35
+ left_area.header("Input")
36
+ form = left_area.form(key='ocrcorrector')
37
+ placeholder = form.empty()
38
+ placeholder.empty()
39
+ input_text = placeholder.text_area(value=st.session_state.text, label='Insert text:', key='input_text')
40
+ scramble_button = form.form_submit_button(label='Scramble')
41
+ submit_button = form.form_submit_button(label='Unscramble')
42
+
43
+ # Right area
44
+ right_area.header("Output")
45
+
46
+ if scramble_button:
47
+ aug = nac.OcrAug()
48
+ st.session_state.text = st.session_state.input_text
49
+ base_text = st.session_state.text
50
+ augmented_data = aug.augment(base_text)
51
+ st.session_state.text = augmented_data
52
+ del st.session_state.input_text
53
+ placeholder.empty()
54
+ input_text = placeholder.text_area(value=st.session_state.text, label='Insert text:', key='input_text')
55
+
56
+ if submit_button:
57
+ base_text = st.session_state.input_text
58
+ output_text = " ".join([x['generated_text'] for x in ocr_pipeline(wrap(base_text, 128))])
59
+ right_area.markdown('#####')
60
+ right_area.text_area(value=output_text, label="Corrected text:")
61
+