File size: 5,369 Bytes
d3a07ee
 
 
 
c2302bf
 
d1f2e36
 
 
c2302bf
d3a07ee
 
 
 
 
c2302bf
 
d3a07ee
c2302bf
d3a07ee
 
 
 
 
 
 
 
 
 
 
 
 
d1f2e36
 
 
 
d3a07ee
 
 
 
f73827b
d3a07ee
c2302bf
f73827b
c2302bf
d1f2e36
d3a07ee
 
f73827b
d3a07ee
 
 
 
c2302bf
d1f2e36
 
 
d3a07ee
 
f73827b
d3a07ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1f2e36
d3a07ee
f73827b
 
 
d3a07ee
 
 
c2302bf
d3a07ee
 
d1f2e36
 
 
 
d3a07ee
 
 
 
c2302bf
 
 
 
 
 
 
d1f2e36
879633b
fbc7281
577e1db
fbc7281
 
 
 
 
 
 
 
 
d1f2e36
 
c2302bf
 
 
 
d1f2e36
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import base64
from io import StringIO
from math import ceil

import streamlit as st

from utils import get_resources, simplify

st.set_page_config(page_title="Text Simplification in Dutch", page_icon="🏃")

BATCH_SIZE = 8

if "text_to_simplify" not in st.session_state:
    st.session_state["text_to_simplify"] = None

st.title("🏃 Text Simplification in Dutch")

fupload_check = st.checkbox("File upload?")

st.markdown(
    "Make sure that the file or text in the text box contains **one sentence per line**. Empty lines will"
    " be removed."
)
if fupload_check:
    uploaded_file = st.file_uploader("Text file", label_visibility="collapsed")
    if uploaded_file is not None:
        stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        st.session_state["text_to_simplify"] = stringio.read().strip()
    else:
        st.session_state["text_to_simplify"] = None
else:
    st.session_state["text_to_simplify"] = st.text_area(
        label="Sentences to translate",
        label_visibility="collapsed",
        height=200,
        value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld.",
    ).strip()


def _get_increment_size(num_sents) -> int:
    if num_sents == 1:
        return 100
    else:
        return ceil(100 / num_sents)


btn_col, results_col = st.columns(2)
btn_ct = btn_col.empty()
pbar_ct = st.empty()
error_ct = st.empty()
simpl_ct = st.container()
if st.session_state["text_to_simplify"]:
    if btn_ct.button("Simplify text"):
        error_ct.empty()
        lines = [
            strip_line for line in st.session_state["text_to_simplify"].splitlines() if (strip_line := line.strip())
        ]
        num_sentences = len(lines)

        pbar = pbar_ct.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
        increment = _get_increment_size(num_sentences)
        percent_done = 0

        model, tokenizer = get_resources()

        simpl_ct.caption("Simplified text")
        output_ct = simpl_ct.empty()
        all_simplifications = []
        html = "<ol>"
        for input_batch, simplifications in simplify(lines, model, tokenizer):
            for input_text, simplification in zip(input_batch, simplifications):
                output_ct.empty()
                html += f"""<li>
                    <ul>
                        <li><strong>Input text:</strong> {input_text}</li>
                        <li><strong>Simplification:</strong> {simplification}</li>
                    </ul>
                </li>"""
                output_ct.markdown(html + "</ol>", unsafe_allow_html=True)

                percent_done += increment
                pbar.progress(min(percent_done, 100))

            all_simplifications.extend(simplifications)

        pbar.empty()

        all_simplifications = "\n".join(all_simplifications) + "\n"
        b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
        results_col.markdown(
            f'<a download="dutch-simplifications.txt" href="data:file/txt;base64,{b64}" title="Download">Download simplifications</a>',
            unsafe_allow_html=True,
        )
else:
    btn_ct.empty()
    error_ct.error("Text cannot be empty!", icon="⚠️")
    simpl_ct.container()


########################
# Information, socials #
########################
st.header("Project background")

st.markdown(
    """This demo highlights work that has been done in light of a master thesis by **Charlotte Van de Velde** as part of the Master of Science in Artificial Intelligence at KU Leuven in 2023. Charlotte is supervised by Vincent Vandeghinste and Bram Vanroy.

Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset and built this demo.

The following models are available, all finetuned from the awesome Dutch T5 models by [Yeb Havinga](https://huggingface.co/yhavinga):

- [`BramVanroy/ul2-small-dutch-simplification-mai-2023`](https://huggingface.co/BramVanroy/ul2-small-dutch-simplification-mai-2023)
- [`BramVanroy/ul2-base-dutch-simplification-mai-2023`](https://huggingface.co/BramVanroy/ul2-base-dutch-simplification-mai-2023) (used in this demo)
- [`BramVanroy/ul2-large-dutch-simplification-mai-2023`](https://huggingface.co/BramVanroy/ul2-large-dutch-simplification-mai-2023)

The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).

"""
)


st.header("Contact ✒️")

st.markdown(
    "Would you like  additional functionality in the demo, do you have questions, or just want to get in touch?"
    " Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
    " or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!"
)