Bram Vanroy commited on
Commit
d1f2e36
1 Parent(s): 6fc246d

make style

Browse files
Files changed (2) hide show
  1. app.py +25 -16
  2. utils.py +16 -14
app.py CHANGED
@@ -2,14 +2,11 @@ import base64
2
  from io import StringIO
3
  from math import ceil
4
 
5
- from utils import get_resources, simplify
6
-
7
  import streamlit as st
8
 
9
- st.set_page_config(
10
- page_title="Text Simplification in Dutch",
11
- page_icon="🏃"
12
- )
13
 
14
  BATCH_SIZE = 8
15
 
@@ -33,8 +30,10 @@ if fupload_check:
33
  st.session_state["text_to_simplify"] = None
34
  else:
35
  st.session_state["text_to_simplify"] = st.text_area(
36
- label="Sentences to translate", label_visibility="collapsed", height=200,
37
- value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld."
 
 
38
  ).strip()
39
 
40
 
@@ -44,6 +43,7 @@ def _get_increment_size(num_sents) -> int:
44
  else:
45
  return ceil(100 / (num_sents / BATCH_SIZE))
46
 
 
47
  btn_col, results_col = st.columns(2)
48
  btn_ct = btn_col.empty()
49
  error_ct = st.empty()
@@ -51,7 +51,9 @@ simpl_ct = st.container()
51
  if st.session_state["text_to_simplify"]:
52
  if btn_ct.button("Simplify text"):
53
  error_ct.empty()
54
- lines = [strip_line for line in st.session_state["text_to_simplify"].splitlines() if (strip_line := line.strip())]
 
 
55
  num_sentences = len(lines)
56
 
57
  pbar = st.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
@@ -73,7 +75,7 @@ if st.session_state["text_to_simplify"]:
73
  <li><strong>Simplification:</strong> {simplification}</li>
74
  </ul>
75
  </li>"""
76
- output_ct.markdown(html+"</ol>", unsafe_allow_html=True)
77
 
78
  all_simplifications.extend(simplifications)
79
 
@@ -83,7 +85,10 @@ if st.session_state["text_to_simplify"]:
83
 
84
  all_simplifications = "\n".join(all_simplifications) + "\n"
85
  b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
86
- results_col.markdown(f'<a download="dutch-simplifications.txt" href="data:file/txt;base64,{b64}" title="Download">Download simplifications</a>', unsafe_allow_html=True)
 
 
 
87
  else:
88
  btn_ct.empty()
89
  error_ct.error("Text cannot be empty!", icon="⚠️")
@@ -95,7 +100,8 @@ else:
95
  ########################
96
  st.header("Project background")
97
 
98
- st.markdown("""This demo highlights work that has been done in light of a master thesis by Charlotte Van de Velde as part of the Master of Science in Artificial Intelligence at KU Leuven in 2023. Charlotte is supervised by Vincent Vandeghinste and Bram Vanroy.
 
99
 
100
  Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset.
101
 
@@ -107,11 +113,14 @@ The following models are available, all finetuned from the awesome Dutch T5 mode
107
 
108
  The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).
109
 
110
- """)
 
111
 
112
 
113
  st.header("Contact ✒️")
114
 
115
- st.markdown("Would you like additional functionality in the demo, do you have questions, or just want to get in touch?"
116
- " Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
117
- " or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!")
 
 
 
2
  from io import StringIO
3
  from math import ceil
4
 
 
 
5
  import streamlit as st
6
 
7
+ from utils import get_resources, simplify
8
+
9
+ st.set_page_config(page_title="Text Simplification in Dutch", page_icon="🏃")
 
10
 
11
  BATCH_SIZE = 8
12
 
 
30
  st.session_state["text_to_simplify"] = None
31
  else:
32
  st.session_state["text_to_simplify"] = st.text_area(
33
+ label="Sentences to translate",
34
+ label_visibility="collapsed",
35
+ height=200,
36
+ value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld.",
37
  ).strip()
38
 
39
 
 
43
  else:
44
  return ceil(100 / (num_sents / BATCH_SIZE))
45
 
46
+
47
  btn_col, results_col = st.columns(2)
48
  btn_ct = btn_col.empty()
49
  error_ct = st.empty()
 
51
  if st.session_state["text_to_simplify"]:
52
  if btn_ct.button("Simplify text"):
53
  error_ct.empty()
54
+ lines = [
55
+ strip_line for line in st.session_state["text_to_simplify"].splitlines() if (strip_line := line.strip())
56
+ ]
57
  num_sentences = len(lines)
58
 
59
  pbar = st.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
 
75
  <li><strong>Simplification:</strong> {simplification}</li>
76
  </ul>
77
  </li>"""
78
+ output_ct.markdown(html + "</ol>", unsafe_allow_html=True)
79
 
80
  all_simplifications.extend(simplifications)
81
 
 
85
 
86
  all_simplifications = "\n".join(all_simplifications) + "\n"
87
  b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
88
+ results_col.markdown(
89
+ f'<a download="dutch-simplifications.txt" href="data:file/txt;base64,{b64}" title="Download">Download simplifications</a>',
90
+ unsafe_allow_html=True,
91
+ )
92
  else:
93
  btn_ct.empty()
94
  error_ct.error("Text cannot be empty!", icon="⚠️")
 
100
  ########################
101
  st.header("Project background")
102
 
103
+ st.markdown(
104
+ """This demo highlights work that has been done in light of a master thesis by Charlotte Van de Velde as part of the Master of Science in Artificial Intelligence at KU Leuven in 2023. Charlotte is supervised by Vincent Vandeghinste and Bram Vanroy.
105
 
106
  Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset.
107
 
 
113
 
114
  The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).
115
 
116
+ """
117
+ )
118
 
119
 
120
  st.header("Contact ✒️")
121
 
122
+ st.markdown(
123
+ "Would you like additional functionality in the demo, do you have questions, or just want to get in touch?"
124
+ " Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
125
+ " or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!"
126
+ )
utils.py CHANGED
@@ -1,18 +1,16 @@
1
- from threading import Thread
2
- from typing import Tuple, Generator, List
3
 
4
- from optimum.bettertransformer import BetterTransformer
5
  import streamlit as st
6
  import torch
7
- from torch.quantization import quantize_dynamic
8
  from torch import nn, qint8
9
- from transformers import T5ForConditionalGeneration, T5Tokenizer, TextStreamer, TextIteratorStreamer
 
10
 
11
 
12
  @st.cache_resource(show_spinner=False)
13
  def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer]:
14
- """
15
- """
16
  tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
17
  model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
18
 
@@ -30,20 +28,24 @@ def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForCo
30
 
31
 
32
  def batchify(iterable, batch_size=16):
 
 
 
 
33
  num_items = len(iterable)
34
  for idx in range(0, num_items, batch_size):
35
- yield iterable[idx:min(idx + batch_size, num_items)]
36
 
37
 
38
  def simplify(
39
- texts: List[str],
40
- model: T5ForConditionalGeneration,
41
- tokenizer: T5Tokenizer,
42
- batch_size: int = 16
43
  ) -> List[str]:
 
 
 
 
 
44
  """
45
- """
46
-
47
  for batch_texts in batchify(texts, batch_size=batch_size):
48
  nlg_batch_texts = ["[NLG] " + text for text in batch_texts]
49
  encoded = tokenizer(nlg_batch_texts, return_tensors="pt", padding=True, truncation=True)
 
1
+ from typing import List, Tuple
 
2
 
 
3
  import streamlit as st
4
  import torch
5
+ from optimum.bettertransformer import BetterTransformer
6
  from torch import nn, qint8
7
+ from torch.quantization import quantize_dynamic
8
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
9
 
10
 
11
  @st.cache_resource(show_spinner=False)
12
  def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer]:
13
+ """Load a T5 model and its (slow) tokenizer"""
 
14
  tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
15
  model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
16
 
 
28
 
29
 
30
  def batchify(iterable, batch_size=16):
31
+ """Turn an iterable in a batch generator
32
+ :param iterable: iterable to batchify
33
+ :param batch_size: batch size
34
+ """
35
  num_items = len(iterable)
36
  for idx in range(0, num_items, batch_size):
37
+ yield iterable[idx : min(idx + batch_size, num_items)]
38
 
39
 
40
  def simplify(
41
+ texts: List[str], model: T5ForConditionalGeneration, tokenizer: T5Tokenizer, batch_size: int = 16
 
 
 
42
  ) -> List[str]:
43
+ """Simplify a given set of texts with a given model and tokenizer. Yields results in batches of 'batch_size'
44
+ :param texts: texts to simplify
45
+ :param model: model to use for simplification
46
+ :param tokenizer: tokenizer to use for simplification
47
+ :param batch_size: batch size to yield results in
48
  """
 
 
49
  for batch_texts in batchify(texts, batch_size=batch_size):
50
  nlg_batch_texts = ["[NLG] " + text for text in batch_texts]
51
  encoded = tokenizer(nlg_batch_texts, return_tensors="pt", padding=True, truncation=True)