Spaces:

BramVanroy
/

mai-simplification-nl-2023-demo

Sleeping

App Files Files Community

Bram Vanroy commited on May 22, 2023

Commit

d1f2e36

•

1 Parent(s): 6fc246d

make style

Browse files

Files changed (2) hide show

app.py +25 -16
utils.py +16 -14

app.py CHANGED Viewed

@@ -2,14 +2,11 @@ import base64
 from io import StringIO
 from math import ceil
-from utils import get_resources, simplify
 import streamlit as st
-st.set_page_config(
-    page_title="Text Simplification in Dutch",
-    page_icon="🏃"
-)
 BATCH_SIZE = 8
@@ -33,8 +30,10 @@ if fupload_check:
         st.session_state["text_to_simplify"] = None
 else:
     st.session_state["text_to_simplify"] = st.text_area(
-        label="Sentences to translate", label_visibility="collapsed", height=200,
-        value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld."
     ).strip()
@@ -44,6 +43,7 @@ def _get_increment_size(num_sents) -> int:
     else:
         return ceil(100 / (num_sents / BATCH_SIZE))
 btn_col, results_col = st.columns(2)
 btn_ct = btn_col.empty()
 error_ct = st.empty()
@@ -51,7 +51,9 @@ simpl_ct = st.container()
 if st.session_state["text_to_simplify"]:
     if btn_ct.button("Simplify text"):
         error_ct.empty()
-        lines = [strip_line for line in st.session_state["text_to_simplify"].splitlines() if (strip_line := line.strip())]
         num_sentences = len(lines)
         pbar = st.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
@@ -73,7 +75,7 @@ if st.session_state["text_to_simplify"]:
                         <li><strong>Simplification:</strong> {simplification}</li>
                     </ul>
                 </li>"""
-                output_ct.markdown(html+"</ol>", unsafe_allow_html=True)
             all_simplifications.extend(simplifications)
@@ -83,7 +85,10 @@ if st.session_state["text_to_simplify"]:
         all_simplifications = "\n".join(all_simplifications) + "\n"
         b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
-        results_col.markdown(f'<a download="dutch-simplifications.txt" href="data:file/txt;base64,{b64}" title="Download">Download simplifications</a>', unsafe_allow_html=True)
 else:
     btn_ct.empty()
     error_ct.error("Text cannot be empty!", icon="⚠️")
@@ -95,7 +100,8 @@ else:
 ########################
 st.header("Project background")
-st.markdown("""This demo highlights work that has been done in light of a master thesis by Charlotte Van de Velde as part of the Master of Science in Artificial Intelligence at KU Leuven in 2023. Charlotte is supervised by Vincent Vandeghinste and Bram Vanroy.
 Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset.
@@ -107,11 +113,14 @@ The following models are available, all finetuned from the awesome Dutch T5 mode
 The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).
-""")
 st.header("Contact ✒️")
-st.markdown("Would you like  additional functionality in the demo, do you have questions, or just want to get in touch?"
-            " Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
-            " or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!")

 from io import StringIO
 from math import ceil
 import streamlit as st
+from utils import get_resources, simplify
+st.set_page_config(page_title="Text Simplification in Dutch", page_icon="🏃")
 BATCH_SIZE = 8
         st.session_state["text_to_simplify"] = None
 else:
     st.session_state["text_to_simplify"] = st.text_area(
+        label="Sentences to translate",
+        label_visibility="collapsed",
+        height=200,
+        value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld.",
     ).strip()
     else:
         return ceil(100 / (num_sents / BATCH_SIZE))
 btn_col, results_col = st.columns(2)
 btn_ct = btn_col.empty()
 error_ct = st.empty()
 if st.session_state["text_to_simplify"]:
     if btn_ct.button("Simplify text"):
         error_ct.empty()
+        lines = [
+            strip_line for line in st.session_state["text_to_simplify"].splitlines() if (strip_line := line.strip())
+        ]
         num_sentences = len(lines)
         pbar = st.progress(0, text=f"Simplifying sentences in batches of {BATCH_SIZE}...")
                         <li><strong>Simplification:</strong> {simplification}</li>
                     </ul>
                 </li>"""
+                output_ct.markdown(html + "</ol>", unsafe_allow_html=True)
             all_simplifications.extend(simplifications)
         all_simplifications = "\n".join(all_simplifications) + "\n"
         b64 = base64.b64encode(all_simplifications.encode("utf-8")).decode("utf-8")
+        results_col.markdown(
+            f'<a download="dutch-simplifications.txt" href="data:file/txt;base64,{b64}" title="Download">Download simplifications</a>',
+            unsafe_allow_html=True,
+        )
 else:
     btn_ct.empty()
     error_ct.error("Text cannot be empty!", icon="⚠️")
 ########################
 st.header("Project background")
+st.markdown(
+    """This demo highlights work that has been done in light of a master thesis by Charlotte Van de Velde as part of the Master of Science in Artificial Intelligence at KU Leuven in 2023. Charlotte is supervised by Vincent Vandeghinste and Bram Vanroy.
 Charlotte created a [dataset](https://huggingface.co/datasets/BramVanroy/chatgpt-dutch-simplification) that contains Dutch sentences and their simplified equivalents with ChatGPT. Bram then trained a number of models on this new dataset.
 The training code can be found on [Github](https://github.com/BramVanroy/mai-simplification-nl-2023#22-hyperparameter-sweep).
+"""
+)
 st.header("Contact ✒️")
+st.markdown(
+    "Would you like  additional functionality in the demo, do you have questions, or just want to get in touch?"
+    " Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
+    " or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!"
+)

utils.py CHANGED Viewed

@@ -1,18 +1,16 @@
-from threading import Thread
-from typing import Tuple, Generator, List
-from optimum.bettertransformer import BetterTransformer
 import streamlit as st
 import torch
-from torch.quantization import quantize_dynamic
 from torch import nn, qint8
-from transformers import T5ForConditionalGeneration, T5Tokenizer, TextStreamer, TextIteratorStreamer
 @st.cache_resource(show_spinner=False)
 def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer]:
-    """
-    """
     tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
     model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
@@ -30,20 +28,24 @@ def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForCo
 def batchify(iterable, batch_size=16):
     num_items = len(iterable)
     for idx in range(0, num_items, batch_size):
-        yield iterable[idx:min(idx + batch_size, num_items)]
 def simplify(
-        texts: List[str],
-        model: T5ForConditionalGeneration,
-        tokenizer: T5Tokenizer,
-        batch_size: int = 16
 ) -> List[str]:
     """
-    """
     for batch_texts in batchify(texts, batch_size=batch_size):
         nlg_batch_texts = ["[NLG] " + text for text in batch_texts]
         encoded = tokenizer(nlg_batch_texts, return_tensors="pt", padding=True, truncation=True)

+from typing import  List, Tuple
 import streamlit as st
 import torch
+from optimum.bettertransformer import BetterTransformer
 from torch import nn, qint8
+from torch.quantization import quantize_dynamic
+from transformers import T5ForConditionalGeneration, T5Tokenizer
 @st.cache_resource(show_spinner=False)
 def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer]:
+    """Load a T5 model and its (slow) tokenizer"""
     tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
     model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
 def batchify(iterable, batch_size=16):
+    """Turn an iterable in a batch generator
+    :param iterable: iterable to batchify
+    :param batch_size: batch size
+    """
     num_items = len(iterable)
     for idx in range(0, num_items, batch_size):
+        yield iterable[idx : min(idx + batch_size, num_items)]
 def simplify(
+    texts: List[str], model: T5ForConditionalGeneration, tokenizer: T5Tokenizer, batch_size: int = 16
 ) -> List[str]:
+    """Simplify a given set of texts with a given model and tokenizer. Yields results in batches of 'batch_size'
+    :param texts: texts to simplify
+    :param model: model to use for simplification
+    :param tokenizer: tokenizer to use for simplification
+    :param batch_size: batch size to yield results in
     """
     for batch_texts in batchify(texts, batch_size=batch_size):
         nlg_batch_texts = ["[NLG] " + text for text in batch_texts]
         encoded = tokenizer(nlg_batch_texts, return_tensors="pt", padding=True, truncation=True)