Bram Vanroy commited on
Commit
c2302bf
1 Parent(s): 5ddc459

push dummy

Browse files
Files changed (7) hide show
  1. .dockerignore +157 -0
  2. .gitignore +237 -0
  3. Dockerfile +21 -0
  4. README.md +8 -6
  5. app.py +43 -0
  6. requirements.txt +8 -0
  7. utils.py +58 -0
.dockerignore ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **/.git
2
+ **/.venv
3
+ **/.mypy_cache
4
+ **/.idea
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py,cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # poetry
103
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107
+ #poetry.lock
108
+
109
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
110
+ __pypackages__/
111
+
112
+ # Celery stuff
113
+ celerybeat-schedule
114
+ celerybeat.pid
115
+
116
+ # SageMath parsed files
117
+ *.sage.py
118
+
119
+ # Environments
120
+ .env
121
+ .venv
122
+ env/
123
+ venv/
124
+ ENV/
125
+ env.bak/
126
+ venv.bak/
127
+
128
+ # Spyder project settings
129
+ .spyderproject
130
+ .spyproject
131
+
132
+ # Rope project settings
133
+ .ropeproject
134
+
135
+ # mkdocs documentation
136
+ /site
137
+
138
+ # mypy
139
+ .mypy_cache/
140
+ .dmypy.json
141
+ dmypy.json
142
+
143
+ # Pyre type checker
144
+ .pyre/
145
+
146
+ # pytype static type analyzer
147
+ .pytype/
148
+
149
+ # Cython debug symbols
150
+ cython_debug/
151
+
152
+ # PyCharm
153
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
154
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
155
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
156
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
157
+ #.idea/
.gitignore ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pipfile*
2
+ data/*
3
+ *config.json
4
+
5
+
6
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
7
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
8
+
9
+ .idea/
10
+ # User-specific stuff
11
+ .idea/**/workspace.xml
12
+ .idea/**/tasks.xml
13
+ .idea/**/usage.statistics.xml
14
+ .idea/**/dictionaries
15
+ .idea/**/shelf
16
+
17
+ # AWS User-specific
18
+ .idea/**/aws.xml
19
+
20
+ # Generated files
21
+ .idea/**/contentModel.xml
22
+
23
+ # Sensitive or high-churn files
24
+ .idea/**/dataSources/
25
+ .idea/**/dataSources.ids
26
+ .idea/**/dataSources.local.xml
27
+ .idea/**/sqlDataSources.xml
28
+ .idea/**/dynamic.xml
29
+ .idea/**/uiDesigner.xml
30
+ .idea/**/dbnavigator.xml
31
+
32
+ # Gradle
33
+ .idea/**/gradle.xml
34
+ .idea/**/libraries
35
+
36
+ # Gradle and Maven with auto-import
37
+ # When using Gradle or Maven with auto-import, you should exclude module files,
38
+ # since they will be recreated, and may cause churn. Uncomment if using
39
+ # auto-import.
40
+ # .idea/artifacts
41
+ # .idea/compiler.xml
42
+ # .idea/jarRepositories.xml
43
+ # .idea/modules.xml
44
+ # .idea/*.iml
45
+ # .idea/modules
46
+ # *.iml
47
+ # *.ipr
48
+
49
+ # CMake
50
+ cmake-build-*/
51
+
52
+ # Mongo Explorer plugin
53
+ .idea/**/mongoSettings.xml
54
+
55
+ # File-based project format
56
+ *.iws
57
+
58
+ # IntelliJ
59
+ out/
60
+
61
+ # mpeltonen/sbt-idea plugin
62
+ .idea_modules/
63
+
64
+ # JIRA plugin
65
+ atlassian-ide-plugin.xml
66
+
67
+ # Cursive Clojure plugin
68
+ .idea/replstate.xml
69
+
70
+ # SonarLint plugin
71
+ .idea/sonarlint/
72
+
73
+ # Crashlytics plugin (for Android Studio and IntelliJ)
74
+ com_crashlytics_export_strings.xml
75
+ crashlytics.properties
76
+ crashlytics-build.properties
77
+ fabric.properties
78
+
79
+ # Editor-based Rest Client
80
+ .idea/httpRequests
81
+
82
+ # Android studio 3.1+ serialized cache file
83
+ .idea/caches/build_file_checksums.ser
84
+
85
+
86
+ # Byte-compiled / optimized / DLL files
87
+ __pycache__/
88
+ *.py[cod]
89
+ *$py.class
90
+
91
+ # C extensions
92
+ *.so
93
+
94
+ # Distribution / packaging
95
+ .Python
96
+ build/
97
+ develop-eggs/
98
+ dist/
99
+ downloads/
100
+ eggs/
101
+ .eggs/
102
+ lib/
103
+ lib64/
104
+ parts/
105
+ sdist/
106
+ var/
107
+ wheels/
108
+ share/python-wheels/
109
+ *.egg-info/
110
+ .installed.cfg
111
+ *.egg
112
+ MANIFEST
113
+
114
+ # PyInstaller
115
+ # Usually these files are written by a python script from a template
116
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
117
+ *.manifest
118
+ *.spec
119
+
120
+ # Installer logs
121
+ pip-log.txt
122
+ pip-delete-this-directory.txt
123
+
124
+ # Unit test / coverage reports
125
+ htmlcov/
126
+ .tox/
127
+ .nox/
128
+ .coverage
129
+ .coverage.*
130
+ .cache
131
+ nosetests.xml
132
+ coverage.xml
133
+ *.cover
134
+ *.py,cover
135
+ .hypothesis/
136
+ .pytest_cache/
137
+ cover/
138
+
139
+ # Translations
140
+ *.mo
141
+ *.pot
142
+
143
+ # Django stuff:
144
+ *.log
145
+ local_settings.py
146
+ db.sqlite3
147
+ db.sqlite3-journal
148
+
149
+ # Flask stuff:
150
+ instance/
151
+ .webassets-cache
152
+
153
+ # Scrapy stuff:
154
+ .scrapy
155
+
156
+ # Sphinx documentation
157
+ docs/_build/
158
+
159
+ # PyBuilder
160
+ .pybuilder/
161
+ target/
162
+
163
+ # Jupyter Notebook
164
+ .ipynb_checkpoints
165
+
166
+ # IPython
167
+ profile_default/
168
+ ipython_config.py
169
+
170
+ # pyenv
171
+ # For a library or package, you might want to ignore these files since the code is
172
+ # intended to run in multiple environments; otherwise, check them in:
173
+ # .python-version
174
+
175
+ # pipenv
176
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
177
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
178
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
179
+ # install all needed dependencies.
180
+ #Pipfile.lock
181
+
182
+ # poetry
183
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
184
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
185
+ # commonly ignored for libraries.
186
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
187
+ #poetry.lock
188
+
189
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
190
+ __pypackages__/
191
+
192
+ # Celery stuff
193
+ celerybeat-schedule
194
+ celerybeat.pid
195
+
196
+ # SageMath parsed files
197
+ *.sage.py
198
+
199
+ # Environments
200
+ .env
201
+ .venv
202
+ env/
203
+ venv/
204
+ ENV/
205
+ env.bak/
206
+ venv.bak/
207
+
208
+ # Spyder project settings
209
+ .spyderproject
210
+ .spyproject
211
+
212
+ # Rope project settings
213
+ .ropeproject
214
+
215
+ # mkdocs documentation
216
+ /site
217
+
218
+ # mypy
219
+ .mypy_cache/
220
+ .dmypy.json
221
+ dmypy.json
222
+
223
+ # Pyre type checker
224
+ .pyre/
225
+
226
+ # pytype static type analyzer
227
+ .pytype/
228
+
229
+ # Cython debug symbols
230
+ cython_debug/
231
+
232
+ # PyCharm
233
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
234
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
235
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
236
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
237
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.10
2
+
3
+ WORKDIR /app
4
+
5
+ COPY ./requirements.txt /app/requirements.txt
6
+
7
+ RUN pip3 install --no-cache-dir -r /app/requirements.txt
8
+
9
+ # User
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV HOME /home/user
13
+ ENV PATH $HOME/.local/bin:$PATH
14
+
15
+ WORKDIR $HOME
16
+ RUN mkdir app
17
+ WORKDIR $HOME/app
18
+ COPY . $HOME/app
19
+
20
+ EXPOSE 8501
21
+ CMD streamlit run app.py
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Mai Simplification Nl 2023 Demo
3
  emoji: 🏃
4
  colorFrom: indigo
5
  colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.19.0
8
  app_file: app.py
9
- pinned: false
10
  license: cc-by-nc-sa-4.0
 
 
 
 
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Dutch Simplification
3
  emoji: 🏃
4
  colorFrom: indigo
5
  colorTo: yellow
6
+ sdk: docker
7
+ app_port: 8501
8
  app_file: app.py
9
+ pinned: true
10
  license: cc-by-nc-sa-4.0
11
+ tags:
12
+ - natural language processing
13
+ - simplification
14
+ - dutch
15
  ---
 
 
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import get_resources, simplify
2
+
3
+ import streamlit as st
4
+
5
+ st.set_page_config(
6
+ page_title="Text Simplification in Dutch",
7
+ page_icon="🏃"
8
+ )
9
+
10
+ st.title("🏃 Text Simplification in Dutch")
11
+
12
+ with st.form("input data"):
13
+ text = st.text_area(label="Input text", value="Met het naderen van de zonovergoten middaghemel op deze betoverende dag, waarbij de atmosferische omstandigheden een onbelemmerde convergentie van cumulusbewolking en uitgestrekte stratosferische azuurblauwe wijdheid faciliteren, lijken de geaggregeerde weersverschijnselen van vandaag, die variëren van sporadische plensbuien tot kalme zuchtjes wind en zeldzame opvlammingen van bliksem, de delicate balans tussen meteorologische complexiteit en eenvoud te weerspiegelen, waardoor de gepassioneerde observator met een gevoel van ontzag en verwondering wordt vervuld.")
14
+ submitted = st.form_submit_button("Submit")
15
+
16
+ error_ct = st.empty()
17
+ if submitted:
18
+ text = text.strip()
19
+ if not text:
20
+ error_ct.error("Text cannot be empty!", icon="⚠️")
21
+ else:
22
+ error_ct.info("Generating abstract meaning representation (AMR)...", icon="💻")
23
+
24
+ model, tokenizer, streamer = get_resources()
25
+ error_ct.empty()
26
+
27
+ for stream_simplification in simplify(text, model, tokenizer, streamer):
28
+ st.write(stream_simplification)
29
+
30
+
31
+ ########################
32
+ # Information, socials #
33
+ ########################
34
+ st.header("Project background")
35
+
36
+ st.markdown("""""")
37
+
38
+
39
+ st.header("Contact ✒️")
40
+
41
+ st.markdown("Would you like additional functionality in the demo, do you have questions, or just want to get in touch?"
42
+ " Give me a shout on [Twitter](https://twitter.com/BramVanroy)"
43
+ " or add me on [LinkedIn](https://www.linkedin.com/in/bramvanroy/)!")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy==1.24.3
2
+ optimum==1.8.6
3
+ torch==2.0.1
4
+ sacremoses==0.0.53
5
+ sentencepiece==0.1.99
6
+ streamlit==1.22.0
7
+ transformers==4.29.2
8
+ tornado==6.3.2
utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import Thread
2
+ from typing import Tuple, Generator
3
+
4
+ from optimum.bettertransformer import BetterTransformer
5
+ import streamlit as st
6
+ import torch
7
+ from torch.quantization import quantize_dynamic
8
+ from torch import nn, qint8
9
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, TextStreamer, TextIteratorStreamer
10
+
11
+
12
+ @st.cache_resource(show_spinner=False)
13
+ def get_resources(quantize: bool = True, no_cuda: bool = False) -> Tuple[T5ForConditionalGeneration, T5Tokenizer, TextIteratorStreamer]:
14
+ """
15
+ """
16
+ tokenizer = T5Tokenizer.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023", use_fast=False)
17
+ model = T5ForConditionalGeneration.from_pretrained("BramVanroy/ul2-base-dutch-simplification-mai-2023")
18
+
19
+ model = BetterTransformer.transform(model, keep_original_model=False)
20
+ model.resize_token_embeddings(len(tokenizer))
21
+
22
+ if torch.cuda.is_available() and not no_cuda:
23
+ model = model.to("cuda")
24
+ elif quantize: # Quantization not supported on CUDA
25
+ model = quantize_dynamic(model, {nn.Linear, nn.Dropout, nn.LayerNorm}, dtype=qint8)
26
+
27
+ model.eval()
28
+ streamer = TextIteratorStreamer(tokenizer, decode_kwargs={"skip_special_tokens": True, "clean_up_tokenization_spaces": True})
29
+
30
+ return model, tokenizer, streamer
31
+
32
+
33
+ def simplify(
34
+ text: str,
35
+ model: T5ForConditionalGeneration,
36
+ tokenizer: T5Tokenizer,
37
+ streamer: TextIteratorStreamer
38
+ ) -> Generator:
39
+ """
40
+ """
41
+ text = "[NLG] " + text
42
+
43
+ encoded = tokenizer(text, return_tensors="pt")
44
+ encoded = {k: v.to(model.device) for k, v in encoded.items()}
45
+ gen_kwargs = {
46
+ **encoded,
47
+ "max_new_tokens": 128,
48
+ "streamer": streamer,
49
+ }
50
+
51
+ with torch.no_grad():
52
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
53
+ thread.start()
54
+
55
+ generated_text = ""
56
+ for new_text in streamer:
57
+ generated_text += new_text
58
+ yield generated_text