klasocki commited on
Commit
5760b44
1 Parent(s): 79999ac

Add baseline, server, unit tests and failing integration tests

Browse files
.gitignore CHANGED
@@ -158,3 +158,80 @@ cython_debug/
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
162
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
163
+
164
+ # User-specific stuff
165
+ .idea/**/workspace.xml
166
+ .idea/**/tasks.xml
167
+ .idea/**/usage.statistics.xml
168
+ .idea/**/dictionaries
169
+ .idea/**/shelf
170
+
171
+ # AWS User-specific
172
+ .idea/**/aws.xml
173
+
174
+ # Generated files
175
+ .idea/**/contentModel.xml
176
+
177
+ # Sensitive or high-churn files
178
+ .idea/**/dataSources/
179
+ .idea/**/dataSources.ids
180
+ .idea/**/dataSources.local.xml
181
+ .idea/**/sqlDataSources.xml
182
+ .idea/**/dynamic.xml
183
+ .idea/**/uiDesigner.xml
184
+ .idea/**/dbnavigator.xml
185
+
186
+ # Gradle
187
+ .idea/**/gradle.xml
188
+ .idea/**/libraries
189
+
190
+ # Gradle and Maven with auto-import
191
+ # When using Gradle or Maven with auto-import, you should exclude module files,
192
+ # since they will be recreated, and may cause churn. Uncomment if using
193
+ # auto-import.
194
+ # .idea/artifacts
195
+ # .idea/compiler.xml
196
+ # .idea/jarRepositories.xml
197
+ # .idea/modules.xml
198
+ # .idea/*.iml
199
+ # .idea/modules
200
+ # *.iml
201
+ # *.ipr
202
+
203
+ # CMake
204
+ cmake-build-*/
205
+
206
+ # Mongo Explorer plugin
207
+ .idea/**/mongoSettings.xml
208
+
209
+ # File-based project format
210
+ *.iws
211
+
212
+ # IntelliJ
213
+ out/
214
+
215
+ # mpeltonen/sbt-idea plugin
216
+ .idea_modules/
217
+
218
+ # JIRA plugin
219
+ atlassian-ide-plugin.xml
220
+
221
+ # Cursive Clojure plugin
222
+ .idea/replstate.xml
223
+
224
+ # SonarLint plugin
225
+ .idea/sonarlint/
226
+
227
+ # Crashlytics plugin (for Android Studio and IntelliJ)
228
+ com_crashlytics_export_strings.xml
229
+ crashlytics.properties
230
+ crashlytics-build.properties
231
+ fabric.properties
232
+
233
+ # Editor-based Rest Client
234
+ .idea/httpRequests
235
+
236
+ # Android studio 3.1+ serialized cache file
237
+ .idea/caches/build_file_checksums.ser
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, make_response
2
+ from baseline import fix_commas, create_baseline_pipeline
3
+ import logging
4
+
5
+ app = Flask(__name__)
6
+ logger = logging.Logger(__name__)
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+
10
+ @app.route('/', methods=['GET'])
11
+ def root():
12
+ return ("Welcome to the comma fixer. Go to /fix-commas?s='some text' or /baseline/fix-commas?s='some text' to try "
13
+ "out the functionality.")
14
+
15
+
16
+ @app.route('/baseline/fix-commas/', methods=['POST'])
17
+ def fix_commas_with_baseline():
18
+ data = request.get_json()
19
+ if 's' in data:
20
+ return make_response(jsonify({"s": fix_commas(app.baseline_pipeline, data['s'])}), 200)
21
+ else:
22
+ return make_response("Parameter 's' missing", 400)
23
+
24
+
25
+ if __name__ == '__main__':
26
+ logger.info("Loading the baseline model.")
27
+ app.baseline_pipeline = create_baseline_pipeline()
28
+ app.run(debug=True)
openapi.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openapi: 3.0.3
2
+ info:
3
+ title: Comma fixer
4
+ description: Comma fixer, using machine learning to fix placement of commas within a string of text.
5
+ version: 1.0.0
6
+ servers:
7
+ - url: 'https://localhost:5000'
8
+ paths:
9
+ /baseline/fix-commas:
10
+ post:
11
+ summary: Fixes comma placement in a sentence using the baseline model
12
+ requestBody:
13
+ required: true
14
+ content:
15
+ application/json:
16
+ schema:
17
+ type: object
18
+ properties:
19
+ s:
20
+ type: string
21
+ example: 'This, is a sentence with wrong commas at least some.'
22
+ description: The text with commas to fix, can be removed, added, reordered at will, or left unchanged.
23
+ responses:
24
+ 200:
25
+ description: Commas fixed.
26
+ content:
27
+ application/json:
28
+ schema:
29
+ type: object
30
+ properties:
31
+ s:
32
+ type: string
33
+ example: 'This is a sentence with wrong commas, at least some.'
34
+ description: A text with commas fixed, or unchanged if not necessary.
35
+ TODO WARNING - the text will have spaces normalized and trimmed at the start and end.
36
+ TODO some other punctuation may be changed as well
37
+
38
+ 400:
39
+ description: Input text query parameter missing.
40
+
41
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ flask == 2.2.2
2
+ pytest
3
+ torch == 2.0.1
4
+ transformers == 4.31.0
5
+
6
+ # for the tokenizer of the baseline model
7
+ protobuf == 4.24.0
8
+ sentencepiece==0.1.99
src/baseline.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
2
+
3
+
4
+ def create_baseline_pipeline() -> NerPipeline:
5
+ tokenizer = AutoTokenizer.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
6
+ model = AutoModelForTokenClassification.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
7
+ return pipeline('ner', model=model, tokenizer=tokenizer)
8
+
9
+
10
+ def _remove_punctuation(s: str) -> str:
11
+ to_remove = ".,?-:"
12
+ for char in to_remove:
13
+ s = s.replace(char, '')
14
+ return s
15
+
16
+
17
+ def _convert_pipeline_json_to_string(pipeline_json: list[dict]) -> str:
18
+ # TODO is it ok to remove redundant spaces, or should we keep input data as is and only touch commas?
19
+ # TODO don't accept tokens with commas inside words
20
+ return ''.join(
21
+ token['word'].replace('▁', ' ') + token['entity'].replace('0', '')
22
+ for token in pipeline_json
23
+ ).strip()
24
+
25
+
26
+ def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
27
+ return _convert_pipeline_json_to_string(
28
+ ner_pipeline(_remove_punctuation(s))
29
+ )
tests/__init__.py ADDED
File without changes
tests/test_baseline.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from baseline import create_baseline_pipeline, fix_commas, _remove_punctuation
3
+
4
+
5
+ @pytest.fixture()
6
+ def baseline_pipeline():
7
+ yield create_baseline_pipeline()
8
+
9
+
10
+ @pytest.mark.parametrize(
11
+ "test_input",
12
+ ['',
13
+ 'Hello world.',
14
+ 'This test string should not have any commas inside it.']
15
+ )
16
+ def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_input):
17
+ result = fix_commas(baseline_pipeline, s=test_input)
18
+ assert result == test_input
19
+
20
+
21
+ @pytest.mark.parametrize(
22
+ "test_input, expected",
23
+ [
24
+ ['I, am', 'I am.'],
25
+ ['A complex clause however it misses a comma something else and a dot?',
26
+ 'A complex claus,e, however, it misses a comma, something else and a dot.']]
27
+ )
28
+ def test_fix_commas_fixes_incorrect_commas(baseline_pipeline, test_input, expected):
29
+ result = fix_commas(baseline_pipeline, s=test_input)
30
+ assert result == expected
31
+
32
+
33
+ @pytest.mark.parametrize(
34
+ "test_input, expected",
35
+ [['', ''],
36
+ ['Hello world...', 'Hello world'],
37
+ ['This: test - string should not, have any commas inside it...?',
38
+ 'This test string should not have any commas inside it']]
39
+ )
40
+ def test__remove_punctuation(test_input, expected):
41
+ assert _remove_punctuation(test_input) == expected
tests/test_integration.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from app import app
4
+ import pytest
5
+
6
+
7
+ def test_fix_commas_fails_on_no_parameter():
8
+ response = app.test_client().post('/baseline/fix-commas/')
9
+ assert response.status_code == 400
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ "test_input",
14
+ [[''],
15
+ ['Hello world.'],
16
+ ['This test string should not have any commas inside it.']]
17
+ )
18
+ def test_fix_commas_plain_string_unchanged(test_input: str):
19
+ response = app.test_client().post('/baseline/fix-commas/', data={'s': test_input})
20
+ print(response.data.decode('utf-8'))
21
+ # result = json.loads(response.data.decode('utf-8')).get('s')
22
+ assert response.status_code == 200
23
+ # assert result == test_input
24
+
25
+
26
+ @pytest.mark.parametrize(
27
+ "test_input, expected",
28
+ [['', ''],
29
+ ['Hello world.', 'Hello world.'],
30
+ ['This test string should not have any commas inside it.',
31
+ 'This test string should not have any commas inside it.']]
32
+ )
33
+ def test_fix_commas_fixes_wrong_commas(test_input: str, expected: str):
34
+ assert False