Spaces:

klasocki
/

comma-fixer

Running

App Files Files Community

klasocki commited on Aug 17, 2023

Commit

5760b44

•

1 Parent(s): 79999ac

Add baseline, server, unit tests and failing integration tests

Browse files

Files changed (8) hide show

.gitignore +77 -0
app.py +28 -0
openapi.yaml +41 -0
requirements.txt +8 -0
src/baseline.py +29 -0
tests/__init__.py +0 -0
tests/test_baseline.py +41 -0
tests/test_integration.py +34 -0

.gitignore CHANGED Viewed

@@ -158,3 +158,80 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from flask import Flask, request, jsonify, make_response
+from baseline import fix_commas, create_baseline_pipeline
+import logging
+app = Flask(__name__)
+logger = logging.Logger(__name__)
+logging.basicConfig(level=logging.INFO)
+@app.route('/', methods=['GET'])
+def root():
+    return ("Welcome to the comma fixer. Go to /fix-commas?s='some text' or /baseline/fix-commas?s='some text' to try "
+            "out the functionality.")
+@app.route('/baseline/fix-commas/', methods=['POST'])
+def fix_commas_with_baseline():
+    data = request.get_json()
+    if 's' in data:
+        return make_response(jsonify({"s": fix_commas(app.baseline_pipeline, data['s'])}), 200)
+    else:
+        return make_response("Parameter 's' missing", 400)
+if __name__ == '__main__':
+    logger.info("Loading the baseline model.")
+    app.baseline_pipeline = create_baseline_pipeline()
+    app.run(debug=True)

openapi.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+openapi: 3.0.3
+info:
+  title: Comma fixer
+  description: Comma fixer, using machine learning to fix placement of commas within a string of text.
+  version: 1.0.0
+servers:
+  - url: 'https://localhost:5000'
+paths:
+  /baseline/fix-commas:
+    post:
+      summary: Fixes comma placement in a sentence using the baseline model
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                s:
+                  type: string
+                  example: 'This, is a sentence with wrong commas at least some.'
+                  description: The text with commas to fix, can be removed, added, reordered at will, or left unchanged.
+      responses:
+        200:
+          description: Commas fixed.
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  s:
+                    type: string
+                    example: 'This is a sentence with wrong commas, at least some.'
+                    description: A text with commas fixed, or unchanged if not necessary.
+                      TODO WARNING - the text will have spaces normalized and trimmed at the start and end.
+                      TODO some other punctuation may be changed as well
+        400:
+          description: Input text query parameter missing.

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+flask == 2.2.2
+pytest
+torch == 2.0.1
+transformers == 4.31.0
+# for the tokenizer of the baseline model
+protobuf == 4.24.0
+sentencepiece==0.1.99

src/baseline.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
+def create_baseline_pipeline() -> NerPipeline:
+    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
+    model = AutoModelForTokenClassification.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
+    return pipeline('ner', model=model, tokenizer=tokenizer)
+def _remove_punctuation(s: str) -> str:
+    to_remove = ".,?-:"
+    for char in to_remove:
+        s = s.replace(char, '')
+    return s
+def _convert_pipeline_json_to_string(pipeline_json: list[dict]) -> str:
+    # TODO is it ok to remove redundant spaces, or should we keep input data as is and only touch commas?
+    # TODO don't accept tokens with commas inside words
+    return ''.join(
+        token['word'].replace('▁', ' ') + token['entity'].replace('0', '')
+        for token in pipeline_json
+    ).strip()
+def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
+    return _convert_pipeline_json_to_string(
+        ner_pipeline(_remove_punctuation(s))
+    )

tests/__init__.py ADDED Viewed

File without changes

tests/test_baseline.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pytest
+from baseline import create_baseline_pipeline, fix_commas, _remove_punctuation
+@pytest.fixture()
+def baseline_pipeline():
+    yield create_baseline_pipeline()
+@pytest.mark.parametrize(
+    "test_input",
+    ['',
+     'Hello world.',
+     'This test string should not have any commas inside it.']
+)
+def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_input):
+    result = fix_commas(baseline_pipeline, s=test_input)
+    assert result == test_input
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        ['I, am', 'I am.'],
+        ['A complex     clause however it misses a comma something else and a dot?',
+         'A complex claus,e, however, it misses a comma, something else and a dot.']]
+)
+def test_fix_commas_fixes_incorrect_commas(baseline_pipeline, test_input, expected):
+    result = fix_commas(baseline_pipeline, s=test_input)
+    assert result == expected
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [['', ''],
+     ['Hello world...', 'Hello world'],
+     ['This: test - string should not, have any commas inside it...?',
+      'This test  string should not have any commas inside it']]
+)
+def test__remove_punctuation(test_input, expected):
+    assert _remove_punctuation(test_input) == expected

tests/test_integration.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import json
+from app import app
+import pytest
+def test_fix_commas_fails_on_no_parameter():
+    response = app.test_client().post('/baseline/fix-commas/')
+    assert response.status_code == 400
+@pytest.mark.parametrize(
+    "test_input",
+    [[''],
+     ['Hello world.'],
+     ['This test string should not have any commas inside it.']]
+)
+def test_fix_commas_plain_string_unchanged(test_input: str):
+    response = app.test_client().post('/baseline/fix-commas/', data={'s': test_input})
+    print(response.data.decode('utf-8'))
+    # result = json.loads(response.data.decode('utf-8')).get('s')
+    assert response.status_code == 200
+    # assert result == test_input
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [['', ''],
+     ['Hello world.', 'Hello world.'],
+     ['This test string should not have any commas inside it.',
+      'This test string should not have any commas inside it.']]
+)
+def test_fix_commas_fixes_wrong_commas(test_input: str, expected: str):
+    assert False