Spaces:

klasocki
/

comma-fixer

Sleeping

App Files Files Community

klasocki commited on Aug 24, 2023

Commit

ca2592c

•

1 Parent(s): 65977ce

Integrate the fine-tuned comma fixed into the app

Browse files

Files changed (10) hide show

Dockerfile +4 -2
app.py +2 -1
commafixer/routers/fixer.py +25 -0
commafixer/src/fixer.py +90 -0
openapi.yaml +34 -0
setup.py +2 -1
static/index.html +34 -25
static/script.js +10 -5
tests/{test_baseline.py → test_comma_fixers.py} +45 -6
tests/test_integration.py +46 -45

Dockerfile CHANGED Viewed

@@ -10,9 +10,11 @@ COPY setup.py .
 RUN pip install --upgrade pip
 RUN pip install --no-cache-dir --upgrade .
-COPY commafixer/src/baseline.py commafixer/src/baseline.py
 ENV TRANSFORMERS_CACHE=/coma-fixer/.cache
-RUN python commafixer/src/baseline.py  # This pre-downloads models and tokenizers
 COPY . .

 RUN pip install --upgrade pip
 RUN pip install --no-cache-dir --upgrade .
+# This pre-downloads models and tokenizers
+COPY commafixer/src/ commafixer/src/
 ENV TRANSFORMERS_CACHE=/coma-fixer/.cache
+RUN python commafixer/src/baseline.py
+RUN python commafixer/src/fixer.py
 COPY . .

app.py CHANGED Viewed

@@ -4,9 +4,10 @@ from fastapi import FastAPI
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
-from commafixer.routers import baseline
 app = FastAPI()
 app.include_router(baseline.router, prefix='/baseline')
 # Without the realpath hack tests fail

 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
+from commafixer.routers import baseline, fixer
 app = FastAPI()
+app.include_router(fixer.router, prefix='/fix-commas')
 app.include_router(baseline.router, prefix='/baseline')
 # Without the realpath hack tests fail

commafixer/routers/fixer.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from fastapi import APIRouter, HTTPException
+import logging
+from commafixer.src.fixer import CommaFixer
+logger = logging.Logger(__name__)
+logging.basicConfig(level=logging.INFO)
+router = APIRouter()
+logger.info('Loading the main comma fixer model...')
+router.model = CommaFixer()
+@router.post('/')
+async def fix_commas(data: dict):
+    json_field_name = 's'
+    if json_field_name in data:
+        logger.debug('Fixing commas.')
+        return {json_field_name: router.model.fix_commas(data['s'])}
+    else:
+        msg = f"Text '{json_field_name}' missing"
+        logger.debug(msg)
+        raise HTTPException(status_code=400, detail=msg)

commafixer/src/fixer.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from peft import PeftConfig, PeftModel
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline, RobertaTokenizerFast
+import nltk
+import re
+class CommaFixer:
+    """
+    A wrapper class for the fine-tuned comma fixer model.
+    """
+    def __init__(self, device=-1):
+        self.id2label = {0: 'O', 1: 'B-COMMA'}
+        self.label2id = {'O': 0, 'B-COMMA': 1}
+        self.model, self.tokenizer = self._load_peft_model()
+    def fix_commas(self, s: str) -> str:
+        """
+        The main method for fixing commas using the fine-tuned model.
+        In the future we should think about batching the calls to it, for now it processes requests string by string.
+        :param s: A string with commas to fix, without length restrictions.
+        However, if the string is longer than the length limit (512 tokens), some whitespaces might be trimmed.
+        Example: comma_fixer.fix_commas("One two thre, and four!")
+        :return: A string with commas fixed, example: "One, two, thre and four!"
+        """
+        s_no_commas = re.sub(r'\s*,', '', s)
+        tokenized = self.tokenizer(s_no_commas, return_tensors='pt', return_offsets_mapping=True, return_length=True)
+        # If text too long, split into sentences and fix commas separately.
+        # TODO this is slow, we should think about joining them until length, or maybe a length limit to avoid
+        #  stalling the whole service
+        if tokenized['length'][0] > self.tokenizer.model_max_length:
+            return ' '.join(self.fix_commas(sentence) for sentence in nltk.sent_tokenize(s))
+        logits = self.model(input_ids=tokenized['input_ids'], attention_mask=tokenized['attention_mask']).logits
+        labels = [self.id2label[tag_id.item()] for tag_id in logits.argmax(dim=2).flatten()]
+        return _fix_commas_based_on_labels_and_offsets(labels, s_no_commas, tokenized['offset_mapping'][0])
+    def _load_peft_model(self, model_name="klasocki/roberta-large-lora-ner-comma-fixer") -> tuple[
+        PeftModel, RobertaTokenizerFast]:
+        """
+        Creates the huggingface model and tokenizer.
+        Can also be used for pre-downloading the model and the tokenizer.
+        :param model_name: Name of the model on the huggingface hub.
+        :return: A model with the peft adapter injected and weights merged, and the tokenizer.
+        """
+        config = PeftConfig.from_pretrained(model_name)
+        inference_model = AutoModelForTokenClassification.from_pretrained(
+            config.base_model_name_or_path, num_labels=len(self.id2label), id2label=self.id2label,
+            label2id=self.label2id
+        )
+        tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
+        model = PeftModel.from_pretrained(inference_model, model_name)
+        model = model.merge_and_unload()  # Join LoRa matrices with the main model for faster inference
+        return model, tokenizer
+def _fix_commas_based_on_labels_and_offsets(
+        labels: list[str],
+        original_s: str,
+        offset_map: list[tuple[int, int]]
+) -> str:
+    """
+    This function returns the original string with only commas fixed, based on the predicted labels from the main
+    model and the offsets from the tokenizer.
+    :param labels: Predicted labels for the tokens.
+    Should already be converted to string, since we will look for B-COMMA tags.
+    :param original_s: The original string, used to preserve original spacing and punctuation.
+    :param offset_map: List of offsets in the original string, we will only use the second integer of each pair
+    indicating where the token ended originally in the string.
+    :return: The string with commas fixed, and everything else intact.
+    """
+    result = original_s
+    commas_inserted = 0
+    for i, label in enumerate(labels):
+        current_offset = offset_map[i][1] + commas_inserted
+        if _should_insert_comma(label, result, current_offset):
+            result = result[:current_offset] + ',' + result[current_offset:]
+            commas_inserted += 1
+    return result
+def _should_insert_comma(label, result, current_offset) -> bool:
+    # Only insert commas for the final token of a word, that is, if next word starts with a space.
+    return label == 'B-COMMA' and result[current_offset].isspace()
+if __name__ == "__main__":
+    CommaFixer()  # to pre-download the model and tokenizer

openapi.yaml CHANGED Viewed

@@ -6,6 +6,40 @@ info:
 servers:
   - url: 'https://localhost:5000'
 paths:
   /baseline/fix-commas:
     post:
       summary: Fixes comma placement in a sentence using the baseline model

 servers:
   - url: 'https://localhost:5000'
 paths:
+  /fix-commas:
+    post:
+      summary: Fixes comma placement in a sentence using the fine-tuned model
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                s:
+                  type: string
+                  example: 'This, is a sentence with wrong commas at least some.'
+                  description: The text with commas to fix. Commas can be removed, added, reordered at will, or left
+                    unchanged. Other punctuation, whitespaces and so on will stay intact.
+      responses:
+        200:
+          description: Commas fixed.
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  s:
+                    type: string
+                    example: 'This is a sentence with wrong commas, at least some.'
+                    description: A text with commas fixed, or unchanged if not necessary. Everything other that
+                      commas will stay as it was originally.
+        400:
+          description: A required field missing from the POST request body JSON.
+# TODO remove duplication here
   /baseline/fix-commas:
     post:
       summary: Fixes comma placement in a sentence using the baseline model

setup.py CHANGED Viewed

@@ -11,6 +11,8 @@ setup(
     install_requires=[
         "fastapi == 0.101.1",
         "uvicorn == 0.23.2",
         "torch == 2.0.1",
         "transformers == 4.31.0",
         # for the tokenizer of the baseline model
@@ -22,7 +24,6 @@ setup(
         'training': [
             'datasets==2.14.4',
             'notebook',
-            'peft==0.5.0',
             'seqeval',
             'evaluate==0.4.0'
         ],

     install_requires=[
         "fastapi == 0.101.1",
         "uvicorn == 0.23.2",
+        "nltk == 3.8.1",
+        'peft==0.5.0',
         "torch == 2.0.1",
         "transformers == 4.31.0",
         # for the tokenizer of the baseline model
         'training': [
             'datasets==2.14.4',
             'notebook',
             'seqeval',
             'evaluate==0.4.0'
         ],

static/index.html CHANGED Viewed

@@ -1,36 +1,45 @@
 <!DOCTYPE html>
 <html lang="en">
-  <head>
-    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Fast API 🤗 Space served with Uvicorn</title>
-    <link rel="stylesheet" href="style.css" />
     <script type="module" src="script.js"></script>
-  </head>
-  <body>
-    <main>
-      <section id="comma-fixing">
         <h2>Fixing commas using Transformers</h2>
         <p>
-          Model:
-          <a
-            href="https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large"
-            rel="noreferrer"
-            target="_blank"
             >oliverguhr/fullstop-punctuation-multilang-large
-          </a>
         </p>
         <form class="comma-fixing-form">
-          <label for="comma-fixing-input">Text with incorrect commas</label>
-          <input
-            id="comma-fixing-input"
-            type="text"
-            value="This is however a very bad, and terrible sentence grammatically that is."
-          />
-          <button id="comma-fixing-submit">Submit</button>
-          <p class="comma-fixing-output"></p>
         </form>
-      </section>
-    </main>
-  </body>
 </html>

 <!DOCTYPE html>
 <html lang="en">
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
     <title>Fast API 🤗 Space served with Uvicorn</title>
+    <link rel="stylesheet" href="style.css"/>
     <script type="module" src="script.js"></script>
+</head>
+<body>
+<main>
+    <section id="comma-fixing">
         <h2>Fixing commas using Transformers</h2>
         <p>
+            Fine-tuned model:
+            <a
+                    href="https://huggingface.co/klasocki/roberta-large-lora-ner-comma-fixer"
+                    rel="noreferrer"
+                    target="_blank"
+            >klasocki/roberta-large-lora-ner-comma-fixer
+            </a>
+        </p>
+        <p>
+            Baseline model:
+            <a
+                    href="https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large"
+                    rel="noreferrer"
+                    target="_blank"
             >oliverguhr/fullstop-punctuation-multilang-large
+            </a>
         </p>
         <form class="comma-fixing-form">
+            <label for="comma-fixing-input">Text with incorrect commas</label>
+            <input
+                    id="comma-fixing-input"
+                    type="text"
+                    value="This is however a very bad, and terrible sentence grammatically that is."
+            />
+            <button id="comma-fixing-submit">Submit</button>
+            <p class="comma-fixing-output"></p>
         </form>
+    </section>
+</main>
+</body>
 </html>

static/script.js CHANGED Viewed

@@ -1,7 +1,7 @@
 const commaFixingForm = document.querySelector(".comma-fixing-form");
 const fixCommas = async (text) => {
-    const inferResponse = await fetch(`baseline/fix-commas/`, {
         method: "POST",
         body: JSON.stringify({
             s: text
@@ -9,10 +9,13 @@ const fixCommas = async (text) => {
         headers: {
             "Content-type": "application/json; charset=UTF-8"
         }
-    });
-    const inferJson = await inferResponse.json();
-    return inferJson.s;
 };
 commaFixingForm.addEventListener("submit", async (event) => {
@@ -21,5 +24,7 @@ commaFixingForm.addEventListener("submit", async (event) => {
     const commaFixingInput = document.getElementById("comma-fixing-input");
     const commaFixingParagraph = document.querySelector(".comma-fixing-output");
-    commaFixingParagraph.textContent = await fixCommas(commaFixingInput.value);
 });

 const commaFixingForm = document.querySelector(".comma-fixing-form");
 const fixCommas = async (text) => {
+    let request = {
         method: "POST",
         body: JSON.stringify({
             s: text
         headers: {
             "Content-type": "application/json; charset=UTF-8"
         }
+    };
+    const baselineResponse = await fetch(`baseline/fix-commas/`, request);
+    const fixerResponse = await fetch(`fix-commas/`, request);
+    const baselineJson = await baselineResponse.json();
+    const inferJson = await fixerResponse.json();
+    return {baseline: baselineJson.s, main: inferJson.s};
 };
 commaFixingForm.addEventListener("submit", async (event) => {
     const commaFixingInput = document.getElementById("comma-fixing-input");
     const commaFixingParagraph = document.querySelector(".comma-fixing-output");
+    const fixed = await fixCommas(commaFixingInput.value);
+    commaFixingParagraph.textContent = `Our model: ${fixed.main}\n\nBaseline model: ${fixed.baseline}`
 });

tests/{test_baseline.py → test_comma_fixers.py} RENAMED Viewed

@@ -1,12 +1,19 @@
 import pytest
 from commafixer.src.baseline import BaselineCommaFixer, _remove_punctuation
 @pytest.fixture()
 def baseline_fixer():
     yield BaselineCommaFixer()
 @pytest.mark.parametrize(
     "test_input",
     ['',
@@ -14,9 +21,40 @@ def baseline_fixer():
      'This test string should not have any commas inside it.',
      'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
 )
-def test_fix_commas_leaves_correct_strings_unchanged(baseline_fixer, test_input):
-    result = baseline_fixer.fix_commas(s=test_input)
-    assert result == test_input
 @pytest.mark.parametrize(
@@ -35,12 +73,13 @@ def test_fix_commas_leaves_correct_strings_unchanged(baseline_fixer, test_input)
          'nonetheless or we will fail this test.',
          ' This is a sentence. With a lot of useless punctuation!!??. O.o However, we have to insert commas O-O '
          'nonetheless, or we will fail this test.'],
-        [" The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes two on each broadside .",
-         " The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ), three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes, two on each broadside ."]
     ]
 )
-def test_fix_commas_fixes_incorrect_commas(baseline_fixer, test_input, expected):
     result = baseline_fixer.fix_commas(s=test_input)
     assert result == expected

 import pytest
 from commafixer.src.baseline import BaselineCommaFixer, _remove_punctuation
+from commafixer.src.fixer import CommaFixer
+# TODO look up best practices and duplication for tests like these
 @pytest.fixture()
 def baseline_fixer():
     yield BaselineCommaFixer()
+@pytest.fixture()
+def comma_fixer():
+    yield CommaFixer()
 @pytest.mark.parametrize(
     "test_input",
     ['',
      'This test string should not have any commas inside it.',
      'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
 )
+class TestCorrectStringsAreUnchanged:
+    def test_model_fix_commas_leaves_correct_strings_unchanged(self, comma_fixer, test_input):
+        result = comma_fixer.fix_commas(s=test_input)
+        assert result == test_input
+    def test_baseline_fix_commas_leaves_correct_strings_unchanged(self, baseline_fixer, test_input):
+        result = baseline_fixer.fix_commas(s=test_input)
+        assert result == test_input
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        ['I, am.', 'I am.'],
+        ['A complex     clause however it misses a comma something else and a dot...?',
+         'A complex     clause, however, it misses a comma, something else and a dot...?'],
+        ['a pen an apple, \tand a pineapple!',
+         'a pen, an apple \tand a pineapple!'],
+        ['Even newlines\ntabs\tand others get preserved.',
+         'Even newlines,\ntabs\tand others get preserved.'],
+        ['I had no Creativity left, therefore, I come here, and write useless examples, for this test.',
+         'I had no Creativity left, therefore I come here and write useless examples for this test.'],
+        [' This is a sentence. With, a lot of, useless punctuation!!??. O.o However we have to insert commas O-O, '
+         'nonetheless or we will fail this test.',
+         ' This is a sentence. With a lot of useless punctuation!!??. O.o However, we have to insert commas O-O '
+         'nonetheless, or we will fail this test.'],
+        [
+            " The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes two on each broadside .",
+            " The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes, two on each broadside ."]
+    ]
+)
+def test_main_model_fix_commas_fixes_correct_commas(comma_fixer, test_input, expected):
+    result = comma_fixer.fix_commas(s=test_input)
+    assert result == expected
 @pytest.mark.parametrize(
          'nonetheless or we will fail this test.',
          ' This is a sentence. With a lot of useless punctuation!!??. O.o However, we have to insert commas O-O '
          'nonetheless, or we will fail this test.'],
+        [
+            " The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes two on each broadside .",
+            " The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ), three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes, two on each broadside ."]
     ]
 )
+def test_baseline_fix_commas_fixes_incorrect_commas(baseline_fixer, test_input, expected):
     result = baseline_fixer.fix_commas(s=test_input)
     assert result == expected

tests/test_integration.py CHANGED Viewed

@@ -4,50 +4,51 @@ import pytest
 from app import app
-@pytest.fixture()
-def client():
-    yield TestClient(app)
-def test_fix_commas_fails_on_no_parameter(client):
-    response = client.post('/baseline/fix-commas/')
-    assert response.status_code == 422
-def test_fix_commas_fails_on_wrong_parameters(client):
-    response = client.post('/baseline/fix-commas/', json={'text': "Some text."})
-    assert response.status_code == 400
 @pytest.mark.parametrize(
-    "test_input",
-    ['',
-     'Hello world.',
-     'This test string should not have any commas inside it.']
 )
-def test_fix_commas_correct_string_unchanged(client, test_input: str):
-    response = client.post('/baseline/fix-commas/', json={'s': test_input})
-    assert response.status_code == 200
-    assert response.json().get('s') == test_input
-@pytest.mark.parametrize(
-    "test_input, expected",
-    [['I am, here.', 'I am here.'],
-     ['books pens and pencils',
-      'books, pens and pencils']]
-)
-def test_fix_commas_fixes_wrong_commas(client, test_input: str, expected: str):
-    response = client.post('/baseline/fix-commas/', json={'s': test_input})
-    assert response.status_code == 200
-    assert response.json().get('s') == expected
-def test_with_a_very_long_string(client):
-    s = "Just a long string. " * 1000
-    response = client.post('/baseline/fix-commas/', json={'s': s})
-    assert response.status_code == 200
-    assert response.json().get('s') == s

 from app import app
 @pytest.mark.parametrize(
+    "endpoint",
+    ['/fix-commas/',
+     '/baseline/fix-commas/']
 )
+class TestFixCommaApi:
+    @pytest.fixture()
+    def client(self):
+        yield TestClient(app)
+    def test_fix_commas_fails_on_no_parameter(self, client, endpoint):
+        response = client.post(endpoint)
+        assert response.status_code == 422
+    def test_fix_commas_fails_on_wrong_parameters(self, client, endpoint):
+        response = client.post(endpoint, json={'text': "Some text."})
+        assert response.status_code == 400
+    @pytest.mark.parametrize(
+        "test_input",
+        ['',
+         'Hello world.',
+         'This test string should not have any commas inside it.']
+    )
+    def test_fix_commas_correct_string_unchanged(self, client, endpoint, test_input: str):
+        response = client.post(endpoint, json={'s': test_input})
+        assert response.status_code == 200
+        assert response.json().get('s') == test_input
+    @pytest.mark.parametrize(
+        "test_input, expected",
+        [['I am, here.', 'I am here.'],
+         ['books pens and pencils',
+          'books, pens and pencils']]
+    )
+    def test_fix_commas_fixes_wrong_commas(self, client, endpoint, test_input: str, expected: str):
+        response = client.post(endpoint, json={'s': test_input})
+        assert response.status_code == 200
+        assert response.json().get('s') == expected
+    def test_with_a_very_long_string(self, endpoint, client):
+        s = ("Just a long string. " * 200).rstrip()
+        response = client.post(endpoint, json={'s': s})
+        assert response.status_code == 200
+        assert response.json().get('s') == s