Spaces:
Sleeping
Sleeping
Integrate the fine-tuned comma fixed into the app
Browse files- Dockerfile +4 -2
- app.py +2 -1
- commafixer/routers/fixer.py +25 -0
- commafixer/src/fixer.py +90 -0
- openapi.yaml +34 -0
- setup.py +2 -1
- static/index.html +34 -25
- static/script.js +10 -5
- tests/{test_baseline.py → test_comma_fixers.py} +45 -6
- tests/test_integration.py +46 -45
Dockerfile
CHANGED
@@ -10,9 +10,11 @@ COPY setup.py .
|
|
10 |
RUN pip install --upgrade pip
|
11 |
RUN pip install --no-cache-dir --upgrade .
|
12 |
|
13 |
-
|
|
|
14 |
ENV TRANSFORMERS_CACHE=/coma-fixer/.cache
|
15 |
-
RUN python commafixer/src/baseline.py
|
|
|
16 |
|
17 |
COPY . .
|
18 |
|
|
|
10 |
RUN pip install --upgrade pip
|
11 |
RUN pip install --no-cache-dir --upgrade .
|
12 |
|
13 |
+
# This pre-downloads models and tokenizers
|
14 |
+
COPY commafixer/src/ commafixer/src/
|
15 |
ENV TRANSFORMERS_CACHE=/coma-fixer/.cache
|
16 |
+
RUN python commafixer/src/baseline.py
|
17 |
+
RUN python commafixer/src/fixer.py
|
18 |
|
19 |
COPY . .
|
20 |
|
app.py
CHANGED
@@ -4,9 +4,10 @@ from fastapi import FastAPI
|
|
4 |
from fastapi.responses import FileResponse
|
5 |
from fastapi.staticfiles import StaticFiles
|
6 |
|
7 |
-
from commafixer.routers import baseline
|
8 |
|
9 |
app = FastAPI()
|
|
|
10 |
app.include_router(baseline.router, prefix='/baseline')
|
11 |
|
12 |
# Without the realpath hack tests fail
|
|
|
4 |
from fastapi.responses import FileResponse
|
5 |
from fastapi.staticfiles import StaticFiles
|
6 |
|
7 |
+
from commafixer.routers import baseline, fixer
|
8 |
|
9 |
app = FastAPI()
|
10 |
+
app.include_router(fixer.router, prefix='/fix-commas')
|
11 |
app.include_router(baseline.router, prefix='/baseline')
|
12 |
|
13 |
# Without the realpath hack tests fail
|
commafixer/routers/fixer.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException
|
2 |
+
import logging
|
3 |
+
|
4 |
+
from commafixer.src.fixer import CommaFixer
|
5 |
+
|
6 |
+
|
7 |
+
logger = logging.Logger(__name__)
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
|
10 |
+
router = APIRouter()
|
11 |
+
|
12 |
+
logger.info('Loading the main comma fixer model...')
|
13 |
+
router.model = CommaFixer()
|
14 |
+
|
15 |
+
|
16 |
+
@router.post('/')
|
17 |
+
async def fix_commas(data: dict):
|
18 |
+
json_field_name = 's'
|
19 |
+
if json_field_name in data:
|
20 |
+
logger.debug('Fixing commas.')
|
21 |
+
return {json_field_name: router.model.fix_commas(data['s'])}
|
22 |
+
else:
|
23 |
+
msg = f"Text '{json_field_name}' missing"
|
24 |
+
logger.debug(msg)
|
25 |
+
raise HTTPException(status_code=400, detail=msg)
|
commafixer/src/fixer.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from peft import PeftConfig, PeftModel
|
2 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline, RobertaTokenizerFast
|
3 |
+
import nltk
|
4 |
+
import re
|
5 |
+
|
6 |
+
|
7 |
+
class CommaFixer:
|
8 |
+
"""
|
9 |
+
A wrapper class for the fine-tuned comma fixer model.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, device=-1):
|
13 |
+
self.id2label = {0: 'O', 1: 'B-COMMA'}
|
14 |
+
self.label2id = {'O': 0, 'B-COMMA': 1}
|
15 |
+
self.model, self.tokenizer = self._load_peft_model()
|
16 |
+
|
17 |
+
def fix_commas(self, s: str) -> str:
|
18 |
+
"""
|
19 |
+
The main method for fixing commas using the fine-tuned model.
|
20 |
+
In the future we should think about batching the calls to it, for now it processes requests string by string.
|
21 |
+
:param s: A string with commas to fix, without length restrictions.
|
22 |
+
However, if the string is longer than the length limit (512 tokens), some whitespaces might be trimmed.
|
23 |
+
Example: comma_fixer.fix_commas("One two thre, and four!")
|
24 |
+
:return: A string with commas fixed, example: "One, two, thre and four!"
|
25 |
+
"""
|
26 |
+
s_no_commas = re.sub(r'\s*,', '', s)
|
27 |
+
tokenized = self.tokenizer(s_no_commas, return_tensors='pt', return_offsets_mapping=True, return_length=True)
|
28 |
+
|
29 |
+
# If text too long, split into sentences and fix commas separately.
|
30 |
+
# TODO this is slow, we should think about joining them until length, or maybe a length limit to avoid
|
31 |
+
# stalling the whole service
|
32 |
+
if tokenized['length'][0] > self.tokenizer.model_max_length:
|
33 |
+
return ' '.join(self.fix_commas(sentence) for sentence in nltk.sent_tokenize(s))
|
34 |
+
|
35 |
+
logits = self.model(input_ids=tokenized['input_ids'], attention_mask=tokenized['attention_mask']).logits
|
36 |
+
labels = [self.id2label[tag_id.item()] for tag_id in logits.argmax(dim=2).flatten()]
|
37 |
+
return _fix_commas_based_on_labels_and_offsets(labels, s_no_commas, tokenized['offset_mapping'][0])
|
38 |
+
|
39 |
+
def _load_peft_model(self, model_name="klasocki/roberta-large-lora-ner-comma-fixer") -> tuple[
|
40 |
+
PeftModel, RobertaTokenizerFast]:
|
41 |
+
"""
|
42 |
+
Creates the huggingface model and tokenizer.
|
43 |
+
Can also be used for pre-downloading the model and the tokenizer.
|
44 |
+
:param model_name: Name of the model on the huggingface hub.
|
45 |
+
:return: A model with the peft adapter injected and weights merged, and the tokenizer.
|
46 |
+
"""
|
47 |
+
config = PeftConfig.from_pretrained(model_name)
|
48 |
+
inference_model = AutoModelForTokenClassification.from_pretrained(
|
49 |
+
config.base_model_name_or_path, num_labels=len(self.id2label), id2label=self.id2label,
|
50 |
+
label2id=self.label2id
|
51 |
+
)
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
|
53 |
+
model = PeftModel.from_pretrained(inference_model, model_name)
|
54 |
+
model = model.merge_and_unload() # Join LoRa matrices with the main model for faster inference
|
55 |
+
return model, tokenizer
|
56 |
+
|
57 |
+
|
58 |
+
def _fix_commas_based_on_labels_and_offsets(
|
59 |
+
labels: list[str],
|
60 |
+
original_s: str,
|
61 |
+
offset_map: list[tuple[int, int]]
|
62 |
+
) -> str:
|
63 |
+
"""
|
64 |
+
This function returns the original string with only commas fixed, based on the predicted labels from the main
|
65 |
+
model and the offsets from the tokenizer.
|
66 |
+
:param labels: Predicted labels for the tokens.
|
67 |
+
Should already be converted to string, since we will look for B-COMMA tags.
|
68 |
+
:param original_s: The original string, used to preserve original spacing and punctuation.
|
69 |
+
:param offset_map: List of offsets in the original string, we will only use the second integer of each pair
|
70 |
+
indicating where the token ended originally in the string.
|
71 |
+
:return: The string with commas fixed, and everything else intact.
|
72 |
+
"""
|
73 |
+
result = original_s
|
74 |
+
commas_inserted = 0
|
75 |
+
|
76 |
+
for i, label in enumerate(labels):
|
77 |
+
current_offset = offset_map[i][1] + commas_inserted
|
78 |
+
if _should_insert_comma(label, result, current_offset):
|
79 |
+
result = result[:current_offset] + ',' + result[current_offset:]
|
80 |
+
commas_inserted += 1
|
81 |
+
return result
|
82 |
+
|
83 |
+
|
84 |
+
def _should_insert_comma(label, result, current_offset) -> bool:
|
85 |
+
# Only insert commas for the final token of a word, that is, if next word starts with a space.
|
86 |
+
return label == 'B-COMMA' and result[current_offset].isspace()
|
87 |
+
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
CommaFixer() # to pre-download the model and tokenizer
|
openapi.yaml
CHANGED
@@ -6,6 +6,40 @@ info:
|
|
6 |
servers:
|
7 |
- url: 'https://localhost:5000'
|
8 |
paths:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
/baseline/fix-commas:
|
10 |
post:
|
11 |
summary: Fixes comma placement in a sentence using the baseline model
|
|
|
6 |
servers:
|
7 |
- url: 'https://localhost:5000'
|
8 |
paths:
|
9 |
+
/fix-commas:
|
10 |
+
post:
|
11 |
+
summary: Fixes comma placement in a sentence using the fine-tuned model
|
12 |
+
requestBody:
|
13 |
+
required: true
|
14 |
+
content:
|
15 |
+
application/json:
|
16 |
+
schema:
|
17 |
+
type: object
|
18 |
+
properties:
|
19 |
+
s:
|
20 |
+
type: string
|
21 |
+
example: 'This, is a sentence with wrong commas at least some.'
|
22 |
+
description: The text with commas to fix. Commas can be removed, added, reordered at will, or left
|
23 |
+
unchanged. Other punctuation, whitespaces and so on will stay intact.
|
24 |
+
responses:
|
25 |
+
200:
|
26 |
+
description: Commas fixed.
|
27 |
+
content:
|
28 |
+
application/json:
|
29 |
+
schema:
|
30 |
+
type: object
|
31 |
+
properties:
|
32 |
+
s:
|
33 |
+
type: string
|
34 |
+
example: 'This is a sentence with wrong commas, at least some.'
|
35 |
+
description: A text with commas fixed, or unchanged if not necessary. Everything other that
|
36 |
+
commas will stay as it was originally.
|
37 |
+
|
38 |
+
400:
|
39 |
+
description: A required field missing from the POST request body JSON.
|
40 |
+
|
41 |
+
# TODO remove duplication here
|
42 |
+
|
43 |
/baseline/fix-commas:
|
44 |
post:
|
45 |
summary: Fixes comma placement in a sentence using the baseline model
|
setup.py
CHANGED
@@ -11,6 +11,8 @@ setup(
|
|
11 |
install_requires=[
|
12 |
"fastapi == 0.101.1",
|
13 |
"uvicorn == 0.23.2",
|
|
|
|
|
14 |
"torch == 2.0.1",
|
15 |
"transformers == 4.31.0",
|
16 |
# for the tokenizer of the baseline model
|
@@ -22,7 +24,6 @@ setup(
|
|
22 |
'training': [
|
23 |
'datasets==2.14.4',
|
24 |
'notebook',
|
25 |
-
'peft==0.5.0',
|
26 |
'seqeval',
|
27 |
'evaluate==0.4.0'
|
28 |
],
|
|
|
11 |
install_requires=[
|
12 |
"fastapi == 0.101.1",
|
13 |
"uvicorn == 0.23.2",
|
14 |
+
"nltk == 3.8.1",
|
15 |
+
'peft==0.5.0',
|
16 |
"torch == 2.0.1",
|
17 |
"transformers == 4.31.0",
|
18 |
# for the tokenizer of the baseline model
|
|
|
24 |
'training': [
|
25 |
'datasets==2.14.4',
|
26 |
'notebook',
|
|
|
27 |
'seqeval',
|
28 |
'evaluate==0.4.0'
|
29 |
],
|
static/index.html
CHANGED
@@ -1,36 +1,45 @@
|
|
1 |
<!DOCTYPE html>
|
2 |
<html lang="en">
|
3 |
-
|
4 |
-
<meta charset="UTF-8"
|
5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0"
|
6 |
<title>Fast API 🤗 Space served with Uvicorn</title>
|
7 |
-
<link rel="stylesheet" href="style.css"
|
8 |
<script type="module" src="script.js"></script>
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
<h2>Fixing commas using Transformers</h2>
|
14 |
<p>
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
>oliverguhr/fullstop-punctuation-multilang-large
|
21 |
-
|
22 |
</p>
|
23 |
<form class="comma-fixing-form">
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
</form>
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
</html>
|
|
|
1 |
<!DOCTYPE html>
|
2 |
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8"/>
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
6 |
<title>Fast API 🤗 Space served with Uvicorn</title>
|
7 |
+
<link rel="stylesheet" href="style.css"/>
|
8 |
<script type="module" src="script.js"></script>
|
9 |
+
</head>
|
10 |
+
<body>
|
11 |
+
<main>
|
12 |
+
<section id="comma-fixing">
|
13 |
<h2>Fixing commas using Transformers</h2>
|
14 |
<p>
|
15 |
+
Fine-tuned model:
|
16 |
+
<a
|
17 |
+
href="https://huggingface.co/klasocki/roberta-large-lora-ner-comma-fixer"
|
18 |
+
rel="noreferrer"
|
19 |
+
target="_blank"
|
20 |
+
>klasocki/roberta-large-lora-ner-comma-fixer
|
21 |
+
</a>
|
22 |
+
</p>
|
23 |
+
<p>
|
24 |
+
Baseline model:
|
25 |
+
<a
|
26 |
+
href="https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large"
|
27 |
+
rel="noreferrer"
|
28 |
+
target="_blank"
|
29 |
>oliverguhr/fullstop-punctuation-multilang-large
|
30 |
+
</a>
|
31 |
</p>
|
32 |
<form class="comma-fixing-form">
|
33 |
+
<label for="comma-fixing-input">Text with incorrect commas</label>
|
34 |
+
<input
|
35 |
+
id="comma-fixing-input"
|
36 |
+
type="text"
|
37 |
+
value="This is however a very bad, and terrible sentence grammatically that is."
|
38 |
+
/>
|
39 |
+
<button id="comma-fixing-submit">Submit</button>
|
40 |
+
<p class="comma-fixing-output"></p>
|
41 |
</form>
|
42 |
+
</section>
|
43 |
+
</main>
|
44 |
+
</body>
|
45 |
</html>
|
static/script.js
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
const commaFixingForm = document.querySelector(".comma-fixing-form");
|
2 |
|
3 |
const fixCommas = async (text) => {
|
4 |
-
|
5 |
method: "POST",
|
6 |
body: JSON.stringify({
|
7 |
s: text
|
@@ -9,10 +9,13 @@ const fixCommas = async (text) => {
|
|
9 |
headers: {
|
10 |
"Content-type": "application/json; charset=UTF-8"
|
11 |
}
|
12 |
-
}
|
13 |
-
const
|
|
|
|
|
|
|
14 |
|
15 |
-
return inferJson.s;
|
16 |
};
|
17 |
|
18 |
commaFixingForm.addEventListener("submit", async (event) => {
|
@@ -21,5 +24,7 @@ commaFixingForm.addEventListener("submit", async (event) => {
|
|
21 |
const commaFixingInput = document.getElementById("comma-fixing-input");
|
22 |
const commaFixingParagraph = document.querySelector(".comma-fixing-output");
|
23 |
|
24 |
-
|
|
|
|
|
25 |
});
|
|
|
1 |
const commaFixingForm = document.querySelector(".comma-fixing-form");
|
2 |
|
3 |
const fixCommas = async (text) => {
|
4 |
+
let request = {
|
5 |
method: "POST",
|
6 |
body: JSON.stringify({
|
7 |
s: text
|
|
|
9 |
headers: {
|
10 |
"Content-type": "application/json; charset=UTF-8"
|
11 |
}
|
12 |
+
};
|
13 |
+
const baselineResponse = await fetch(`baseline/fix-commas/`, request);
|
14 |
+
const fixerResponse = await fetch(`fix-commas/`, request);
|
15 |
+
const baselineJson = await baselineResponse.json();
|
16 |
+
const inferJson = await fixerResponse.json();
|
17 |
|
18 |
+
return {baseline: baselineJson.s, main: inferJson.s};
|
19 |
};
|
20 |
|
21 |
commaFixingForm.addEventListener("submit", async (event) => {
|
|
|
24 |
const commaFixingInput = document.getElementById("comma-fixing-input");
|
25 |
const commaFixingParagraph = document.querySelector(".comma-fixing-output");
|
26 |
|
27 |
+
const fixed = await fixCommas(commaFixingInput.value);
|
28 |
+
|
29 |
+
commaFixingParagraph.textContent = `Our model: ${fixed.main}\n\nBaseline model: ${fixed.baseline}`
|
30 |
});
|
tests/{test_baseline.py → test_comma_fixers.py}
RENAMED
@@ -1,12 +1,19 @@
|
|
1 |
import pytest
|
2 |
from commafixer.src.baseline import BaselineCommaFixer, _remove_punctuation
|
|
|
3 |
|
4 |
|
|
|
5 |
@pytest.fixture()
|
6 |
def baseline_fixer():
|
7 |
yield BaselineCommaFixer()
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
10 |
@pytest.mark.parametrize(
|
11 |
"test_input",
|
12 |
['',
|
@@ -14,9 +21,40 @@ def baseline_fixer():
|
|
14 |
'This test string should not have any commas inside it.',
|
15 |
'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
|
16 |
)
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
@pytest.mark.parametrize(
|
@@ -35,12 +73,13 @@ def test_fix_commas_leaves_correct_strings_unchanged(baseline_fixer, test_input)
|
|
35 |
'nonetheless or we will fail this test.',
|
36 |
' This is a sentence. With a lot of useless punctuation!!??. O.o However, we have to insert commas O-O '
|
37 |
'nonetheless, or we will fail this test.'],
|
38 |
-
[
|
39 |
-
|
|
|
40 |
|
41 |
]
|
42 |
)
|
43 |
-
def
|
44 |
result = baseline_fixer.fix_commas(s=test_input)
|
45 |
assert result == expected
|
46 |
|
|
|
1 |
import pytest
|
2 |
from commafixer.src.baseline import BaselineCommaFixer, _remove_punctuation
|
3 |
+
from commafixer.src.fixer import CommaFixer
|
4 |
|
5 |
|
6 |
+
# TODO look up best practices and duplication for tests like these
|
7 |
@pytest.fixture()
|
8 |
def baseline_fixer():
|
9 |
yield BaselineCommaFixer()
|
10 |
|
11 |
|
12 |
+
@pytest.fixture()
|
13 |
+
def comma_fixer():
|
14 |
+
yield CommaFixer()
|
15 |
+
|
16 |
+
|
17 |
@pytest.mark.parametrize(
|
18 |
"test_input",
|
19 |
['',
|
|
|
21 |
'This test string should not have any commas inside it.',
|
22 |
'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
|
23 |
)
|
24 |
+
class TestCorrectStringsAreUnchanged:
|
25 |
+
def test_model_fix_commas_leaves_correct_strings_unchanged(self, comma_fixer, test_input):
|
26 |
+
result = comma_fixer.fix_commas(s=test_input)
|
27 |
+
assert result == test_input
|
28 |
+
|
29 |
+
def test_baseline_fix_commas_leaves_correct_strings_unchanged(self, baseline_fixer, test_input):
|
30 |
+
result = baseline_fixer.fix_commas(s=test_input)
|
31 |
+
assert result == test_input
|
32 |
+
|
33 |
+
|
34 |
+
@pytest.mark.parametrize(
|
35 |
+
"test_input, expected",
|
36 |
+
[
|
37 |
+
['I, am.', 'I am.'],
|
38 |
+
['A complex clause however it misses a comma something else and a dot...?',
|
39 |
+
'A complex clause, however, it misses a comma, something else and a dot...?'],
|
40 |
+
['a pen an apple, \tand a pineapple!',
|
41 |
+
'a pen, an apple \tand a pineapple!'],
|
42 |
+
['Even newlines\ntabs\tand others get preserved.',
|
43 |
+
'Even newlines,\ntabs\tand others get preserved.'],
|
44 |
+
['I had no Creativity left, therefore, I come here, and write useless examples, for this test.',
|
45 |
+
'I had no Creativity left, therefore I come here and write useless examples for this test.'],
|
46 |
+
[' This is a sentence. With, a lot of, useless punctuation!!??. O.o However we have to insert commas O-O, '
|
47 |
+
'nonetheless or we will fail this test.',
|
48 |
+
' This is a sentence. With a lot of useless punctuation!!??. O.o However, we have to insert commas O-O '
|
49 |
+
'nonetheless, or we will fail this test.'],
|
50 |
+
[
|
51 |
+
" The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes two on each broadside .",
|
52 |
+
" The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes, two on each broadside ."]
|
53 |
+
]
|
54 |
+
)
|
55 |
+
def test_main_model_fix_commas_fixes_correct_commas(comma_fixer, test_input, expected):
|
56 |
+
result = comma_fixer.fix_commas(s=test_input)
|
57 |
+
assert result == expected
|
58 |
|
59 |
|
60 |
@pytest.mark.parametrize(
|
|
|
73 |
'nonetheless or we will fail this test.',
|
74 |
' This is a sentence. With a lot of useless punctuation!!??. O.o However, we have to insert commas O-O '
|
75 |
'nonetheless, or we will fail this test.'],
|
76 |
+
[
|
77 |
+
" The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ) three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes two on each broadside .",
|
78 |
+
" The ship 's secondary armament consisted of fourteen 45 @-@ calibre 6 @-@ inch ( 152 mm ) quick @-@ firing ( QF ) guns mounted in casemates . Lighter guns consisted of eight 47 @-@ millimetre ( 1 @.@ 9 in ), three @-@ pounder Hotchkiss guns and four 47 @-@ millimetre 2 @.@ 5 @-@ pounder Hotchkiss guns . The ship was also equipped with four submerged 18 @-@ inch torpedo tubes, two on each broadside ."]
|
79 |
|
80 |
]
|
81 |
)
|
82 |
+
def test_baseline_fix_commas_fixes_incorrect_commas(baseline_fixer, test_input, expected):
|
83 |
result = baseline_fixer.fix_commas(s=test_input)
|
84 |
assert result == expected
|
85 |
|
tests/test_integration.py
CHANGED
@@ -4,50 +4,51 @@ import pytest
|
|
4 |
from app import app
|
5 |
|
6 |
|
7 |
-
@pytest.fixture()
|
8 |
-
def client():
|
9 |
-
yield TestClient(app)
|
10 |
-
|
11 |
-
|
12 |
-
def test_fix_commas_fails_on_no_parameter(client):
|
13 |
-
response = client.post('/baseline/fix-commas/')
|
14 |
-
assert response.status_code == 422
|
15 |
-
|
16 |
-
|
17 |
-
def test_fix_commas_fails_on_wrong_parameters(client):
|
18 |
-
response = client.post('/baseline/fix-commas/', json={'text': "Some text."})
|
19 |
-
assert response.status_code == 400
|
20 |
-
|
21 |
-
|
22 |
@pytest.mark.parametrize(
|
23 |
-
"
|
24 |
-
['',
|
25 |
-
'
|
26 |
-
'This test string should not have any commas inside it.']
|
27 |
)
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from app import app
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
@pytest.mark.parametrize(
|
8 |
+
"endpoint",
|
9 |
+
['/fix-commas/',
|
10 |
+
'/baseline/fix-commas/']
|
|
|
11 |
)
|
12 |
+
class TestFixCommaApi:
|
13 |
+
@pytest.fixture()
|
14 |
+
def client(self):
|
15 |
+
yield TestClient(app)
|
16 |
+
|
17 |
+
def test_fix_commas_fails_on_no_parameter(self, client, endpoint):
|
18 |
+
response = client.post(endpoint)
|
19 |
+
assert response.status_code == 422
|
20 |
+
|
21 |
+
def test_fix_commas_fails_on_wrong_parameters(self, client, endpoint):
|
22 |
+
response = client.post(endpoint, json={'text': "Some text."})
|
23 |
+
assert response.status_code == 400
|
24 |
+
|
25 |
+
@pytest.mark.parametrize(
|
26 |
+
"test_input",
|
27 |
+
['',
|
28 |
+
'Hello world.',
|
29 |
+
'This test string should not have any commas inside it.']
|
30 |
+
)
|
31 |
+
def test_fix_commas_correct_string_unchanged(self, client, endpoint, test_input: str):
|
32 |
+
response = client.post(endpoint, json={'s': test_input})
|
33 |
+
|
34 |
+
assert response.status_code == 200
|
35 |
+
assert response.json().get('s') == test_input
|
36 |
+
|
37 |
+
@pytest.mark.parametrize(
|
38 |
+
"test_input, expected",
|
39 |
+
[['I am, here.', 'I am here.'],
|
40 |
+
['books pens and pencils',
|
41 |
+
'books, pens and pencils']]
|
42 |
+
)
|
43 |
+
def test_fix_commas_fixes_wrong_commas(self, client, endpoint, test_input: str, expected: str):
|
44 |
+
response = client.post(endpoint, json={'s': test_input})
|
45 |
+
|
46 |
+
assert response.status_code == 200
|
47 |
+
assert response.json().get('s') == expected
|
48 |
+
|
49 |
+
def test_with_a_very_long_string(self, endpoint, client):
|
50 |
+
s = ("Just a long string. " * 200).rstrip()
|
51 |
+
response = client.post(endpoint, json={'s': s})
|
52 |
+
|
53 |
+
assert response.status_code == 200
|
54 |
+
assert response.json().get('s') == s
|