Spaces:
Running
Running
Upload tests/test_data_cleaning.py with huggingface_hub
Browse files- tests/test_data_cleaning.py +24 -0
tests/test_data_cleaning.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from datasets import Dataset
|
| 3 |
+
from mlplo.data_cleaning import is_valid_example, deduplicate_split
|
| 4 |
+
|
| 5 |
+
def test_is_valid_example():
|
| 6 |
+
assert is_valid_example(
|
| 7 |
+
{"text": "A " * 50, "summary": "B " * 10},
|
| 8 |
+
"text", "summary",
|
| 9 |
+
min_document_words=10, max_document_words=100, min_summary_words=5
|
| 10 |
+
)
|
| 11 |
+
# Too short document
|
| 12 |
+
assert not is_valid_example(
|
| 13 |
+
{"text": "A " * 5, "summary": "B " * 10},
|
| 14 |
+
"text", "summary",
|
| 15 |
+
min_document_words=10, max_document_words=100, min_summary_words=5
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def test_deduplicate_split():
|
| 19 |
+
data = {"text": ["A", "B", "A", "C"], "summary": ["1", "2", "3", "4"]}
|
| 20 |
+
ds = Dataset.from_dict(data)
|
| 21 |
+
dedup, removed = deduplicate_split(ds, "text")
|
| 22 |
+
assert removed == 1
|
| 23 |
+
assert len(dedup) == 3
|
| 24 |
+
assert dedup["text"] == ["A", "B", "C"]
|