Adive01 commited on
Commit
9237ef1
·
verified ·
1 Parent(s): 907098b

Upload tests/test_data_cleaning.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tests/test_data_cleaning.py +24 -0
tests/test_data_cleaning.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from datasets import Dataset
3
+ from mlplo.data_cleaning import is_valid_example, deduplicate_split
4
+
5
+ def test_is_valid_example():
6
+ assert is_valid_example(
7
+ {"text": "A " * 50, "summary": "B " * 10},
8
+ "text", "summary",
9
+ min_document_words=10, max_document_words=100, min_summary_words=5
10
+ )
11
+ # Too short document
12
+ assert not is_valid_example(
13
+ {"text": "A " * 5, "summary": "B " * 10},
14
+ "text", "summary",
15
+ min_document_words=10, max_document_words=100, min_summary_words=5
16
+ )
17
+
18
+ def test_deduplicate_split():
19
+ data = {"text": ["A", "B", "A", "C"], "summary": ["1", "2", "3", "4"]}
20
+ ds = Dataset.from_dict(data)
21
+ dedup, removed = deduplicate_split(ds, "text")
22
+ assert removed == 1
23
+ assert len(dedup) == 3
24
+ assert dedup["text"] == ["A", "B", "C"]