|
from unittest import TestCase |
|
|
|
from datasets import Dataset |
|
from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters |
|
|
|
|
|
def get_dataset(): |
|
data_dict = { |
|
"repo_name": ["test_repo1", "test_repo2", "test_repo3"], |
|
"path": ["test_1.py", "test_2.py", "unit_test.py"], |
|
"content": ["a " * 20, "a " * 30, "b " * 7], |
|
} |
|
dataset = Dataset.from_dict(data_dict) |
|
return dataset |
|
|
|
|
|
class MakeDuplicateClustersTest(TestCase): |
|
def test_make_duplicate_clusters(self): |
|
ds = get_dataset() |
|
duplicate_clusters = make_duplicate_clusters(ds, 0.85) |
|
self.assertEqual(len(duplicate_clusters[0]), 2) |
|
|
|
def test_deduplicate_dataset(self): |
|
ds = get_dataset() |
|
ds_filter, duplicate_clusters = deduplicate_dataset(ds) |
|
self.assertEqual(len(ds_filter), 2) |
|
print(duplicate_clusters) |
|
self.assertEqual(duplicate_clusters[0][0]["copies"], 2) |
|
self.assertEqual(duplicate_clusters[0][0]["is_extreme"], True) |
|
|