diff --git a/app.py b/app.py index decd997a44c0ba1849df850913d4678d7e7be77d..b25d2225bca7d3cc8186b3dea4cc5b296bf548e2 100644 --- a/app.py +++ b/app.py @@ -6,8 +6,11 @@ from transformers import AutoTokenizer import ast import re - -CACHE_DIR = "cache_ds/" #Use this to build the dataset +version = st.sidebar.selectbox("Choose a version", ["init","local_dedup"]) +if version == "init": + CACHE_DIR = "cache_ds/" #Use this to build the dataset +else: + CACHE_DIR = "local_dedup/" contribution_json = "contributors.json" contribution_dict = json.load(open(contribution_json,"r")) diff --git a/local_dedup/AI4Code/dataset.arrow b/local_dedup/AI4Code/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c94914fb56b5235a194ae018f1332c52ad8ef8eb --- /dev/null +++ b/local_dedup/AI4Code/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10a7f6b87c470ba3134fe3723ff180882d065f609db88fbbe142b4b4620bb580 +size 12556632 diff --git a/local_dedup/AI4Code/dataset_info.json b/local_dedup/AI4Code/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..29dc12aeb66cc6df2ec278d3daf82cbe46ae2672 --- /dev/null +++ b/local_dedup/AI4Code/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2b26217928d6bea283689fb6af4808a924e8e571e25bdd8b3782da9d6a695d +size 957 diff --git a/local_dedup/AI4Code/state.json b/local_dedup/AI4Code/state.json new file mode 100644 index 0000000000000000000000000000000000000000..5fe085b2d670a75628142dea244bef5dc2f55fc0 --- /dev/null +++ b/local_dedup/AI4Code/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e405f730336401cff3874ed90b3d097ae7a7134a46ba077d3d363277df080b66 +size 256 diff --git a/local_dedup/AMPS/dataset.arrow b/local_dedup/AMPS/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e2d91fb19c8b17901676dac4c939cdb0eab97cbb --- /dev/null +++ b/local_dedup/AMPS/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053b86f94cbdaa8c553951075aa9f6f3360ec8334fbd94736d59c7d3585e2251 +size 441608 diff --git a/local_dedup/AMPS/dataset_info.json b/local_dedup/AMPS/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c4cc3101443cc45d9a7582dbaf485f51bf58b6d2 --- /dev/null +++ b/local_dedup/AMPS/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f95d82428c02ef084b39df6bbae3dd733fbd7e2d6336d8b2d482830223047183 +size 944 diff --git a/local_dedup/AMPS/state.json b/local_dedup/AMPS/state.json new file mode 100644 index 0000000000000000000000000000000000000000..89dee69cc21407ffa1d9f063c79a7636e248bac7 --- /dev/null +++ b/local_dedup/AMPS/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e9ab0699d0c17d8768b26fa63fd22aaff608ba9d9c238202e782473a4ea885d +size 256 diff --git a/local_dedup/ASFPublicMail_ver2/dataset.arrow b/local_dedup/ASFPublicMail_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..de75f6993a012f451197466846710182d89d1dea --- /dev/null +++ b/local_dedup/ASFPublicMail_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990bbad25277a351bc22f3c26121b02d698e6e002fe9b024be09c7cddefe7e70 +size 131782480 diff --git a/local_dedup/ASFPublicMail_ver2/dataset_info.json b/local_dedup/ASFPublicMail_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..fa0132442caa26f97e267af6d02e1aaeb529cc5c --- /dev/null +++ b/local_dedup/ASFPublicMail_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16063393c7da47937ce963b3f3e3c1ee8970ae45d22f881b938f7d567adcaf3d +size 1259 diff --git a/local_dedup/ASFPublicMail_ver2/state.json b/local_dedup/ASFPublicMail_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..f2a2c70bfcbdbacbb19c9a64974a9d2c1d12faa4 --- /dev/null +++ b/local_dedup/ASFPublicMail_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d16989887376d31bea4fac729c95e170ee1f80d02501b976a9bbb3ceb871fc +size 256 diff --git a/local_dedup/CodePilePosts/dataset.arrow b/local_dedup/CodePilePosts/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e2db6c1e22631f880287d386a76be86d01fb3995 --- /dev/null +++ b/local_dedup/CodePilePosts/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79420ef99d82108a769a6f048a1bb9b405f98ccace89da8013c6ed6d033e5b1c +size 2131816 diff --git a/local_dedup/CodePilePosts/dataset_info.json b/local_dedup/CodePilePosts/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ac7afe3164d70a45b68e96f806584cf6089f5397 --- /dev/null +++ b/local_dedup/CodePilePosts/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92391f1839e371c569d8ccd3c691ddf5796870f6063a316baf2b62c92aabe313 +size 958 diff --git a/local_dedup/CodePilePosts/state.json b/local_dedup/CodePilePosts/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ee512cee0d02be44d93beb75dc97b014af5a22ea --- /dev/null +++ b/local_dedup/CodePilePosts/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b120b2e69adbc98ea0700dfa28a816f87ecf57577de8b89df3c67fe92308bea +size 256 diff --git a/local_dedup/CodePileReddit2019_ver2/dataset.arrow b/local_dedup/CodePileReddit2019_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..532e25962fd4ca71c35b371db9bcafda81dc2b07 --- /dev/null +++ b/local_dedup/CodePileReddit2019_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72c37182256491228a36335c885235a32e1c71ee2ed52f739e2c3263853bed32 +size 1783584 diff --git a/local_dedup/CodePileReddit2019_ver2/dataset_info.json b/local_dedup/CodePileReddit2019_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c064b363d28441880e836b217072671146b379e7 --- /dev/null +++ b/local_dedup/CodePileReddit2019_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be965a30f154d072da3cad24b91ed90860c36fe043c4846ce3c6769ca9e0ccf +size 2763 diff --git a/local_dedup/CodePileReddit2019_ver2/state.json b/local_dedup/CodePileReddit2019_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..751d9a6130eabfcb1b19eb3711555ab85f5e5d07 --- /dev/null +++ b/local_dedup/CodePileReddit2019_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d4db67ccd3f8f9d9bc342cfe49cf8067b0ae3939c243d11acd85dbeb040c40 +size 256 diff --git a/local_dedup/CodePileReddit2020_ver2/dataset.arrow b/local_dedup/CodePileReddit2020_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1d80e38429d4b6421ecc4ea5c19cc6968fef04e8 --- /dev/null +++ b/local_dedup/CodePileReddit2020_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:629cfc15f65cc75f7c1529ce7b5e02ce3ac6b4250fd45a691335bd0dfbec1718 +size 2690040 diff --git a/local_dedup/CodePileReddit2020_ver2/dataset_info.json b/local_dedup/CodePileReddit2020_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..22328087c529ec13a39471f35be08e1226c36f7d --- /dev/null +++ b/local_dedup/CodePileReddit2020_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69329b7f07b6b6bb5c700170cac90f4078ada7db41b00aa3d4293471d2feb94e +size 1253 diff --git a/local_dedup/CodePileReddit2020_ver2/state.json b/local_dedup/CodePileReddit2020_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..2889d8b88b95e909f5e0c7ae581540b50e240dfb --- /dev/null +++ b/local_dedup/CodePileReddit2020_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbbf5c5b0a93102657732c1f013ca5957efe85968b8eb68793c430d6cbf95b72 +size 256 diff --git a/local_dedup/CodePileReddit2021_ver2/dataset.arrow b/local_dedup/CodePileReddit2021_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..461ed6bb5eaf14ab38194df435b36289e6654180 --- /dev/null +++ b/local_dedup/CodePileReddit2021_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8544962a7e998f20f9e543e3166145758d5a5cc6cca0555ef8ed8728897d07e0 +size 2900344 diff --git a/local_dedup/CodePileReddit2021_ver2/dataset_info.json b/local_dedup/CodePileReddit2021_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..6510a3b284098fda1186a9060c9ad6e26f75599a --- /dev/null +++ b/local_dedup/CodePileReddit2021_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0699a41b3de07ae98d6019e33663d8a4d0765989c1b57b08cca9a0056693fa +size 1253 diff --git a/local_dedup/CodePileReddit2021_ver2/state.json b/local_dedup/CodePileReddit2021_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..4b5f34b7e520a393d0daa32a246b3dd6d2d73556 --- /dev/null +++ b/local_dedup/CodePileReddit2021_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058c28a4130873b232bd1c60ac96f08f0a0693ca454ab317963f9d4234ee2997 +size 256 diff --git a/local_dedup/CodePileReddit2022_ver2/dataset.arrow b/local_dedup/CodePileReddit2022_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..394b123a7c05960de2b5a7e4be316e0a5580f2dc --- /dev/null +++ b/local_dedup/CodePileReddit2022_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae8987c49610915aa4efae028f8e6504f4d16026dfdcadfa813ec3c0ccc8f955 +size 2762800 diff --git a/local_dedup/CodePileReddit2022_ver2/dataset_info.json b/local_dedup/CodePileReddit2022_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..fe4a470189398d5ed646bed649d60b54c41a8461 --- /dev/null +++ b/local_dedup/CodePileReddit2022_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60c704f09a60f403589a96e2216516ebf73380269ec409f65172fdcfdf704d2 +size 1253 diff --git a/local_dedup/CodePileReddit2022_ver2/state.json b/local_dedup/CodePileReddit2022_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..a6f619f604af385dbb3c37ae2a42453a4864f2cd --- /dev/null +++ b/local_dedup/CodePileReddit2022_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb705f514f7d4d454f7ca4042d79bdc504153f7c8bab2ccb993ceb70cd14d0e +size 256 diff --git a/local_dedup/DMMath/dataset.arrow b/local_dedup/DMMath/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a74a1127caea0de8b893a4483e43bee00bf0a224 --- /dev/null +++ b/local_dedup/DMMath/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541363919fbedc9f56a9770d0f2b36bd1a0dfe2c7e29221f120a286fce8a8261 +size 8564624 diff --git a/local_dedup/DMMath/dataset_info.json b/local_dedup/DMMath/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..2bfd2720972cbde122c30ba48430d313fffa9233 --- /dev/null +++ b/local_dedup/DMMath/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0136da611caf524a63c1963fd904d7a5d29ffd277d8622d002d86cdcc613cb0e +size 952 diff --git a/local_dedup/DMMath/state.json b/local_dedup/DMMath/state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d718f712a679a4e14ec4a47b49c14c76d0ed416 --- /dev/null +++ b/local_dedup/DMMath/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7488e34da2c713600e94665e3e800d610129b8552fb88e7b53be0beea09c40c1 +size 256 diff --git a/local_dedup/DevDocs/dataset.arrow b/local_dedup/DevDocs/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..29fc39a83947a695c97a2e3231dad7aedcb1a818 --- /dev/null +++ b/local_dedup/DevDocs/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d397e10d33324f31aa5604e6ac2f1359084eb840369c6846d6ba23be744eb775 +size 4400152 diff --git a/local_dedup/DevDocs/dataset_info.json b/local_dedup/DevDocs/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..e171c017ed8e71541fda500946246d2fa006ba96 --- /dev/null +++ b/local_dedup/DevDocs/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa7112599732b6a2d018c3514d7d64749948b4bf5e224a49b8100e9a5823b35 +size 952 diff --git a/local_dedup/DevDocs/state.json b/local_dedup/DevDocs/state.json new file mode 100644 index 0000000000000000000000000000000000000000..c0e376e96dcd0316582815374b3a065c4b123e9e --- /dev/null +++ b/local_dedup/DevDocs/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5e46075c479c18ccb888cae48c28d7a2973a764591ef73dfe0155c661cb151a +size 256 diff --git a/local_dedup/Enwiki_ver2/dataset.arrow b/local_dedup/Enwiki_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ebd7fbec295ce0c9de462e4df34ca546fa4d4e7c --- /dev/null +++ b/local_dedup/Enwiki_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4dde363d3088a5ac711d23de933ffb08d43b6871c890c6a31b285689a6ebf4d +size 4899320 diff --git a/local_dedup/Enwiki_ver2/dataset_info.json b/local_dedup/Enwiki_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..0f9cb42176f7a91c1ceb837bcbba4bb21ac2e2b3 --- /dev/null +++ b/local_dedup/Enwiki_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd184f28e4452acedda0d110c835b58efaf363901e436d93afc11f2d730ea3c +size 1241 diff --git a/local_dedup/Enwiki_ver2/state.json b/local_dedup/Enwiki_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..37263a3fbef391514c5d73d802bf4250fe35bbb8 --- /dev/null +++ b/local_dedup/Enwiki_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c8b509410155d7b015758265ed9a781a3580b4b1eea31f4c5e63cc03d421799 +size 256 diff --git a/local_dedup/EuroParliamentProceedings_ver2/dataset.arrow b/local_dedup/EuroParliamentProceedings_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..8638bd12650a73a64017d0c98f475bd0747a4391 --- /dev/null +++ b/local_dedup/EuroParliamentProceedings_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e862965d1847aba701bc7f65ca3c44ddf50f86e357a89bc75cb0a67add2d3986 +size 72240848 diff --git a/local_dedup/EuroParliamentProceedings_ver2/dataset_info.json b/local_dedup/EuroParliamentProceedings_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ac1b2cec04f87f32bb2ef1cb40699e0ac2da5887 --- /dev/null +++ b/local_dedup/EuroParliamentProceedings_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d3d99fb72a57c05e775f26dacecc1459b6d3262a8d34a48b4538310392da2a +size 1267 diff --git a/local_dedup/EuroParliamentProceedings_ver2/state.json b/local_dedup/EuroParliamentProceedings_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3e255060aa6a888a8497b8fd6f35ef66a963d27 --- /dev/null +++ b/local_dedup/EuroParliamentProceedings_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13c4046eb4c5ddae956c4ec64fff0b850068b7ed7e0429d54a2649f91f815723 +size 256 diff --git a/local_dedup/FreeLaw_Options_ver2/dataset.arrow b/local_dedup/FreeLaw_Options_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..de6b4a8dc0cc657f4b85ea06d0a5831c5c7ccb63 --- /dev/null +++ b/local_dedup/FreeLaw_Options_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2feb33332bc3538125ca9b53e70a92ce730391d2da53c393d188b9537e517f0f +size 14057864 diff --git a/local_dedup/FreeLaw_Options_ver2/dataset_info.json b/local_dedup/FreeLaw_Options_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc7b138b3248fbbfe1140724113830a35f1faba --- /dev/null +++ b/local_dedup/FreeLaw_Options_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a43628cbced75c4b249dea6088d4e1704afb6dfcc8a6058ea34fedf12fde993 +size 1256 diff --git a/local_dedup/FreeLaw_Options_ver2/state.json b/local_dedup/FreeLaw_Options_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d695d5d84e5bd0758d859a9b0c87cb62a1b5e1a4 --- /dev/null +++ b/local_dedup/FreeLaw_Options_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fd863b68357df401a6530e805ec0418af425aa41815b5a7b6d6b9267c0ce041 +size 256 diff --git a/local_dedup/GithubDiff_ver2/dataset.arrow b/local_dedup/GithubDiff_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f0661c8d038c072aabee6a07f085259acb2f98b0 --- /dev/null +++ b/local_dedup/GithubDiff_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a391ce41c165462c9623d797c7f3a7aa2b94c7a759b1a34888c725a037e2fb4c +size 7660264 diff --git a/local_dedup/GithubDiff_ver2/dataset_info.json b/local_dedup/GithubDiff_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..2e2c44c7a6f24276258a4e168eeaf477daad5ae9 --- /dev/null +++ b/local_dedup/GithubDiff_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60979cf00fcfade9d2191d456431ede2161d1687f62d4ffa93f5d3dbf77cb68 +size 961 diff --git a/local_dedup/GithubDiff_ver2/state.json b/local_dedup/GithubDiff_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..60a8ede763ddf842393f34abc4885dc73f195931 --- /dev/null +++ b/local_dedup/GithubDiff_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c1e14e8ca0bad56c53a5e1495ef59235d5268850eb4dbc7c143c04fbf45024 +size 256 diff --git a/local_dedup/Gutenberg_ver2/dataset.arrow b/local_dedup/Gutenberg_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..64349e3d3d0bceed71c5ac5770a9765f4926374b --- /dev/null +++ b/local_dedup/Gutenberg_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d286efd3d65355727bcacbbfb0972292b9c41928b3ba48e82d134f14e2040e6d +size 369487992 diff --git a/local_dedup/Gutenberg_ver2/dataset_info.json b/local_dedup/Gutenberg_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..b7305ba85bc90a8501f5382ea6561ead5b1f790c --- /dev/null +++ b/local_dedup/Gutenberg_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a540771ad266f40ad4b1174db5a49b64ddb51442318c1621c00d83b16cc78616 +size 1255 diff --git a/local_dedup/Gutenberg_ver2/state.json b/local_dedup/Gutenberg_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b4c6f8a9549e1c63ee61222985086441ba426e0 --- /dev/null +++ b/local_dedup/Gutenberg_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c2a0a3d1ffa309aee2d1eaff58a7cc3e6d9be9547171fe052224b84bae508fe +size 256 diff --git a/local_dedup/OtherWiki/dataset.arrow b/local_dedup/OtherWiki/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..4e8ab041cbda4661bb31da650dcedddc9ee7283e --- /dev/null +++ b/local_dedup/OtherWiki/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338dd162af98018824e2fe3b4c49d72ea09fba5526d631fc9c720b7a9d79f9b8 +size 8538312 diff --git a/local_dedup/OtherWiki/dataset_info.json b/local_dedup/OtherWiki/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..51b6cb9cd626b340d47857b1deba30c5b6bfc39b --- /dev/null +++ b/local_dedup/OtherWiki/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:293e2e953e71bdd075e76a301ea735c6756dc3f6dac2f3ad3f9e34ae52e0d070 +size 955 diff --git a/local_dedup/OtherWiki/state.json b/local_dedup/OtherWiki/state.json new file mode 100644 index 0000000000000000000000000000000000000000..688c576d5f45fdbb8a2bbfcf6a93ed7c9645ecfd --- /dev/null +++ b/local_dedup/OtherWiki/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1288154c3bba167b777854f90edc1fa0920790064186bd35572c53e7b91fe3 +size 256 diff --git a/local_dedup/PileOfLaw_ver2/dataset.arrow b/local_dedup/PileOfLaw_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2cd4009b7b64219d4e593d4e7fe77f6fd9387cd4 --- /dev/null +++ b/local_dedup/PileOfLaw_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d12743c9c222679cb29dfef0e4d366f0e7485d609a06e4537e346908e16d498c +size 17384928 diff --git a/local_dedup/PileOfLaw_ver2/dataset_info.json b/local_dedup/PileOfLaw_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..8531cecf2ab4d88bf1fa5c51d24fafcaf5630710 --- /dev/null +++ b/local_dedup/PileOfLaw_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98ad5e5d4ba412b334e21d829cb1db01e72c813602ba2469cd2071377b12dc0b +size 1250 diff --git a/local_dedup/PileOfLaw_ver2/state.json b/local_dedup/PileOfLaw_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..8156390bec3ac5311ab8fa4fd75da71ba094fd76 --- /dev/null +++ b/local_dedup/PileOfLaw_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bfa4f9290e68501a5faa3104ad1f4e12830e3fa89e697ec4d8e6b7a92b7eeae +size 256 diff --git a/local_dedup/PubMed_ver2/dataset.arrow b/local_dedup/PubMed_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9819b62eaa792fee7121771dde4a5a78d43246e0 --- /dev/null +++ b/local_dedup/PubMed_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eed529f88c5a1b75c52560b3517d9a839b56689f3286725af0082de1693faa7 +size 46065288 diff --git a/local_dedup/PubMed_ver2/dataset_info.json b/local_dedup/PubMed_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..13a057ae3052a918de5194922cc6e73d31ea43ea --- /dev/null +++ b/local_dedup/PubMed_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521f6c83673dbf0f0a3d10092c5c8cc05e42e566960d46ee791252633697f6bf +size 1247 diff --git a/local_dedup/PubMed_ver2/state.json b/local_dedup/PubMed_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..8fc91e249fc0bc8b1796d461f6d1f8a59f8efa0a --- /dev/null +++ b/local_dedup/PubMed_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:256623cac70ab01f2dadafc7a067f23b96c55877fa36cf6ac1c42cdc49ef7dce +size 256 diff --git a/local_dedup/S2ORC_ver2/dataset.arrow b/local_dedup/S2ORC_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..900c97db4e5d3552bb2ca8666d9ff021bf979219 --- /dev/null +++ b/local_dedup/S2ORC_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:743be44a89da15f4d89b68e92c1a6af61e2d6a112f27c804d13c24204029a013 +size 25335584 diff --git a/local_dedup/S2ORC_ver2/dataset_info.json b/local_dedup/S2ORC_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..7dcf2189c478e04cb6dd8c64cc02ae3564d91f4f --- /dev/null +++ b/local_dedup/S2ORC_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46577981e14053923ea05858171e6105f2bfdd9b8e02d8ce32237ba11271c768 +size 1246 diff --git a/local_dedup/S2ORC_ver2/state.json b/local_dedup/S2ORC_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef9bb29d2662bafd9059e8d667056acfd71668f0 --- /dev/null +++ b/local_dedup/S2ORC_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a7bf1fc284e91e12a95bc3fbfae3d4fa2f584647d20ddf9f86e0b5a72ba16fe +size 256 diff --git a/local_dedup/StackExchange_ver3/dataset.arrow b/local_dedup/StackExchange_ver3/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c4d327bb4ecc92899f9ce0a88ecb947d532e1fea --- /dev/null +++ b/local_dedup/StackExchange_ver3/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d3b59321445b76e7f14be3698e906b5e1f1845b793452320e3c87a84dd96cf8 +size 3985640 diff --git a/local_dedup/StackExchange_ver3/dataset_info.json b/local_dedup/StackExchange_ver3/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..829b581443828e514075832e83f9b0708bd6fbe0 --- /dev/null +++ b/local_dedup/StackExchange_ver3/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a4bb3e0606df2100f779f4d3465a9f44366e0465f4244c59932581d33edf477 +size 1249 diff --git a/local_dedup/StackExchange_ver3/state.json b/local_dedup/StackExchange_ver3/state.json new file mode 100644 index 0000000000000000000000000000000000000000..4171455b37e89da59f69ff154d58587c9cc7d34d --- /dev/null +++ b/local_dedup/StackExchange_ver3/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e524d5ba6f0aecb4d6fded16ff2074fd8752364fb2b8bc62e7e16f0c1248c07 +size 256 diff --git a/local_dedup/USPTO_ver2/dataset.arrow b/local_dedup/USPTO_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6c2653d615d7a8a5b9095d5cc638e3bba8803ecd --- /dev/null +++ b/local_dedup/USPTO_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b2b0d4734dd47221e37cf1ec07f0e126fac8659cd91a090f2984389adbce415 +size 48349960 diff --git a/local_dedup/USPTO_ver2/dataset_info.json b/local_dedup/USPTO_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c7a1891d4b1b7a772e8507d8cf1ab3a514e90755 --- /dev/null +++ b/local_dedup/USPTO_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a54c4830ae6495ef5218ee472dbfef4c2b48cfe5b62d5f48ce044a14b3a0998 +size 1246 diff --git a/local_dedup/USPTO_ver2/state.json b/local_dedup/USPTO_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..eb059391f559888e52ae0fcfe9d44e1b235c599d --- /dev/null +++ b/local_dedup/USPTO_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a7317343166fe5bc51704a19c4c783d8cc77169b33d9136ae3a10104576526b +size 256 diff --git a/local_dedup/UbuntuIRC_ver2/dataset.arrow b/local_dedup/UbuntuIRC_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3c9a298bb778c7257274bfceee7f90ffcbe23a3b --- /dev/null +++ b/local_dedup/UbuntuIRC_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af57e0d7784dd177a431bacc0effc128f2161d7a4297cc2ff3ec3db672170f34 +size 11262448 diff --git a/local_dedup/UbuntuIRC_ver2/dataset_info.json b/local_dedup/UbuntuIRC_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..236724632985b0e38c04b3d8b650453b6b0278c5 --- /dev/null +++ b/local_dedup/UbuntuIRC_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:846cfd4debc3a3eff59f2b2646424adad4f18e3329c1f5c1d1e7d78b4a36baa4 +size 1436 diff --git a/local_dedup/UbuntuIRC_ver2/state.json b/local_dedup/UbuntuIRC_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..0aafb0cb306b3649b153542427d30bff8dafb135 --- /dev/null +++ b/local_dedup/UbuntuIRC_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64a0c4b109968c525c6fd92c4d5c6274e8c1b18fbe5409a7c0149d3cf5c12f8e +size 256 diff --git a/local_dedup/arXiv_ver2/dataset.arrow b/local_dedup/arXiv_ver2/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1b86f1a5c47c74dbd5f6c0ec76163ab1b8d8a2da --- /dev/null +++ b/local_dedup/arXiv_ver2/dataset.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4033765f60a05069a895929836161ee7512b689d51891a0b3228dab621d3a9ee +size 50128552 diff --git a/local_dedup/arXiv_ver2/dataset_info.json b/local_dedup/arXiv_ver2/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..76ceeda8aee30781b074d9f41630af94e4762400 --- /dev/null +++ b/local_dedup/arXiv_ver2/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15da1446b29b726e1adf07e1f0a30a869d2be04336b7e31ad0f261d29697485 +size 1247 diff --git a/local_dedup/arXiv_ver2/state.json b/local_dedup/arXiv_ver2/state.json new file mode 100644 index 0000000000000000000000000000000000000000..672e3cce1270215dc3ace35bb85e9c224b22fc4c --- /dev/null +++ b/local_dedup/arXiv_ver2/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ef06bb89b38ac23bb3de6d9740a39ff33f5ff10fce3636947900ed780eda92 +size 256