Spaces:
Build error
Build error
Reshinth Adithyan
commited on
Commit
•
d1b0126
1
Parent(s):
9c88e2b
Add local dedup version
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +5 -2
- local_dedup/AI4Code/dataset.arrow +3 -0
- local_dedup/AI4Code/dataset_info.json +3 -0
- local_dedup/AI4Code/state.json +3 -0
- local_dedup/AMPS/dataset.arrow +3 -0
- local_dedup/AMPS/dataset_info.json +3 -0
- local_dedup/AMPS/state.json +3 -0
- local_dedup/ASFPublicMail_ver2/dataset.arrow +3 -0
- local_dedup/ASFPublicMail_ver2/dataset_info.json +3 -0
- local_dedup/ASFPublicMail_ver2/state.json +3 -0
- local_dedup/CodePilePosts/dataset.arrow +3 -0
- local_dedup/CodePilePosts/dataset_info.json +3 -0
- local_dedup/CodePilePosts/state.json +3 -0
- local_dedup/CodePileReddit2019_ver2/dataset.arrow +3 -0
- local_dedup/CodePileReddit2019_ver2/dataset_info.json +3 -0
- local_dedup/CodePileReddit2019_ver2/state.json +3 -0
- local_dedup/CodePileReddit2020_ver2/dataset.arrow +3 -0
- local_dedup/CodePileReddit2020_ver2/dataset_info.json +3 -0
- local_dedup/CodePileReddit2020_ver2/state.json +3 -0
- local_dedup/CodePileReddit2021_ver2/dataset.arrow +3 -0
- local_dedup/CodePileReddit2021_ver2/dataset_info.json +3 -0
- local_dedup/CodePileReddit2021_ver2/state.json +3 -0
- local_dedup/CodePileReddit2022_ver2/dataset.arrow +3 -0
- local_dedup/CodePileReddit2022_ver2/dataset_info.json +3 -0
- local_dedup/CodePileReddit2022_ver2/state.json +3 -0
- local_dedup/DMMath/dataset.arrow +3 -0
- local_dedup/DMMath/dataset_info.json +3 -0
- local_dedup/DMMath/state.json +3 -0
- local_dedup/DevDocs/dataset.arrow +3 -0
- local_dedup/DevDocs/dataset_info.json +3 -0
- local_dedup/DevDocs/state.json +3 -0
- local_dedup/Enwiki_ver2/dataset.arrow +3 -0
- local_dedup/Enwiki_ver2/dataset_info.json +3 -0
- local_dedup/Enwiki_ver2/state.json +3 -0
- local_dedup/EuroParliamentProceedings_ver2/dataset.arrow +3 -0
- local_dedup/EuroParliamentProceedings_ver2/dataset_info.json +3 -0
- local_dedup/EuroParliamentProceedings_ver2/state.json +3 -0
- local_dedup/FreeLaw_Options_ver2/dataset.arrow +3 -0
- local_dedup/FreeLaw_Options_ver2/dataset_info.json +3 -0
- local_dedup/FreeLaw_Options_ver2/state.json +3 -0
- local_dedup/GithubDiff_ver2/dataset.arrow +3 -0
- local_dedup/GithubDiff_ver2/dataset_info.json +3 -0
- local_dedup/GithubDiff_ver2/state.json +3 -0
- local_dedup/Gutenberg_ver2/dataset.arrow +3 -0
- local_dedup/Gutenberg_ver2/dataset_info.json +3 -0
- local_dedup/Gutenberg_ver2/state.json +3 -0
- local_dedup/OtherWiki/dataset.arrow +3 -0
- local_dedup/OtherWiki/dataset_info.json +3 -0
- local_dedup/OtherWiki/state.json +3 -0
- local_dedup/PileOfLaw_ver2/dataset.arrow +3 -0
app.py
CHANGED
@@ -6,8 +6,11 @@ from transformers import AutoTokenizer
|
|
6 |
import ast
|
7 |
import re
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
11 |
contribution_json = "contributors.json"
|
12 |
|
13 |
contribution_dict = json.load(open(contribution_json,"r"))
|
|
|
6 |
import ast
|
7 |
import re
|
8 |
|
9 |
+
version = st.sidebar.selectbox("Choose a version", ["init","local_dedup"])
|
10 |
+
if version == "init":
|
11 |
+
CACHE_DIR = "cache_ds/" #Use this to build the dataset
|
12 |
+
else:
|
13 |
+
CACHE_DIR = "local_dedup/"
|
14 |
contribution_json = "contributors.json"
|
15 |
|
16 |
contribution_dict = json.load(open(contribution_json,"r"))
|
local_dedup/AI4Code/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10a7f6b87c470ba3134fe3723ff180882d065f609db88fbbe142b4b4620bb580
|
3 |
+
size 12556632
|
local_dedup/AI4Code/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f2b26217928d6bea283689fb6af4808a924e8e571e25bdd8b3782da9d6a695d
|
3 |
+
size 957
|
local_dedup/AI4Code/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e405f730336401cff3874ed90b3d097ae7a7134a46ba077d3d363277df080b66
|
3 |
+
size 256
|
local_dedup/AMPS/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:053b86f94cbdaa8c553951075aa9f6f3360ec8334fbd94736d59c7d3585e2251
|
3 |
+
size 441608
|
local_dedup/AMPS/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f95d82428c02ef084b39df6bbae3dd733fbd7e2d6336d8b2d482830223047183
|
3 |
+
size 944
|
local_dedup/AMPS/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e9ab0699d0c17d8768b26fa63fd22aaff608ba9d9c238202e782473a4ea885d
|
3 |
+
size 256
|
local_dedup/ASFPublicMail_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:990bbad25277a351bc22f3c26121b02d698e6e002fe9b024be09c7cddefe7e70
|
3 |
+
size 131782480
|
local_dedup/ASFPublicMail_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16063393c7da47937ce963b3f3e3c1ee8970ae45d22f881b938f7d567adcaf3d
|
3 |
+
size 1259
|
local_dedup/ASFPublicMail_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09d16989887376d31bea4fac729c95e170ee1f80d02501b976a9bbb3ceb871fc
|
3 |
+
size 256
|
local_dedup/CodePilePosts/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79420ef99d82108a769a6f048a1bb9b405f98ccace89da8013c6ed6d033e5b1c
|
3 |
+
size 2131816
|
local_dedup/CodePilePosts/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92391f1839e371c569d8ccd3c691ddf5796870f6063a316baf2b62c92aabe313
|
3 |
+
size 958
|
local_dedup/CodePilePosts/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b120b2e69adbc98ea0700dfa28a816f87ecf57577de8b89df3c67fe92308bea
|
3 |
+
size 256
|
local_dedup/CodePileReddit2019_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72c37182256491228a36335c885235a32e1c71ee2ed52f739e2c3263853bed32
|
3 |
+
size 1783584
|
local_dedup/CodePileReddit2019_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5be965a30f154d072da3cad24b91ed90860c36fe043c4846ce3c6769ca9e0ccf
|
3 |
+
size 2763
|
local_dedup/CodePileReddit2019_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80d4db67ccd3f8f9d9bc342cfe49cf8067b0ae3939c243d11acd85dbeb040c40
|
3 |
+
size 256
|
local_dedup/CodePileReddit2020_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:629cfc15f65cc75f7c1529ce7b5e02ce3ac6b4250fd45a691335bd0dfbec1718
|
3 |
+
size 2690040
|
local_dedup/CodePileReddit2020_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69329b7f07b6b6bb5c700170cac90f4078ada7db41b00aa3d4293471d2feb94e
|
3 |
+
size 1253
|
local_dedup/CodePileReddit2020_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dbbf5c5b0a93102657732c1f013ca5957efe85968b8eb68793c430d6cbf95b72
|
3 |
+
size 256
|
local_dedup/CodePileReddit2021_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8544962a7e998f20f9e543e3166145758d5a5cc6cca0555ef8ed8728897d07e0
|
3 |
+
size 2900344
|
local_dedup/CodePileReddit2021_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e0699a41b3de07ae98d6019e33663d8a4d0765989c1b57b08cca9a0056693fa
|
3 |
+
size 1253
|
local_dedup/CodePileReddit2021_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:058c28a4130873b232bd1c60ac96f08f0a0693ca454ab317963f9d4234ee2997
|
3 |
+
size 256
|
local_dedup/CodePileReddit2022_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae8987c49610915aa4efae028f8e6504f4d16026dfdcadfa813ec3c0ccc8f955
|
3 |
+
size 2762800
|
local_dedup/CodePileReddit2022_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d60c704f09a60f403589a96e2216516ebf73380269ec409f65172fdcfdf704d2
|
3 |
+
size 1253
|
local_dedup/CodePileReddit2022_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfb705f514f7d4d454f7ca4042d79bdc504153f7c8bab2ccb993ceb70cd14d0e
|
3 |
+
size 256
|
local_dedup/DMMath/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:541363919fbedc9f56a9770d0f2b36bd1a0dfe2c7e29221f120a286fce8a8261
|
3 |
+
size 8564624
|
local_dedup/DMMath/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0136da611caf524a63c1963fd904d7a5d29ffd277d8622d002d86cdcc613cb0e
|
3 |
+
size 952
|
local_dedup/DMMath/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7488e34da2c713600e94665e3e800d610129b8552fb88e7b53be0beea09c40c1
|
3 |
+
size 256
|
local_dedup/DevDocs/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d397e10d33324f31aa5604e6ac2f1359084eb840369c6846d6ba23be744eb775
|
3 |
+
size 4400152
|
local_dedup/DevDocs/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1aa7112599732b6a2d018c3514d7d64749948b4bf5e224a49b8100e9a5823b35
|
3 |
+
size 952
|
local_dedup/DevDocs/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5e46075c479c18ccb888cae48c28d7a2973a764591ef73dfe0155c661cb151a
|
3 |
+
size 256
|
local_dedup/Enwiki_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4dde363d3088a5ac711d23de933ffb08d43b6871c890c6a31b285689a6ebf4d
|
3 |
+
size 4899320
|
local_dedup/Enwiki_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6bd184f28e4452acedda0d110c835b58efaf363901e436d93afc11f2d730ea3c
|
3 |
+
size 1241
|
local_dedup/Enwiki_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c8b509410155d7b015758265ed9a781a3580b4b1eea31f4c5e63cc03d421799
|
3 |
+
size 256
|
local_dedup/EuroParliamentProceedings_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e862965d1847aba701bc7f65ca3c44ddf50f86e357a89bc75cb0a67add2d3986
|
3 |
+
size 72240848
|
local_dedup/EuroParliamentProceedings_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03d3d99fb72a57c05e775f26dacecc1459b6d3262a8d34a48b4538310392da2a
|
3 |
+
size 1267
|
local_dedup/EuroParliamentProceedings_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13c4046eb4c5ddae956c4ec64fff0b850068b7ed7e0429d54a2649f91f815723
|
3 |
+
size 256
|
local_dedup/FreeLaw_Options_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2feb33332bc3538125ca9b53e70a92ce730391d2da53c393d188b9537e517f0f
|
3 |
+
size 14057864
|
local_dedup/FreeLaw_Options_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a43628cbced75c4b249dea6088d4e1704afb6dfcc8a6058ea34fedf12fde993
|
3 |
+
size 1256
|
local_dedup/FreeLaw_Options_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fd863b68357df401a6530e805ec0418af425aa41815b5a7b6d6b9267c0ce041
|
3 |
+
size 256
|
local_dedup/GithubDiff_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a391ce41c165462c9623d797c7f3a7aa2b94c7a759b1a34888c725a037e2fb4c
|
3 |
+
size 7660264
|
local_dedup/GithubDiff_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d60979cf00fcfade9d2191d456431ede2161d1687f62d4ffa93f5d3dbf77cb68
|
3 |
+
size 961
|
local_dedup/GithubDiff_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3c1e14e8ca0bad56c53a5e1495ef59235d5268850eb4dbc7c143c04fbf45024
|
3 |
+
size 256
|
local_dedup/Gutenberg_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d286efd3d65355727bcacbbfb0972292b9c41928b3ba48e82d134f14e2040e6d
|
3 |
+
size 369487992
|
local_dedup/Gutenberg_ver2/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a540771ad266f40ad4b1174db5a49b64ddb51442318c1621c00d83b16cc78616
|
3 |
+
size 1255
|
local_dedup/Gutenberg_ver2/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c2a0a3d1ffa309aee2d1eaff58a7cc3e6d9be9547171fe052224b84bae508fe
|
3 |
+
size 256
|
local_dedup/OtherWiki/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:338dd162af98018824e2fe3b4c49d72ea09fba5526d631fc9c720b7a9d79f9b8
|
3 |
+
size 8538312
|
local_dedup/OtherWiki/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:293e2e953e71bdd075e76a301ea735c6756dc3f6dac2f3ad3f9e34ae52e0d070
|
3 |
+
size 955
|
local_dedup/OtherWiki/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a1288154c3bba167b777854f90edc1fa0920790064186bd35572c53e7b91fe3
|
3 |
+
size 256
|
local_dedup/PileOfLaw_ver2/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d12743c9c222679cb29dfef0e4d366f0e7485d609a06e4537e346908e16d498c
|
3 |
+
size 17384928
|