Reshinth Adithyan commited on
Commit
d1b0126
1 Parent(s): 9c88e2b

Add local dedup version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +5 -2
  2. local_dedup/AI4Code/dataset.arrow +3 -0
  3. local_dedup/AI4Code/dataset_info.json +3 -0
  4. local_dedup/AI4Code/state.json +3 -0
  5. local_dedup/AMPS/dataset.arrow +3 -0
  6. local_dedup/AMPS/dataset_info.json +3 -0
  7. local_dedup/AMPS/state.json +3 -0
  8. local_dedup/ASFPublicMail_ver2/dataset.arrow +3 -0
  9. local_dedup/ASFPublicMail_ver2/dataset_info.json +3 -0
  10. local_dedup/ASFPublicMail_ver2/state.json +3 -0
  11. local_dedup/CodePilePosts/dataset.arrow +3 -0
  12. local_dedup/CodePilePosts/dataset_info.json +3 -0
  13. local_dedup/CodePilePosts/state.json +3 -0
  14. local_dedup/CodePileReddit2019_ver2/dataset.arrow +3 -0
  15. local_dedup/CodePileReddit2019_ver2/dataset_info.json +3 -0
  16. local_dedup/CodePileReddit2019_ver2/state.json +3 -0
  17. local_dedup/CodePileReddit2020_ver2/dataset.arrow +3 -0
  18. local_dedup/CodePileReddit2020_ver2/dataset_info.json +3 -0
  19. local_dedup/CodePileReddit2020_ver2/state.json +3 -0
  20. local_dedup/CodePileReddit2021_ver2/dataset.arrow +3 -0
  21. local_dedup/CodePileReddit2021_ver2/dataset_info.json +3 -0
  22. local_dedup/CodePileReddit2021_ver2/state.json +3 -0
  23. local_dedup/CodePileReddit2022_ver2/dataset.arrow +3 -0
  24. local_dedup/CodePileReddit2022_ver2/dataset_info.json +3 -0
  25. local_dedup/CodePileReddit2022_ver2/state.json +3 -0
  26. local_dedup/DMMath/dataset.arrow +3 -0
  27. local_dedup/DMMath/dataset_info.json +3 -0
  28. local_dedup/DMMath/state.json +3 -0
  29. local_dedup/DevDocs/dataset.arrow +3 -0
  30. local_dedup/DevDocs/dataset_info.json +3 -0
  31. local_dedup/DevDocs/state.json +3 -0
  32. local_dedup/Enwiki_ver2/dataset.arrow +3 -0
  33. local_dedup/Enwiki_ver2/dataset_info.json +3 -0
  34. local_dedup/Enwiki_ver2/state.json +3 -0
  35. local_dedup/EuroParliamentProceedings_ver2/dataset.arrow +3 -0
  36. local_dedup/EuroParliamentProceedings_ver2/dataset_info.json +3 -0
  37. local_dedup/EuroParliamentProceedings_ver2/state.json +3 -0
  38. local_dedup/FreeLaw_Options_ver2/dataset.arrow +3 -0
  39. local_dedup/FreeLaw_Options_ver2/dataset_info.json +3 -0
  40. local_dedup/FreeLaw_Options_ver2/state.json +3 -0
  41. local_dedup/GithubDiff_ver2/dataset.arrow +3 -0
  42. local_dedup/GithubDiff_ver2/dataset_info.json +3 -0
  43. local_dedup/GithubDiff_ver2/state.json +3 -0
  44. local_dedup/Gutenberg_ver2/dataset.arrow +3 -0
  45. local_dedup/Gutenberg_ver2/dataset_info.json +3 -0
  46. local_dedup/Gutenberg_ver2/state.json +3 -0
  47. local_dedup/OtherWiki/dataset.arrow +3 -0
  48. local_dedup/OtherWiki/dataset_info.json +3 -0
  49. local_dedup/OtherWiki/state.json +3 -0
  50. local_dedup/PileOfLaw_ver2/dataset.arrow +3 -0
app.py CHANGED
@@ -6,8 +6,11 @@ from transformers import AutoTokenizer
6
  import ast
7
  import re
8
 
9
-
10
- CACHE_DIR = "cache_ds/" #Use this to build the dataset
 
 
 
11
  contribution_json = "contributors.json"
12
 
13
  contribution_dict = json.load(open(contribution_json,"r"))
6
  import ast
7
  import re
8
 
9
+ version = st.sidebar.selectbox("Choose a version", ["init","local_dedup"])
10
+ if version == "init":
11
+ CACHE_DIR = "cache_ds/" #Use this to build the dataset
12
+ else:
13
+ CACHE_DIR = "local_dedup/"
14
  contribution_json = "contributors.json"
15
 
16
  contribution_dict = json.load(open(contribution_json,"r"))
local_dedup/AI4Code/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10a7f6b87c470ba3134fe3723ff180882d065f609db88fbbe142b4b4620bb580
3
+ size 12556632
local_dedup/AI4Code/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2b26217928d6bea283689fb6af4808a924e8e571e25bdd8b3782da9d6a695d
3
+ size 957
local_dedup/AI4Code/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e405f730336401cff3874ed90b3d097ae7a7134a46ba077d3d363277df080b66
3
+ size 256
local_dedup/AMPS/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:053b86f94cbdaa8c553951075aa9f6f3360ec8334fbd94736d59c7d3585e2251
3
+ size 441608
local_dedup/AMPS/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95d82428c02ef084b39df6bbae3dd733fbd7e2d6336d8b2d482830223047183
3
+ size 944
local_dedup/AMPS/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e9ab0699d0c17d8768b26fa63fd22aaff608ba9d9c238202e782473a4ea885d
3
+ size 256
local_dedup/ASFPublicMail_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:990bbad25277a351bc22f3c26121b02d698e6e002fe9b024be09c7cddefe7e70
3
+ size 131782480
local_dedup/ASFPublicMail_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16063393c7da47937ce963b3f3e3c1ee8970ae45d22f881b938f7d567adcaf3d
3
+ size 1259
local_dedup/ASFPublicMail_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d16989887376d31bea4fac729c95e170ee1f80d02501b976a9bbb3ceb871fc
3
+ size 256
local_dedup/CodePilePosts/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79420ef99d82108a769a6f048a1bb9b405f98ccace89da8013c6ed6d033e5b1c
3
+ size 2131816
local_dedup/CodePilePosts/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92391f1839e371c569d8ccd3c691ddf5796870f6063a316baf2b62c92aabe313
3
+ size 958
local_dedup/CodePilePosts/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b120b2e69adbc98ea0700dfa28a816f87ecf57577de8b89df3c67fe92308bea
3
+ size 256
local_dedup/CodePileReddit2019_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72c37182256491228a36335c885235a32e1c71ee2ed52f739e2c3263853bed32
3
+ size 1783584
local_dedup/CodePileReddit2019_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5be965a30f154d072da3cad24b91ed90860c36fe043c4846ce3c6769ca9e0ccf
3
+ size 2763
local_dedup/CodePileReddit2019_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80d4db67ccd3f8f9d9bc342cfe49cf8067b0ae3939c243d11acd85dbeb040c40
3
+ size 256
local_dedup/CodePileReddit2020_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:629cfc15f65cc75f7c1529ce7b5e02ce3ac6b4250fd45a691335bd0dfbec1718
3
+ size 2690040
local_dedup/CodePileReddit2020_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69329b7f07b6b6bb5c700170cac90f4078ada7db41b00aa3d4293471d2feb94e
3
+ size 1253
local_dedup/CodePileReddit2020_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbbf5c5b0a93102657732c1f013ca5957efe85968b8eb68793c430d6cbf95b72
3
+ size 256
local_dedup/CodePileReddit2021_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8544962a7e998f20f9e543e3166145758d5a5cc6cca0555ef8ed8728897d07e0
3
+ size 2900344
local_dedup/CodePileReddit2021_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e0699a41b3de07ae98d6019e33663d8a4d0765989c1b57b08cca9a0056693fa
3
+ size 1253
local_dedup/CodePileReddit2021_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058c28a4130873b232bd1c60ac96f08f0a0693ca454ab317963f9d4234ee2997
3
+ size 256
local_dedup/CodePileReddit2022_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae8987c49610915aa4efae028f8e6504f4d16026dfdcadfa813ec3c0ccc8f955
3
+ size 2762800
local_dedup/CodePileReddit2022_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60c704f09a60f403589a96e2216516ebf73380269ec409f65172fdcfdf704d2
3
+ size 1253
local_dedup/CodePileReddit2022_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb705f514f7d4d454f7ca4042d79bdc504153f7c8bab2ccb993ceb70cd14d0e
3
+ size 256
local_dedup/DMMath/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541363919fbedc9f56a9770d0f2b36bd1a0dfe2c7e29221f120a286fce8a8261
3
+ size 8564624
local_dedup/DMMath/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0136da611caf524a63c1963fd904d7a5d29ffd277d8622d002d86cdcc613cb0e
3
+ size 952
local_dedup/DMMath/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7488e34da2c713600e94665e3e800d610129b8552fb88e7b53be0beea09c40c1
3
+ size 256
local_dedup/DevDocs/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d397e10d33324f31aa5604e6ac2f1359084eb840369c6846d6ba23be744eb775
3
+ size 4400152
local_dedup/DevDocs/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1aa7112599732b6a2d018c3514d7d64749948b4bf5e224a49b8100e9a5823b35
3
+ size 952
local_dedup/DevDocs/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5e46075c479c18ccb888cae48c28d7a2973a764591ef73dfe0155c661cb151a
3
+ size 256
local_dedup/Enwiki_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4dde363d3088a5ac711d23de933ffb08d43b6871c890c6a31b285689a6ebf4d
3
+ size 4899320
local_dedup/Enwiki_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd184f28e4452acedda0d110c835b58efaf363901e436d93afc11f2d730ea3c
3
+ size 1241
local_dedup/Enwiki_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c8b509410155d7b015758265ed9a781a3580b4b1eea31f4c5e63cc03d421799
3
+ size 256
local_dedup/EuroParliamentProceedings_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e862965d1847aba701bc7f65ca3c44ddf50f86e357a89bc75cb0a67add2d3986
3
+ size 72240848
local_dedup/EuroParliamentProceedings_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03d3d99fb72a57c05e775f26dacecc1459b6d3262a8d34a48b4538310392da2a
3
+ size 1267
local_dedup/EuroParliamentProceedings_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13c4046eb4c5ddae956c4ec64fff0b850068b7ed7e0429d54a2649f91f815723
3
+ size 256
local_dedup/FreeLaw_Options_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2feb33332bc3538125ca9b53e70a92ce730391d2da53c393d188b9537e517f0f
3
+ size 14057864
local_dedup/FreeLaw_Options_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a43628cbced75c4b249dea6088d4e1704afb6dfcc8a6058ea34fedf12fde993
3
+ size 1256
local_dedup/FreeLaw_Options_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd863b68357df401a6530e805ec0418af425aa41815b5a7b6d6b9267c0ce041
3
+ size 256
local_dedup/GithubDiff_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a391ce41c165462c9623d797c7f3a7aa2b94c7a759b1a34888c725a037e2fb4c
3
+ size 7660264
local_dedup/GithubDiff_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60979cf00fcfade9d2191d456431ede2161d1687f62d4ffa93f5d3dbf77cb68
3
+ size 961
local_dedup/GithubDiff_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3c1e14e8ca0bad56c53a5e1495ef59235d5268850eb4dbc7c143c04fbf45024
3
+ size 256
local_dedup/Gutenberg_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d286efd3d65355727bcacbbfb0972292b9c41928b3ba48e82d134f14e2040e6d
3
+ size 369487992
local_dedup/Gutenberg_ver2/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a540771ad266f40ad4b1174db5a49b64ddb51442318c1621c00d83b16cc78616
3
+ size 1255
local_dedup/Gutenberg_ver2/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c2a0a3d1ffa309aee2d1eaff58a7cc3e6d9be9547171fe052224b84bae508fe
3
+ size 256
local_dedup/OtherWiki/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338dd162af98018824e2fe3b4c49d72ea09fba5526d631fc9c720b7a9d79f9b8
3
+ size 8538312
local_dedup/OtherWiki/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293e2e953e71bdd075e76a301ea735c6756dc3f6dac2f3ad3f9e34ae52e0d070
3
+ size 955
local_dedup/OtherWiki/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a1288154c3bba167b777854f90edc1fa0920790064186bd35572c53e7b91fe3
3
+ size 256
local_dedup/PileOfLaw_ver2/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d12743c9c222679cb29dfef0e4d366f0e7485d609a06e4537e346908e16d498c
3
+ size 17384928