diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..0af1c96e63bb252ae57fb10cb245a22160bb224f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +codepep/codepep.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/codepep/.gitattributes b/codepep/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..f530c0a6d77ba8dc74821e51842f23f59a219954 --- /dev/null +++ b/codepep/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +codepep.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/codepep/README.md b/codepep/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82c5c5b2b139dfa5a5f38c0c1d48caf6c5644528 --- /dev/null +++ b/codepep/README.md @@ -0,0 +1,8 @@ +--- +license: apache-2.0 +--- + +- min(1.0, int(dat['copies'])/20 + dat['ratio']/10 + dat['alpha_frac']* 0.1 + 0.5*(1-dat['avg_score'])) if avg_score exists +- min(1.0, int(dat['copies'])/20 + dat['ratio']/10 + dat['alpha_frac']* 0.1) otherwise + +- avg_score is the pep8 score, lower meaning better. \ No newline at end of file diff --git a/codepep/codepep.jsonl b/codepep/codepep.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b3d5764b30489ef9eca3b7090e14246f962eb1f --- /dev/null +++ b/codepep/codepep.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06fea27330fb381a9fa0c68326fbf85c365b49b80a6a8d497ff521fc0d5ab211 +size 14422553423 diff --git a/freelaw/.gitattributes b/freelaw/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..f4f3945bd7150d3e12988485c42da1f8c29c59f8 --- /dev/null +++ b/freelaw/.gitattributes @@ -0,0 +1,54 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/freelaw/data/test/domain_test_0.jsonl.zst b/freelaw/data/test/domain_test_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..313d9119e0d7f6a6d85594e9a5ea9996c764f67e --- /dev/null +++ b/freelaw/data/test/domain_test_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf8e5ee5995335cffa43477e661a153e977d09e75fc4e167379eafdc6cc99ad2 +size 11870943 diff --git a/freelaw/data/test/domain_test_1.jsonl.zst b/freelaw/data/test/domain_test_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..700b38c11872ecf0d480733e0e93d646dfac0642 --- /dev/null +++ b/freelaw/data/test/domain_test_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7bf707f6d94399227df5fdd15f105e5869cb2c994adea94dfa6e839c566548f +size 12065676 diff --git a/freelaw/data/test/domain_test_2.jsonl.zst b/freelaw/data/test/domain_test_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..46d9ec27c2be29479327d096942faf7facce5a0d --- /dev/null +++ b/freelaw/data/test/domain_test_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c18a847b74e7af785637639d56c6678e75153b8df9509c0768e7c6a7c3a86dde +size 1849160 diff --git a/freelaw/data/test/pile_test_0.jsonl.zst b/freelaw/data/test/pile_test_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5b4edc6346737e38fc49e141b154e7272992d1cd --- /dev/null +++ b/freelaw/data/test/pile_test_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6135e5d41433fafc6497fc0a5e8f9f8aacbd3f66ffd4c65d0b987456c0fe552b +size 10979518 diff --git a/freelaw/data/train/domain_01_0.jsonl.zst b/freelaw/data/train/domain_01_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..35c469bacb14d21ebc5fc7087726bd1a2872f39b --- /dev/null +++ b/freelaw/data/train/domain_01_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:804f1d5baf0294505eebfcd33bb7b9b92c91cdafd08eb89d1acb0999f6e5a904 +size 11934103 diff --git a/freelaw/data/train/domain_01_1.jsonl.zst b/freelaw/data/train/domain_01_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..041fdd87cfe8e1bdf25ad6b434f9b71b8a3b457b --- /dev/null +++ b/freelaw/data/train/domain_01_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb727034408ccc2b87cbdb9b9e8071efde6f1905dd3d39a8c1af1da16fd7d9e7 +size 12510062 diff --git a/freelaw/data/train/domain_01_10.jsonl.zst b/freelaw/data/train/domain_01_10.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..80b5d80ed51b579572ff88a72563d660f19134f9 --- /dev/null +++ b/freelaw/data/train/domain_01_10.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f77372e1169392d3eeede15961acb6b75c5188246ce7e1ac16a3e36471e9649 +size 12577487 diff --git a/freelaw/data/train/domain_01_11.jsonl.zst b/freelaw/data/train/domain_01_11.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..de6fbbc4803b702a61f100576d63dc45a3b64e43 --- /dev/null +++ b/freelaw/data/train/domain_01_11.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba7b130a4b676692a445a56e479dfebd355bcfeca30d838d8851b62647296e7 +size 12378731 diff --git a/freelaw/data/train/domain_01_12.jsonl.zst b/freelaw/data/train/domain_01_12.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c9a1568e6091d2e19383ff5bd43b75fee706c359 --- /dev/null +++ b/freelaw/data/train/domain_01_12.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97541fdb9e46f60ab9cc9f25050cfeb5575aa1eaf32922559aefaf8a573b8b92 +size 12549909 diff --git a/freelaw/data/train/domain_01_13.jsonl.zst b/freelaw/data/train/domain_01_13.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..60187c27afb8cfad5b963fb58a68ce3f3a474fe2 --- /dev/null +++ b/freelaw/data/train/domain_01_13.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c408a71052ba1b95fe2c6065c07a3e90f76f47d0339388583d40929c23bff1d +size 11155713 diff --git a/freelaw/data/train/domain_01_14.jsonl.zst b/freelaw/data/train/domain_01_14.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..312be3ef56e02bc94fe856cdda9092d35eb2c697 --- /dev/null +++ b/freelaw/data/train/domain_01_14.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ee5d4b137b00ca24b8be798ef99ba670392bf849b041f64eb1d4331f5ca271e +size 11502836 diff --git a/freelaw/data/train/domain_01_15.jsonl.zst b/freelaw/data/train/domain_01_15.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d784c01ba9293b1c2e7c750d8ee3d638926990be --- /dev/null +++ b/freelaw/data/train/domain_01_15.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43f1b9a0adacb7032135ce245b06ed7746b34dc854611b5cb45d1fb9f7518405 +size 12507147 diff --git a/freelaw/data/train/domain_01_16.jsonl.zst b/freelaw/data/train/domain_01_16.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..0c69684c5193cbd2508b286affad0bdc047a6390 --- /dev/null +++ b/freelaw/data/train/domain_01_16.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea2af1b72bae903b5a2398c596c8ea073834f847eb35fc1d9bb5701176580391 +size 12555984 diff --git a/freelaw/data/train/domain_01_17.jsonl.zst b/freelaw/data/train/domain_01_17.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f727ace2b03b9931b7802c76da72aeb8fe117547 --- /dev/null +++ b/freelaw/data/train/domain_01_17.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff4d7bbbdc8e62d7175aa849980f4f6123296d11278d986478ae139ee8ea28e +size 12759768 diff --git a/freelaw/data/train/domain_01_18.jsonl.zst b/freelaw/data/train/domain_01_18.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..b0bc364cc83774be99c5d4939d9660fe2f223aa3 --- /dev/null +++ b/freelaw/data/train/domain_01_18.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1cd093ba163b326b9ae7c49ee9219a33a74a64f312102c965df45fdb127f216 +size 11924511 diff --git a/freelaw/data/train/domain_01_19.jsonl.zst b/freelaw/data/train/domain_01_19.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2388518ee7f2ce2f0a32291d814b409f9c2cde84 --- /dev/null +++ b/freelaw/data/train/domain_01_19.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d991771d9bd097ef35de6bc8d5a1c25485248fbe64d45544ceaedba6330e0a6f +size 11663875 diff --git a/freelaw/data/train/domain_01_2.jsonl.zst b/freelaw/data/train/domain_01_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..559b55c174331a82ae92e7105c7a3d169df7a382 --- /dev/null +++ b/freelaw/data/train/domain_01_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61fcde34f5c97bbd570d0d765e211f0655f182e4c8ced538c3f59304741f2de3 +size 11842354 diff --git a/freelaw/data/train/domain_01_20.jsonl.zst b/freelaw/data/train/domain_01_20.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..766cedc63c00fd7166c925799d7a4a4f02f5ccbf --- /dev/null +++ b/freelaw/data/train/domain_01_20.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18a5f49fa84e4cd95daa46abc023769503ae07d7688c9538be041e178d534f03 +size 12210895 diff --git a/freelaw/data/train/domain_01_21.jsonl.zst b/freelaw/data/train/domain_01_21.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..24b253fe9085a551bdc22bb9c55a1414ef903c6a --- /dev/null +++ b/freelaw/data/train/domain_01_21.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8623f8f18808c6210ac7ccde6e8c41cd1f4990f4287898836db1c37b7a0f05 +size 12250919 diff --git a/freelaw/data/train/domain_01_22.jsonl.zst b/freelaw/data/train/domain_01_22.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e16d1d54fd8b024e3a013a727fc32451745c3df5 --- /dev/null +++ b/freelaw/data/train/domain_01_22.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cecc0cdb91ba19611987fa0a4aba84b3fdcdc21e8d273d8d062ab414e9a97ea +size 11989544 diff --git a/freelaw/data/train/domain_01_23.jsonl.zst b/freelaw/data/train/domain_01_23.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..836d0542959ad323f4ba12c48cb8ea10bc636e63 --- /dev/null +++ b/freelaw/data/train/domain_01_23.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3cafc1461b64d30ef59c742a39e57349946d1950746018c7a658bf199c49986 +size 12273878 diff --git a/freelaw/data/train/domain_01_24.jsonl.zst b/freelaw/data/train/domain_01_24.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..61bc7244ff788fb821afba117f437648f8c7c3c4 --- /dev/null +++ b/freelaw/data/train/domain_01_24.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e62fb39a6e7094387bc56df6c4718873953743c485c70c8d9e8a3f217b1eccd6 +size 11795303 diff --git a/freelaw/data/train/domain_01_25.jsonl.zst b/freelaw/data/train/domain_01_25.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f8c3aec8411c82dac70253b1b6d71dcb04994649 --- /dev/null +++ b/freelaw/data/train/domain_01_25.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:962a638b7110cf77ee2d906a240cd2510a7a383b45f870e774dbf205dbd609d7 +size 12770172 diff --git a/freelaw/data/train/domain_01_26.jsonl.zst b/freelaw/data/train/domain_01_26.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..dd0b33b4aee6e0b8a147d1cadad72f9d99bf3b16 --- /dev/null +++ b/freelaw/data/train/domain_01_26.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c13864937c13db4f2af3e74a454d43215ad161fe98c7fb45b396aaa46c02a77a +size 12182129 diff --git a/freelaw/data/train/domain_01_27.jsonl.zst b/freelaw/data/train/domain_01_27.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..072afa1d122980eaa821519c6fe9d1f0cdd5d88a --- /dev/null +++ b/freelaw/data/train/domain_01_27.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439492c4b3c14ad53000b9d768fedcd23fec82a75ca92a626d0e2aa6f23a76e2 +size 12981424 diff --git a/freelaw/data/train/domain_01_28.jsonl.zst b/freelaw/data/train/domain_01_28.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2c4174c11d94e1c768e3e25b0d049b7d2593e4f4 --- /dev/null +++ b/freelaw/data/train/domain_01_28.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bcf561358fc5a7c0a4cbee1ea0fcb6044988c69eb1d4dc8e2dce98d0450b440 +size 12532539 diff --git a/freelaw/data/train/domain_01_29.jsonl.zst b/freelaw/data/train/domain_01_29.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f6f1f841121121f6067b7817cef4e3945766a265 --- /dev/null +++ b/freelaw/data/train/domain_01_29.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b1f852ab988a1a91f8e1e6942e9b145ba1fdad9d8b90bd0a503df202d36256f +size 12198384 diff --git a/freelaw/data/train/domain_01_3.jsonl.zst b/freelaw/data/train/domain_01_3.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..da5aa9654ed69122ebfdf2222c61dfe9e606a6dc --- /dev/null +++ b/freelaw/data/train/domain_01_3.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71f161d3a41046865addf92dd7c322ec8f582cd7b1301f3e01e5e2b9c1607e93 +size 11466123 diff --git a/freelaw/data/train/domain_01_30.jsonl.zst b/freelaw/data/train/domain_01_30.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..86c47ed71b08cba757162e262e84ffb776d85963 --- /dev/null +++ b/freelaw/data/train/domain_01_30.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f59f16496a8cfbd2319025cbb46cb8a6b9b9c453ca15e6a0c1613ee8ea2a9f9 +size 12854302 diff --git a/freelaw/data/train/domain_01_31.jsonl.zst b/freelaw/data/train/domain_01_31.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..93f6fc97d5e65cf86277a402561cf81125e7b203 --- /dev/null +++ b/freelaw/data/train/domain_01_31.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:543b1869130ee352548ea8782592275d9309b35f6e69ce3afb9163ccc864c3ae +size 12593967 diff --git a/freelaw/data/train/domain_01_32.jsonl.zst b/freelaw/data/train/domain_01_32.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..628d08d41ec1ad38c01a2ca4a19809118bf7ac49 --- /dev/null +++ b/freelaw/data/train/domain_01_32.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24adb00fbdfc70a24e727ef8dfd4b596aaf31f454db75e018ff52c3ba85db1d4 +size 11564791 diff --git a/freelaw/data/train/domain_01_33.jsonl.zst b/freelaw/data/train/domain_01_33.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..15d7d9687270bcf3282d75555b96f08ab55577ec --- /dev/null +++ b/freelaw/data/train/domain_01_33.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a260a43f5793bc9d38ccf6ef9c8f8821438920260c833e84c6aea3cb746d94f1 +size 12301268 diff --git a/freelaw/data/train/domain_01_34.jsonl.zst b/freelaw/data/train/domain_01_34.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c69eee7cf3058f1aab0723da09de8e87d5ea63ac --- /dev/null +++ b/freelaw/data/train/domain_01_34.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f323af4fb398a15f8a03755cb70cdb1d1b2b1ee3786328d228009700275708a +size 12019258 diff --git a/freelaw/data/train/domain_01_35.jsonl.zst b/freelaw/data/train/domain_01_35.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..b450105b2ed4e5fcaa691ad0853eed1e9a58895e --- /dev/null +++ b/freelaw/data/train/domain_01_35.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3b9a7474d2085c679c3cd99098860c9f3af57e67375143401176997a64f292 +size 12291143 diff --git a/freelaw/data/train/domain_01_36.jsonl.zst b/freelaw/data/train/domain_01_36.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..507fac4588632192fc8a7836e760128bcfab1124 --- /dev/null +++ b/freelaw/data/train/domain_01_36.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:742ae4c28a2d2f5005243e6287d44588de83966c8cdcc6b5a7c7ab0595b58a09 +size 11732476 diff --git a/freelaw/data/train/domain_01_37.jsonl.zst b/freelaw/data/train/domain_01_37.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..4cdb4fb4c3cdbef29e6b87bd8f97e05044ee60a3 --- /dev/null +++ b/freelaw/data/train/domain_01_37.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:569008e8fc9f2aa0cfc33d958feb58867828256a6341dd3acbe1ad1d07683528 +size 12419656 diff --git a/freelaw/data/train/domain_01_38.jsonl.zst b/freelaw/data/train/domain_01_38.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e870e5f7c268bd9aae1fbcd8a2630625d2f5c8e4 --- /dev/null +++ b/freelaw/data/train/domain_01_38.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5067c30069016c71907f43bcaa7fd9fd70fbfecf8e48e2efc03d83ffe230954f +size 12171605 diff --git a/freelaw/data/train/domain_01_39.jsonl.zst b/freelaw/data/train/domain_01_39.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..84acbfba340943ce686a261381e4d288121b2f24 --- /dev/null +++ b/freelaw/data/train/domain_01_39.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11ed935538e99f2980e9fdef083448c424cfc2b9693c079ea4530e1558b18082 +size 12422724 diff --git a/freelaw/data/train/domain_01_4.jsonl.zst b/freelaw/data/train/domain_01_4.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..9c93a258f7ffaf0434e2794c19a0b6ca587f9647 --- /dev/null +++ b/freelaw/data/train/domain_01_4.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c3f5a107aee6e7e3f3e6f93331c4df196f4f74faeacd93d6c4c733551e599e +size 12007688 diff --git a/freelaw/data/train/domain_01_40.jsonl.zst b/freelaw/data/train/domain_01_40.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..503a7e6bd5419ef3484ef0dfd5227223483c8c73 --- /dev/null +++ b/freelaw/data/train/domain_01_40.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0415d1b7e49d4e2338ebfc2a95687073ef027d5fe9589910922f986b0ebac8a9 +size 11475796 diff --git a/freelaw/data/train/domain_01_41.jsonl.zst b/freelaw/data/train/domain_01_41.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..0f92c8afa28757c310b88c711026f0c804e5a75d --- /dev/null +++ b/freelaw/data/train/domain_01_41.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92d18cff3b11c85da2e424ff699453f76883a4780e5c990f683af8623642f39d +size 12165466 diff --git a/freelaw/data/train/domain_01_42.jsonl.zst b/freelaw/data/train/domain_01_42.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a1a9d4cb117ae0f0b4ab7573691024ec151abda6 --- /dev/null +++ b/freelaw/data/train/domain_01_42.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a37e836c0eb737dd2bee81d44996785325b33bca9cdd715d132d7b050541169 +size 12141248 diff --git a/freelaw/data/train/domain_01_43.jsonl.zst b/freelaw/data/train/domain_01_43.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..381afe2659f3d4f0793e61db333530315e13413a --- /dev/null +++ b/freelaw/data/train/domain_01_43.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6d6014e6da9480931c43b726cdadd0736091422c759a363d0a017331962f13 +size 12749737 diff --git a/freelaw/data/train/domain_01_44.jsonl.zst b/freelaw/data/train/domain_01_44.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..9419ed6be09847d9e68c92b565fa18e33cbcab00 --- /dev/null +++ b/freelaw/data/train/domain_01_44.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b316888e7963cdc314193801521e8e4dfdea5c1e28fcf3595d0e55ebbe0a7f42 +size 12084946 diff --git a/freelaw/data/train/domain_01_45.jsonl.zst b/freelaw/data/train/domain_01_45.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..51e91ff440c16b5b0817a840982b4c60bf91e00f --- /dev/null +++ b/freelaw/data/train/domain_01_45.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6bc2923adf386120203842dae5470220e6c26cb66667b3ec0f4162337dad4db +size 12125784 diff --git a/freelaw/data/train/domain_01_46.jsonl.zst b/freelaw/data/train/domain_01_46.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5788c39dd04f7db4b28b8d911cf01487dee53323 --- /dev/null +++ b/freelaw/data/train/domain_01_46.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584b9a539ee56dbfe7dcb54447652b2c09de5514fe96995cfc68a166a0db8062 +size 11965776 diff --git a/freelaw/data/train/domain_01_47.jsonl.zst b/freelaw/data/train/domain_01_47.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2a6f81128f9912346fb4fb301ac2e17784f0b59c --- /dev/null +++ b/freelaw/data/train/domain_01_47.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a119a154a886c64e2fce20d6789d798eccab376751b7fc30b0e7eb670568bdf +size 13124492 diff --git a/freelaw/data/train/domain_01_48.jsonl.zst b/freelaw/data/train/domain_01_48.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..93e5987eba8110851908169a55344dbb35e5c571 --- /dev/null +++ b/freelaw/data/train/domain_01_48.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5fd016994db34078792681e2cb19fdcd059cea541400689b429c8ee85fd2366 +size 12745473 diff --git a/freelaw/data/train/domain_01_49.jsonl.zst b/freelaw/data/train/domain_01_49.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..495b644239634ab080eed0b8429b9899a97dc875 --- /dev/null +++ b/freelaw/data/train/domain_01_49.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d37af66b52bfb6ab94d9cca05b8594acffeb64289529a923f3f6b64ad572ec9 +size 11259382 diff --git a/freelaw/data/train/domain_01_5.jsonl.zst b/freelaw/data/train/domain_01_5.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..cffd7372b0fe300e22733722963117f068d2e538 --- /dev/null +++ b/freelaw/data/train/domain_01_5.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70d639b54418d68748acd3494a8e223a86c77ff2be137af2eb83709eac67b5a2 +size 12175610 diff --git a/freelaw/data/train/domain_01_50.jsonl.zst b/freelaw/data/train/domain_01_50.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..422722c46acdd1efce64fbb7d46f980f76af2a7b --- /dev/null +++ b/freelaw/data/train/domain_01_50.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14fae91138ce9f9df8dc0006bd47354bf7998a013c9462b9e32d31c76fc8bd55 +size 12623090 diff --git a/freelaw/data/train/domain_01_51.jsonl.zst b/freelaw/data/train/domain_01_51.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..6e7d53e974bdb1c71b35285342d173a92467ce8c --- /dev/null +++ b/freelaw/data/train/domain_01_51.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f6ad19336ca077abf9dc80078e82f958fc66267d76d7793425ac6c7b996a5a +size 12281078 diff --git a/freelaw/data/train/domain_01_52.jsonl.zst b/freelaw/data/train/domain_01_52.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f2b48dfefa30c18e57ac974590562d496e69beb3 --- /dev/null +++ b/freelaw/data/train/domain_01_52.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfbfd93c8434da4aeec98df3483e1b7dd572711a51fb3928f6aa0de50fe18c0 +size 11787335 diff --git a/freelaw/data/train/domain_01_53.jsonl.zst b/freelaw/data/train/domain_01_53.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..49dc18d47c37e431c4b87a9524f790b7d595b676 --- /dev/null +++ b/freelaw/data/train/domain_01_53.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f695373dfaf8e61edbab0e55dc74c68b9c1f558ab70b263f2eddedebcd7dab6 +size 11218804 diff --git a/freelaw/data/train/domain_01_54.jsonl.zst b/freelaw/data/train/domain_01_54.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ca42355158c68029fecb406b4cc199613c1bfa41 --- /dev/null +++ b/freelaw/data/train/domain_01_54.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c1c3cf74524481176f2143dde4e690df05f88395d94eae4515ce6f95deae4cc +size 12087840 diff --git a/freelaw/data/train/domain_01_55.jsonl.zst b/freelaw/data/train/domain_01_55.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..52cd8ff06117ed197851c6b8a65b50ec479f4fbc --- /dev/null +++ b/freelaw/data/train/domain_01_55.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a844dc713683404ba4c97292fd5efc0b72b31c5a4eb10f02618031f2a131b672 +size 12072148 diff --git a/freelaw/data/train/domain_01_56.jsonl.zst b/freelaw/data/train/domain_01_56.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..b42aac1d988a73b1b087958513f8d9c786648199 --- /dev/null +++ b/freelaw/data/train/domain_01_56.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c58de689ff496f6489b63e660322f593ff61ffcdc8ca10f088cd05f92fc7bd +size 12450319 diff --git a/freelaw/data/train/domain_01_57.jsonl.zst b/freelaw/data/train/domain_01_57.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f71ad9c9dfbdf1ad6c24d2372cbf00cce2e0b2a4 --- /dev/null +++ b/freelaw/data/train/domain_01_57.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dc33d0d5e4332a34ed712373bedd50bef11fc166ac4ec7cdf13456fca4ca94d +size 12343452 diff --git a/freelaw/data/train/domain_01_58.jsonl.zst b/freelaw/data/train/domain_01_58.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..04c1ff5532a1f47ce13e079a8627ac00b9b043a8 --- /dev/null +++ b/freelaw/data/train/domain_01_58.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4918ab83253349604230028f348fe7b0bd1a9745fc146a0609d4b1516a3f1e47 +size 11469414 diff --git a/freelaw/data/train/domain_01_59.jsonl.zst b/freelaw/data/train/domain_01_59.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..4b24c3235333f818cfce6979c0a43ba6aa0ea8f0 --- /dev/null +++ b/freelaw/data/train/domain_01_59.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e5da22a7ee51d5abb7c03f1524388efbd05bcbf7c3f9b99edd9e005abcdd6a8 +size 12170716 diff --git a/freelaw/data/train/domain_01_6.jsonl.zst b/freelaw/data/train/domain_01_6.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..221939e405bf23f2307f276a1dc504e4e4e31867 --- /dev/null +++ b/freelaw/data/train/domain_01_6.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b833086427f9601a8dd254e1d140ad53bcf8e87c1052f2adbe7fa8fd20d80c5 +size 12234072 diff --git a/freelaw/data/train/domain_01_60.jsonl.zst b/freelaw/data/train/domain_01_60.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..bd5285da290045d5bf5b9e592aee8a4d868ec18e --- /dev/null +++ b/freelaw/data/train/domain_01_60.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:680b11ab099de12d9fc2d5873b33fba52ec77dcc81702c4fab543309bd23c164 +size 11886230 diff --git a/freelaw/data/train/domain_01_61.jsonl.zst b/freelaw/data/train/domain_01_61.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..49d825b4777ceff204f51233215ca3565be3d58e --- /dev/null +++ b/freelaw/data/train/domain_01_61.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152873359ac6d0c150bcd97ddd078072772e56f99a49dc0d34ba71f1072a7d3c +size 11966492 diff --git a/freelaw/data/train/domain_01_62.jsonl.zst b/freelaw/data/train/domain_01_62.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f7894f4fb598ba91e3049af5c8be3e7990f2f9f0 --- /dev/null +++ b/freelaw/data/train/domain_01_62.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15037ff41d0b3f05e06dc6b2a6d8b3839630dc0ac88eb27f4b640c92f623b184 +size 12503256 diff --git a/freelaw/data/train/domain_01_63.jsonl.zst b/freelaw/data/train/domain_01_63.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a7dc5e38891033e15ab6fb1799cf720410e85560 --- /dev/null +++ b/freelaw/data/train/domain_01_63.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d858f01229a0b256b6c42eccaba9d7d8952e04b5cec1ac34d2bbade3b2e915cc +size 12698596 diff --git a/freelaw/data/train/domain_01_64.jsonl.zst b/freelaw/data/train/domain_01_64.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ccda171c3f1fceb7c98bfc5d4e3efc474b7e3719 --- /dev/null +++ b/freelaw/data/train/domain_01_64.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e767b2f33a0203b06c345a54d4d58b24118e8f18317e18ec7ceea5d727722924 +size 12602838 diff --git a/freelaw/data/train/domain_01_65.jsonl.zst b/freelaw/data/train/domain_01_65.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d08b976d9a2b56ca378e9f5f14a473a22f2bec45 --- /dev/null +++ b/freelaw/data/train/domain_01_65.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5371b2e75b0cf77dc12e77ef65a91f31f2175542b59a53769987fcc8b26a3480 +size 12165247 diff --git a/freelaw/data/train/domain_01_66.jsonl.zst b/freelaw/data/train/domain_01_66.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..af0dd2a29d7aee5ce5f510de99f77b287706558e --- /dev/null +++ b/freelaw/data/train/domain_01_66.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5912380a2fe78f215323c1f55b39b6ea307df007588806415ba3e74a9ac6989d +size 11803039 diff --git a/freelaw/data/train/domain_01_67.jsonl.zst b/freelaw/data/train/domain_01_67.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fec7213397d4eefe8308de8782d6f3b62943f60b --- /dev/null +++ b/freelaw/data/train/domain_01_67.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c63199690fa7188707709257c55e4b859de06950ae65d2b70c99f7f702654907 +size 11728690 diff --git a/freelaw/data/train/domain_01_68.jsonl.zst b/freelaw/data/train/domain_01_68.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..b4c5b0c15cd02421a81b90aa599cd34e815c64d0 --- /dev/null +++ b/freelaw/data/train/domain_01_68.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f0ec75e745ac47754105e7d61ce3b306c2ab25f1309a965c9bd3ec99e91fb0 +size 13210006 diff --git a/freelaw/data/train/domain_01_69.jsonl.zst b/freelaw/data/train/domain_01_69.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f6c75b2e0d5aa328deb73871ed755bbad30f23f7 --- /dev/null +++ b/freelaw/data/train/domain_01_69.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:206323dd4d5a0838b432fd5e24021e0a69d56579cffceab7f4343b538b4b823d +size 12813621 diff --git a/freelaw/data/train/domain_01_7.jsonl.zst b/freelaw/data/train/domain_01_7.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..114063bde5b763e0a830970d106d39fa2c5f8cf0 --- /dev/null +++ b/freelaw/data/train/domain_01_7.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2acf78f724b14cb6fc8b980d04962199b32b5965f6f25b152f8604b554bad6 +size 11831331 diff --git a/freelaw/data/train/domain_01_70.jsonl.zst b/freelaw/data/train/domain_01_70.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..22da1bc403d03f0d28b2885bcfb95df41053fada --- /dev/null +++ b/freelaw/data/train/domain_01_70.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e43cf6a59b1406218dca91189e5d84aa3f15fcb449da4a30067a29a2f749011 +size 2761053 diff --git a/freelaw/data/train/domain_01_8.jsonl.zst b/freelaw/data/train/domain_01_8.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..11bfcf4009a44bfed30c29e20d9cd07781a71373 --- /dev/null +++ b/freelaw/data/train/domain_01_8.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b13bf0e5514e52fc20d0b347d10c1f8d646fdbc980cb3dea38902ed20d51a1d6 +size 12993828 diff --git a/freelaw/data/train/domain_01_9.jsonl.zst b/freelaw/data/train/domain_01_9.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1ece9390ba93bc4b4ddd41e5b1c8a34f39fec190 --- /dev/null +++ b/freelaw/data/train/domain_01_9.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c32f9e91f90364c0d9cdf798c1eda929a7242ec4098e15d832c44835cbbc4ce1 +size 11962453 diff --git a/freelaw/data/train/pile_01_40.jsonl.zst b/freelaw/data/train/pile_01_40.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1a805dc775c67c21263fbef5e311fc99b272f783 --- /dev/null +++ b/freelaw/data/train/pile_01_40.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2767a1eea6879c4ed9a626de39039017e682ba30f65444e680dd328224fc8ca5 +size 212730917 diff --git a/freelaw/data/train/pile_01_54.jsonl.zst b/freelaw/data/train/pile_01_54.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fd6dceb1532c56557f55274c69e94c29a7762806 --- /dev/null +++ b/freelaw/data/train/pile_01_54.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda222798e44ee2a2fcbe56761f5c578b0dac0ebf4880b8c81fc21d8bf7ddd82 +size 215886741 diff --git a/freelaw/data/val/domain_val_0.jsonl.zst b/freelaw/data/val/domain_val_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..26e5dd327b877cf6716b02d697d35cef69a835a3 --- /dev/null +++ b/freelaw/data/val/domain_val_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73159119d683fa23a7a36a88afa91bea482a06877966be8b79620d7349346655 +size 11734738 diff --git a/freelaw/data/val/domain_val_1.jsonl.zst b/freelaw/data/val/domain_val_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..02de8425227c0f142c6c801376eeb59c4bb4d31c --- /dev/null +++ b/freelaw/data/val/domain_val_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:762d2cf9b931047baa36c5ac3dff0aeaa360965dfdd6491d15d273d0ad22563d +size 12092135 diff --git a/freelaw/data/val/domain_val_2.jsonl.zst b/freelaw/data/val/domain_val_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..64e23313e0932d9202d966edc23a4d3021c03119 --- /dev/null +++ b/freelaw/data/val/domain_val_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba3743f1d754f0f07a765934201613f23f998dca14fac412378ddf067c5c0d5 +size 1844210 diff --git a/freelaw/data/val/pile_val_2.jsonl.zst b/freelaw/data/val/pile_val_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f103fd87109432facd9bb327126476f41236d586 --- /dev/null +++ b/freelaw/data/val/pile_val_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0191005309464753e3f2a72d09cb7c4e8f4db4e634e5f92e8a9ef7a4ddf86106 +size 10391998 diff --git a/freelaw/freelaw.py b/freelaw/freelaw.py new file mode 100644 index 0000000000000000000000000000000000000000..e29bba648ced4e4980a35909ea09bb92054fbe46 --- /dev/null +++ b/freelaw/freelaw.py @@ -0,0 +1,86 @@ +import io +import json +import os +from glob import glob + +import datasets +import zstandard as zstd +from datasets import GeneratorBasedBuilder +from datasets.utils import Version +from huggingface_hub import snapshot_download + +# Requires REPO_NAME and file name to be same e.g. uspto.py +REPO_NAME = "Multi-Domain-Expert-Layers/freelaw" + +class PileDomainDataset(GeneratorBasedBuilder): + VERSION = Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description="Pile Domain Dataset", + features=datasets.Features( + { + "text": datasets.Value("string"), + } + ), + supervised_keys=None, + ) + + def _split_generators(self, dl_manager): + + dl_path = snapshot_download(repo_id=REPO_NAME, repo_type="dataset") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/train"), + "split": None, + }, + ), + datasets.SplitGenerator( + name="validation", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": None, + }, + ), + datasets.SplitGenerator( + name="validation_pile", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": "pile", + }, + ), + datasets.SplitGenerator( + name="validation_domain", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": "domain", + }, + ), + datasets.SplitGenerator( + name="test_pile", + gen_kwargs={"data_dir": os.path.join(dl_path, "data/test"), "split": "pile"}, + ), + datasets.SplitGenerator( + name="test_domain", + gen_kwargs={"data_dir": os.path.join(dl_path, "data/test"), "split": "domain"}, + ), + ] + + def _generate_examples(self, data_dir, split): + dctx = zstd.ZstdDecompressor() + idx = -1 + file_paths = glob(os.path.join(data_dir, f"*.jsonl.zst")) + if split is not None: + file_paths = [f for f in file_paths if split in f] + for file in file_paths: + with open(file, "rb") as f: + reader = dctx.stream_reader(f) + buffer = io.BufferedReader(reader) + for _, line in enumerate(buffer.readlines()): + data = json.loads(line) + idx += 1 + yield idx, data + diff --git a/github/.gitattributes b/github/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..f4f3945bd7150d3e12988485c42da1f8c29c59f8 --- /dev/null +++ b/github/.gitattributes @@ -0,0 +1,54 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/github/README.md b/github/README.md new file mode 100644 index 0000000000000000000000000000000000000000..154df8298fab5ecf322016157858e08cd1bccbe1 --- /dev/null +++ b/github/README.md @@ -0,0 +1,3 @@ +--- +license: apache-2.0 +--- diff --git a/github/data/test/domain_test_0.jsonl.zst b/github/data/test/domain_test_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f8a93ec8a486f70502f3069b43976f5ee567f528 --- /dev/null +++ b/github/data/test/domain_test_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa829dc6d6e5e90c73a338877e056fa0d90d985906517983790dc2c673c0d4cf +size 11248383 diff --git a/github/data/test/domain_test_1.jsonl.zst b/github/data/test/domain_test_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..451510f4d2ce9d6e00f76c07c0f5b467cd8e764f --- /dev/null +++ b/github/data/test/domain_test_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ad61b60e09b914b911b2affe1a5752c8985b9f53c4f2b714957b1b99b7e636 +size 11481504 diff --git a/github/data/test/domain_test_2.jsonl.zst b/github/data/test/domain_test_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..26ea40a7fb21a8f63d9eb7a6b38e6d5b83896c5b --- /dev/null +++ b/github/data/test/domain_test_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:387312c7406a430390f5d9b19f249774a34048189197815b4aebcb0c85e8657f +size 1603704 diff --git a/github/data/test/pile_test_0.jsonl.zst b/github/data/test/pile_test_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..074d22653b14d69e3bbdc8adbdc5667114270049 --- /dev/null +++ b/github/data/test/pile_test_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b2efd59329179a472fda9c6d5665ca6bc6d66acdaf3adb0d06a80deedc4c73d +size 39200703 diff --git a/github/data/train/domain_01_0.jsonl.zst b/github/data/train/domain_01_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..40c68b5c1b494b8e1f8a5cf3cb5b9be6053a9a24 --- /dev/null +++ b/github/data/train/domain_01_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eda12761875010598017ae483b12e3fa6195f809d67785890f69f0a4126c351 +size 11691825 diff --git a/github/data/train/domain_01_1.jsonl.zst b/github/data/train/domain_01_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..78bbed161e01bdac6bf483b44ab2ecd717871e3b --- /dev/null +++ b/github/data/train/domain_01_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7412cac6c85e990de91cdbc5f1fdc25d7d14629a817320c1ce19d4fee9ca127c +size 12338583 diff --git a/github/data/train/domain_01_10.jsonl.zst b/github/data/train/domain_01_10.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..970012660016037fc6c491804da5cf32a94af610 --- /dev/null +++ b/github/data/train/domain_01_10.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f4954eefdc32f796803afdc7fecb4dbac58120dfce79f566f5400324d451c8 +size 11932886 diff --git a/github/data/train/domain_01_11.jsonl.zst b/github/data/train/domain_01_11.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..df85eb4e881fe79df494a221d59b4895ab6bd9a4 --- /dev/null +++ b/github/data/train/domain_01_11.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b301c3de583f19488330105ccb43f40052fd6465341238b130be9ea0dd7596 +size 11266370 diff --git a/github/data/train/domain_01_12.jsonl.zst b/github/data/train/domain_01_12.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8bd198af6fa9b9cf66ea9cdb657cae2e3c7b9e73 --- /dev/null +++ b/github/data/train/domain_01_12.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2134238c92348b174bdfb6963a2fcd067c75d405be7be23a630904450db65e64 +size 11925449 diff --git a/github/data/train/domain_01_13.jsonl.zst b/github/data/train/domain_01_13.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8911b2ae482af49701e62b2ff3df1f1b7f81833e --- /dev/null +++ b/github/data/train/domain_01_13.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:776ef18b903cdbab18f62d18a4ede5c2d28f46d5d3892e7f1dde3c6c58fb4458 +size 11894905 diff --git a/github/data/train/domain_01_14.jsonl.zst b/github/data/train/domain_01_14.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e893faee15705144639be85a26e0c9871d83d5df --- /dev/null +++ b/github/data/train/domain_01_14.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d0c708dfbe5354c4673d11a7d8ec270133f2f0783dc61fa377dd7d73933e30 +size 11495280 diff --git a/github/data/train/domain_01_15.jsonl.zst b/github/data/train/domain_01_15.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..6b9f92f0fde194374cae6e1d6ca3b57e47d7a7da --- /dev/null +++ b/github/data/train/domain_01_15.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eb73e31ea58895884cea427656bf2f33daeef8be6c097f8e69e92b84d812e3a +size 11931464 diff --git a/github/data/train/domain_01_16.jsonl.zst b/github/data/train/domain_01_16.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e5e139d25546aef2d35aae973e260910414ec18b --- /dev/null +++ b/github/data/train/domain_01_16.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a63717467d0165270f4f8f37266fe78b0c37a18492dc7f3968b58fc3bd61e0b +size 11684821 diff --git a/github/data/train/domain_01_17.jsonl.zst b/github/data/train/domain_01_17.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2a85f5e5678b83cf4f98c06968bb4e7b797b5ada --- /dev/null +++ b/github/data/train/domain_01_17.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe49c3dd81829075785c949b6d367d23f2dc107b0ae7cf07e612d570891eb1bd +size 11760492 diff --git a/github/data/train/domain_01_18.jsonl.zst b/github/data/train/domain_01_18.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..486eb5b00a55408815fc24017790dba7d838b0f1 --- /dev/null +++ b/github/data/train/domain_01_18.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5b1daf945f21afdc9a725e6148abaf049e148f0e2fb4fe63d54d9e2043de3d +size 11616204 diff --git a/github/data/train/domain_01_19.jsonl.zst b/github/data/train/domain_01_19.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..bc7637f3165e50ca40fc418ceef12a57877de662 --- /dev/null +++ b/github/data/train/domain_01_19.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3ca0cdc8b175bdd503fbae3bc076aa03781723611c8c907546dc8e4bdca890a +size 11543215 diff --git a/github/data/train/domain_01_2.jsonl.zst b/github/data/train/domain_01_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fdeefa42503ddeaaf07381b40ebb57d70f61e3e2 --- /dev/null +++ b/github/data/train/domain_01_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4153dc2fcaa131a92e32d8906648822ed3c17bc333d523a32dc5827cbd47489b +size 11446893 diff --git a/github/data/train/domain_01_20.jsonl.zst b/github/data/train/domain_01_20.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fb1920676bcb024585a1b0ebeb378ac1ba2f8545 --- /dev/null +++ b/github/data/train/domain_01_20.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060b839247d9a4658f02c054419031287a68240987573759167da43e0d2c12f1 +size 11593607 diff --git a/github/data/train/domain_01_21.jsonl.zst b/github/data/train/domain_01_21.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..3bf36db75521186ff9b8a5a737eaff3bf895429b --- /dev/null +++ b/github/data/train/domain_01_21.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:000101bbe58cf78f51b5969d209a2389a861fbd53ed04115c2ba4943edff0fe5 +size 11660069 diff --git a/github/data/train/domain_01_22.jsonl.zst b/github/data/train/domain_01_22.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f84e2f4370b5c747cebbcdc77ae7ce848e6f0d11 --- /dev/null +++ b/github/data/train/domain_01_22.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd95201cca3698810f8eecb32b0f4c04ec50b12670c62b5e3c8adb6c57bd26f +size 11684626 diff --git a/github/data/train/domain_01_23.jsonl.zst b/github/data/train/domain_01_23.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..befb461787ceedfd67207df64d5a5eb749ca704b --- /dev/null +++ b/github/data/train/domain_01_23.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbbbaa121e508aa972d3014c850da904837a1011726cb626baefa9d3835c4a49 +size 11249119 diff --git a/github/data/train/domain_01_24.jsonl.zst b/github/data/train/domain_01_24.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..eb3c6dc6eacff9640188b39937e500ce6f9d8733 --- /dev/null +++ b/github/data/train/domain_01_24.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe664ef850818b01983c8a878a7998095c9e77fd3bb872e4ac8e3512a731dc14 +size 11398248 diff --git a/github/data/train/domain_01_25.jsonl.zst b/github/data/train/domain_01_25.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f180634c96328b7f76e3b03dc63a1805b16c5301 --- /dev/null +++ b/github/data/train/domain_01_25.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c561ee39283252c68fabab074da5fae7cff87d81c80cb3fcf589e66e3f07ef +size 11574796 diff --git a/github/data/train/domain_01_26.jsonl.zst b/github/data/train/domain_01_26.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a1ce6c284eca44ee6190f518cd0d38167ffe0112 --- /dev/null +++ b/github/data/train/domain_01_26.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcbfa039aab693b48ef74f9aa5a15243906b8d12fda56dfead6599cba980eead +size 11732903 diff --git a/github/data/train/domain_01_27.jsonl.zst b/github/data/train/domain_01_27.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2913a7a123ca46c656ccfafdc49593e66c61b4a9 --- /dev/null +++ b/github/data/train/domain_01_27.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0ff31ba085ebc3904968d6fb89738fb04529ba9bcf6f6b886aa9aa9239f65f2 +size 11631898 diff --git a/github/data/train/domain_01_28.jsonl.zst b/github/data/train/domain_01_28.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..3adf2c90041c7a3feaf21444ebe9b2d852bffa5c --- /dev/null +++ b/github/data/train/domain_01_28.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:267ad9015c1bbf1945d687c2ec9ef0474861f104b45236ce64a2b10592ee4848 +size 11780765 diff --git a/github/data/train/domain_01_29.jsonl.zst b/github/data/train/domain_01_29.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a24aa921a7c6c6217a284811b49801f26bc69a2e --- /dev/null +++ b/github/data/train/domain_01_29.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe203cbfff87d93ca1aa2d4b24e660aa90beef9358694e062d37b90849926b5 +size 11749345 diff --git a/github/data/train/domain_01_3.jsonl.zst b/github/data/train/domain_01_3.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..463b0a8a168715d521afdd95405b191ce23611a5 --- /dev/null +++ b/github/data/train/domain_01_3.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ab2bc4523223ef13ca5a57caad2178a3bcd854bde9fbc8c564b5a832c3c308 +size 11665530 diff --git a/github/data/train/domain_01_30.jsonl.zst b/github/data/train/domain_01_30.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8df4fe2bfb198a767d397846e4c16599e2101b9b --- /dev/null +++ b/github/data/train/domain_01_30.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a002af22405dedbd4d8cedccff9ce8d13bcc9b0b4d1775d0b1ef6279c8985666 +size 11479228 diff --git a/github/data/train/domain_01_31.jsonl.zst b/github/data/train/domain_01_31.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..aefa0997ce3da101e4e93fefe4e2318acb00a47f --- /dev/null +++ b/github/data/train/domain_01_31.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37026bcd5155c88432088f8de562e126c3fc601c63cebcd500843aac3146822 +size 11629505 diff --git a/github/data/train/domain_01_32.jsonl.zst b/github/data/train/domain_01_32.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5f05a17ddf6bba51bb7784707b744da0fa4a7f40 --- /dev/null +++ b/github/data/train/domain_01_32.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8730cbb49651727fff9a33d2e935e1430b4f1d4aa96a462928193efd432c5d8a +size 11703561 diff --git a/github/data/train/domain_01_33.jsonl.zst b/github/data/train/domain_01_33.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..faf2f70b2713ca88147753d77c2d18f11ca4241d --- /dev/null +++ b/github/data/train/domain_01_33.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f52dda0254c73ab4aec206c4653e80a43abb9c18f735f5cfe967c99699cd2da0 +size 11951738 diff --git a/github/data/train/domain_01_34.jsonl.zst b/github/data/train/domain_01_34.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d075c5d400eb8466cc2d36e9b2815d7da5fd81c7 --- /dev/null +++ b/github/data/train/domain_01_34.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e6b4ae73eef5d46a6dcb8e66a4296ed7645c039e03834c635aa52b4f1503ad7 +size 11292067 diff --git a/github/data/train/domain_01_35.jsonl.zst b/github/data/train/domain_01_35.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8a0d70e021d3c99a26f2dcb9fc3184f173adf674 --- /dev/null +++ b/github/data/train/domain_01_35.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fa3769ee291e7e6f5140dc4a6d74916427aef40475cfe4bac5f8f9844bb02a +size 10955876 diff --git a/github/data/train/domain_01_36.jsonl.zst b/github/data/train/domain_01_36.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..beb8516d64b3c679e352ae462617b2adca6a4a55 --- /dev/null +++ b/github/data/train/domain_01_36.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de930911d8cf9fed5ed48fbc8e510c84ff359b3fd3231e46b3539a4b1a1be21 +size 11463025 diff --git a/github/data/train/domain_01_37.jsonl.zst b/github/data/train/domain_01_37.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2a213d4f23980c659271606846ca854705f48e3d --- /dev/null +++ b/github/data/train/domain_01_37.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ffeaaaa4638f90ba06844cbd74ace244365628f61aaedcf38dc1b5cd88b74e +size 11793944 diff --git a/github/data/train/domain_01_38.jsonl.zst b/github/data/train/domain_01_38.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..75317498ec0833e78981f8113674a7f127407ae6 --- /dev/null +++ b/github/data/train/domain_01_38.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05213067bc5d5416dac7ed7ff19738c3ad77cbf2dd6b0bd4ace5935d094bfd7 +size 11515219 diff --git a/github/data/train/domain_01_39.jsonl.zst b/github/data/train/domain_01_39.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..60cdc2e2db60c292fe4637b31de76f52a55a9423 --- /dev/null +++ b/github/data/train/domain_01_39.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c582541912897870a707d16c74c9f1ac25bd1f4dd22e3303d2ba59af786871c +size 11985315 diff --git a/github/data/train/domain_01_4.jsonl.zst b/github/data/train/domain_01_4.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8ee4ee43c10744a5010efe6b58a8feefde0708b2 --- /dev/null +++ b/github/data/train/domain_01_4.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1481b27a589ab979b7a55e5542438d5f5431171ad843c2611dde3e154cce889e +size 11967681 diff --git a/github/data/train/domain_01_40.jsonl.zst b/github/data/train/domain_01_40.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ddcfc7794cb9120bbb23d3c5ee4c2535a9c9d614 --- /dev/null +++ b/github/data/train/domain_01_40.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6209495fa8c12da561b4a7496f6be86272b6cc5933ce0fc7e539a3bb86aa133f +size 11476789 diff --git a/github/data/train/domain_01_41.jsonl.zst b/github/data/train/domain_01_41.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ebece9eacf964a1876aab0a53db472551bb6d104 --- /dev/null +++ b/github/data/train/domain_01_41.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c085e57a0c00bf9d0f265f63eb8ef46814fcb68affb4472700051b94af15e2 +size 11635646 diff --git a/github/data/train/domain_01_42.jsonl.zst b/github/data/train/domain_01_42.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..64c6534bac07c4942201bc71898a2653b9e5046e --- /dev/null +++ b/github/data/train/domain_01_42.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:792cff11563e0894b1a507e93456ed3282a225e320a898519652d8921f35f58f +size 11266439 diff --git a/github/data/train/domain_01_43.jsonl.zst b/github/data/train/domain_01_43.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c0c20e13464bef728689dde4769a336c61b2f74a --- /dev/null +++ b/github/data/train/domain_01_43.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d16bbc4362224a48b592d16c466f87b7dfab2c39f437b7bf95bd2847ac81e09 +size 11297326 diff --git a/github/data/train/domain_01_44.jsonl.zst b/github/data/train/domain_01_44.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..4cf165d4a2f5741707cc6c86b4ea381df5b84e99 --- /dev/null +++ b/github/data/train/domain_01_44.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488aa77b0a11725b6b1625fac4d36cf14042a35f6fbe1d095ce87ad78fed1085 +size 11610262 diff --git a/github/data/train/domain_01_45.jsonl.zst b/github/data/train/domain_01_45.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..03921ef9babf30ee0d2bedb6db5a3928cb649aa2 --- /dev/null +++ b/github/data/train/domain_01_45.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06b929569ccb87adab9ef38da2f36a40abe5081e4d0e8a650fbd8fe0fdd164e3 +size 11522419 diff --git a/github/data/train/domain_01_46.jsonl.zst b/github/data/train/domain_01_46.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1b113d4f53b8a9d786f690470b3caccb907ec790 --- /dev/null +++ b/github/data/train/domain_01_46.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f03992719a391674e3f86ac651793ffd9b1e2b52d987815397d6e2e23d6094 +size 11101945 diff --git a/github/data/train/domain_01_47.jsonl.zst b/github/data/train/domain_01_47.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5f91b5baf463c4a67894770e71945903791f0573 --- /dev/null +++ b/github/data/train/domain_01_47.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92ad818b7efb45d165302cd82611dbc9434f87bee72c76cb3960ece5c408baf7 +size 12029204 diff --git a/github/data/train/domain_01_48.jsonl.zst b/github/data/train/domain_01_48.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..73ba7b1b0f4449de8f3d1fbd858a86afbe3220cf --- /dev/null +++ b/github/data/train/domain_01_48.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86499d12d8ad3af33d66effd12ca3e81f649103287590b84397df03a2bbdcc05 +size 11589986 diff --git a/github/data/train/domain_01_49.jsonl.zst b/github/data/train/domain_01_49.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..52721f9ded7a892025d40299dbbb2b44e45a48f7 --- /dev/null +++ b/github/data/train/domain_01_49.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac38d0ec7db4a678836fed2376c7eabf062f3886886b8ba2fc9e346c73c1987c +size 11378529 diff --git a/github/data/train/domain_01_5.jsonl.zst b/github/data/train/domain_01_5.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..0ddb83e4219efbd951744f73932b187335f6cd7b --- /dev/null +++ b/github/data/train/domain_01_5.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7834886cd73efffeb91d0d110d9fe8dfc14fc9d255827e3ef893c3408379307e +size 11562342 diff --git a/github/data/train/domain_01_50.jsonl.zst b/github/data/train/domain_01_50.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..54288ba6c011abea0a57afb68cf0a3e7756dc068 --- /dev/null +++ b/github/data/train/domain_01_50.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2617fadb9a7241bac3d660f8c26ec32547404f5bac093d83d5235f634bea67a2 +size 11244951 diff --git a/github/data/train/domain_01_51.jsonl.zst b/github/data/train/domain_01_51.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a6d238b9b25e79e9bf66f2a057bb8e3229df4642 --- /dev/null +++ b/github/data/train/domain_01_51.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ad8de5a8e30c5790769aebe098f739159b0421b3f001e80c9a140297eaeaabd +size 11918471 diff --git a/github/data/train/domain_01_52.jsonl.zst b/github/data/train/domain_01_52.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fe4b52e23595f153263cbc55ddcbb86d859b3a0f --- /dev/null +++ b/github/data/train/domain_01_52.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ea22aaad83067ad7cac656b3d664c2ea22235e39e5bd0d23bb250565d97b239 +size 11477271 diff --git a/github/data/train/domain_01_53.jsonl.zst b/github/data/train/domain_01_53.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d2b2f862a2b8e34271f815d51cbbcf2ce2322c57 --- /dev/null +++ b/github/data/train/domain_01_53.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a38b0edc6a5c7fadb15e886a6a675a5c088e8e9dadbae48c2eba9d586832202d +size 11846738 diff --git a/github/data/train/domain_01_54.jsonl.zst b/github/data/train/domain_01_54.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..19f694e149351b392aa68decb0727cfa85be8fbe --- /dev/null +++ b/github/data/train/domain_01_54.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025a8c63a3c25eedc24334af67323173cdca6bce9ec3d9343181307725445061 +size 11946423 diff --git a/github/data/train/domain_01_55.jsonl.zst b/github/data/train/domain_01_55.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..61e1672d5ebf91a6db646f48b99f0cc05da39154 --- /dev/null +++ b/github/data/train/domain_01_55.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39301d6ab3865b4dbd6793c1a01ac8623f3096a73e8771c215e19937f29b3f3a +size 11490563 diff --git a/github/data/train/domain_01_56.jsonl.zst b/github/data/train/domain_01_56.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..9bcc2f5c9f18cf0fc2a81cc5b0dee39ac2293b9a --- /dev/null +++ b/github/data/train/domain_01_56.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5568a10872e290ccae0e246c9fce2f0ef2e39b1191e110b32818483e20c31824 +size 11795316 diff --git a/github/data/train/domain_01_57.jsonl.zst b/github/data/train/domain_01_57.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..cd61e5f9a3b20a2630140a3af712758fbd1c1d71 --- /dev/null +++ b/github/data/train/domain_01_57.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ac302d3c25729500063ce0eb878bbc4045891deae69a3cb3e8af745f0fe05e +size 11428093 diff --git a/github/data/train/domain_01_58.jsonl.zst b/github/data/train/domain_01_58.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..4f41c25d02df7cfce39e32383c467127f39ceff6 --- /dev/null +++ b/github/data/train/domain_01_58.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af18b207a693f8c64ba75c3398d10ac9d88577eed828f2cd5fdb7bc2706b68aa +size 11386949 diff --git a/github/data/train/domain_01_59.jsonl.zst b/github/data/train/domain_01_59.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..60593046793f29537f2aa0990f1a7600724dad90 --- /dev/null +++ b/github/data/train/domain_01_59.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fb6e72e11aca7887619b7d36bbb140bf1a608792c3627df4c3567783ded1592 +size 11941645 diff --git a/github/data/train/domain_01_6.jsonl.zst b/github/data/train/domain_01_6.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d7764a8947e1901d4e764e4c099e2a933f498c7a --- /dev/null +++ b/github/data/train/domain_01_6.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303de6ec6c41b17b6a26a49e4e99e2630c64e7de8f434d4ced49240703c824f4 +size 11506081 diff --git a/github/data/train/domain_01_60.jsonl.zst b/github/data/train/domain_01_60.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..872fb0fe70e99f069626300cff4dfb5b09d2002b --- /dev/null +++ b/github/data/train/domain_01_60.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a786e44f24279d0287d37f8528397e14df9a9498aa8bc1863d4124eb3be35c +size 11466147 diff --git a/github/data/train/domain_01_61.jsonl.zst b/github/data/train/domain_01_61.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..11d1345bf5e9761140dc78db95a943c0853ec5bd --- /dev/null +++ b/github/data/train/domain_01_61.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e77c48090b1e068a8df29f2f52c8c79bb609a2eef1d963fc09cd5a0ba9129b52 +size 11954769 diff --git a/github/data/train/domain_01_62.jsonl.zst b/github/data/train/domain_01_62.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..36d78c098dc5b7ae718cc5ad782ad49cfc4bb7d9 --- /dev/null +++ b/github/data/train/domain_01_62.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568c5b7f4b84255850a6e27cb17f2fdf4b3ce86223ed75bfc9dfdfcf0059eec4 +size 11997815 diff --git a/github/data/train/domain_01_63.jsonl.zst b/github/data/train/domain_01_63.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..bc17dd71c09b667dcc67676a13f0d507963d593b --- /dev/null +++ b/github/data/train/domain_01_63.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d5137671d30729ab6d88fdaf576305ae11a9b420aa92e38707f6d3ed6872d96 +size 11997425 diff --git a/github/data/train/domain_01_64.jsonl.zst b/github/data/train/domain_01_64.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1947aaa769afcc36c26e6605aa6598be975b5922 --- /dev/null +++ b/github/data/train/domain_01_64.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64efb4a7bf3f511c7b637ffec56c43e6434aa213af611c6d1d4715005bd74501 +size 11846406 diff --git a/github/data/train/domain_01_65.jsonl.zst b/github/data/train/domain_01_65.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..84a9ef1ba4c11d51141ec51cb51bcbb94b3ceb47 --- /dev/null +++ b/github/data/train/domain_01_65.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca3977e90e71cf3fbab841cec96ad158504cfce6265e2a9f7f8f3fbbfc9b54b6 +size 12146210 diff --git a/github/data/train/domain_01_66.jsonl.zst b/github/data/train/domain_01_66.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..61ec0322331da0271788910684804bd7d4c1175b --- /dev/null +++ b/github/data/train/domain_01_66.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ac1a999d9b9f3623456df7260b05c2044225a5d7453f54af88ad8776fe2db7 +size 11717087 diff --git a/github/data/train/domain_01_67.jsonl.zst b/github/data/train/domain_01_67.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..6d35cf9a12af230707fe1c6a5cc7a73ce270c73d --- /dev/null +++ b/github/data/train/domain_01_67.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb75357c730f4bbd91abb7b66dfce4065241521ec3169f51171cc1ec901f505 +size 12052631 diff --git a/github/data/train/domain_01_68.jsonl.zst b/github/data/train/domain_01_68.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ff165078d2174c85a0d2314807fd9806c070aa97 --- /dev/null +++ b/github/data/train/domain_01_68.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa2ade541cb65bf53c513fd4ccee30b05f2c33030a20e56af2eadbe9182ea8fb +size 11265037 diff --git a/github/data/train/domain_01_69.jsonl.zst b/github/data/train/domain_01_69.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..9baabd90daa27242a2a5190f2524d7fce64d30b8 --- /dev/null +++ b/github/data/train/domain_01_69.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561a1b8aa9967e79b97a3f03dba8922cd6b872751d2ecf55998b75db3aa33d9c +size 11349341 diff --git a/github/data/train/domain_01_7.jsonl.zst b/github/data/train/domain_01_7.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f7714267b5bcecea5e265fae510683565396f943 --- /dev/null +++ b/github/data/train/domain_01_7.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81d745fef7853defe038ea7dd367b3112ba8ac34d3e56cfbb944ca6b9a32078c +size 11299183 diff --git a/github/data/train/domain_01_70.jsonl.zst b/github/data/train/domain_01_70.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..4189b4887afe65bd4677afd75059a51d905c2561 --- /dev/null +++ b/github/data/train/domain_01_70.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce16c4d6042ff0b6184575f6c1fa5a4f4963e21fec6cef76dc3c1d6782175ce2 +size 2557427 diff --git a/github/data/train/domain_01_8.jsonl.zst b/github/data/train/domain_01_8.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..4c6914bd61536589f142c754d5f99cf1c8a8e91a --- /dev/null +++ b/github/data/train/domain_01_8.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f41c496162b11f8c94f641ea05c2b9b6ceacebadb1f5d6e3a672c1a84d0d572 +size 11862280 diff --git a/github/data/train/domain_01_9.jsonl.zst b/github/data/train/domain_01_9.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..facfbcd8d1a8e7060724da2772b5dffb9c4e7cd1 --- /dev/null +++ b/github/data/train/domain_01_9.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b33845c6c14b273f8847267f49d6b71c2ed3aae203d55d3e17ba389c5d026672 +size 11365158 diff --git a/github/data/train/pile_01_23.jsonl.zst b/github/data/train/pile_01_23.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..740102fb2cdae309bef1427f76d27a70d2341608 --- /dev/null +++ b/github/data/train/pile_01_23.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4b3996400b605f62a09ae2e02851455bcc4e57c125e38fcddb6f381ae62baf1 +size 217613746 diff --git a/github/data/train/pile_01_37.jsonl.zst b/github/data/train/pile_01_37.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f3d983d1f132ad412d96438273dc96c6d2ed0f2e --- /dev/null +++ b/github/data/train/pile_01_37.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbf4d3fe8c1768b1d70323af586737f3d6adb998b82973f5aaa14a85a0ed9d44 +size 227357304 diff --git a/github/data/train/pile_01_40.jsonl.zst b/github/data/train/pile_01_40.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1a805dc775c67c21263fbef5e311fc99b272f783 --- /dev/null +++ b/github/data/train/pile_01_40.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2767a1eea6879c4ed9a626de39039017e682ba30f65444e680dd328224fc8ca5 +size 212730917 diff --git a/github/data/train/pile_01_54.jsonl.zst b/github/data/train/pile_01_54.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fd6dceb1532c56557f55274c69e94c29a7762806 --- /dev/null +++ b/github/data/train/pile_01_54.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda222798e44ee2a2fcbe56761f5c578b0dac0ebf4880b8c81fc21d8bf7ddd82 +size 215886741 diff --git a/github/data/train/pile_01_59.jsonl.zst b/github/data/train/pile_01_59.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c0c9dd73cd52b2e2130cc2e791a73cb9103c0fab --- /dev/null +++ b/github/data/train/pile_01_59.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d7f9643bfc78bab56a426f82d298f46e2554b62e2ee905461626b62a8d22b18 +size 212864050 diff --git a/github/data/train/pile_01_6.jsonl.zst b/github/data/train/pile_01_6.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..49f18100660d2273a9a11777b86ccd14d33b9aed --- /dev/null +++ b/github/data/train/pile_01_6.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0b810775e7a78ab25630a7977a86d9229af0fd35c77f36bfe14a44ce5e986a3 +size 214674937 diff --git a/github/data/train/pile_01_61.jsonl.zst b/github/data/train/pile_01_61.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e8aa5e13b22b028147c9fe99c57cef4ccb1d6a48 --- /dev/null +++ b/github/data/train/pile_01_61.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c1497324d6f99464b389fad349b79aa330b993a2ef791090c7601a96411441 +size 208976819 diff --git a/github/data/val/domain_val_0.jsonl.zst b/github/data/val/domain_val_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..2c13125b3d5e0fcb6bd78a9e55af8193d7620cf6 --- /dev/null +++ b/github/data/val/domain_val_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90969548159b86d47fd0ff405c72cad983626955552b8403956fa647b8011825 +size 11065653 diff --git a/github/data/val/domain_val_1.jsonl.zst b/github/data/val/domain_val_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e87d791ab9ce19945444502355029c470cf8967a --- /dev/null +++ b/github/data/val/domain_val_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:195d5eb269bb71194fa38a3f12cee5548e0ad36b8f5225d0664d83d33fe565f6 +size 11635081 diff --git a/github/data/val/domain_val_2.jsonl.zst b/github/data/val/domain_val_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d5e71b832e5ffb8cda9a2a04a80819d622590000 --- /dev/null +++ b/github/data/val/domain_val_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:174eb75c300c64025f2f0623e79f174de0f4cece0d05ccd56843c702514fadae +size 1689896 diff --git a/github/data/val/pile_val_1.jsonl.zst b/github/data/val/pile_val_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d46e941637d545e53145465bc9fc1c52a2094640 --- /dev/null +++ b/github/data/val/pile_val_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7b9a30d1b42bd49e76a0ae8dbe6f2bc5224f8cd99105c671870d86befcc0f02 +size 40125669 diff --git a/github/data/val/pile_val_2.jsonl.zst b/github/data/val/pile_val_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c903da555a630270949881f735bce6d78752ff00 --- /dev/null +++ b/github/data/val/pile_val_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c089eac9379dbc211b8e1e40f7e27ab4dcede3830eeea6a3d2cfa456bdfa63 +size 29966921 diff --git a/github/github.py b/github/github.py new file mode 100644 index 0000000000000000000000000000000000000000..3247ba22322fdc4a4535403e1f089e7bc2c6df7b --- /dev/null +++ b/github/github.py @@ -0,0 +1,86 @@ +import io +import json +import os +from glob import glob + +import datasets +import zstandard as zstd +from datasets import GeneratorBasedBuilder +from datasets.utils import Version +from huggingface_hub import snapshot_download + +# Requires REPO_NAME and file name to be same e.g. uspto.py +REPO_NAME = "Multi-Domain-Expert-Layers/github" + +class PileDomainDataset(GeneratorBasedBuilder): + VERSION = Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description="Pile Domain Dataset", + features=datasets.Features( + { + "text": datasets.Value("string"), + } + ), + supervised_keys=None, + ) + + def _split_generators(self, dl_manager): + + dl_path = snapshot_download(repo_id=REPO_NAME, repo_type="dataset") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/train"), + "split": None, + }, + ), + datasets.SplitGenerator( + name="validation", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": None, + }, + ), + datasets.SplitGenerator( + name="validation_pile", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": "pile", + }, + ), + datasets.SplitGenerator( + name="validation_domain", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": "domain", + }, + ), + datasets.SplitGenerator( + name="test_pile", + gen_kwargs={"data_dir": os.path.join(dl_path, "data/test"), "split": "pile"}, + ), + datasets.SplitGenerator( + name="test_domain", + gen_kwargs={"data_dir": os.path.join(dl_path, "data/test"), "split": "domain"}, + ), + ] + + def _generate_examples(self, data_dir, split): + dctx = zstd.ZstdDecompressor() + idx = -1 + file_paths = glob(os.path.join(data_dir, f"*.jsonl.zst")) + if split is not None: + file_paths = [f for f in file_paths if split in f] + for file in file_paths: + with open(file, "rb") as f: + reader = dctx.stream_reader(f) + buffer = io.BufferedReader(reader) + for _, line in enumerate(buffer.readlines()): + data = json.loads(line) + idx += 1 + yield idx, data + diff --git a/uspto/.gitattributes b/uspto/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..f4f3945bd7150d3e12988485c42da1f8c29c59f8 --- /dev/null +++ b/uspto/.gitattributes @@ -0,0 +1,54 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/uspto/README.md b/uspto/README.md new file mode 100644 index 0000000000000000000000000000000000000000..154df8298fab5ecf322016157858e08cd1bccbe1 --- /dev/null +++ b/uspto/README.md @@ -0,0 +1,3 @@ +--- +license: apache-2.0 +--- diff --git a/uspto/data/test/domain_test_0.jsonl.zst b/uspto/data/test/domain_test_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c7a80987a7c0c94e8acdeab7339fe0dea0a40835 --- /dev/null +++ b/uspto/data/test/domain_test_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70440e0cd17c331b7bd0552deef0d40dd3718247d63c691a23ae7b2488c47064 +size 6529625 diff --git a/uspto/data/test/domain_test_1.jsonl.zst b/uspto/data/test/domain_test_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e12aa0a965944b9396747a0170b1782140f83bb3 --- /dev/null +++ b/uspto/data/test/domain_test_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:569307f55f200de89aa96c704184ccb0ab0aa97b613a4060dd5d51ffd2133dac +size 6509042 diff --git a/uspto/data/test/domain_test_2.jsonl.zst b/uspto/data/test/domain_test_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ac2106aa2c72d5160a82fced5c8d775f93a2f8d6 --- /dev/null +++ b/uspto/data/test/domain_test_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b870b20780fd16677d6df0b0affb6697d760cd89817aa3b81b3b7ac19e0b4b +size 970866 diff --git a/uspto/data/test/pile_test_0.jsonl.zst b/uspto/data/test/pile_test_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..984b5e917ecb8bc7f4ec1cb05394a8574bc86413 --- /dev/null +++ b/uspto/data/test/pile_test_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e433955df2cde6352bc6e3d879fc43e217924f636683f7d06203850bc9f44f +size 24110126 diff --git a/uspto/data/train/domain_01_0.jsonl.zst b/uspto/data/train/domain_01_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a3eb49386f18aa9fdb003f96d6cba6239e272b5b --- /dev/null +++ b/uspto/data/train/domain_01_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4373900c9a5910b342ebbae6aa45ba1e803a47cc4f2a7aea48ae9de94f9b8e8 +size 6503794 diff --git a/uspto/data/train/domain_01_1.jsonl.zst b/uspto/data/train/domain_01_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..76fea2d5304734783b825246f6571cf63687a7be --- /dev/null +++ b/uspto/data/train/domain_01_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a70695c68adbe576b608fcbbe2a8fc256a27bc2313bb7875fb2de5b2a84803c +size 6664613 diff --git a/uspto/data/train/domain_01_10.jsonl.zst b/uspto/data/train/domain_01_10.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5d06f46224ecb3b428a510e953ac4e97b0846855 --- /dev/null +++ b/uspto/data/train/domain_01_10.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a5b047181cdb018a9fe98a797d415bf28031ca843bcf519b493777c8cb09bc +size 6654548 diff --git a/uspto/data/train/domain_01_11.jsonl.zst b/uspto/data/train/domain_01_11.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..32dc674284427ff6a6daa82a4dbc47072f8caea5 --- /dev/null +++ b/uspto/data/train/domain_01_11.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c17555ae983fad1e5cad530767cf4b279b5bb7ef95e335e5e299deb5881b05 +size 6472219 diff --git a/uspto/data/train/domain_01_12.jsonl.zst b/uspto/data/train/domain_01_12.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..9bf462d68ba2ed0d857ce7f9c9bdaf67ecc056b7 --- /dev/null +++ b/uspto/data/train/domain_01_12.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f61ad85c1bbef1c612a108398e05d1d3870bb3db756919713ce6bbd6d61fb85 +size 6439713 diff --git a/uspto/data/train/domain_01_13.jsonl.zst b/uspto/data/train/domain_01_13.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a36f0a73fbe4abe62ce9d288bc57844702b6494f --- /dev/null +++ b/uspto/data/train/domain_01_13.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:115274c2f197301f24edfb003fd26288718c9c43822b78947f1317b21e404297 +size 6578666 diff --git a/uspto/data/train/domain_01_14.jsonl.zst b/uspto/data/train/domain_01_14.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1044e1000abb2d6bf67e524ce796edac04c9b5e9 --- /dev/null +++ b/uspto/data/train/domain_01_14.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d5695c51a325e007f02445230e9606cca4cf76a6fda4d68bd464be3668f88d +size 6592106 diff --git a/uspto/data/train/domain_01_15.jsonl.zst b/uspto/data/train/domain_01_15.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8c146272d1b037e0605a9e4e80ae41dfdd7023c0 --- /dev/null +++ b/uspto/data/train/domain_01_15.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c418ead19cf2d757186854e95acbcefcea9e706f75d8990c5c970807c4ad202 +size 6563978 diff --git a/uspto/data/train/domain_01_16.jsonl.zst b/uspto/data/train/domain_01_16.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e66e14f8b5e7fef57cf067e1f07d218c6c89e06c --- /dev/null +++ b/uspto/data/train/domain_01_16.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5412b0e69e110664bc1da22800ea98958f80b0af7d81c1763f7d78fe9eb0d5ef +size 6878956 diff --git a/uspto/data/train/domain_01_17.jsonl.zst b/uspto/data/train/domain_01_17.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a5d374ebe4009809be2b738485c8a896eb65d150 --- /dev/null +++ b/uspto/data/train/domain_01_17.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a02c907d55417860d19b13f44c20fc146b44b21ed4a2e9f3930365ea0f0165db +size 6717802 diff --git a/uspto/data/train/domain_01_18.jsonl.zst b/uspto/data/train/domain_01_18.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d976792a78bdcf62a34b218fe3528e7ffc2889d0 --- /dev/null +++ b/uspto/data/train/domain_01_18.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77ded6a014557f72151e96b867ea7e88b870cf32f314813e00c2ccc2dc4d34f +size 6531196 diff --git a/uspto/data/train/domain_01_19.jsonl.zst b/uspto/data/train/domain_01_19.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..b6354d7350a7d2fa2e049128c94bdd08d2f46dd4 --- /dev/null +++ b/uspto/data/train/domain_01_19.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6603b6f52a4580a9c991061fd7ac84b6b3709ed7110be7e552deb90a8e6c5e98 +size 6636666 diff --git a/uspto/data/train/domain_01_2.jsonl.zst b/uspto/data/train/domain_01_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8dc5e4eff013bb2300a73e8b8babd78ca1891b03 --- /dev/null +++ b/uspto/data/train/domain_01_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65afbb2e0ed14cc9b914771e8dd8393074e6ec66b51b84786c3d5053f8569bef +size 6603627 diff --git a/uspto/data/train/domain_01_20.jsonl.zst b/uspto/data/train/domain_01_20.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d8e425f9b94296afe6ac633b99fb10e7c0a3c56e --- /dev/null +++ b/uspto/data/train/domain_01_20.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17e2e7e5b640e73664d2ddfe51d90d38f73d292ab1e86059b4f38b5f2c5a6dc8 +size 6510463 diff --git a/uspto/data/train/domain_01_21.jsonl.zst b/uspto/data/train/domain_01_21.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5cecc48e1969bf88c2ecad0ad6e6eea36de4002d --- /dev/null +++ b/uspto/data/train/domain_01_21.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:952b0d2eff25a127b6529d790dad8a97cfc779ad2a96a139d0f0eb259d2297ba +size 6677585 diff --git a/uspto/data/train/domain_01_22.jsonl.zst b/uspto/data/train/domain_01_22.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..436d8116a0f3126546c6b798d55742fad018b314 --- /dev/null +++ b/uspto/data/train/domain_01_22.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8574ea0857de0c0b83476f701405378d29b8d43548be064c54b0e0d66101c7 +size 6649582 diff --git a/uspto/data/train/domain_01_23.jsonl.zst b/uspto/data/train/domain_01_23.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1a3dcfafe06d93ae795babde14c2953568ade77d --- /dev/null +++ b/uspto/data/train/domain_01_23.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f020871fdbf00e78198d7de18b12f4b56082c151479a3274e79cfb84f6a0ad1e +size 6488726 diff --git a/uspto/data/train/domain_01_24.jsonl.zst b/uspto/data/train/domain_01_24.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8a9bea89bc95b20deac7a883369aebdb4d4c715b --- /dev/null +++ b/uspto/data/train/domain_01_24.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd85bb47430e6e1d15e2fda83c340ddb6c7d7ffe5c64ea47f0b8fdb1f5521e3 +size 6895152 diff --git a/uspto/data/train/domain_01_25.jsonl.zst b/uspto/data/train/domain_01_25.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..527aae21bacccfc129ba6b3c4775d67fd82dcea1 --- /dev/null +++ b/uspto/data/train/domain_01_25.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3661b19111a2220158fb111a88ad886cfeec78c7a1110457bbbafe62bfa559e +size 6752129 diff --git a/uspto/data/train/domain_01_26.jsonl.zst b/uspto/data/train/domain_01_26.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5123eed6e8b49fe62b9c5b4b171a252ba1311441 --- /dev/null +++ b/uspto/data/train/domain_01_26.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a4cdb5dcd1a3325fcef89c7c265e38502076c3d87f22749b7be7f23c49f6ae +size 6413566 diff --git a/uspto/data/train/domain_01_27.jsonl.zst b/uspto/data/train/domain_01_27.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f941e3d9ce3961da33cdb5a432e0ff85730e533f --- /dev/null +++ b/uspto/data/train/domain_01_27.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1c37f0e0ed6793c5d6741e6a051eb4da2de8de07c93e022eb37ec917424cef +size 6621363 diff --git a/uspto/data/train/domain_01_28.jsonl.zst b/uspto/data/train/domain_01_28.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..48d35a8bed16edbdd562b38d83417edae7a3dae2 --- /dev/null +++ b/uspto/data/train/domain_01_28.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b0d5577643f89889ec8b9dbb6c171ab343c1f6d809a3889a1959bd4dac0254a +size 6825622 diff --git a/uspto/data/train/domain_01_29.jsonl.zst b/uspto/data/train/domain_01_29.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..c4e4e5c30da9d452c0ece7ee34e2aa0fe85b7bb5 --- /dev/null +++ b/uspto/data/train/domain_01_29.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7264ee84bf6d22529c96c40372fc68b0f5ecbc1ae6bfc1fcc46042fa79237ca +size 6369166 diff --git a/uspto/data/train/domain_01_3.jsonl.zst b/uspto/data/train/domain_01_3.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..6f72a072c9baca87c8d00dcc74e9c60384976e68 --- /dev/null +++ b/uspto/data/train/domain_01_3.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac52f8529b4a72e3288a1400ba2b4b73c853e7c6c957459dd75e0bf8fe2633b +size 6531858 diff --git a/uspto/data/train/domain_01_30.jsonl.zst b/uspto/data/train/domain_01_30.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..214ac6fe97247a2f8f84e52317924d07ccc4d04f --- /dev/null +++ b/uspto/data/train/domain_01_30.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d96c501fe624bd3f8e32c905cff882e46ef6809b54fb82b305db367fef9eef +size 6631560 diff --git a/uspto/data/train/domain_01_31.jsonl.zst b/uspto/data/train/domain_01_31.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ba92987b1f2e15205a641dbd0398bbe826cdfd71 --- /dev/null +++ b/uspto/data/train/domain_01_31.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8117a599dea6baa56bba3d6bf3435bce9057fa456171160bb195da83353c64ae +size 6841034 diff --git a/uspto/data/train/domain_01_32.jsonl.zst b/uspto/data/train/domain_01_32.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..004844acec19300d28c50e431f664c74d85e3d92 --- /dev/null +++ b/uspto/data/train/domain_01_32.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45e8bc5d363b480e9a7679695711729fe003b49582b99f4c2a19e3be60343284 +size 6612770 diff --git a/uspto/data/train/domain_01_33.jsonl.zst b/uspto/data/train/domain_01_33.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..100e6a1a5deb52127ed11e9391db7a8ba9dca595 --- /dev/null +++ b/uspto/data/train/domain_01_33.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01fc2d93bb08bbcd554b17cb1b03189a6c307df7dd99c29a6f11d81a3a760e1 +size 6738840 diff --git a/uspto/data/train/domain_01_34.jsonl.zst b/uspto/data/train/domain_01_34.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..13b9a62a90ed2f0bb598750b4a94204b9eee936b --- /dev/null +++ b/uspto/data/train/domain_01_34.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836ba379275f8013f3c91715fd6906fbeca15143419083c14efed947c1c25420 +size 6463490 diff --git a/uspto/data/train/domain_01_35.jsonl.zst b/uspto/data/train/domain_01_35.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..304348d93115486926296c2a07783f7ebc29f739 --- /dev/null +++ b/uspto/data/train/domain_01_35.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:443c652c8667e6c564c88e868d8df70d5c068f5026b173573b82fcf5051c9f76 +size 6566123 diff --git a/uspto/data/train/domain_01_36.jsonl.zst b/uspto/data/train/domain_01_36.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..50fcc6c0e4305800176e8559457df89e88afc7fa --- /dev/null +++ b/uspto/data/train/domain_01_36.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0599af88189640b769a5d4d4dbfdcbb6a17569102bbc46fadce33f615d6d2d06 +size 6794337 diff --git a/uspto/data/train/domain_01_37.jsonl.zst b/uspto/data/train/domain_01_37.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f566a7d1b47a6085f14a90f075ec6f9c0fdef873 --- /dev/null +++ b/uspto/data/train/domain_01_37.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cf893a66d451ffd31fe5aac7cfcc1e14caf008e9706624dc6cd01e48c2d0b8b +size 6613158 diff --git a/uspto/data/train/domain_01_38.jsonl.zst b/uspto/data/train/domain_01_38.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..707d5a9a6b88dcd0d9789545f99609ddf4c33226 --- /dev/null +++ b/uspto/data/train/domain_01_38.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a75c1ca5a1e3653d7ed6aacb1282e2dafd4d1fbea250e34d2a4f9008e7db1b57 +size 6802060 diff --git a/uspto/data/train/domain_01_39.jsonl.zst b/uspto/data/train/domain_01_39.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..da2cc23c1069cda90e8bb00f468c8373ab823339 --- /dev/null +++ b/uspto/data/train/domain_01_39.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a13b3ee536c53e10b4dc6e1eb107643d1146d0cfbcaff695caf6d2e07ee19b9 +size 6569756 diff --git a/uspto/data/train/domain_01_4.jsonl.zst b/uspto/data/train/domain_01_4.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d2fa48f9bc1770391a396d9b1bb698af8a7b80b7 --- /dev/null +++ b/uspto/data/train/domain_01_4.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad1884e1386a41d0d29b71af6a5fd85bec2e12112b73828fe4de6209140e11d +size 6909789 diff --git a/uspto/data/train/domain_01_40.jsonl.zst b/uspto/data/train/domain_01_40.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d3a3b634f0e664cc891a105f5da36aaf9102a290 --- /dev/null +++ b/uspto/data/train/domain_01_40.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42758786690d49a3885183f39406a46d1f6fc33d23f5d8be4869186ec326b5e +size 6543524 diff --git a/uspto/data/train/domain_01_41.jsonl.zst b/uspto/data/train/domain_01_41.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..67b9db54b8127e555e03997402e049af097de1d8 --- /dev/null +++ b/uspto/data/train/domain_01_41.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdfeb204ab579ce366a204fd5b871bbcce13917eac135ba490c4003a6773c897 +size 6415953 diff --git a/uspto/data/train/domain_01_42.jsonl.zst b/uspto/data/train/domain_01_42.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..8a6d3d56cbf79154d7a441986c4ba0a745ac66ba --- /dev/null +++ b/uspto/data/train/domain_01_42.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7500593a40630a9978a45845d1c0ee732df5c00c235ee13fe12fdbfa205afd68 +size 6553483 diff --git a/uspto/data/train/domain_01_43.jsonl.zst b/uspto/data/train/domain_01_43.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..523d05055e1e0572f0da769438b8bcb79265db78 --- /dev/null +++ b/uspto/data/train/domain_01_43.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3315c9278508b1cf0d4f76dd18da9770f5b148b533f9ebf2b34deafa0767cf5 +size 6870200 diff --git a/uspto/data/train/domain_01_44.jsonl.zst b/uspto/data/train/domain_01_44.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e02fad244fb7c52fba9bf0dcd4397381eead45c8 --- /dev/null +++ b/uspto/data/train/domain_01_44.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f7315594770198fe9bc1c5b22f4d8f8088b27ee3294e32e487007db8a4eebe0 +size 6698688 diff --git a/uspto/data/train/domain_01_45.jsonl.zst b/uspto/data/train/domain_01_45.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ee11507e796671fa2f3152813a86e9d6f5c724e4 --- /dev/null +++ b/uspto/data/train/domain_01_45.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a8b6c8b1c3b631fe28938f6627fb7a3441b9c2e648a4ff3ba450ece99a99d42 +size 6676426 diff --git a/uspto/data/train/domain_01_46.jsonl.zst b/uspto/data/train/domain_01_46.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..b81de26dd2b20f0e8b1a191c1db3113b7ea7400e --- /dev/null +++ b/uspto/data/train/domain_01_46.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:655f871226a1b8e0adbbb14e4a2f4213f961075f309e6fb9e5f2a1e6a92b35a7 +size 6505400 diff --git a/uspto/data/train/domain_01_47.jsonl.zst b/uspto/data/train/domain_01_47.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..241e2ca1fbb96794524689a54c447e7d17316ceb --- /dev/null +++ b/uspto/data/train/domain_01_47.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8762a4c05618397e5da527a872edf17ddaccbb8f1a07f23dd21dc263858274d +size 6864439 diff --git a/uspto/data/train/domain_01_48.jsonl.zst b/uspto/data/train/domain_01_48.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..61dadb45e6e6be1fbac7c5b812eaf8569fe977e0 --- /dev/null +++ b/uspto/data/train/domain_01_48.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:952039db7edd670571d534f58966790631fe90c60a46898a9cd7e49e3cdfbd10 +size 6484838 diff --git a/uspto/data/train/domain_01_49.jsonl.zst b/uspto/data/train/domain_01_49.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1c5c2c1430c3bcfb45228a531e223d4b5f76de0c --- /dev/null +++ b/uspto/data/train/domain_01_49.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb10033fc607d7da67cbf2afa21761e61e403d730d4af6ab0068e455be261bde +size 6503819 diff --git a/uspto/data/train/domain_01_5.jsonl.zst b/uspto/data/train/domain_01_5.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f40fa36c95c29d044b05d391b367256221f70b8a --- /dev/null +++ b/uspto/data/train/domain_01_5.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f49e4c013b7ee2a7ff0eccd7dd2a469573a67f2502a1d47f18fac14348de914d +size 6676280 diff --git a/uspto/data/train/domain_01_50.jsonl.zst b/uspto/data/train/domain_01_50.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..77669490cdbb3948ade695a277161152ddd760a0 --- /dev/null +++ b/uspto/data/train/domain_01_50.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a271dd973d773f23bda0f84ec4efb24d7dc4dbe9ddfc12897b61ace6fdb516 +size 6557576 diff --git a/uspto/data/train/domain_01_51.jsonl.zst b/uspto/data/train/domain_01_51.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..25d459dd461f942e69b6e1b9ce7fad93e8891a2d --- /dev/null +++ b/uspto/data/train/domain_01_51.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e500fb2b7df5f2e7dac009c7cf0716bb2c0b40bec79be13656a19ac7fc792d8 +size 6633722 diff --git a/uspto/data/train/domain_01_52.jsonl.zst b/uspto/data/train/domain_01_52.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..7752cceb8f89a88f8d44660a1adaf901e579cc3d --- /dev/null +++ b/uspto/data/train/domain_01_52.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93a887180719c4512324cbfedb09c3f924ec5ee6ea6e2aab7510bd30e1a4f70 +size 6307211 diff --git a/uspto/data/train/domain_01_53.jsonl.zst b/uspto/data/train/domain_01_53.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..24b606da851968b169eaa36a75851e084a83f097 --- /dev/null +++ b/uspto/data/train/domain_01_53.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b58f78fcd053afc79611e285b79e1aa33122ab3aa4e91e2fc75dc34cb00b2795 +size 6677383 diff --git a/uspto/data/train/domain_01_54.jsonl.zst b/uspto/data/train/domain_01_54.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..acc77d3e07252b4489759e3c9735e41934cb32c0 --- /dev/null +++ b/uspto/data/train/domain_01_54.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff412ecfd0c0bfe65267624115e19f53cd74a47ba482a5903056bd9508b81e95 +size 6468118 diff --git a/uspto/data/train/domain_01_55.jsonl.zst b/uspto/data/train/domain_01_55.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a8eedf1bddcdbffa9da29161d7bef2d463616a0a --- /dev/null +++ b/uspto/data/train/domain_01_55.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2691145ddc04e880501889f9f244bf4692bb6d05dcc2dfa8ad0b82bb3ad005 +size 6615621 diff --git a/uspto/data/train/domain_01_56.jsonl.zst b/uspto/data/train/domain_01_56.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a51ad621f157825eabc9f3a94d4c6d82f16e938b --- /dev/null +++ b/uspto/data/train/domain_01_56.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e52560dc42f0e5d61d7dfc639f14e4a4b47e6df06eb000dc7ad676c6e5a0542 +size 6533862 diff --git a/uspto/data/train/domain_01_57.jsonl.zst b/uspto/data/train/domain_01_57.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..88eb89d198b41bfde8598743bc1df26e1c212ca6 --- /dev/null +++ b/uspto/data/train/domain_01_57.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a99f543365b59d88e4c41eedf50688666403df760e729b11bc476bc24ff2842 +size 6646719 diff --git a/uspto/data/train/domain_01_58.jsonl.zst b/uspto/data/train/domain_01_58.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a3c8b17a3a0065ba027caa7b8247cd01cb128230 --- /dev/null +++ b/uspto/data/train/domain_01_58.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3785f2d71a13c3736a2fb7c04302ef275d0e4bde88cd9b3c8db69ee293d27e2 +size 6775846 diff --git a/uspto/data/train/domain_01_59.jsonl.zst b/uspto/data/train/domain_01_59.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a71e596758dc94ac3e42e09ca4586105c3543727 --- /dev/null +++ b/uspto/data/train/domain_01_59.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5fd6e2441d4263e54ecfc22376b6be880e1a95c06e0a26df5b37e53156181a +size 6763011 diff --git a/uspto/data/train/domain_01_6.jsonl.zst b/uspto/data/train/domain_01_6.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ca784f9cac0a96c4b4034a7e57885db3f91dc149 --- /dev/null +++ b/uspto/data/train/domain_01_6.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33cf66463b2d799331fea4b44effa183b04831f81f03f5f31742edb67187619a +size 6723011 diff --git a/uspto/data/train/domain_01_60.jsonl.zst b/uspto/data/train/domain_01_60.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..0efb9064abc29ac9f850b7a2867a96229dafb922 --- /dev/null +++ b/uspto/data/train/domain_01_60.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10793826773f8baab84ffcdf8d7b44f0bb223b0739b839e0945d157ea9cbe755 +size 6584767 diff --git a/uspto/data/train/domain_01_61.jsonl.zst b/uspto/data/train/domain_01_61.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ebd3f6ec36819f6b6c0b0251659d3e0af87f5cdb --- /dev/null +++ b/uspto/data/train/domain_01_61.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ba6bc1a15ecc70a30c90170355c2bb53328b2ffb5b4fd8bb53725d338de34e +size 6569560 diff --git a/uspto/data/train/domain_01_62.jsonl.zst b/uspto/data/train/domain_01_62.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1ec3d26fee311fa370cc76c18ee63870925c2614 --- /dev/null +++ b/uspto/data/train/domain_01_62.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11317ca400cabc29e849ccd7ae20973a844e7e487a171f4f296b68601e8ba1a5 +size 6548412 diff --git a/uspto/data/train/domain_01_63.jsonl.zst b/uspto/data/train/domain_01_63.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..291a060e259fa941530cf5728c4344a480adc929 --- /dev/null +++ b/uspto/data/train/domain_01_63.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8af0e0ceeed39752cb39c4f7bfbb7449a464ab76f16cfd612124c1630677433 +size 6721988 diff --git a/uspto/data/train/domain_01_64.jsonl.zst b/uspto/data/train/domain_01_64.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..3a73c8b4b4acdf260be5ed652e1c1e8d18e4ba83 --- /dev/null +++ b/uspto/data/train/domain_01_64.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0be9407208128a4a99f18b90c476fcafc40400ea9f51494840365cd77b0e8f +size 6492909 diff --git a/uspto/data/train/domain_01_65.jsonl.zst b/uspto/data/train/domain_01_65.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..5bc7bccc6b79d4ff37cf9c9bef64942cf6abd379 --- /dev/null +++ b/uspto/data/train/domain_01_65.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c451c12dabe1a47389783e841a5a43d5c4e246f686b7f88303dc5d6bb129a61 +size 6624750 diff --git a/uspto/data/train/domain_01_66.jsonl.zst b/uspto/data/train/domain_01_66.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d8d4a2f5c0ad03bb843e8ae70d2c61d6fe2d2534 --- /dev/null +++ b/uspto/data/train/domain_01_66.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee87795a67d80e09671b922e000f2c1f50c329dc15166bdbf7c8abd63f228e3c +size 6278176 diff --git a/uspto/data/train/domain_01_67.jsonl.zst b/uspto/data/train/domain_01_67.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..ae5d5324b4044e1814acd35f6dcff24cee0da57c --- /dev/null +++ b/uspto/data/train/domain_01_67.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6f484acd82489368428c569b5734cc95e18c9240fc95c488f111cecdaebc9d +size 6663933 diff --git a/uspto/data/train/domain_01_68.jsonl.zst b/uspto/data/train/domain_01_68.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..34798cb3fc4a210241f8090cef974ef795de5ba7 --- /dev/null +++ b/uspto/data/train/domain_01_68.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3727749577529e139398811df1cfb8c2feefb53ad5fc8fa9bfed27bdfba8a70e +size 6590031 diff --git a/uspto/data/train/domain_01_69.jsonl.zst b/uspto/data/train/domain_01_69.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..3a64be6890ecfec57b344237790e09a92903bc9b --- /dev/null +++ b/uspto/data/train/domain_01_69.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a810e42e4829a1f32a454368e91297430fdccac1539bc91c9b33becd2fffaa80 +size 6398434 diff --git a/uspto/data/train/domain_01_7.jsonl.zst b/uspto/data/train/domain_01_7.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..91cef55fb2453fc40416611af0ec2d1e5b1fd8f8 --- /dev/null +++ b/uspto/data/train/domain_01_7.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f8e6c8e7760e502d1143788a97a7c0cfab353995cddc14645e967de62a55a1 +size 6832906 diff --git a/uspto/data/train/domain_01_70.jsonl.zst b/uspto/data/train/domain_01_70.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..20e17cbe2a60d68db6616605a7c7deb39cf2c747 --- /dev/null +++ b/uspto/data/train/domain_01_70.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1f0a0c4f939e3acf0fd7d7adde87a74005d25a3b57a732638a1a6ba8cece36a +size 1306520 diff --git a/uspto/data/train/domain_01_8.jsonl.zst b/uspto/data/train/domain_01_8.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..d703424a4c802d2eed02a79dc90c097b6994c184 --- /dev/null +++ b/uspto/data/train/domain_01_8.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e0c251ecfdd620506193f21dc32f737c635fc034f5224080fd94c2827fa751 +size 6540522 diff --git a/uspto/data/train/domain_01_9.jsonl.zst b/uspto/data/train/domain_01_9.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..9a1b5bcd4842692f2ee797d3164861d72f02792f --- /dev/null +++ b/uspto/data/train/domain_01_9.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ed1bb89fd776487fd171f29b8fa91802522ac428c4c63962311b90e607ab91 +size 6623820 diff --git a/uspto/data/train/pile_01_37.jsonl.zst b/uspto/data/train/pile_01_37.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..f3d983d1f132ad412d96438273dc96c6d2ed0f2e --- /dev/null +++ b/uspto/data/train/pile_01_37.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbf4d3fe8c1768b1d70323af586737f3d6adb998b82973f5aaa14a85a0ed9d44 +size 227357304 diff --git a/uspto/data/train/pile_01_40.jsonl.zst b/uspto/data/train/pile_01_40.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..1a805dc775c67c21263fbef5e311fc99b272f783 --- /dev/null +++ b/uspto/data/train/pile_01_40.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2767a1eea6879c4ed9a626de39039017e682ba30f65444e680dd328224fc8ca5 +size 212730917 diff --git a/uspto/data/train/pile_01_54.jsonl.zst b/uspto/data/train/pile_01_54.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..fd6dceb1532c56557f55274c69e94c29a7762806 --- /dev/null +++ b/uspto/data/train/pile_01_54.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda222798e44ee2a2fcbe56761f5c578b0dac0ebf4880b8c81fc21d8bf7ddd82 +size 215886741 diff --git a/uspto/data/train/pile_01_61.jsonl.zst b/uspto/data/train/pile_01_61.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..e8aa5e13b22b028147c9fe99c57cef4ccb1d6a48 --- /dev/null +++ b/uspto/data/train/pile_01_61.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c1497324d6f99464b389fad349b79aa330b993a2ef791090c7601a96411441 +size 208976819 diff --git a/uspto/data/val/domain_val_0.jsonl.zst b/uspto/data/val/domain_val_0.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..0bf753fb9f9d7332566f36341e27a2ff17232c29 --- /dev/null +++ b/uspto/data/val/domain_val_0.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6dd78b137a4a4824b84f435d88e1e78974113705807917f51127fce7dc3c81a +size 6688544 diff --git a/uspto/data/val/domain_val_1.jsonl.zst b/uspto/data/val/domain_val_1.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..271a475eb52d9ef1d682f482812cca30f06d3ee6 --- /dev/null +++ b/uspto/data/val/domain_val_1.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50709f870f1e4235df502381f20bd37cf48f354c838e1e2056cf40c5fc7d0ac +size 6534715 diff --git a/uspto/data/val/domain_val_2.jsonl.zst b/uspto/data/val/domain_val_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..6703f71069e23e665761bdf2aa24d0e585f65cb7 --- /dev/null +++ b/uspto/data/val/domain_val_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3977a2ef2d2eafc37022f0f25a059a86879910fb0356d94a5aaf7adb396f7f43 +size 982958 diff --git a/uspto/data/val/pile_val_2.jsonl.zst b/uspto/data/val/pile_val_2.jsonl.zst new file mode 100644 index 0000000000000000000000000000000000000000..a870a4edce72ce706faeadbd07b4f764c0b6bebc --- /dev/null +++ b/uspto/data/val/pile_val_2.jsonl.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0516e81718eecff588899c9f2b174ffaed18ecb10b194648cf72d99ce4b39d88 +size 23346058 diff --git a/uspto/uspto.py b/uspto/uspto.py new file mode 100644 index 0000000000000000000000000000000000000000..33f23a71ba331fee2750b889a5bcf6074c5f2e32 --- /dev/null +++ b/uspto/uspto.py @@ -0,0 +1,86 @@ +import io +import json +import os +from glob import glob + +import datasets +import zstandard as zstd +from datasets import GeneratorBasedBuilder +from datasets.utils import Version +from huggingface_hub import snapshot_download + +# Requires REPO_NAME and file name to be same e.g. uspto.py +REPO_NAME = "Multi-Domain-Expert-Layers/uspto" + +class PileDomainDataset(GeneratorBasedBuilder): + VERSION = Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description="Pile Domain Dataset", + features=datasets.Features( + { + "text": datasets.Value("string"), + } + ), + supervised_keys=None, + ) + + def _split_generators(self, dl_manager): + + dl_path = snapshot_download(repo_id=REPO_NAME, repo_type="dataset") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/train"), + "split": None, + }, + ), + datasets.SplitGenerator( + name="validation", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": None, + }, + ), + datasets.SplitGenerator( + name="validation_pile", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": "pile", + }, + ), + datasets.SplitGenerator( + name="validation_domain", + gen_kwargs={ + "data_dir": os.path.join(dl_path, "data/val"), + "split": "domain", + }, + ), + datasets.SplitGenerator( + name="test_pile", + gen_kwargs={"data_dir": os.path.join(dl_path, "data/test"), "split": "pile"}, + ), + datasets.SplitGenerator( + name="test_domain", + gen_kwargs={"data_dir": os.path.join(dl_path, "data/test"), "split": "domain"}, + ), + ] + + def _generate_examples(self, data_dir, split): + dctx = zstd.ZstdDecompressor() + idx = -1 + file_paths = glob(os.path.join(data_dir, f"*.jsonl.zst")) + if split is not None: + file_paths = [f for f in file_paths if split in f] + for file in file_paths: + with open(file, "rb") as f: + reader = dctx.stream_reader(f) + buffer = io.BufferedReader(reader) + for _, line in enumerate(buffer.readlines()): + data = json.loads(line) + idx += 1 + yield idx, data +