{ "cells": [ { "cell_type": "markdown", "id": "3f9ce240-fd1a-4550-83c7-8cf9658b1d3a", "metadata": {}, "source": [ "# Dataloading (1B+ Training Pairs)\n", "\n", "## This notebook collects and uploads all 50 relevant sentence embedding datasets to S3 as .json.gz files where each line contains one training record" ] }, { "cell_type": "code", "execution_count": 62, "id": "d7af8d0e-b3ed-4007-abdd-5952d775e119", "metadata": { "tags": [] }, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "\n", "os.chdir('/home/ec2-user/SageMaker')" ] }, { "cell_type": "code", "execution_count": 63, "id": "686699fb-d2c9-4653-afca-580aef343451", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n", " : update-motd, versionlock\n", "Cleaning repos: amzn2-core amzn2extra-docker amzn2extra-epel\n", " : amzn2extra-kernel-5.10 amzn2extra-python3.8 centos-extras\n", " : copr:copr.fedorainfracloud.org:vbatts:shadow-utils-newxidmap\n", " : docker-ce-stable libnvidia-container neuron\n", "21 metadata files removed\n", "15 sqlite files removed\n", "0 metadata files removed\n", "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n", " : update-motd, versionlock\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "https://download.docker.com/linux/centos/2/x86_64/stable/repodata/repomd.xml: [Errno 14] HTTPS Error 404 - Not Found\n", "Trying other mirror.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "62 packages excluded due to repository priority protections\n", "Resolving Dependencies\n", "--> Running transaction check\n", "---> Package epel-release.noarch 0:7-11 will be installed\n", "--> Finished Dependency Resolution\n", "\n", "Dependencies Resolved\n", "\n", "================================================================================\n", " Package Arch Version Repository Size\n", "================================================================================\n", "Installing:\n", " epel-release noarch 7-11 amzn2extra-epel 15 k\n", "\n", "Transaction Summary\n", "================================================================================\n", "Install 1 Package\n", "\n", "Total download size: 15 k\n", "Installed size: 24 k\n", "Downloading packages:\n", "Running transaction check\n", "Running transaction test\n", "Transaction test succeeded\n", "Running transaction\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Warning: RPMDB altered outside of yum.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Installing : epel-release-7-11.noarch 1/1 \n", " Verifying : epel-release-7-11.noarch 1/1 \n", "\n", "Installed:\n", " epel-release.noarch 0:7-11 \n", "\n", "Complete!\n", "Installing epel-release\n", " 0 ansible2 available \\\n", " [ =2.4.2 =2.4.6 =2.8 =stable ]\n", " 2 httpd_modules available [ =1.0 =stable ]\n", " 3 memcached1.5 available \\\n", " [ =1.5.1 =1.5.16 =1.5.17 ]\n", " 6 postgresql10 available [ =10 =stable ]\n", " 9 R3.4 available [ =3.4.3 =stable ]\n", " 10 rust1 available \\\n", " [ =1.22.1 =1.26.0 =1.26.1 =1.27.2 =1.31.0 =1.38.0\n", " =stable ]\n", " 18 libreoffice available \\\n", " [ =5.0.6.2_15 =5.3.6.1 =stable ]\n", " 19 gimp available [ =2.8.22 ]\n", " 20 docker=latest enabled \\\n", " [ =17.12.1 =18.03.1 =18.06.1 =18.09.9 =stable ]\n", " 21 mate-desktop1.x available \\\n", " [ =1.19.0 =1.20.0 =stable ]\n", " 22 GraphicsMagick1.3 available \\\n", " [ =1.3.29 =1.3.32 =1.3.34 =stable ]\n", " 23 tomcat8.5 available \\\n", " [ =8.5.31 =8.5.32 =8.5.38 =8.5.40 =8.5.42 =8.5.50\n", " =stable ]\n", " 24 epel=latest enabled [ =7.11 =stable ]\n", " 25 testing available [ =1.0 =stable ]\n", " 26 ecs available [ =stable ]\n", " 27 corretto8 available \\\n", " [ =1.8.0_192 =1.8.0_202 =1.8.0_212 =1.8.0_222 =1.8.0_232\n", " =1.8.0_242 =stable ]\n", " 29 golang1.11 available \\\n", " [ =1.11.3 =1.11.11 =1.11.13 =stable ]\n", " 30 squid4 available [ =4 =stable ]\n", " 32 lustre2.10 available \\\n", " [ =2.10.5 =2.10.8 =stable ]\n", " 33 java-openjdk11 available [ =11 =stable ]\n", " 34 lynis available [ =stable ]\n", " 36 BCC available [ =0.x =stable ]\n", " 37 mono available [ =5.x =stable ]\n", " 38 nginx1 available [ =stable ]\n", " 40 mock available [ =stable ]\n", " 41 postgresql11 available [ =11 =stable ]\n", " 43 livepatch available [ =stable ]\n", " 44 python3.8=latest enabled [ =stable ]\n", " 45 haproxy2 available [ =stable ]\n", " 46 collectd available [ =stable ]\n", " 47 aws-nitro-enclaves-cli available [ =stable ]\n", " 48 R4 available [ =stable ]\n", " _ kernel-5.4 available [ =stable ]\n", " 50 selinux-ng available [ =stable ]\n", " 51 php8.0 available [ =stable ]\n", " 52 tomcat9 available [ =stable ]\n", " 53 unbound1.13 available [ =stable ]\n", " 54 mariadb10.5 available [ =stable ]\n", " 55 kernel-5.10=latest enabled [ =stable ]\n", " 56 redis6 available [ =stable ]\n", " 57 ruby3.0 available [ =stable ]\n", " 58 postgresql12 available [ =stable ]\n", " 59 postgresql13 available [ =stable ]\n", " 60 mock2 available [ =stable ]\n", " 61 dnsmasq2.85 available [ =stable ]\n", " 62 kernel-5.15 available [ =stable ]\n", " 63 postgresql14 available [ =stable ]\n", " 64 firefox available [ =stable ]\n", " 65 lustre available [ =stable ]\n", " 66 php8.1 available [ =stable ]\n", " 67 awscli1 available [ =stable ]\n", " 68 php8.2 available [ =stable ]\n", " 69 dnsmasq available [ =stable ]\n", " 70 unbound1.17 available [ =stable ]\n", " 71 golang1.19 available [ =stable ]\n", " 72 collectd-python3 available [ =stable ]\n", "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n", " : update-motd, versionlock\n", "================================== repo: epel ==================================\n", "[epel]\n", "async = True\n", "bandwidth = 0\n", "base_persistdir = /var/lib/yum/repos/x86_64/2\n", "baseurl = \n", "cache = 0\n", "cachedir = /var/cache/yum/x86_64/2/epel\n", "check_config_file_age = True\n", "compare_providers_priority = 80\n", "cost = 1000\n", "deltarpm_metadata_percentage = 100\n", "deltarpm_percentage = \n", "enabled = True\n", "enablegroups = True\n", "exclude = \n", "failovermethod = priority\n", "ftp_disable_epsv = False\n", "gpgcadir = /var/lib/yum/repos/x86_64/2/epel/gpgcadir\n", "gpgcakey = \n", "gpgcheck = True\n", "gpgdir = /var/lib/yum/repos/x86_64/2/epel/gpgdir\n", "gpgkey = file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7\n", "hdrdir = /var/cache/yum/x86_64/2/epel/headers\n", "http_caching = all\n", "includepkgs = \n", "ip_resolve = \n", "keepalive = True\n", "keepcache = False\n", "mddownloadpolicy = sqlite\n", "mdpolicy = group:small\n", "mediaid = \n", "metadata_expire = 21600\n", "metadata_expire_filter = read-only:present\n", "metalink = https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=x86_64\n", "minrate = 0\n", "mirrorlist = \n", "mirrorlist_expire = 86400\n", "name = Extra Packages for Enterprise Linux 7 - x86_64\n", "old_base_cache_dir = \n", "password = \n", "persistdir = /var/lib/yum/repos/x86_64/2/epel\n", "pkgdir = /var/cache/yum/x86_64/2/epel/packages\n", "priority = 99\n", "proxy = False\n", "proxy_dict = \n", "proxy_password = \n", "proxy_username = \n", "repo_gpgcheck = False\n", "report_instanceid = False\n", "retries = 7\n", "skip_if_unavailable = False\n", "ssl_check_cert_permissions = True\n", "sslcacert = \n", "sslclientcert = \n", "sslclientkey = \n", "sslverify = True\n", "throttle = 0\n", "timeout = 5.0\n", "ui_id = epel/x86_64\n", "ui_repoid_vars = releasever,\n", " basearch\n", "username = \n", "\n", "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n", " : update-motd, versionlock\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "https://download.docker.com/linux/centos/2/x86_64/stable/repodata/repomd.xml: [Errno 14] HTTPS Error 404 - Not Found\n", "Trying other mirror.\n", "http://mirror.es.its.nyu.edu/epel/7/x86_64/repodata/repomd.xml: [Errno 12] Timeout on http://mirror.es.its.nyu.edu/epel/7/x86_64/repodata/repomd.xml: (28, 'Failed to connect to mirror.es.its.nyu.edu port 80 after 5001 ms: Timeout was reached')\n", "Trying other mirror.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "286 packages excluded due to repository priority protections\n", "Resolving Dependencies\n", "--> Running transaction check\n", "---> Package git-lfs.x86_64 0:2.10.0-2.el7 will be installed\n", "--> Finished Dependency Resolution\n", "\n", "Dependencies Resolved\n", "\n", "================================================================================\n", " Package Arch Version Repository Size\n", "================================================================================\n", "Installing:\n", " git-lfs x86_64 2.10.0-2.el7 epel 3.7 M\n", "\n", "Transaction Summary\n", "================================================================================\n", "Install 1 Package\n", "\n", "Total download size: 3.7 M\n", "Installed size: 13 M\n", "Downloading packages:\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "warning: /var/cache/yum/x86_64/2/epel/packages/git-lfs-2.10.0-2.el7.x86_64.rpm: Header V4 RSA/SHA256 Signature, key ID 352c64e5: NOKEY\n", "Importing GPG key 0x352C64E5:\n", " Userid : \"Fedora EPEL (7) \"\n", " Fingerprint: 91e9 7d7c 4a5e 96f1 7f3e 888f 6a2f aea2 352c 64e5\n", " Package : epel-release-7-11.noarch (@amzn2extra-epel)\n", " From : /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Public key for git-lfs-2.10.0-2.el7.x86_64.rpm is not installed\n", "Retrieving key from file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7\n", "Running transaction check\n", "Running transaction test\n", "Transaction test succeeded\n", "Running transaction\n", " Installing : git-lfs-2.10.0-2.el7.x86_64 1/1 \n", " Verifying : git-lfs-2.10.0-2.el7.x86_64 1/1 \n", "\n", "Installed:\n", " git-lfs.x86_64 0:2.10.0-2.el7 \n", "\n", "Complete!\n", "Git LFS initialized.\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# install git-lfs\n", "os.system('sudo amazon-linux-extras install epel -y')\n", "os.system('sudo yum-config-manager --enable epel')\n", "os.system('sudo yum install git-lfs -y')\n", "os.system('git lfs install')" ] }, { "cell_type": "code", "execution_count": 3, "id": "6595a6c5-9fae-4bd4-b096-53475ec98294", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cloning into 'stackexchange_title_body_jsonl'...\n", "Cloning into 'stackexchange_titlebody_best_voted_answer_jsonl'...\n", "Cloning into 'stackexchange_title_best_voted_answer_jsonl'...\n", "Cloning into 'stackexchange_titlebody_best_and_down_voted_answer_jsonl'...\n", "Cloning into 'reddit-title-body'...\n", "Cloning into '1B_sentence_embeddings'...\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# clone relevant datasets' github repositories\n", "stacks = ['stackexchange_title_body_jsonl', #25.3M \n", " 'stackexchange_titlebody_best_voted_answer_jsonl', #4.75M\n", " 'stackexchange_title_best_voted_answer_jsonl', #4.75M\n", " 'stackexchange_titlebody_best_and_down_voted_answer_jsonl'] #210K\n", "\n", "os.environ['GIT_LFS_SKIP_SMUDGE'] = \"1\"\n", "\n", "# clone stackexchange repos\n", "for stack in stacks:\n", " os.system(f'git clone https://huggingface.co/datasets/flax-sentence-embeddings/{stack}')\n", "# clone reddit repo\n", "os.system('git clone https://huggingface.co/datasets/sentence-transformers/reddit-title-body')\n", "# clone 1B+ sentence embeddings repo (this one is just for reference)\n", "os.system('git clone https://github.com/AntoineSimoulin/1B_sentence_embeddings')" ] }, { "cell_type": "code", "execution_count": null, "id": "56438029-0f26-491c-b405-f92fb9caeff7", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading 4 StackExchange GitHub datasets into s3://lodestone-rnd/data/\n", "upload: ./networkengineering.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/networkengineering.stackexchange.com.json.gz\n", "upload: ./emacs.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/emacs.stackexchange.com.json.gz\n", "upload: ./christianity.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/christianity.stackexchange.com.json.gz\n", "upload: ./bitcoin.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/bitcoin.stackexchange.com.json.gz\n", "upload: ./academia.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/academia.stackexchange.com.json.gz\n", "upload: ./music.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/music.stackexchange.com.json.gz\n", "upload: ./biology.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/biology.stackexchange.com.json.gz\n", "upload: ./history.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/history.stackexchange.com.json.gz\n", "upload: ./skeptics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/skeptics.stackexchange.com.json.gz\n", "upload: ./anime.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/anime.stackexchange.com.json.gz\n", "upload: ./quant.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/quant.stackexchange.com.json.gz\n", "upload: ./boardgames.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/boardgames.stackexchange.com.json.gz\n", "upload: ./judaism.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/judaism.stackexchange.com.json.gz\n", "upload: ./travel.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/travel.stackexchange.com.json.gz\n", "upload: ./gaming.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gaming.stackexchange.com.json.gz\n", "upload: ./webapps.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/webapps.stackexchange.com.json.gz\n", "upload: ./stats.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/stats.stackexchange.com.json.gz\n", "upload: ./law.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/law.stackexchange.com.json.gz\n", "upload: ./scifi.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/scifi.stackexchange.com.json.gz\n", "upload: ./bicycles.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/bicycles.stackexchange.com.json.gz\n", "upload: ./datascience.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/datascience.stackexchange.com.json.gz\n", "upload: ./softwareengineering.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/softwareengineering.stackexchange.com.json.gz\n", "upload: ./islam.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/islam.stackexchange.com.json.gz\n", "upload: ./craftcms.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/craftcms.stackexchange.com.json.gz\n", "upload: ./diy.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/diy.stackexchange.com.json.gz\n", "upload: ./arduino.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/arduino.stackexchange.com.json.gz\n", "upload: ./raspberrypi.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/raspberrypi.stackexchange.com.json.gz\n", "upload: ./wordpress.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/wordpress.stackexchange.com.json.gz\n", "upload: ./dba.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/dba.stackexchange.com.json.gz\n", "upload: ./apple.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/apple.stackexchange.com.json.gz\n", "upload: ./hinduism.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/hinduism.stackexchange.com.json.gz\n", "upload: ./mechanics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/mechanics.stackexchange.com.json.gz\n", "upload: ./gamedev.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gamedev.stackexchange.com.json.gz\n", "upload: ./writers.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/writers.stackexchange.com.json.gz\n", "upload: ./mathematica.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/mathematica.stackexchange.com.json.gz\n", "upload: ./unix.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/unix.stackexchange.com.json.gz\n", "upload: ./magento.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/magento.stackexchange.com.json.gz\n", "upload: ./ethereum.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/ethereum.stackexchange.com.json.gz\n", "upload: ./electronics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/electronics.stackexchange.com.json.gz\n", "upload: ./cs.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/cs.stackexchange.com.json.gz\n", "upload: ./blender.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/blender.stackexchange.com.json.gz\n", "upload: ./drupal.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/drupal.stackexchange.com.json.gz\n", "upload: ./small_stackexchanges.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/small_stackexchanges.json.gz\n", "upload: ./photo.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/photo.stackexchange.com.json.gz\n", "upload: ./engineering.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/engineering.stackexchange.com.json.gz\n", "upload: ./ux.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/ux.stackexchange.com.json.gz\n", "upload: ./german.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/german.stackexchange.com.json.gz\n", "upload: ./japanese.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/japanese.stackexchange.com.json.gz\n", "upload: ./civicrm.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/civicrm.stackexchange.com.json.gz\n", "upload: ./sharepoint.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/sharepoint.stackexchange.com.json.gz\n", "upload: ./mathoverflow.net.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/mathoverflow.net.json.gz\n", "upload: ./meta.stackoverflow.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/meta.stackoverflow.com.json.gz\n", "upload: ./rpg.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/rpg.stackexchange.com.json.gz\n", "upload: ./crypto.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/crypto.stackexchange.com.json.gz\n", "upload: ./vi.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/vi.stackexchange.com.json.gz\n", "upload: ./graphicdesign.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/graphicdesign.stackexchange.com.json.gz\n", "upload: ./cooking.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/cooking.stackexchange.com.json.gz\n", "upload: ./math.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/math.stackexchange.com.json.gz\n", "upload: ./expressionengine.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/expressionengine.stackexchange.com.json.gz\n", "upload: ./movies.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/movies.stackexchange.com.json.gz\n", "upload: ./salesforce.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/salesforce.stackexchange.com.json.gz\n", "upload: ./physics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/physics.stackexchange.com.json.gz\n", "upload: ./aviation.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/aviation.stackexchange.com.json.gz\n", "upload: ./gardening.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gardening.stackexchange.com.json.gz\n", "upload: ./english.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/english.stackexchange.com.json.gz\n", "upload: ./askubuntu.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/askubuntu.com.json.gz\n", "upload: ./french.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/french.stackexchange.com.json.gz\n", "upload: ./codereview.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/codereview.stackexchange.com.json.gz\n", "upload: ./softwarerecs.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/softwarerecs.stackexchange.com.json.gz\n", "upload: ./rus.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/rus.stackexchange.com.json.gz\n", "upload: ./money.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/money.stackexchange.com.json.gz\n", "upload: ./philosophy.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/philosophy.stackexchange.com.json.gz\n", "upload: ./chemistry.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/chemistry.stackexchange.com.json.gz\n", "upload: ./meta.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/meta.stackexchange.com.json.gz\n", "upload: ./cstheory.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/cstheory.stackexchange.com.json.gz\n", "upload: ./space.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/space.stackexchange.com.json.gz\n", "upload: ./politics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/politics.stackexchange.com.json.gz\n", "upload: ./ell.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/ell.stackexchange.com.json.gz\n", "upload: ./puzzling.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/puzzling.stackexchange.com.json.gz\n", "upload: ./astronomy.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/astronomy.stackexchange.com.json.gz\n", "upload: ./worldbuilding.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/worldbuilding.stackexchange.com.json.gz\n", "upload: ./economics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/economics.stackexchange.com.json.gz\n", "upload: ./workplace.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/workplace.stackexchange.com.json.gz\n", "upload: ./tex.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/tex.stackexchange.com.json.gz\n", "upload: ./android.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/android.stackexchange.com.json.gz\n", "upload: ./gis.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gis.stackexchange.com.json.gz\n", "upload: ./dsp.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/dsp.stackexchange.com.json.gz\n", "upload: ./superuser.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/superuser.com.json.gz\n", "upload: ./english.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/english.stackexchange.com.json.gz\n", "upload: ./meta.serverfault.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/meta.serverfault.com.json.gz\n", "upload: ./scicomp.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/scicomp.stackexchange.com.json.gz\n", "upload: ./askubuntu.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/askubuntu.com.json.gz\n", "upload: ./french.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/french.stackexchange.com.json.gz\n", "upload: ./coffee.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/coffee.stackexchange.com.json.gz\n", "upload: ./codereview.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/codereview.stackexchange.com.json.gz\n", "upload: ./sound.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/sound.stackexchange.com.json.gz\n", "upload: ./opensource.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/opensource.stackexchange.com.json.gz\n", "upload: ./woodworking.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/woodworking.stackexchange.com.json.gz\n", "upload: ./outdoors.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/outdoors.stackexchange.com.json.gz\n", "upload: ./reddit_title_text_2018.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2018.json.gz\n", "upload: ./reddit_title_text_2011.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2011.json.gz\n", "upload: ./reddit_title_text_2020.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2020.json.gz\n", "upload: ./reddit_title_text_2012.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2012.json.gz\n", "upload: ./reddit_title_text_2021.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2021.json.gz\n", "upload: ./reddit_title_text_2019.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2019.json.gz\n", "upload: ./reddit_title_text_2010.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2010.json.gz\n", "upload: ./reddit_title_text_2014.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2014.json.gz\n", "\u001b[32mDone\u001b[0m\n", "Total files uploaded: 12\n", "\n", "\n", "Downloading 9 HuggingFace datasets into s3://lodestone-rnd/data/\n", "Downloading dataset S2ORC_citations_abstracts (39,567,485 pairs) ... " ] } ], "source": [ "# DOWNLOAD GITHUB DATASETS (STACKEXCHANGE (https://huggingface.co/flax-sentence-embeddings) & REDDIT (https://huggingface.co/datasets/sentence-transformers/reddit-title-body))\n", "\n", "# these are the files marked as unsafe by HuggingFace when viewing the each of the datasets' pages\n", "unsafe = [[\"serverfault.com.jsonl.gz\", \"security.stackexchange.com.jsonl.gz\"],\n", " [\"monero.stackexchange.com.jsonl.gz\", \"serverfault.com.jsonl.gz\", \"security.stackexchange.com.jsonl.gz\"],\n", " [\"elementaryos.stackexchange.com.jsonl.gz\", \"monero.stackexchange.com.jsonl.gz\", \"security.stackexchange.com.jsonl.gz\"],\n", " [\"\"]]\n", "\n", "file_counts = []\n", "print('Downloading {:,} StackExchange GitHub datasets into s3://lodestone-rnd/data/'.format(len(stacks)))\n", "for i, stack in enumerate(stacks):\n", " # get the names of all the files in the repository that are not unsafe\n", " files = [file for file in os.listdir(f'/home/ec2-user/SageMaker/{stack}') if file.endswith(\".jsonl.gz\")==True if file not in unsafe[i]]\n", " file_counts.append(len(files))\n", " os.chdir(f'/home/ec2-user/SageMaker/{stack}')\n", " print('Downloading dataset {} ({} files) ... '.format(stack, len(files)), end='', flush=True)\n", " # sequentially pull each dataset from git lfs, stream it to S3, and then delete the local copy to free up disk memory\n", " for file_name in files:\n", " os.system(f'git lfs pull --include={file_name}')\n", " os.system(f'aws s3 cp {file_name} s3://lodestone-rnd/data/{stack}/{file_name[:-9] + \".json.gz\"}')\n", " os.remove(file_name)\n", " os.system('rm -r .git/lfs/objects/*')\n", " if len(os.listdir('.git/objects/pack')) == 4:\n", " os.system('ls -t .git/objects/pack/* | head -2 | xargs rm --')\n", " print('\\033[32m' + 'Done' + '\\033[0m')\n", "print(f'Total files uploaded: {sum(file_counts)}')\n", "\n", "print(\"\\n\")\n", "\n", "print('Downloading {:,} Reddit GitHub dataset into s3://lodestone-rnd/data/'.format(1))\n", "# get the names of all the files in the repository\n", "files = [file for file in os.listdir(f'/home/ec2-user/SageMaker/reddit-title-body') if file.endswith(\".jsonl.gz\")==True]\n", "os.chdir(f'/home/ec2-user/SageMaker/reddit-title-body')\n", "print('Downloading dataset {} ({} files) ... '.format(\"reddit-title-body\", len(files)), end='', flush=True)\n", "# sequentially pull each dataset from git lfs, stream it to S3, and then delete the local copy to free up disk memory\n", "for file_name in files:\n", " os.system(f'git lfs pull --include={file_name}')\n", " os.system(f'aws s3 cp {file_name} s3://lodestone-rnd/data/reddit-title-body/{file_name[:-9] + \".json.gz\"}')\n", " os.remove(file_name)\n", " os.system('rm -r .git/lfs/objects/*')\n", " if len(os.listdir('.git/objects/pack')) == 4:\n", " os.system('ls -t .git/objects/pack/* | head -2 | xargs rm --')\n", "print('\\033[32m' + 'Done' + '\\033[0m')\n", "print(f'Total files uploaded: {len(files)}')\n", "\n", "os.chdir('/home/ec2-user/SageMaker')\n", "\n", "print(\"\\n\")\n", "\n", "# DOWNLOAD HUGGINGFACE DATASETS (https://huggingface.co/datasets/sentence-transformers/embedding-training-data)\n", "\n", "# read dataset information from HuggingFace_datasets.tsv\n", "datasets = pd.read_csv(\n", " 'HuggingFace_datasets.tsv',\n", " index_col=0,\n", " sep='\\t',\n", " dtype={\n", " 'Description': str,\n", " 'Size (#Pairs)': str,\n", " 'Performance': float,\n", " 'Download link': str,\n", " 'Source': str})\n", "datasets['Size (#Pairs)'] = datasets['Size (#Pairs)'].str.replace(',', '').astype(int)\n", "datasets = datasets.to_dict(orient='index')\n", "\n", "print('Downloading {:,} HuggingFace datasets into s3://lodestone-rnd/data/'.format(len(datasets)))\n", "\n", "# stream each of the datasets from the URL provided into S3\n", "# (note that S2ORC_citations_abstracts is larger than 50GB and therefore requires the expected size to be passed into the command line as well)\n", "for d in datasets.keys():\n", " print('Downloading dataset {} ({:,} pairs) ... '.format(d, datasets[d]['Size (#Pairs)']), end='', flush=True)\n", " if d == \"S2ORC_citations_abstracts\":\n", " os.system(f'wget -qO- {datasets[d][\"Download link\"]} | aws s3 cp - s3://lodestone-rnd/data/{d + \".json.gz\"} --expected-size 120259084288')\n", " else:\n", " os.system(f'wget -qO- {datasets[d][\"Download link\"]} | aws s3 cp - s3://lodestone-rnd/data/{d + \".json.gz\"}')\n", " print('\\033[32m' + 'Done' + '\\033[0m')\n", "\n", "print(\"\\n\")\n", "\n", "# DOWNLOAD GOOGLE SHEETS DATASETS (https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0)\n", "\n", "# read dataset information from GoogleSheets_datasets.tsv\n", "datasets = pd.read_csv(\n", " 'GoogleSheets_datasets.tsv',\n", " index_col=0,\n", " sep='\\t',\n", " dtype={\n", " 'Description': str,\n", " 'Size (#Pairs)': str,\n", " 'Performance': float,\n", " 'Download link': str,\n", " 'Source': str})\n", "datasets['Size (#Pairs)'] = datasets['Size (#Pairs)'].str.replace(',', '').astype(int)\n", "datasets = datasets.to_dict(orient='index')\n", "\n", "print('Downloading {:,} 1B+ Google Sheets datasets into s3://lodestone-rnd/data/'.format(len(datasets)))\n", "\n", "# stream each of the datasets from the URL provided into S3\n", "for d in datasets.keys():\n", " print('Downloading dataset {} ({:,} pairs) ... '.format(d, datasets[d]['Size (#Pairs)']), end='', flush=True)\n", " os.system(f'wget -qO- {datasets[d][\"Download link\"]} | aws s3 cp - s3://lodestone-rnd/data/{d + \".json.gz\"}')\n", " print('\\033[32m' + 'Done' + '\\033[0m')\n", "\n", "print(\"\\n\")\n", " \n", "print(f'Successfully downloaded 50 datasets and {621+12+9+36} files into s3://lodestone-rnd/data/')" ] }, { "cell_type": "code", "execution_count": 1, "id": "216d4564-6b69-4688-94ec-a67287134a2d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# clean up (remove the cloned repositories)\n", "import shutil\n", "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_title_body_jsonl\")\n", "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_titlebody_best_voted_answer_jsonl\")\n", "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_title_best_voted_answer_jsonl\")\n", "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_titlebody_best_and_down_voted_answer_jsonl\")\n", "shutil.rmtree(\"/home/ec2-user/SageMaker/reddit-title-body\")\n", "shutil.rmtree(\"/home/ec2-user/SageMaker/1B_sentence_embeddings\")" ] } ], "metadata": { "kernelspec": { "display_name": "conda_pytorch_p310", "language": "python", "name": "conda_pytorch_p310" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }