dylanAtHum commited on Aug 25, 2023

Commit

64ae4c7

unverified ·

1 Parent(s): 55fdd33

Initial Commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

1_Pooling/config.json +4 -0
Data_Records.ipynb +92 -0
Dataloading.ipynb +675 -0
README.md +1937 -0
Replication.txt +40 -0
Training.py +465 -0
bert_layers.py +1072 -0
bert_padding.py +159 -0
config.json +33 -0
config_sentence_transformers.json +7 -0
configuration_bert.py +25 -0
data_records.json +1 -0
flash_attn_triton.py +1112 -0
modules.json +20 -0
mteb_results/AmazonCounterfactualClassification.json +29 -0
mteb_results/AmazonPolarityClassification.json +15 -0
mteb_results/AmazonReviewsClassification.json +25 -0
mteb_results/ArguAna.json +38 -0
mteb_results/ArxivClusteringP2P.json +10 -0
mteb_results/ArxivClusteringS2S.json +10 -0
mteb_results/AskUbuntuDupQuestions.json +10 -0
mteb_results/BIOSSES.json +20 -0
mteb_results/Banking77Classification.json +13 -0
mteb_results/BiorxivClusteringP2P.json +10 -0
mteb_results/BiorxivClusteringS2S.json +10 -0
mteb_results/CQADupstackEnglishRetrieval.json +38 -0
mteb_results/ClimateFEVER.json +38 -0
mteb_results/DBPedia.json +38 -0
mteb_results/EmotionClassification.json +21 -0
mteb_results/FEVER.json +38 -0
mteb_results/FiQA2018.json +38 -0
mteb_results/HotpotQA.json +38 -0
mteb_results/ImdbClassification.json +15 -0
mteb_results/MSMARCO.json +38 -0
mteb_results/MTOPDomainClassification.json +25 -0
mteb_results/MTOPIntentClassification.json +25 -0
mteb_results/MassiveIntentClassification.json +25 -0
mteb_results/MassiveScenarioClassification.json +25 -0
mteb_results/MedrxivClusteringP2P.json +10 -0
mteb_results/MedrxivClusteringS2S.json +10 -0
mteb_results/MindSmallReranking.json +10 -0
mteb_results/NFCorpus.json +38 -0
mteb_results/NQ.json +38 -0
mteb_results/QuoraRetrieval.json +38 -0
mteb_results/RedditClustering.json +10 -0
mteb_results/RedditClusteringP2P.json +10 -0
mteb_results/SCIDOCS.json +38 -0
mteb_results/SICK-R.json +20 -0
mteb_results/STS12.json +20 -0
mteb_results/STS13.json +20 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_mean_tokens": true
+}

Data_Records.ipynb ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e66bbb77-71f5-4d80-b766-f67144ea7a93",
+   "metadata": {},
+   "source": [
+    "# Data Records\n",
+    "\n",
+    "## This notebook generates the data_records.json file where each entry in the resulting dictionary follows the form {filename: num_records} for every dataset we will use during training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "74ad6613-44ff-435e-8550-df993e915677",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# import relevant libraries\n",
+    "import os\n",
+    "import boto3\n",
+    "import json\n",
+    "from smart_open import open"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2d53761-da0e-44f4-8a3e-1285bf810b03",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "s3 = boto3.resource('s3')\n",
+    "my_bucket = s3.Bucket('lodestone-rnd')\n",
+    "\n",
+    "# collect all filenames from the data/ directory of the lodestone-rnd S3 bucket\n",
+    "files = [\"\"]*((621+12+9+36)+1)\n",
+    "for i, object_summary in enumerate(my_bucket.objects.filter(Prefix=\"data/\")):\n",
+    "    files[i] = object_summary.key[5:]\n",
+    "files = files[1:]\n",
+    "files = [file for file in files if file != 'cnn_dailymail_splitted.json.gz']\n",
+    "\n",
+    "s3_client = boto3.client(\"s3\")\n",
+    "\n",
+    "# for each training dataset, store the number of records in a dictionary with the following form {filename: num_records}\n",
+    "data_lengths = {}\n",
+    "for file in files:\n",
+    "    source_uri = f's3://lodestone-rnd/data/{file}'\n",
+    "    # S2ORC_citations_abstracts.json.gz and amazon-qa.json.gz must be handled differently since each line in their training\n",
+    "    # data is split into multiple records due to the fact that each query has multiple positive pair responses\n",
+    "    if file in ['S2ORC_citations_abstracts.json.gz','amazon-qa.json.gz']:\n",
+    "        length = 0\n",
+    "        for json_line in open(source_uri, transport_params={\"client\": s3_client}):\n",
+    "            data = json.loads(json_line.strip())\n",
+    "            length += len(data['pos'])\n",
+    "    else:\n",
+    "        length = int(os.popen(f'aws s3 cp {source_uri} - | zcat | wc -l').read().rstrip())\n",
+    "    data_lengths[f'{file}'] = length\n",
+    "    \n",
+    "# write the resulting dictionary to a .json file for future use during training\n",
+    "with open('data_records.json', 'w') as fileout:\n",
+    "    json.dump(data_lengths, fileout)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_pytorch_p310",
+   "language": "python",
+   "name": "conda_pytorch_p310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Dataloading.ipynb ADDED Viewed

	@@ -0,0 +1,675 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3f9ce240-fd1a-4550-83c7-8cf9658b1d3a",
+   "metadata": {},
+   "source": [
+    "# Dataloading (1B+ Training Pairs)\n",
+    "\n",
+    "## This notebook collects and uploads all 50 relevant sentence embedding datasets to S3 as .json.gz files where each line contains one training record"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "d7af8d0e-b3ed-4007-abdd-5952d775e119",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "\n",
+    "os.chdir('/home/ec2-user/SageMaker')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "686699fb-d2c9-4653-afca-580aef343451",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n",
+      "              : update-motd, versionlock\n",
+      "Cleaning repos: amzn2-core amzn2extra-docker amzn2extra-epel\n",
+      "              : amzn2extra-kernel-5.10 amzn2extra-python3.8 centos-extras\n",
+      "              : copr:copr.fedorainfracloud.org:vbatts:shadow-utils-newxidmap\n",
+      "              : docker-ce-stable libnvidia-container neuron\n",
+      "21 metadata files removed\n",
+      "15 sqlite files removed\n",
+      "0 metadata files removed\n",
+      "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n",
+      "              : update-motd, versionlock\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "https://download.docker.com/linux/centos/2/x86_64/stable/repodata/repomd.xml: [Errno 14] HTTPS Error 404 - Not Found\n",
+      "Trying other mirror.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "62 packages excluded due to repository priority protections\n",
+      "Resolving Dependencies\n",
+      "--> Running transaction check\n",
+      "---> Package epel-release.noarch 0:7-11 will be installed\n",
+      "--> Finished Dependency Resolution\n",
+      "\n",
+      "Dependencies Resolved\n",
+      "\n",
+      "================================================================================\n",
+      " Package              Arch           Version      Repository               Size\n",
+      "================================================================================\n",
+      "Installing:\n",
+      " epel-release         noarch         7-11         amzn2extra-epel          15 k\n",
+      "\n",
+      "Transaction Summary\n",
+      "================================================================================\n",
+      "Install  1 Package\n",
+      "\n",
+      "Total download size: 15 k\n",
+      "Installed size: 24 k\n",
+      "Downloading packages:\n",
+      "Running transaction check\n",
+      "Running transaction test\n",
+      "Transaction test succeeded\n",
+      "Running transaction\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning: RPMDB altered outside of yum.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Installing : epel-release-7-11.noarch                                     1/1 \n",
+      "  Verifying  : epel-release-7-11.noarch                                     1/1 \n",
+      "\n",
+      "Installed:\n",
+      "  epel-release.noarch 0:7-11                                                    \n",
+      "\n",
+      "Complete!\n",
+      "Installing epel-release\n",
+      "  0  ansible2                 available    \\\n",
+      "        [ =2.4.2  =2.4.6  =2.8  =stable ]\n",
+      "  2  httpd_modules            available    [ =1.0  =stable ]\n",
+      "  3  memcached1.5             available    \\\n",
+      "        [ =1.5.1  =1.5.16  =1.5.17 ]\n",
+      "  6  postgresql10             available    [ =10  =stable ]\n",
+      "  9  R3.4                     available    [ =3.4.3  =stable ]\n",
+      " 10  rust1                    available    \\\n",
+      "        [ =1.22.1  =1.26.0  =1.26.1  =1.27.2  =1.31.0  =1.38.0\n",
+      "          =stable ]\n",
+      " 18  libreoffice              available    \\\n",
+      "        [ =5.0.6.2_15  =5.3.6.1  =stable ]\n",
+      " 19  gimp                     available    [ =2.8.22 ]\n",
+      " 20  docker=latest            enabled      \\\n",
+      "        [ =17.12.1  =18.03.1  =18.06.1  =18.09.9  =stable ]\n",
+      " 21  mate-desktop1.x          available    \\\n",
+      "        [ =1.19.0  =1.20.0  =stable ]\n",
+      " 22  GraphicsMagick1.3        available    \\\n",
+      "        [ =1.3.29  =1.3.32  =1.3.34  =stable ]\n",
+      " 23  tomcat8.5                available    \\\n",
+      "        [ =8.5.31  =8.5.32  =8.5.38  =8.5.40  =8.5.42  =8.5.50\n",
+      "          =stable ]\n",
+      " 24  epel=latest              enabled      [ =7.11  =stable ]\n",
+      " 25  testing                  available    [ =1.0  =stable ]\n",
+      " 26  ecs                      available    [ =stable ]\n",
+      " 27  corretto8                available    \\\n",
+      "        [ =1.8.0_192  =1.8.0_202  =1.8.0_212  =1.8.0_222  =1.8.0_232\n",
+      "          =1.8.0_242  =stable ]\n",
+      " 29  golang1.11               available    \\\n",
+      "        [ =1.11.3  =1.11.11  =1.11.13  =stable ]\n",
+      " 30  squid4                   available    [ =4  =stable ]\n",
+      " 32  lustre2.10               available    \\\n",
+      "        [ =2.10.5  =2.10.8  =stable ]\n",
+      " 33  java-openjdk11           available    [ =11  =stable ]\n",
+      " 34  lynis                    available    [ =stable ]\n",
+      " 36  BCC                      available    [ =0.x  =stable ]\n",
+      " 37  mono                     available    [ =5.x  =stable ]\n",
+      " 38  nginx1                   available    [ =stable ]\n",
+      " 40  mock                     available    [ =stable ]\n",
+      " 41  postgresql11             available    [ =11  =stable ]\n",
+      " 43  livepatch                available    [ =stable ]\n",
+      " 44  python3.8=latest         enabled      [ =stable ]\n",
+      " 45  haproxy2                 available    [ =stable ]\n",
+      " 46  collectd                 available    [ =stable ]\n",
+      " 47  aws-nitro-enclaves-cli   available    [ =stable ]\n",
+      " 48  R4                       available    [ =stable ]\n",
+      "  _  kernel-5.4               available    [ =stable ]\n",
+      " 50  selinux-ng               available    [ =stable ]\n",
+      " 51  php8.0                   available    [ =stable ]\n",
+      " 52  tomcat9                  available    [ =stable ]\n",
+      " 53  unbound1.13              available    [ =stable ]\n",
+      " 54  mariadb10.5              available    [ =stable ]\n",
+      " 55  kernel-5.10=latest       enabled      [ =stable ]\n",
+      " 56  redis6                   available    [ =stable ]\n",
+      " 57  ruby3.0                  available    [ =stable ]\n",
+      " 58  postgresql12             available    [ =stable ]\n",
+      " 59  postgresql13             available    [ =stable ]\n",
+      " 60  mock2                    available    [ =stable ]\n",
+      " 61  dnsmasq2.85              available    [ =stable ]\n",
+      " 62  kernel-5.15              available    [ =stable ]\n",
+      " 63  postgresql14             available    [ =stable ]\n",
+      " 64  firefox                  available    [ =stable ]\n",
+      " 65  lustre                   available    [ =stable ]\n",
+      " 66  php8.1                   available    [ =stable ]\n",
+      " 67  awscli1                  available    [ =stable ]\n",
+      " 68  php8.2                   available    [ =stable ]\n",
+      " 69  dnsmasq                  available    [ =stable ]\n",
+      " 70  unbound1.17              available    [ =stable ]\n",
+      " 71  golang1.19               available    [ =stable ]\n",
+      " 72  collectd-python3         available    [ =stable ]\n",
+      "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n",
+      "              : update-motd, versionlock\n",
+      "================================== repo: epel ==================================\n",
+      "[epel]\n",
+      "async = True\n",
+      "bandwidth = 0\n",
+      "base_persistdir = /var/lib/yum/repos/x86_64/2\n",
+      "baseurl = \n",
+      "cache = 0\n",
+      "cachedir = /var/cache/yum/x86_64/2/epel\n",
+      "check_config_file_age = True\n",
+      "compare_providers_priority = 80\n",
+      "cost = 1000\n",
+      "deltarpm_metadata_percentage = 100\n",
+      "deltarpm_percentage = \n",
+      "enabled = True\n",
+      "enablegroups = True\n",
+      "exclude = \n",
+      "failovermethod = priority\n",
+      "ftp_disable_epsv = False\n",
+      "gpgcadir = /var/lib/yum/repos/x86_64/2/epel/gpgcadir\n",
+      "gpgcakey = \n",
+      "gpgcheck = True\n",
+      "gpgdir = /var/lib/yum/repos/x86_64/2/epel/gpgdir\n",
+      "gpgkey = file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7\n",
+      "hdrdir = /var/cache/yum/x86_64/2/epel/headers\n",
+      "http_caching = all\n",
+      "includepkgs = \n",
+      "ip_resolve = \n",
+      "keepalive = True\n",
+      "keepcache = False\n",
+      "mddownloadpolicy = sqlite\n",
+      "mdpolicy = group:small\n",
+      "mediaid = \n",
+      "metadata_expire = 21600\n",
+      "metadata_expire_filter = read-only:present\n",
+      "metalink = https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=x86_64\n",
+      "minrate = 0\n",
+      "mirrorlist = \n",
+      "mirrorlist_expire = 86400\n",
+      "name = Extra Packages for Enterprise Linux 7 - x86_64\n",
+      "old_base_cache_dir = \n",
+      "password = \n",
+      "persistdir = /var/lib/yum/repos/x86_64/2/epel\n",
+      "pkgdir = /var/cache/yum/x86_64/2/epel/packages\n",
+      "priority = 99\n",
+      "proxy = False\n",
+      "proxy_dict = \n",
+      "proxy_password = \n",
+      "proxy_username = \n",
+      "repo_gpgcheck = False\n",
+      "report_instanceid = False\n",
+      "retries = 7\n",
+      "skip_if_unavailable = False\n",
+      "ssl_check_cert_permissions = True\n",
+      "sslcacert = \n",
+      "sslclientcert = \n",
+      "sslclientkey = \n",
+      "sslverify = True\n",
+      "throttle = 0\n",
+      "timeout = 5.0\n",
+      "ui_id = epel/x86_64\n",
+      "ui_repoid_vars = releasever,\n",
+      "   basearch\n",
+      "username = \n",
+      "\n",
+      "Loaded plugins: dkms-build-requires, extras_suggestions, langpacks, priorities,\n",
+      "              : update-motd, versionlock\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "https://download.docker.com/linux/centos/2/x86_64/stable/repodata/repomd.xml: [Errno 14] HTTPS Error 404 - Not Found\n",
+      "Trying other mirror.\n",
+      "http://mirror.es.its.nyu.edu/epel/7/x86_64/repodata/repomd.xml: [Errno 12] Timeout on http://mirror.es.its.nyu.edu/epel/7/x86_64/repodata/repomd.xml: (28, 'Failed to connect to mirror.es.its.nyu.edu port 80 after 5001 ms: Timeout was reached')\n",
+      "Trying other mirror.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "286 packages excluded due to repository priority protections\n",
+      "Resolving Dependencies\n",
+      "--> Running transaction check\n",
+      "---> Package git-lfs.x86_64 0:2.10.0-2.el7 will be installed\n",
+      "--> Finished Dependency Resolution\n",
+      "\n",
+      "Dependencies Resolved\n",
+      "\n",
+      "================================================================================\n",
+      " Package           Arch             Version                Repository      Size\n",
+      "================================================================================\n",
+      "Installing:\n",
+      " git-lfs           x86_64           2.10.0-2.el7           epel           3.7 M\n",
+      "\n",
+      "Transaction Summary\n",
+      "================================================================================\n",
+      "Install  1 Package\n",
+      "\n",
+      "Total download size: 3.7 M\n",
+      "Installed size: 13 M\n",
+      "Downloading packages:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "warning: /var/cache/yum/x86_64/2/epel/packages/git-lfs-2.10.0-2.el7.x86_64.rpm: Header V4 RSA/SHA256 Signature, key ID 352c64e5: NOKEY\n",
+      "Importing GPG key 0x352C64E5:\n",
+      " Userid     : \"Fedora EPEL (7) <epel@fedoraproject.org>\"\n",
+      " Fingerprint: 91e9 7d7c 4a5e 96f1 7f3e 888f 6a2f aea2 352c 64e5\n",
+      " Package    : epel-release-7-11.noarch (@amzn2extra-epel)\n",
+      " From       : /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Public key for git-lfs-2.10.0-2.el7.x86_64.rpm is not installed\n",
+      "Retrieving key from file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7\n",
+      "Running transaction check\n",
+      "Running transaction test\n",
+      "Transaction test succeeded\n",
+      "Running transaction\n",
+      "  Installing : git-lfs-2.10.0-2.el7.x86_64                                  1/1 \n",
+      "  Verifying  : git-lfs-2.10.0-2.el7.x86_64                                  1/1 \n",
+      "\n",
+      "Installed:\n",
+      "  git-lfs.x86_64 0:2.10.0-2.el7                                                 \n",
+      "\n",
+      "Complete!\n",
+      "Git LFS initialized.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# install git-lfs\n",
+    "os.system('sudo amazon-linux-extras install epel -y')\n",
+    "os.system('sudo yum-config-manager --enable epel')\n",
+    "os.system('sudo yum install git-lfs -y')\n",
+    "os.system('git lfs install')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6595a6c5-9fae-4bd4-b096-53475ec98294",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Cloning into 'stackexchange_title_body_jsonl'...\n",
+      "Cloning into 'stackexchange_titlebody_best_voted_answer_jsonl'...\n",
+      "Cloning into 'stackexchange_title_best_voted_answer_jsonl'...\n",
+      "Cloning into 'stackexchange_titlebody_best_and_down_voted_answer_jsonl'...\n",
+      "Cloning into 'reddit-title-body'...\n",
+      "Cloning into '1B_sentence_embeddings'...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# clone relevant datasets' github repositories\n",
+    "stacks = ['stackexchange_title_body_jsonl', #25.3M \n",
+    "          'stackexchange_titlebody_best_voted_answer_jsonl', #4.75M\n",
+    "          'stackexchange_title_best_voted_answer_jsonl', #4.75M\n",
+    "          'stackexchange_titlebody_best_and_down_voted_answer_jsonl'] #210K\n",
+    "\n",
+    "os.environ['GIT_LFS_SKIP_SMUDGE'] = \"1\"\n",
+    "\n",
+    "# clone stackexchange repos\n",
+    "for stack in stacks:\n",
+    "    os.system(f'git clone https://huggingface.co/datasets/flax-sentence-embeddings/{stack}')\n",
+    "# clone reddit repo\n",
+    "os.system('git clone https://huggingface.co/datasets/sentence-transformers/reddit-title-body')\n",
+    "# clone 1B+ sentence embeddings repo (this one is just for reference)\n",
+    "os.system('git clone https://github.com/AntoineSimoulin/1B_sentence_embeddings')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56438029-0f26-491c-b405-f92fb9caeff7",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading 4 StackExchange GitHub datasets into s3://lodestone-rnd/data/\n",
+      "upload: ./networkengineering.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/networkengineering.stackexchange.com.json.gz\n",
+      "upload: ./emacs.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/emacs.stackexchange.com.json.gz\n",
+      "upload: ./christianity.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/christianity.stackexchange.com.json.gz\n",
+      "upload: ./bitcoin.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/bitcoin.stackexchange.com.json.gz\n",
+      "upload: ./academia.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/academia.stackexchange.com.json.gz\n",
+      "upload: ./music.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/music.stackexchange.com.json.gz\n",
+      "upload: ./biology.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/biology.stackexchange.com.json.gz\n",
+      "upload: ./history.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/history.stackexchange.com.json.gz\n",
+      "upload: ./skeptics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/skeptics.stackexchange.com.json.gz\n",
+      "upload: ./anime.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/anime.stackexchange.com.json.gz\n",
+      "upload: ./quant.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/quant.stackexchange.com.json.gz\n",
+      "upload: ./boardgames.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/boardgames.stackexchange.com.json.gz\n",
+      "upload: ./judaism.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/judaism.stackexchange.com.json.gz\n",
+      "upload: ./travel.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/travel.stackexchange.com.json.gz\n",
+      "upload: ./gaming.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gaming.stackexchange.com.json.gz\n",
+      "upload: ./webapps.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/webapps.stackexchange.com.json.gz\n",
+      "upload: ./stats.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/stats.stackexchange.com.json.gz\n",
+      "upload: ./law.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/law.stackexchange.com.json.gz\n",
+      "upload: ./scifi.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/scifi.stackexchange.com.json.gz\n",
+      "upload: ./bicycles.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/bicycles.stackexchange.com.json.gz\n",
+      "upload: ./datascience.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/datascience.stackexchange.com.json.gz\n",
+      "upload: ./softwareengineering.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/softwareengineering.stackexchange.com.json.gz\n",
+      "upload: ./islam.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/islam.stackexchange.com.json.gz\n",
+      "upload: ./craftcms.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/craftcms.stackexchange.com.json.gz\n",
+      "upload: ./diy.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/diy.stackexchange.com.json.gz\n",
+      "upload: ./arduino.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/arduino.stackexchange.com.json.gz\n",
+      "upload: ./raspberrypi.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/raspberrypi.stackexchange.com.json.gz\n",
+      "upload: ./wordpress.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/wordpress.stackexchange.com.json.gz\n",
+      "upload: ./dba.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/dba.stackexchange.com.json.gz\n",
+      "upload: ./apple.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/apple.stackexchange.com.json.gz\n",
+      "upload: ./hinduism.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/hinduism.stackexchange.com.json.gz\n",
+      "upload: ./mechanics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/mechanics.stackexchange.com.json.gz\n",
+      "upload: ./gamedev.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gamedev.stackexchange.com.json.gz\n",
+      "upload: ./writers.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/writers.stackexchange.com.json.gz\n",
+      "upload: ./mathematica.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/mathematica.stackexchange.com.json.gz\n",
+      "upload: ./unix.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/unix.stackexchange.com.json.gz\n",
+      "upload: ./magento.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/magento.stackexchange.com.json.gz\n",
+      "upload: ./ethereum.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/ethereum.stackexchange.com.json.gz\n",
+      "upload: ./electronics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/electronics.stackexchange.com.json.gz\n",
+      "upload: ./cs.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/cs.stackexchange.com.json.gz\n",
+      "upload: ./blender.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/blender.stackexchange.com.json.gz\n",
+      "upload: ./drupal.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/drupal.stackexchange.com.json.gz\n",
+      "upload: ./small_stackexchanges.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/small_stackexchanges.json.gz\n",
+      "upload: ./photo.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/photo.stackexchange.com.json.gz\n",
+      "upload: ./engineering.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/engineering.stackexchange.com.json.gz\n",
+      "upload: ./ux.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/ux.stackexchange.com.json.gz\n",
+      "upload: ./german.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/german.stackexchange.com.json.gz\n",
+      "upload: ./japanese.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/japanese.stackexchange.com.json.gz\n",
+      "upload: ./civicrm.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/civicrm.stackexchange.com.json.gz\n",
+      "upload: ./sharepoint.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/sharepoint.stackexchange.com.json.gz\n",
+      "upload: ./mathoverflow.net.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/mathoverflow.net.json.gz\n",
+      "upload: ./meta.stackoverflow.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/meta.stackoverflow.com.json.gz\n",
+      "upload: ./rpg.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/rpg.stackexchange.com.json.gz\n",
+      "upload: ./crypto.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/crypto.stackexchange.com.json.gz\n",
+      "upload: ./vi.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/vi.stackexchange.com.json.gz\n",
+      "upload: ./graphicdesign.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/graphicdesign.stackexchange.com.json.gz\n",
+      "upload: ./cooking.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/cooking.stackexchange.com.json.gz\n",
+      "upload: ./math.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/math.stackexchange.com.json.gz\n",
+      "upload: ./expressionengine.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/expressionengine.stackexchange.com.json.gz\n",
+      "upload: ./movies.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/movies.stackexchange.com.json.gz\n",
+      "upload: ./salesforce.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/salesforce.stackexchange.com.json.gz\n",
+      "upload: ./physics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/physics.stackexchange.com.json.gz\n",
+      "upload: ./aviation.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/aviation.stackexchange.com.json.gz\n",
+      "upload: ./gardening.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gardening.stackexchange.com.json.gz\n",
+      "upload: ./english.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/english.stackexchange.com.json.gz\n",
+      "upload: ./askubuntu.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/askubuntu.com.json.gz\n",
+      "upload: ./french.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/french.stackexchange.com.json.gz\n",
+      "upload: ./codereview.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/codereview.stackexchange.com.json.gz\n",
+      "upload: ./softwarerecs.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/softwarerecs.stackexchange.com.json.gz\n",
+      "upload: ./rus.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/rus.stackexchange.com.json.gz\n",
+      "upload: ./money.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/money.stackexchange.com.json.gz\n",
+      "upload: ./philosophy.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/philosophy.stackexchange.com.json.gz\n",
+      "upload: ./chemistry.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/chemistry.stackexchange.com.json.gz\n",
+      "upload: ./meta.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/meta.stackexchange.com.json.gz\n",
+      "upload: ./cstheory.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/cstheory.stackexchange.com.json.gz\n",
+      "upload: ./space.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/space.stackexchange.com.json.gz\n",
+      "upload: ./politics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/politics.stackexchange.com.json.gz\n",
+      "upload: ./ell.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/ell.stackexchange.com.json.gz\n",
+      "upload: ./puzzling.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/puzzling.stackexchange.com.json.gz\n",
+      "upload: ./astronomy.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/astronomy.stackexchange.com.json.gz\n",
+      "upload: ./worldbuilding.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/worldbuilding.stackexchange.com.json.gz\n",
+      "upload: ./economics.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/economics.stackexchange.com.json.gz\n",
+      "upload: ./workplace.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/workplace.stackexchange.com.json.gz\n",
+      "upload: ./tex.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/tex.stackexchange.com.json.gz\n",
+      "upload: ./android.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/android.stackexchange.com.json.gz\n",
+      "upload: ./gis.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/gis.stackexchange.com.json.gz\n",
+      "upload: ./dsp.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/dsp.stackexchange.com.json.gz\n",
+      "upload: ./superuser.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_body_jsonl/superuser.com.json.gz\n",
+      "upload: ./english.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/english.stackexchange.com.json.gz\n",
+      "upload: ./meta.serverfault.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/meta.serverfault.com.json.gz\n",
+      "upload: ./scicomp.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/scicomp.stackexchange.com.json.gz\n",
+      "upload: ./askubuntu.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/askubuntu.com.json.gz\n",
+      "upload: ./french.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/french.stackexchange.com.json.gz\n",
+      "upload: ./coffee.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/coffee.stackexchange.com.json.gz\n",
+      "upload: ./codereview.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/codereview.stackexchange.com.json.gz\n",
+      "upload: ./sound.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/sound.stackexchange.com.json.gz\n",
+      "upload: ./opensource.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/opensource.stackexchange.com.json.gz\n",
+      "upload: ./woodworking.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/woodworking.stackexchange.com.json.gz\n",
+      "upload: ./outdoors.stackexchange.com.jsonl.gz to s3://lodestone-rnd/data/stackexchange_title_best_voted_answer_jsonl/outdoors.stackexchange.com.json.gz\n",
+      "upload: ./reddit_title_text_2018.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2018.json.gz\n",
+      "upload: ./reddit_title_text_2011.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2011.json.gz\n",
+      "upload: ./reddit_title_text_2020.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2020.json.gz\n",
+      "upload: ./reddit_title_text_2012.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2012.json.gz\n",
+      "upload: ./reddit_title_text_2021.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2021.json.gz\n",
+      "upload: ./reddit_title_text_2019.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2019.json.gz\n",
+      "upload: ./reddit_title_text_2010.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2010.json.gz\n",
+      "upload: ./reddit_title_text_2014.jsonl.gz to s3://lodestone-rnd/data/reddit-title-body/reddit_title_text_2014.json.gz\n",
+      "\u001b[32mDone\u001b[0m\n",
+      "Total files uploaded: 12\n",
+      "\n",
+      "\n",
+      "Downloading 9 HuggingFace datasets into s3://lodestone-rnd/data/\n",
+      "Downloading dataset S2ORC_citations_abstracts (39,567,485 pairs) ... "
+     ]
+    }
+   ],
+   "source": [
+    "# DOWNLOAD GITHUB DATASETS (STACKEXCHANGE (https://huggingface.co/flax-sentence-embeddings) & REDDIT (https://huggingface.co/datasets/sentence-transformers/reddit-title-body))\n",
+    "\n",
+    "# these are the files marked as unsafe by HuggingFace when viewing the each of the datasets' pages\n",
+    "unsafe = [[\"serverfault.com.jsonl.gz\", \"security.stackexchange.com.jsonl.gz\"],\n",
+    "          [\"monero.stackexchange.com.jsonl.gz\", \"serverfault.com.jsonl.gz\", \"security.stackexchange.com.jsonl.gz\"],\n",
+    "          [\"elementaryos.stackexchange.com.jsonl.gz\", \"monero.stackexchange.com.jsonl.gz\", \"security.stackexchange.com.jsonl.gz\"],\n",
+    "          [\"\"]]\n",
+    "\n",
+    "file_counts = []\n",
+    "print('Downloading {:,} StackExchange GitHub datasets into s3://lodestone-rnd/data/'.format(len(stacks)))\n",
+    "for i, stack in enumerate(stacks):\n",
+    "    # get the names of all the files in the repository that are not unsafe\n",
+    "    files = [file for file in os.listdir(f'/home/ec2-user/SageMaker/{stack}') if file.endswith(\".jsonl.gz\")==True if file not in unsafe[i]]\n",
+    "    file_counts.append(len(files))\n",
+    "    os.chdir(f'/home/ec2-user/SageMaker/{stack}')\n",
+    "    print('Downloading dataset {} ({} files) ... '.format(stack, len(files)), end='', flush=True)\n",
+    "    # sequentially pull each dataset from git lfs, stream it to S3, and then delete the local copy to free up disk memory\n",
+    "    for file_name in files:\n",
+    "        os.system(f'git lfs pull --include={file_name}')\n",
+    "        os.system(f'aws s3 cp {file_name} s3://lodestone-rnd/data/{stack}/{file_name[:-9] + \".json.gz\"}')\n",
+    "        os.remove(file_name)\n",
+    "        os.system('rm -r .git/lfs/objects/*')\n",
+    "        if len(os.listdir('.git/objects/pack')) == 4:\n",
+    "            os.system('ls -t .git/objects/pack/* | head -2 | xargs rm --')\n",
+    "    print('\\033[32m' + 'Done' + '\\033[0m')\n",
+    "print(f'Total files uploaded: {sum(file_counts)}')\n",
+    "\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "print('Downloading {:,} Reddit GitHub dataset into s3://lodestone-rnd/data/'.format(1))\n",
+    "# get the names of all the files in the repository\n",
+    "files = [file for file in os.listdir(f'/home/ec2-user/SageMaker/reddit-title-body') if file.endswith(\".jsonl.gz\")==True]\n",
+    "os.chdir(f'/home/ec2-user/SageMaker/reddit-title-body')\n",
+    "print('Downloading dataset {} ({} files) ... '.format(\"reddit-title-body\", len(files)), end='', flush=True)\n",
+    "# sequentially pull each dataset from git lfs, stream it to S3, and then delete the local copy to free up disk memory\n",
+    "for file_name in files:\n",
+    "    os.system(f'git lfs pull --include={file_name}')\n",
+    "    os.system(f'aws s3 cp {file_name} s3://lodestone-rnd/data/reddit-title-body/{file_name[:-9] + \".json.gz\"}')\n",
+    "    os.remove(file_name)\n",
+    "    os.system('rm -r .git/lfs/objects/*')\n",
+    "    if len(os.listdir('.git/objects/pack')) == 4:\n",
+    "            os.system('ls -t .git/objects/pack/* | head -2 | xargs rm --')\n",
+    "print('\\033[32m' + 'Done' + '\\033[0m')\n",
+    "print(f'Total files uploaded: {len(files)}')\n",
+    "\n",
+    "os.chdir('/home/ec2-user/SageMaker')\n",
+    "\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "# DOWNLOAD HUGGINGFACE DATASETS (https://huggingface.co/datasets/sentence-transformers/embedding-training-data)\n",
+    "\n",
+    "# read dataset information from HuggingFace_datasets.tsv\n",
+    "datasets = pd.read_csv(\n",
+    "    'HuggingFace_datasets.tsv',\n",
+    "    index_col=0,\n",
+    "    sep='\\t',\n",
+    "    dtype={\n",
+    "          'Description': str,\n",
+    "          'Size (#Pairs)': str,\n",
+    "          'Performance': float,\n",
+    "          'Download link': str,\n",
+    "          'Source': str})\n",
+    "datasets['Size (#Pairs)'] = datasets['Size (#Pairs)'].str.replace(',', '').astype(int)\n",
+    "datasets = datasets.to_dict(orient='index')\n",
+    "\n",
+    "print('Downloading {:,} HuggingFace datasets into s3://lodestone-rnd/data/'.format(len(datasets)))\n",
+    "\n",
+    "# stream each of the datasets from the URL provided into S3\n",
+    "# (note that S2ORC_citations_abstracts is larger than 50GB and therefore requires the expected size to be passed into the command line as well)\n",
+    "for d in datasets.keys():\n",
+    "    print('Downloading dataset {} ({:,} pairs) ... '.format(d, datasets[d]['Size (#Pairs)']), end='', flush=True)\n",
+    "    if d == \"S2ORC_citations_abstracts\":\n",
+    "        os.system(f'wget -qO- {datasets[d][\"Download link\"]} | aws s3 cp - s3://lodestone-rnd/data/{d + \".json.gz\"} --expected-size 120259084288')\n",
+    "    else:\n",
+    "        os.system(f'wget -qO- {datasets[d][\"Download link\"]} | aws s3 cp - s3://lodestone-rnd/data/{d + \".json.gz\"}')\n",
+    "    print('\\033[32m' + 'Done' + '\\033[0m')\n",
+    "\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "# DOWNLOAD GOOGLE SHEETS DATASETS (https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0)\n",
+    "\n",
+    "# read dataset information from GoogleSheets_datasets.tsv\n",
+    "datasets = pd.read_csv(\n",
+    "    'GoogleSheets_datasets.tsv',\n",
+    "    index_col=0,\n",
+    "    sep='\\t',\n",
+    "    dtype={\n",
+    "          'Description': str,\n",
+    "          'Size (#Pairs)': str,\n",
+    "          'Performance': float,\n",
+    "          'Download link': str,\n",
+    "          'Source': str})\n",
+    "datasets['Size (#Pairs)'] = datasets['Size (#Pairs)'].str.replace(',', '').astype(int)\n",
+    "datasets = datasets.to_dict(orient='index')\n",
+    "\n",
+    "print('Downloading {:,} 1B+ Google Sheets datasets into s3://lodestone-rnd/data/'.format(len(datasets)))\n",
+    "\n",
+    "# stream each of the datasets from the URL provided into S3\n",
+    "for d in datasets.keys():\n",
+    "    print('Downloading dataset {} ({:,} pairs) ... '.format(d, datasets[d]['Size (#Pairs)']), end='', flush=True)\n",
+    "    os.system(f'wget -qO- {datasets[d][\"Download link\"]} | aws s3 cp - s3://lodestone-rnd/data/{d + \".json.gz\"}')\n",
+    "    print('\\033[32m' + 'Done' + '\\033[0m')\n",
+    "\n",
+    "print(\"\\n\")\n",
+    "    \n",
+    "print(f'Successfully downloaded 50 datasets and {621+12+9+36} files into s3://lodestone-rnd/data/')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "216d4564-6b69-4688-94ec-a67287134a2d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# clean up (remove the cloned repositories)\n",
+    "import shutil\n",
+    "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_title_body_jsonl\")\n",
+    "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_titlebody_best_voted_answer_jsonl\")\n",
+    "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_title_best_voted_answer_jsonl\")\n",
+    "shutil.rmtree(\"/home/ec2-user/SageMaker/stackexchange_titlebody_best_and_down_voted_answer_jsonl\")\n",
+    "shutil.rmtree(\"/home/ec2-user/SageMaker/reddit-title-body\")\n",
+    "shutil.rmtree(\"/home/ec2-user/SageMaker/1B_sentence_embeddings\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_pytorch_p310",
+   "language": "python",
+   "name": "conda_pytorch_p310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -1,3 +1,1940 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- mteb
+language: en
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_title_body_jsonl
+- flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl
+- flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl
+- flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl
+- sentence-transformers/reddit-title-body
+- msmarco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+- sentence-transformers/embedding-training-data
+model-index:
+- name: lodestone-base-4096-v1
+  results:
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_counterfactual
+      name: MTEB AmazonCounterfactualClassification (en)
+      config: en
+      split: test
+      revision: e8379541af4e31359cca9fbcf4b00f2671dba205
+    metrics:
+    - type: accuracy
+      value: 69.7313432835821
+    - type: ap
+      value: 31.618259511417733
+    - type: f1
+      value: 63.30313825394228
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_polarity
+      name: MTEB AmazonPolarityClassification
+      config: default
+      split: test
+      revision: e2d317d38cd51312af73b3d32a06d1a08b442046
+    metrics:
+    - type: accuracy
+      value: 86.89837499999999
+    - type: ap
+      value: 82.39500885672128
+    - type: f1
+      value: 86.87317947399657
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_reviews_multi
+      name: MTEB AmazonReviewsClassification (en)
+      config: en
+      split: test
+      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
+    metrics:
+    - type: accuracy
+      value: 44.05
+    - type: f1
+      value: 42.67624383248947
+  - task:
+      type: Retrieval
+    dataset:
+      type: arguana
+      name: MTEB ArguAna
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 26.173999999999996
+    - type: map_at_10
+      value: 40.976
+    - type: map_at_100
+      value: 42.067
+    - type: map_at_1000
+      value: 42.075
+    - type: map_at_3
+      value: 35.917
+    - type: map_at_5
+      value: 38.656
+    - type: mrr_at_1
+      value: 26.814
+    - type: mrr_at_10
+      value: 41.252
+    - type: mrr_at_100
+      value: 42.337
+    - type: mrr_at_1000
+      value: 42.345
+    - type: mrr_at_3
+      value: 36.226
+    - type: mrr_at_5
+      value: 38.914
+    - type: ndcg_at_1
+      value: 26.173999999999996
+    - type: ndcg_at_10
+      value: 49.819
+    - type: ndcg_at_100
+      value: 54.403999999999996
+    - type: ndcg_at_1000
+      value: 54.59
+    - type: ndcg_at_3
+      value: 39.231
+    - type: ndcg_at_5
+      value: 44.189
+    - type: precision_at_1
+      value: 26.173999999999996
+    - type: precision_at_10
+      value: 7.838000000000001
+    - type: precision_at_100
+      value: 0.9820000000000001
+    - type: precision_at_1000
+      value: 0.1
+    - type: precision_at_3
+      value: 16.287
+    - type: precision_at_5
+      value: 12.191
+    - type: recall_at_1
+      value: 26.173999999999996
+    - type: recall_at_10
+      value: 78.378
+    - type: recall_at_100
+      value: 98.222
+    - type: recall_at_1000
+      value: 99.644
+    - type: recall_at_3
+      value: 48.862
+    - type: recall_at_5
+      value: 60.953
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/arxiv-clustering-p2p
+      name: MTEB ArxivClusteringP2P
+      config: default
+      split: test
+      revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
+    metrics:
+    - type: v_measure
+      value: 42.31689035788179
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/arxiv-clustering-s2s
+      name: MTEB ArxivClusteringS2S
+      config: default
+      split: test
+      revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
+    metrics:
+    - type: v_measure
+      value: 31.280245136660984
+  - task:
+      type: Reranking
+    dataset:
+      type: mteb/askubuntudupquestions-reranking
+      name: MTEB AskUbuntuDupQuestions
+      config: default
+      split: test
+      revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
+    metrics:
+    - type: map
+      value: 58.79109720839415
+    - type: mrr
+      value: 71.79615705931495
+  - task:
+      type: STS
+    dataset:
+      type: mteb/biosses-sts
+      name: MTEB BIOSSES
+      config: default
+      split: test
+      revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
+    metrics:
+    - type: cos_sim_pearson
+      value: 76.44918756608115
+    - type: cos_sim_spearman
+      value: 70.86607256286257
+    - type: euclidean_pearson
+      value: 74.12154678100815
+    - type: euclidean_spearman
+      value: 70.86607256286257
+    - type: manhattan_pearson
+      value: 74.0078626964417
+    - type: manhattan_spearman
+      value: 70.68353828321327
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/banking77
+      name: MTEB Banking77Classification
+      config: default
+      split: test
+      revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
+    metrics:
+    - type: accuracy
+      value: 75.40584415584415
+    - type: f1
+      value: 74.29514617572676
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/biorxiv-clustering-p2p
+      name: MTEB BiorxivClusteringP2P
+      config: default
+      split: test
+      revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
+    metrics:
+    - type: v_measure
+      value: 37.41860080664014
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/biorxiv-clustering-s2s
+      name: MTEB BiorxivClusteringS2S
+      config: default
+      split: test
+      revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
+    metrics:
+    - type: v_measure
+      value: 29.319217023090705
+  - task:
+      type: Retrieval
+    dataset:
+      type: BeIR/cqadupstack
+      name: MTEB CQADupstackEnglishRetrieval
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 22.528000000000002
+    - type: map_at_10
+      value: 30.751
+    - type: map_at_100
+      value: 31.855
+    - type: map_at_1000
+      value: 31.972
+    - type: map_at_3
+      value: 28.465
+    - type: map_at_5
+      value: 29.738
+    - type: mrr_at_1
+      value: 28.662
+    - type: mrr_at_10
+      value: 35.912
+    - type: mrr_at_100
+      value: 36.726
+    - type: mrr_at_1000
+      value: 36.777
+    - type: mrr_at_3
+      value: 34.013
+    - type: mrr_at_5
+      value: 35.156
+    - type: ndcg_at_1
+      value: 28.662
+    - type: ndcg_at_10
+      value: 35.452
+    - type: ndcg_at_100
+      value: 40.1
+    - type: ndcg_at_1000
+      value: 42.323
+    - type: ndcg_at_3
+      value: 32.112
+    - type: ndcg_at_5
+      value: 33.638
+    - type: precision_at_1
+      value: 28.662
+    - type: precision_at_10
+      value: 6.688
+    - type: precision_at_100
+      value: 1.13
+    - type: precision_at_1000
+      value: 0.16
+    - type: precision_at_3
+      value: 15.562999999999999
+    - type: precision_at_5
+      value: 11.019
+    - type: recall_at_1
+      value: 22.528000000000002
+    - type: recall_at_10
+      value: 43.748
+    - type: recall_at_100
+      value: 64.235
+    - type: recall_at_1000
+      value: 78.609
+    - type: recall_at_3
+      value: 33.937
+    - type: recall_at_5
+      value: 38.234
+  - task:
+      type: Retrieval
+    dataset:
+      type: climate-fever
+      name: MTEB ClimateFEVER
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 9.468
+    - type: map_at_10
+      value: 16.029
+    - type: map_at_100
+      value: 17.693
+    - type: map_at_1000
+      value: 17.886
+    - type: map_at_3
+      value: 13.15
+    - type: map_at_5
+      value: 14.568
+    - type: mrr_at_1
+      value: 21.173000000000002
+    - type: mrr_at_10
+      value: 31.028
+    - type: mrr_at_100
+      value: 32.061
+    - type: mrr_at_1000
+      value: 32.119
+    - type: mrr_at_3
+      value: 27.534999999999997
+    - type: mrr_at_5
+      value: 29.431
+    - type: ndcg_at_1
+      value: 21.173000000000002
+    - type: ndcg_at_10
+      value: 23.224
+    - type: ndcg_at_100
+      value: 30.225
+    - type: ndcg_at_1000
+      value: 33.961000000000006
+    - type: ndcg_at_3
+      value: 18.174
+    - type: ndcg_at_5
+      value: 19.897000000000002
+    - type: precision_at_1
+      value: 21.173000000000002
+    - type: precision_at_10
+      value: 7.4719999999999995
+    - type: precision_at_100
+      value: 1.5010000000000001
+    - type: precision_at_1000
+      value: 0.219
+    - type: precision_at_3
+      value: 13.312
+    - type: precision_at_5
+      value: 10.619
+    - type: recall_at_1
+      value: 9.468
+    - type: recall_at_10
+      value: 28.823
+    - type: recall_at_100
+      value: 53.26499999999999
+    - type: recall_at_1000
+      value: 74.536
+    - type: recall_at_3
+      value: 16.672
+    - type: recall_at_5
+      value: 21.302
+  - task:
+      type: Retrieval
+    dataset:
+      type: dbpedia-entity
+      name: MTEB DBPedia
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 6.343
+    - type: map_at_10
+      value: 12.717
+    - type: map_at_100
+      value: 16.48
+    - type: map_at_1000
+      value: 17.381
+    - type: map_at_3
+      value: 9.568999999999999
+    - type: map_at_5
+      value: 11.125
+    - type: mrr_at_1
+      value: 48.75
+    - type: mrr_at_10
+      value: 58.425000000000004
+    - type: mrr_at_100
+      value: 59.075
+    - type: mrr_at_1000
+      value: 59.095
+    - type: mrr_at_3
+      value: 56.291999999999994
+    - type: mrr_at_5
+      value: 57.679
+    - type: ndcg_at_1
+      value: 37.875
+    - type: ndcg_at_10
+      value: 27.77
+    - type: ndcg_at_100
+      value: 30.288999999999998
+    - type: ndcg_at_1000
+      value: 36.187999999999995
+    - type: ndcg_at_3
+      value: 31.385999999999996
+    - type: ndcg_at_5
+      value: 29.923
+    - type: precision_at_1
+      value: 48.75
+    - type: precision_at_10
+      value: 22.375
+    - type: precision_at_100
+      value: 6.3420000000000005
+    - type: precision_at_1000
+      value: 1.4489999999999998
+    - type: precision_at_3
+      value: 35.5
+    - type: precision_at_5
+      value: 30.55
+    - type: recall_at_1
+      value: 6.343
+    - type: recall_at_10
+      value: 16.936
+    - type: recall_at_100
+      value: 35.955999999999996
+    - type: recall_at_1000
+      value: 55.787
+    - type: recall_at_3
+      value: 10.771
+    - type: recall_at_5
+      value: 13.669999999999998
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/emotion
+      name: MTEB EmotionClassification
+      config: default
+      split: test
+      revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
+    metrics:
+    - type: accuracy
+      value: 41.99
+    - type: f1
+      value: 36.823402174564954
+  - task:
+      type: Retrieval
+    dataset:
+      type: fever
+      name: MTEB FEVER
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 40.088
+    - type: map_at_10
+      value: 52.69200000000001
+    - type: map_at_100
+      value: 53.296
+    - type: map_at_1000
+      value: 53.325
+    - type: map_at_3
+      value: 49.905
+    - type: map_at_5
+      value: 51.617000000000004
+    - type: mrr_at_1
+      value: 43.009
+    - type: mrr_at_10
+      value: 56.203
+    - type: mrr_at_100
+      value: 56.75
+    - type: mrr_at_1000
+      value: 56.769000000000005
+    - type: mrr_at_3
+      value: 53.400000000000006
+    - type: mrr_at_5
+      value: 55.163
+    - type: ndcg_at_1
+      value: 43.009
+    - type: ndcg_at_10
+      value: 59.39
+    - type: ndcg_at_100
+      value: 62.129999999999995
+    - type: ndcg_at_1000
+      value: 62.793
+    - type: ndcg_at_3
+      value: 53.878
+    - type: ndcg_at_5
+      value: 56.887
+    - type: precision_at_1
+      value: 43.009
+    - type: precision_at_10
+      value: 8.366
+    - type: precision_at_100
+      value: 0.983
+    - type: precision_at_1000
+      value: 0.105
+    - type: precision_at_3
+      value: 22.377
+    - type: precision_at_5
+      value: 15.035000000000002
+    - type: recall_at_1
+      value: 40.088
+    - type: recall_at_10
+      value: 76.68700000000001
+    - type: recall_at_100
+      value: 88.91
+    - type: recall_at_1000
+      value: 93.782
+    - type: recall_at_3
+      value: 61.809999999999995
+    - type: recall_at_5
+      value: 69.131
+  - task:
+      type: Retrieval
+    dataset:
+      type: fiqa
+      name: MTEB FiQA2018
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 10.817
+    - type: map_at_10
+      value: 18.9
+    - type: map_at_100
+      value: 20.448
+    - type: map_at_1000
+      value: 20.660999999999998
+    - type: map_at_3
+      value: 15.979
+    - type: map_at_5
+      value: 17.415
+    - type: mrr_at_1
+      value: 23.148
+    - type: mrr_at_10
+      value: 31.208000000000002
+    - type: mrr_at_100
+      value: 32.167
+    - type: mrr_at_1000
+      value: 32.242
+    - type: mrr_at_3
+      value: 28.498
+    - type: mrr_at_5
+      value: 29.964000000000002
+    - type: ndcg_at_1
+      value: 23.148
+    - type: ndcg_at_10
+      value: 25.325999999999997
+    - type: ndcg_at_100
+      value: 31.927
+    - type: ndcg_at_1000
+      value: 36.081
+    - type: ndcg_at_3
+      value: 21.647
+    - type: ndcg_at_5
+      value: 22.762999999999998
+    - type: precision_at_1
+      value: 23.148
+    - type: precision_at_10
+      value: 7.546
+    - type: precision_at_100
+      value: 1.415
+    - type: precision_at_1000
+      value: 0.216
+    - type: precision_at_3
+      value: 14.969
+    - type: precision_at_5
+      value: 11.327
+    - type: recall_at_1
+      value: 10.817
+    - type: recall_at_10
+      value: 32.164
+    - type: recall_at_100
+      value: 57.655
+    - type: recall_at_1000
+      value: 82.797
+    - type: recall_at_3
+      value: 19.709
+    - type: recall_at_5
+      value: 24.333
+  - task:
+      type: Retrieval
+    dataset:
+      type: hotpotqa
+      name: MTEB HotpotQA
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 25.380999999999997
+    - type: map_at_10
+      value: 33.14
+    - type: map_at_100
+      value: 33.948
+    - type: map_at_1000
+      value: 34.028000000000006
+    - type: map_at_3
+      value: 31.019999999999996
+    - type: map_at_5
+      value: 32.23
+    - type: mrr_at_1
+      value: 50.763000000000005
+    - type: mrr_at_10
+      value: 57.899
+    - type: mrr_at_100
+      value: 58.426
+    - type: mrr_at_1000
+      value: 58.457
+    - type: mrr_at_3
+      value: 56.093
+    - type: mrr_at_5
+      value: 57.116
+    - type: ndcg_at_1
+      value: 50.763000000000005
+    - type: ndcg_at_10
+      value: 41.656
+    - type: ndcg_at_100
+      value: 45.079
+    - type: ndcg_at_1000
+      value: 46.916999999999994
+    - type: ndcg_at_3
+      value: 37.834
+    - type: ndcg_at_5
+      value: 39.732
+    - type: precision_at_1
+      value: 50.763000000000005
+    - type: precision_at_10
+      value: 8.648
+    - type: precision_at_100
+      value: 1.135
+    - type: precision_at_1000
+      value: 0.13799999999999998
+    - type: precision_at_3
+      value: 23.105999999999998
+    - type: precision_at_5
+      value: 15.363
+    - type: recall_at_1
+      value: 25.380999999999997
+    - type: recall_at_10
+      value: 43.241
+    - type: recall_at_100
+      value: 56.745000000000005
+    - type: recall_at_1000
+      value: 69.048
+    - type: recall_at_3
+      value: 34.659
+    - type: recall_at_5
+      value: 38.406
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/imdb
+      name: MTEB ImdbClassification
+      config: default
+      split: test
+      revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
+    metrics:
+    - type: accuracy
+      value: 79.544
+    - type: ap
+      value: 73.82920133396664
+    - type: f1
+      value: 79.51048124883265
+  - task:
+      type: Retrieval
+    dataset:
+      type: msmarco
+      name: MTEB MSMARCO
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 11.174000000000001
+    - type: map_at_10
+      value: 19.451999999999998
+    - type: map_at_100
+      value: 20.612
+    - type: map_at_1000
+      value: 20.703
+    - type: map_at_3
+      value: 16.444
+    - type: map_at_5
+      value: 18.083
+    - type: mrr_at_1
+      value: 11.447000000000001
+    - type: mrr_at_10
+      value: 19.808
+    - type: mrr_at_100
+      value: 20.958
+    - type: mrr_at_1000
+      value: 21.041999999999998
+    - type: mrr_at_3
+      value: 16.791
+    - type: mrr_at_5
+      value: 18.459
+    - type: ndcg_at_1
+      value: 11.447000000000001
+    - type: ndcg_at_10
+      value: 24.556
+    - type: ndcg_at_100
+      value: 30.637999999999998
+    - type: ndcg_at_1000
+      value: 33.14
+    - type: ndcg_at_3
+      value: 18.325
+    - type: ndcg_at_5
+      value: 21.278
+    - type: precision_at_1
+      value: 11.447000000000001
+    - type: precision_at_10
+      value: 4.215
+    - type: precision_at_100
+      value: 0.732
+    - type: precision_at_1000
+      value: 0.095
+    - type: precision_at_3
+      value: 8.052
+    - type: precision_at_5
+      value: 6.318
+    - type: recall_at_1
+      value: 11.174000000000001
+    - type: recall_at_10
+      value: 40.543
+    - type: recall_at_100
+      value: 69.699
+    - type: recall_at_1000
+      value: 89.403
+    - type: recall_at_3
+      value: 23.442
+    - type: recall_at_5
+      value: 30.536
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/mtop_domain
+      name: MTEB MTOPDomainClassification (en)
+      config: en
+      split: test
+      revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
+    metrics:
+    - type: accuracy
+      value: 89.6671226630187
+    - type: f1
+      value: 89.57660424361246
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/mtop_intent
+      name: MTEB MTOPIntentClassification (en)
+      config: en
+      split: test
+      revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
+    metrics:
+    - type: accuracy
+      value: 60.284997720018254
+    - type: f1
+      value: 40.30637400152823
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_intent
+      name: MTEB MassiveIntentClassification (en)
+      config: en
+      split: test
+      revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
+    metrics:
+    - type: accuracy
+      value: 63.33557498318763
+    - type: f1
+      value: 60.24039910680179
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_scenario
+      name: MTEB MassiveScenarioClassification (en)
+      config: en
+      split: test
+      revision: 7d571f92784cd94a019292a1f45445077d0ef634
+    metrics:
+    - type: accuracy
+      value: 72.37390719569603
+    - type: f1
+      value: 72.33097333477316
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/medrxiv-clustering-p2p
+      name: MTEB MedrxivClusteringP2P
+      config: default
+      split: test
+      revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
+    metrics:
+    - type: v_measure
+      value: 34.68158939060552
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/medrxiv-clustering-s2s
+      name: MTEB MedrxivClusteringS2S
+      config: default
+      split: test
+      revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
+    metrics:
+    - type: v_measure
+      value: 30.340061711905236
+  - task:
+      type: Reranking
+    dataset:
+      type: mteb/mind_small
+      name: MTEB MindSmallReranking
+      config: default
+      split: test
+      revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
+    metrics:
+    - type: map
+      value: 32.01814326295803
+    - type: mrr
+      value: 33.20555240055367
+  - task:
+      type: Retrieval
+    dataset:
+      type: nfcorpus
+      name: MTEB NFCorpus
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 3.3910000000000005
+    - type: map_at_10
+      value: 7.7219999999999995
+    - type: map_at_100
+      value: 10.286
+    - type: map_at_1000
+      value: 11.668000000000001
+    - type: map_at_3
+      value: 5.552
+    - type: map_at_5
+      value: 6.468
+    - type: mrr_at_1
+      value: 34.365
+    - type: mrr_at_10
+      value: 42.555
+    - type: mrr_at_100
+      value: 43.295
+    - type: mrr_at_1000
+      value: 43.357
+    - type: mrr_at_3
+      value: 40.299
+    - type: mrr_at_5
+      value: 41.182
+    - type: ndcg_at_1
+      value: 31.424000000000003
+    - type: ndcg_at_10
+      value: 24.758
+    - type: ndcg_at_100
+      value: 23.677999999999997
+    - type: ndcg_at_1000
+      value: 33.377
+    - type: ndcg_at_3
+      value: 28.302
+    - type: ndcg_at_5
+      value: 26.342
+    - type: precision_at_1
+      value: 33.437
+    - type: precision_at_10
+      value: 19.256999999999998
+    - type: precision_at_100
+      value: 6.662999999999999
+    - type: precision_at_1000
+      value: 1.9900000000000002
+    - type: precision_at_3
+      value: 27.761000000000003
+    - type: precision_at_5
+      value: 23.715
+    - type: recall_at_1
+      value: 3.3910000000000005
+    - type: recall_at_10
+      value: 11.068
+    - type: recall_at_100
+      value: 25.878
+    - type: recall_at_1000
+      value: 60.19
+    - type: recall_at_3
+      value: 6.1690000000000005
+    - type: recall_at_5
+      value: 7.767
+  - task:
+      type: Retrieval
+    dataset:
+      type: nq
+      name: MTEB NQ
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 15.168000000000001
+    - type: map_at_10
+      value: 26.177
+    - type: map_at_100
+      value: 27.564
+    - type: map_at_1000
+      value: 27.628999999999998
+    - type: map_at_3
+      value: 22.03
+    - type: map_at_5
+      value: 24.276
+    - type: mrr_at_1
+      value: 17.439
+    - type: mrr_at_10
+      value: 28.205000000000002
+    - type: mrr_at_100
+      value: 29.357
+    - type: mrr_at_1000
+      value: 29.408
+    - type: mrr_at_3
+      value: 24.377
+    - type: mrr_at_5
+      value: 26.540000000000003
+    - type: ndcg_at_1
+      value: 17.41
+    - type: ndcg_at_10
+      value: 32.936
+    - type: ndcg_at_100
+      value: 39.196999999999996
+    - type: ndcg_at_1000
+      value: 40.892
+    - type: ndcg_at_3
+      value: 24.721
+    - type: ndcg_at_5
+      value: 28.615000000000002
+    - type: precision_at_1
+      value: 17.41
+    - type: precision_at_10
+      value: 6.199000000000001
+    - type: precision_at_100
+      value: 0.9690000000000001
+    - type: precision_at_1000
+      value: 0.11299999999999999
+    - type: precision_at_3
+      value: 11.790000000000001
+    - type: precision_at_5
+      value: 9.264
+    - type: recall_at_1
+      value: 15.168000000000001
+    - type: recall_at_10
+      value: 51.914
+    - type: recall_at_100
+      value: 79.804
+    - type: recall_at_1000
+      value: 92.75999999999999
+    - type: recall_at_3
+      value: 30.212
+    - type: recall_at_5
+      value: 39.204
+  - task:
+      type: Retrieval
+    dataset:
+      type: quora
+      name: MTEB QuoraRetrieval
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 67.306
+    - type: map_at_10
+      value: 80.634
+    - type: map_at_100
+      value: 81.349
+    - type: map_at_1000
+      value: 81.37299999999999
+    - type: map_at_3
+      value: 77.691
+    - type: map_at_5
+      value: 79.512
+    - type: mrr_at_1
+      value: 77.56
+    - type: mrr_at_10
+      value: 84.177
+    - type: mrr_at_100
+      value: 84.35000000000001
+    - type: mrr_at_1000
+      value: 84.353
+    - type: mrr_at_3
+      value: 83.003
+    - type: mrr_at_5
+      value: 83.799
+    - type: ndcg_at_1
+      value: 77.58
+    - type: ndcg_at_10
+      value: 84.782
+    - type: ndcg_at_100
+      value: 86.443
+    - type: ndcg_at_1000
+      value: 86.654
+    - type: ndcg_at_3
+      value: 81.67
+    - type: ndcg_at_5
+      value: 83.356
+    - type: precision_at_1
+      value: 77.58
+    - type: precision_at_10
+      value: 12.875
+    - type: precision_at_100
+      value: 1.503
+    - type: precision_at_1000
+      value: 0.156
+    - type: precision_at_3
+      value: 35.63
+    - type: precision_at_5
+      value: 23.483999999999998
+    - type: recall_at_1
+      value: 67.306
+    - type: recall_at_10
+      value: 92.64
+    - type: recall_at_100
+      value: 98.681
+    - type: recall_at_1000
+      value: 99.79
+    - type: recall_at_3
+      value: 83.682
+    - type: recall_at_5
+      value: 88.424
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/reddit-clustering
+      name: MTEB RedditClustering
+      config: default
+      split: test
+      revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
+    metrics:
+    - type: v_measure
+      value: 50.76319866126382
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/reddit-clustering-p2p
+      name: MTEB RedditClusteringP2P
+      config: default
+      split: test
+      revision: 282350215ef01743dc01b456c7f5241fa8937f16
+    metrics:
+    - type: v_measure
+      value: 55.024711941648995
+  - task:
+      type: Retrieval
+    dataset:
+      type: scidocs
+      name: MTEB SCIDOCS
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 3.9379999999999997
+    - type: map_at_10
+      value: 8.817
+    - type: map_at_100
+      value: 10.546999999999999
+    - type: map_at_1000
+      value: 10.852
+    - type: map_at_3
+      value: 6.351999999999999
+    - type: map_at_5
+      value: 7.453
+    - type: mrr_at_1
+      value: 19.400000000000002
+    - type: mrr_at_10
+      value: 27.371000000000002
+    - type: mrr_at_100
+      value: 28.671999999999997
+    - type: mrr_at_1000
+      value: 28.747
+    - type: mrr_at_3
+      value: 24.583
+    - type: mrr_at_5
+      value: 26.143
+    - type: ndcg_at_1
+      value: 19.400000000000002
+    - type: ndcg_at_10
+      value: 15.264
+    - type: ndcg_at_100
+      value: 22.63
+    - type: ndcg_at_1000
+      value: 28.559
+    - type: ndcg_at_3
+      value: 14.424999999999999
+    - type: ndcg_at_5
+      value: 12.520000000000001
+    - type: precision_at_1
+      value: 19.400000000000002
+    - type: precision_at_10
+      value: 7.8100000000000005
+    - type: precision_at_100
+      value: 1.854
+    - type: precision_at_1000
+      value: 0.329
+    - type: precision_at_3
+      value: 13.100000000000001
+    - type: precision_at_5
+      value: 10.68
+    - type: recall_at_1
+      value: 3.9379999999999997
+    - type: recall_at_10
+      value: 15.903
+    - type: recall_at_100
+      value: 37.645
+    - type: recall_at_1000
+      value: 66.86
+    - type: recall_at_3
+      value: 7.993
+    - type: recall_at_5
+      value: 10.885
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sickr-sts
+      name: MTEB SICK-R
+      config: default
+      split: test
+      revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
+    metrics:
+    - type: cos_sim_pearson
+      value: 80.12689060151425
+    - type: cos_sim_spearman
+      value: 70.46515535094771
+    - type: euclidean_pearson
+      value: 77.17160003557223
+    - type: euclidean_spearman
+      value: 70.4651757047438
+    - type: manhattan_pearson
+      value: 77.18129609281937
+    - type: manhattan_spearman
+      value: 70.46610403752913
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts12-sts
+      name: MTEB STS12
+      config: default
+      split: test
+      revision: a0d554a64d88156834ff5ae9920b964011b16384
+    metrics:
+    - type: cos_sim_pearson
+      value: 70.451157033355
+    - type: cos_sim_spearman
+      value: 63.99899601697852
+    - type: euclidean_pearson
+      value: 67.46985359967678
+    - type: euclidean_spearman
+      value: 64.00001637764805
+    - type: manhattan_pearson
+      value: 67.56534741780037
+    - type: manhattan_spearman
+      value: 64.06533893575366
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts13-sts
+      name: MTEB STS13
+      config: default
+      split: test
+      revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
+    metrics:
+    - type: cos_sim_pearson
+      value: 77.65086614464292
+    - type: cos_sim_spearman
+      value: 78.20169706921848
+    - type: euclidean_pearson
+      value: 77.77758172155283
+    - type: euclidean_spearman
+      value: 78.20169706921848
+    - type: manhattan_pearson
+      value: 77.75077884860052
+    - type: manhattan_spearman
+      value: 78.16875216484164
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts14-sts
+      name: MTEB STS14
+      config: default
+      split: test
+      revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
+    metrics:
+    - type: cos_sim_pearson
+      value: 76.26381598259717
+    - type: cos_sim_spearman
+      value: 70.78377709313477
+    - type: euclidean_pearson
+      value: 74.82646556532096
+    - type: euclidean_spearman
+      value: 70.78377658155212
+    - type: manhattan_pearson
+      value: 74.81784766108225
+    - type: manhattan_spearman
+      value: 70.79351454692176
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts15-sts
+      name: MTEB STS15
+      config: default
+      split: test
+      revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
+    metrics:
+    - type: cos_sim_pearson
+      value: 79.00532026789739
+    - type: cos_sim_spearman
+      value: 80.02708383244838
+    - type: euclidean_pearson
+      value: 79.48345422610525
+    - type: euclidean_spearman
+      value: 80.02708383244838
+    - type: manhattan_pearson
+      value: 79.44519739854803
+    - type: manhattan_spearman
+      value: 79.98344094559687
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts16-sts
+      name: MTEB STS16
+      config: default
+      split: test
+      revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
+    metrics:
+    - type: cos_sim_pearson
+      value: 77.32783048164805
+    - type: cos_sim_spearman
+      value: 78.79729961288045
+    - type: euclidean_pearson
+      value: 78.72111945793154
+    - type: euclidean_spearman
+      value: 78.79729904606872
+    - type: manhattan_pearson
+      value: 78.72464311117116
+    - type: manhattan_spearman
+      value: 78.822591248334
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts17-crosslingual-sts
+      name: MTEB STS17 (en-en)
+      config: en-en
+      split: test
+      revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
+    metrics:
+    - type: cos_sim_pearson
+      value: 82.04318630630854
+    - type: cos_sim_spearman
+      value: 83.87886389259836
+    - type: euclidean_pearson
+      value: 83.40385877895086
+    - type: euclidean_spearman
+      value: 83.87886389259836
+    - type: manhattan_pearson
+      value: 83.46337128901547
+    - type: manhattan_spearman
+      value: 83.9723106941644
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts22-crosslingual-sts
+      name: MTEB STS22 (en)
+      config: en
+      split: test
+      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
+    metrics:
+    - type: cos_sim_pearson
+      value: 63.003511169944595
+    - type: cos_sim_spearman
+      value: 64.39318805580227
+    - type: euclidean_pearson
+      value: 65.4797990735967
+    - type: euclidean_spearman
+      value: 64.39318805580227
+    - type: manhattan_pearson
+      value: 65.44604544280844
+    - type: manhattan_spearman
+      value: 64.38742899984233
+  - task:
+      type: STS
+    dataset:
+      type: mteb/stsbenchmark-sts
+      name: MTEB STSBenchmark
+      config: default
+      split: test
+      revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
+    metrics:
+    - type: cos_sim_pearson
+      value: 76.63101237585029
+    - type: cos_sim_spearman
+      value: 75.57446967644269
+    - type: euclidean_pearson
+      value: 76.93491768734478
+    - type: euclidean_spearman
+      value: 75.57446967644269
+    - type: manhattan_pearson
+      value: 76.92187567800636
+    - type: manhattan_spearman
+      value: 75.57239337194585
+  - task:
+      type: Reranking
+    dataset:
+      type: mteb/scidocs-reranking
+      name: MTEB SciDocsRR
+      config: default
+      split: test
+      revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
+    metrics:
+    - type: map
+      value: 78.5376604868993
+    - type: mrr
+      value: 92.94422897364073
+  - task:
+      type: Retrieval
+    dataset:
+      type: scifact
+      name: MTEB SciFact
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 38.872
+    - type: map_at_10
+      value: 50.417
+    - type: map_at_100
+      value: 51.202000000000005
+    - type: map_at_1000
+      value: 51.25999999999999
+    - type: map_at_3
+      value: 47.02
+    - type: map_at_5
+      value: 49.326
+    - type: mrr_at_1
+      value: 41.0
+    - type: mrr_at_10
+      value: 51.674
+    - type: mrr_at_100
+      value: 52.32599999999999
+    - type: mrr_at_1000
+      value: 52.376999999999995
+    - type: mrr_at_3
+      value: 48.778
+    - type: mrr_at_5
+      value: 50.744
+    - type: ndcg_at_1
+      value: 41.0
+    - type: ndcg_at_10
+      value: 56.027
+    - type: ndcg_at_100
+      value: 59.362
+    - type: ndcg_at_1000
+      value: 60.839
+    - type: ndcg_at_3
+      value: 50.019999999999996
+    - type: ndcg_at_5
+      value: 53.644999999999996
+    - type: precision_at_1
+      value: 41.0
+    - type: precision_at_10
+      value: 8.1
+    - type: precision_at_100
+      value: 0.987
+    - type: precision_at_1000
+      value: 0.11100000000000002
+    - type: precision_at_3
+      value: 20.444000000000003
+    - type: precision_at_5
+      value: 14.466999999999999
+    - type: recall_at_1
+      value: 38.872
+    - type: recall_at_10
+      value: 71.906
+    - type: recall_at_100
+      value: 86.367
+    - type: recall_at_1000
+      value: 98.0
+    - type: recall_at_3
+      value: 56.206
+    - type: recall_at_5
+      value: 65.05
+  - task:
+      type: PairClassification
+    dataset:
+      type: mteb/sprintduplicatequestions-pairclassification
+      name: MTEB SprintDuplicateQuestions
+      config: default
+      split: test
+      revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
+    metrics:
+    - type: cos_sim_accuracy
+      value: 99.7039603960396
+    - type: cos_sim_ap
+      value: 90.40809844250262
+    - type: cos_sim_f1
+      value: 84.53181583031557
+    - type: cos_sim_precision
+      value: 87.56698821007502
+    - type: cos_sim_recall
+      value: 81.69999999999999
+    - type: dot_accuracy
+      value: 99.7039603960396
+    - type: dot_ap
+      value: 90.40809844250262
+    - type: dot_f1
+      value: 84.53181583031557
+    - type: dot_precision
+      value: 87.56698821007502
+    - type: dot_recall
+      value: 81.69999999999999
+    - type: euclidean_accuracy
+      value: 99.7039603960396
+    - type: euclidean_ap
+      value: 90.4080982863383
+    - type: euclidean_f1
+      value: 84.53181583031557
+    - type: euclidean_precision
+      value: 87.56698821007502
+    - type: euclidean_recall
+      value: 81.69999999999999
+    - type: manhattan_accuracy
+      value: 99.7
+    - type: manhattan_ap
+      value: 90.39771161966652
+    - type: manhattan_f1
+      value: 84.32989690721648
+    - type: manhattan_precision
+      value: 87.02127659574468
+    - type: manhattan_recall
+      value: 81.8
+    - type: max_accuracy
+      value: 99.7039603960396
+    - type: max_ap
+      value: 90.40809844250262
+    - type: max_f1
+      value: 84.53181583031557
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/stackexchange-clustering
+      name: MTEB StackExchangeClustering
+      config: default
+      split: test
+      revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
+    metrics:
+    - type: v_measure
+      value: 59.663210666678715
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/stackexchange-clustering-p2p
+      name: MTEB StackExchangeClusteringP2P
+      config: default
+      split: test
+      revision: 815ca46b2622cec33ccafc3735d572c266efdb44
+    metrics:
+    - type: v_measure
+      value: 32.107791216468776
+  - task:
+      type: Reranking
+    dataset:
+      type: mteb/stackoverflowdupquestions-reranking
+      name: MTEB StackOverflowDupQuestions
+      config: default
+      split: test
+      revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
+    metrics:
+    - type: map
+      value: 46.440691925067604
+    - type: mrr
+      value: 47.03390257618199
+  - task:
+      type: Summarization
+    dataset:
+      type: mteb/summeval
+      name: MTEB SummEval
+      config: default
+      split: test
+      revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
+    metrics:
+    - type: cos_sim_pearson
+      value: 31.067177519784074
+    - type: cos_sim_spearman
+      value: 31.234728424648967
+    - type: dot_pearson
+      value: 31.06717083018107
+    - type: dot_spearman
+      value: 31.234728424648967
+  - task:
+      type: Retrieval
+    dataset:
+      type: trec-covid
+      name: MTEB TRECCOVID
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 0.136
+    - type: map_at_10
+      value: 0.767
+    - type: map_at_100
+      value: 3.3689999999999998
+    - type: map_at_1000
+      value: 8.613999999999999
+    - type: map_at_3
+      value: 0.369
+    - type: map_at_5
+      value: 0.514
+    - type: mrr_at_1
+      value: 48.0
+    - type: mrr_at_10
+      value: 63.908
+    - type: mrr_at_100
+      value: 64.615
+    - type: mrr_at_1000
+      value: 64.615
+    - type: mrr_at_3
+      value: 62.0
+    - type: mrr_at_5
+      value: 63.4
+    - type: ndcg_at_1
+      value: 44.0
+    - type: ndcg_at_10
+      value: 38.579
+    - type: ndcg_at_100
+      value: 26.409
+    - type: ndcg_at_1000
+      value: 26.858999999999998
+    - type: ndcg_at_3
+      value: 47.134
+    - type: ndcg_at_5
+      value: 43.287
+    - type: precision_at_1
+      value: 48.0
+    - type: precision_at_10
+      value: 40.400000000000006
+    - type: precision_at_100
+      value: 26.640000000000004
+    - type: precision_at_1000
+      value: 12.04
+    - type: precision_at_3
+      value: 52.666999999999994
+    - type: precision_at_5
+      value: 46.800000000000004
+    - type: recall_at_1
+      value: 0.136
+    - type: recall_at_10
+      value: 1.0070000000000001
+    - type: recall_at_100
+      value: 6.318
+    - type: recall_at_1000
+      value: 26.522000000000002
+    - type: recall_at_3
+      value: 0.41700000000000004
+    - type: recall_at_5
+      value: 0.606
+  - task:
+      type: Retrieval
+    dataset:
+      type: webis-touche2020
+      name: MTEB Touche2020
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 1.9949999999999999
+    - type: map_at_10
+      value: 8.304
+    - type: map_at_100
+      value: 13.644
+    - type: map_at_1000
+      value: 15.43
+    - type: map_at_3
+      value: 4.788
+    - type: map_at_5
+      value: 6.22
+    - type: mrr_at_1
+      value: 22.448999999999998
+    - type: mrr_at_10
+      value: 37.658
+    - type: mrr_at_100
+      value: 38.491
+    - type: mrr_at_1000
+      value: 38.503
+    - type: mrr_at_3
+      value: 32.312999999999995
+    - type: mrr_at_5
+      value: 35.68
+    - type: ndcg_at_1
+      value: 21.429000000000002
+    - type: ndcg_at_10
+      value: 18.995
+    - type: ndcg_at_100
+      value: 32.029999999999994
+    - type: ndcg_at_1000
+      value: 44.852
+    - type: ndcg_at_3
+      value: 19.464000000000002
+    - type: ndcg_at_5
+      value: 19.172
+    - type: precision_at_1
+      value: 22.448999999999998
+    - type: precision_at_10
+      value: 17.143
+    - type: precision_at_100
+      value: 6.877999999999999
+    - type: precision_at_1000
+      value: 1.524
+    - type: precision_at_3
+      value: 21.769
+    - type: precision_at_5
+      value: 20.0
+    - type: recall_at_1
+      value: 1.9949999999999999
+    - type: recall_at_10
+      value: 13.395999999999999
+    - type: recall_at_100
+      value: 44.348
+    - type: recall_at_1000
+      value: 82.622
+    - type: recall_at_3
+      value: 5.896
+    - type: recall_at_5
+      value: 8.554
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/toxic_conversations_50k
+      name: MTEB ToxicConversationsClassification
+      config: default
+      split: test
+      revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
+    metrics:
+    - type: accuracy
+      value: 67.9394
+    - type: ap
+      value: 12.943337263423334
+    - type: f1
+      value: 52.28243093094156
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/tweet_sentiment_extraction
+      name: MTEB TweetSentimentExtractionClassification
+      config: default
+      split: test
+      revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
+    metrics:
+    - type: accuracy
+      value: 56.414827391058296
+    - type: f1
+      value: 56.666412409573105
+  - task:
+      type: Clustering
+    dataset:
+      type: mteb/twentynewsgroups-clustering
+      name: MTEB TwentyNewsgroupsClustering
+      config: default
+      split: test
+      revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
+    metrics:
+    - type: v_measure
+      value: 47.009746255495465
+  - task:
+      type: PairClassification
+    dataset:
+      type: mteb/twittersemeval2015-pairclassification
+      name: MTEB TwitterSemEval2015
+      config: default
+      split: test
+      revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
+    metrics:
+    - type: cos_sim_accuracy
+      value: 84.02574953805807
+    - type: cos_sim_ap
+      value: 67.66599910763128
+    - type: cos_sim_f1
+      value: 63.491277990844985
+    - type: cos_sim_precision
+      value: 59.77172140694154
+    - type: cos_sim_recall
+      value: 67.70448548812665
+    - type: dot_accuracy
+      value: 84.02574953805807
+    - type: dot_ap
+      value: 67.66600090945406
+    - type: dot_f1
+      value: 63.491277990844985
+    - type: dot_precision
+      value: 59.77172140694154
+    - type: dot_recall
+      value: 67.70448548812665
+    - type: euclidean_accuracy
+      value: 84.02574953805807
+    - type: euclidean_ap
+      value: 67.6659842364448
+    - type: euclidean_f1
+      value: 63.491277990844985
+    - type: euclidean_precision
+      value: 59.77172140694154
+    - type: euclidean_recall
+      value: 67.70448548812665
+    - type: manhattan_accuracy
+      value: 84.0317100792752
+    - type: manhattan_ap
+      value: 67.66351692448987
+    - type: manhattan_f1
+      value: 63.48610948306178
+    - type: manhattan_precision
+      value: 57.11875131828729
+    - type: manhattan_recall
+      value: 71.45118733509234
+    - type: max_accuracy
+      value: 84.0317100792752
+    - type: max_ap
+      value: 67.66600090945406
+    - type: max_f1
+      value: 63.491277990844985
+  - task:
+      type: PairClassification
+    dataset:
+      type: mteb/twitterurlcorpus-pairclassification
+      name: MTEB TwitterURLCorpus
+      config: default
+      split: test
+      revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
+    metrics:
+    - type: cos_sim_accuracy
+      value: 87.53832421314084
+    - type: cos_sim_ap
+      value: 83.11416594316626
+    - type: cos_sim_f1
+      value: 75.41118114347518
+    - type: cos_sim_precision
+      value: 73.12839059674504
+    - type: cos_sim_recall
+      value: 77.8410840776101
+    - type: dot_accuracy
+      value: 87.53832421314084
+    - type: dot_ap
+      value: 83.11416226342155
+    - type: dot_f1
+      value: 75.41118114347518
+    - type: dot_precision
+      value: 73.12839059674504
+    - type: dot_recall
+      value: 77.8410840776101
+    - type: euclidean_accuracy
+      value: 87.53832421314084
+    - type: euclidean_ap
+      value: 83.11416284455395
+    - type: euclidean_f1
+      value: 75.41118114347518
+    - type: euclidean_precision
+      value: 73.12839059674504
+    - type: euclidean_recall
+      value: 77.8410840776101
+    - type: manhattan_accuracy
+      value: 87.49369348391353
+    - type: manhattan_ap
+      value: 83.08066812574694
+    - type: manhattan_f1
+      value: 75.36561228603892
+    - type: manhattan_precision
+      value: 71.9202518363064
+    - type: manhattan_recall
+      value: 79.15768401601478
+    - type: max_accuracy
+      value: 87.53832421314084
+    - type: max_ap
+      value: 83.11416594316626
+    - type: max_f1
+      value: 75.41118114347518
 ---
+# lodestone-base-4096-v1
+This new [sentence-transformers](https://www.SBERT.net) model from [Hum](https://www.hum.works/) maps long sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Abstract
+In the hopes of furthering Hum's overarching mission of increasing the accessibility and interconnectivity of human knowledge, this model was developed as part of a project intending to boost the maximum input sequence length of sentence embedding models by leveraging recent architectural advances in the design of transformer models such as the incorporation of FlashAttention, Attention with Linear Biases (ALiBi), and Gated Linear Units (GLU). These modifications and enhancements were implemented by the team at MosaicML who designed and constructed the pre-trained [`mosaic-bert-base-seqlen-2048`](https://huggingface.co/mosaicml/mosaic-bert-base-seqlen-2048) model, and more information regarding the details of their development and testing specifications can be found on the model card.
+While the fine-tuning procedure followed during the course of this project loosely mirrors that of the of the original [Flax-sentence-embeddings](https://huggingface.co/flax-sentence-embeddings) team responsible for the creation of many other popular sentence-transformers models (e.g. [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2), [all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1), and [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)), our methodology includes novel techniques for data loading, batch sampling, and model checkpointing intended to improve training efficiency with regards to memory allocation and data storage.
+Through combining these well-established and proven fine-tuning practices with novel advances in transformer architectural elements, our `lodestone-base-4096-v1` model is able to achieve comparable performance metrics on standard text embedding evaluation benchmarks while also supporting a longer and more robust input sequence length of 4096 while retaining a smaller, more manageable size capable of being run on either a GPU or CPU.
+## Usage
+Using this model becomes relatively easy when you have [sentence-transformers](https://www.SBERT.net) installed.
+*At the time of publishing, sentence-transformers does not support remote code which is required for flash-attention used by the model. A fork of the sentence-transformers repository that allows remote code execution is provided for convenience. It can be installed using the following command:*
+```
+pip install git+https://github.com/Hum-Works/sentence-transformers.git
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('lodestone-base-4096-v1', trust_remote_code=True, revision='v1.0.0')
+sentences = ["This is an example sentence", "Each sentence is converted"]
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+*Note: The model will use the openAI/Triton implementation of FlashAttention if installed. This is more performant than the fallback, torch implementation. Some platforms and GPUs may not be supported by Triton - up to date compatibility can be found on [Triton’s github page](https://github.com/openai/triton#compatibility).*
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised contrastive learning objective. We used the pretrained [`mosaic-bert-base-seqlen-2048`](https://huggingface.co/mosaicml/mosaic-bert-base-seqlen-2048) model and fine-tuned it on a nearly 1.5B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+## Intended uses
+Our model is intended to be used as a long sentence and paragraph encoder. Given an input text, it outputs a vector containing the semantic information. The sentence vector may be used for information retrieval, clustering, or sentence similarity tasks.
+## Training procedure
+### Pre-training
+We use the pretrained [`mosaic-bert-base-seqlen-2048`](https://huggingface.co/mosaicml/mosaic-bert-base-seqlen-2048). Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the dot product of each possible sentence pairing in the batch. We then apply the cross entropy loss by comparing with true pairs.
+#### Hyperparameters
+We trained our model on an ml.g5.4xlarge EC2 instance with 1 NVIDIA A10G Tensor Core GPU. We train the model during 1.4 million steps using a batch size of 16. We use a learning rate warm up of 500. The sequence length during training was limited to 2048 tokens. We used the AdamW optimizer with a 2e-5 learning rate and weight decay of 0.01 (i.e. the default parameter values for SentenceTransformer.fit()). The full training script is accessible in this current repository: `Training.py`.
+## Model Architecture
+By incorporating FlashAttention, [Attention with Linear Biases (ALiBi)](https://arxiv.org/abs/2108.12409), and Gated Linear Units (GLU), this model is able to handle input sequences of 4096, 8x longer than that supported by most comparable sentence embedding models.
+The model was trained using a sequence length maximum of 2048, but the final model has a maximum sequence length of 4096. This is accomplished by taking advantage of ALiBi’s positional attention extrapolation which has been shown to allow sequence lengths of 2x the initial trained length.
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
+  (2): Normalize()
+)
+```
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is nearly 1.5 billion sentences. We sampled each dataset given a weighted probability proportional to its relative contribution to the entire dataset.
+The breakdown of the dataset can be seen below, and the entire dataset can be publicly accessed and uploaded via the `Dataloading.ipynb` located within this repository.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| **[S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts)** | [paper](https://aclanthology.org/2020.acl-main.447/) | 252,102,397 |
+| **[Reddit posts](https://huggingface.co/datasets/sentence-transformers/reddit-title-body) (Title, Body) pairs** | - | 127,445,911 |
+| **[Amazon reviews (2018)](https://huggingface.co/datasets/sentence-transformers/embedding-training-data) (Title, Review) pairs** | - | 87,877,725 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_body_jsonl) (Title, Body) pairs  | - | 25,368,423 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| **[Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl) (Title, Most Upvoted Answer) pairs** | - | 4,784,250 |
+| **[Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl) (Title+Body, Most Upvoted Answer) pairs** | - | 4,551,660 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| **[Amazon QA](https://huggingface.co/datasets/sentence-transformers/embedding-training-data)** | - | 2,507,114 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,375,067 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| **[AG News]((Title, Description) pairs of news articles from the AG News dataset)** | - | 1,157,745 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| **[CC News](https://huggingface.co/datasets/sentence-transformers/embedding-training-data) (Title, article) pairs** | - | 614,664 |
+| **[NPR](https://huggingface.co/datasets/sentence-transformers/embedding-training-data) (Title, Body) pairs** | - | 594,384 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| **[MS Marco](https://microsoft.github.io/msmarco/) (Query, Answer Passage) pairs** | [paper](https://doi.org/10.1145/3404835.3462804) | 532,751 |
+| [Stack Exchange](https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0) (Title, Body) pairs | - | 364,000 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| **[CNN & DailyMail](https://huggingface.co/datasets/sentence-transformers/embedding-training-data) (highlight sentences, article) pairs** | - | 311,971 |
+| [Stack Exchange](https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0) Duplicate questions (titles) | - | 304,524 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0) Duplicate questions (bodies) | - | 250,518 |
+| [Stack Exchange](https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0) Duplicate questions (titles+bodies) | - | 250,459 |
+| **[XSUM](https://huggingface.co/datasets/sentence-transformers/embedding-training-data) (Summary, News Article) pairs** | - | 226,711 |
+| **[Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl) (Title+Body, Most Upvoted Answer, Most Downvoted Answer) triplets** | - | 216,454 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| **[FEVER](https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0) training data** | - | 139,051 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| **[SearchQA](https://huggingface.co/datasets/search_qa) (Question, Top-Snippet)** | [paper](https://arxiv.org/abs/1704.05179) | 117,384 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| **[Quora Question Duplicates](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs)** | - | 103,663 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,492,453,113** |
+#### Replication
+The entire fine-tuning process for this model can be replicated by following the steps outlined in the `Replication.txt` file within this repository. This document explains how to modify the [sentence-transformers](https://www.SBERT.net) library, configure the pre-trained [`mosaic-bert-base-seqlen-2048`](https://huggingface.co/mosaicml/mosaic-bert-base-seqlen-2048) model, load all of the training data, and execute the training script.
+#### Limitations
+Due to technical constraints (e.g. limited GPU memory capacity), this model was trained with a smaller batch size of 16, making it so that each step during training was less well-informed than it would have been on a higher performance system. This smaller than ideal hyperparameter value will generally cause the model to be more likely to get stuck in a local minimum and for the parameter configuration to take a longer time to converge to the optimum. In order to counteract this potential risk, we trained the model for a larger number of steps than many of its contemporaries to ensure a greater chance of achieving strong performance, but this is an area which could be improved if further fine-tuning was performed.
+It is also worth noting that, while this model is able to handle longer input sequences of up to 4096 word pieces, the training dataset used consists of sentence and paragraph pairs and triplets which do not necessarily reach that maximum sequence length. Since the data was not tailored specifically for this larger input size, further fine-tuning may be required to ensure highly accurate embeddings for longer texts of that magnitude.
+Finally, as stated on https://huggingface.co/datasets/sentence-transformers/reddit-title-body, an additional reminder and warning regarding the Reddit posts data is that one should "Be aware that this dataset is not filtered for biases, hate-speech, spam, racial slurs etc. It depicts the content as it is posted on Reddit." Thus, while we believe this has not induced any pathological behaviors in the model's performance due to its relatively low prevalence of records in the whole dataset of nearly 1.5B sentence pairs and the fact that this model was trained to produce semantic embeddings rather than generative text outputs, it is always important to be aware of vulnerabilities to bias.

Replication.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+Lodestone Replication
+The dataloading, library modification, model preparation, and training process can be replicated in a straightforward manner by simply running a few Jupyter notebooks and Python files.
+Data Wrangling and Loading
+Dataloading.ipynb utilizes the contents of the GoogleSheets_datasets.tsv and HuggingFace_datasets.tsv to fetch data from various URLs provided by the original distilroberta team to their curated datasets in cloud storage. The data is then streamed directly into the data folder of the lodestone-rnd S3 bucket in us-east-1. In addition to the data used by the distilroberta team and provided at https://docs.google.com/spreadsheets/d/1vXJrIg38cEaKjOG5y4I4PQwAQFUmCkohbViJ9zj_Emg/edit#gid=0, data was also collected from https://huggingface.co/datasets/sentence-transformers/embedding-training-data and the following HuggingFace dataset repositories:
+Stack Exchange
+https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_body_jsonl
+https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl
+https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl
+https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl
+Reddit
+https://huggingface.co/datasets/sentence-transformers/reddit-title-body
+All of the HuggingFace data is handled remotely or pulled via the script in Dataloading.ipynb, so the only files required for this entire process are Dataloading.ipynb, GoogleSheets_datasets.tsv, and HuggingFace_datasets.tsv. Running this notebook results in 679 objects and 310.5GB of data being loaded into S3.
+Once the data is in S3, run Data_Records.ipynb to generate the data_records.json file which contains a dictionary of {filename: record count} pairs and is used throughout the Training.py script.
+Library and Model Preparation
+In order to run the training process with our specific model, we need to make a few custom modifications to the sentence-transformers library and to the config.json file of the mosaic-bert-base-seqlen-2048 base model.
+To alter the sentence-transformers library, clone the repository from https://github.com/UKPLab/sentence-transformers locally and replace the SentenceTransformer.py and Transformer.py files located within the sentence-transformers/sentence_transformers/ and sentence-transformers/sentence_transformers/models/ directories of the cloned repository, respectively, with those located inside dev/ folder. (This has already been done in this notebook instance, but this will have to be completed if training on another system.)
+Before conducting actual training, we also need to clone the mosaic-bert-base-seqlen-2048 model locally and make a few small changes to its config.json file. Running Mosaic_Model.ipynb will execute this process and get our model ready to begin training. (Again, this has already been done in this notebook instance, but this will have to be completed if training on another system.)
+Training
+To perform the final training run, open a SageMaker Terminal window and execute the following:
+cd SageMaker
+screen -S training
+python Training.py
+^a d (that is, Ctrl + a, then d)
+To reattach to the screen and observe how training is progressing, run `screen -r training` in the Terminal. Occasionally epochs may stall and require manual intervention to kickstart the process again. Pressing ^c (that is, Ctrl+c) inside the screen should suffice the get things going again, but this action will automatically cause the currently stalled epoch to fail and for the training to proceed to the next epoch or data chunk without updating the existing model parameterization. Epoch successes and failures and the cumulative number of successfully completed steps can be monitored via the train_logs.txt file which is updated automatically throughout the course of training.
+The Training.py file can be reconfigured such that training hyperparameters could be passed in through the command line, but, at present, hyperparameters should be set within the file before running it.
+This concludes the steps required for replication of the Lodestone training process.

Training.py ADDED Viewed

	@@ -0,0 +1,465 @@

+# This training script is a duplicate of the Training.ipynb notebook but can be invoked from the terminal
+import os
+print(os.getcwd())
+os.environ["PATH"]="/usr/local/cuda-11.7/bin:"+os.getenv("PATH")
+os.system('pip uninstall -y torch')
+os.system('pip uninstall -y einops')
+os.system('pip uninstall -y transformers')
+os.system('pip uninstall -y sentence_transformers')
+os.system('pip uninstall -y datasets')
+os.system('pip uninstall -y sagemaker')
+os.system('pip uninstall -y smart_open')
+os.system('pip uninstall -y pynvml')
+os.system('pip install -r lodestone-reqs.txt')
+os.system('pip install -e ./sentence-transformers')
+os.system('pip uninstall -y triton')
+os.system('pip install --no-deps triton==2.0.0.dev20221202')
+#####
+from pynvml import *
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+import logging
+import os
+import json
+import torch
+import boto3
+from smart_open import open
+import random
+import time
+import gc
+os.environ["PATH"]="/usr/local/cuda-11.7/bin:"+os.getenv("PATH")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#####
+def print_gpu_utilization():
+    "This helper function outputs the current GPU memory usage."
+    nvmlInit()
+    handle = nvmlDeviceGetHandleByIndex(0)
+    info = nvmlDeviceGetMemoryInfo(handle)
+    return f"GPU memory occupied: {info.used/1024**3} GB."
+#####
+class MultiDatasetDataLoader:
+    """
+    This custom dataloader class consumes a list of datasets and a batch size and produces batches randomly sampled
+    from the datasets provided where each batch consists of records from a single dataset and datasets are chosen
+    for batches in proportion to their total number of records.
+    """
+    def __init__(self, datasets, batch_size_pairs, batch_size_triplets=None, dataset_size_temp=-1, allow_swap=True):
+        self.allow_swap = allow_swap
+        self.batch_size_pairs = batch_size_pairs
+        self.batch_size_triplets = batch_size_pairs if batch_size_triplets is None else batch_size_triplets
+        # Compute dataset weights
+        self.dataset_lengths = list(map(len, datasets))
+        self.dataset_lengths_sum = sum(self.dataset_lengths)
+        weights = []
+        # if dataset_size_temp > 0:  # Scale probability with dataset size
+        #     for dataset in datasets:
+        #         prob = len(dataset) / self.dataset_lengths_sum
+        #         weights.append(max(1, int(math.pow(prob, 1 / dataset_size_temp) * 1000)))
+        # else:  # Equal weighting of all datasets
+        #     weights = [100] * len(datasets)
+        for dataset in datasets:
+            weights.append(len(dataset))
+        # logging.info("Dataset lengths and weights: {}".format(list(zip(self.dataset_lengths, weights))))
+        self.dataset_idx = []
+        self.dataset_idx_pointer = 0
+        for idx, weight in enumerate(weights):
+            self.dataset_idx.extend([idx] * weight)
+        random.shuffle(self.dataset_idx)
+        self.datasets = []
+        for dataset in datasets:
+            random.shuffle(dataset)
+            self.datasets.append({
+                'elements': dataset,
+                'pointer': 0,
+            })
+    def __iter__(self):
+        for _ in range(int(self.__len__())):
+            # Select dataset
+            if self.dataset_idx_pointer >= len(self.dataset_idx):
+                self.dataset_idx_pointer = 0
+                random.shuffle(self.dataset_idx)
+            dataset_idx = self.dataset_idx[self.dataset_idx_pointer]
+            self.dataset_idx_pointer += 1
+            # Select batch from this dataset
+            dataset = self.datasets[dataset_idx]
+            batch_size = self.batch_size_pairs if len(dataset['elements'][0].texts) == 2 else self.batch_size_triplets
+            batch = []
+            texts_in_batch = set()
+            guid_in_batch = set()
+            while len(batch) < batch_size:
+                example = dataset['elements'][dataset['pointer']]
+                valid_example = True
+                # First check if one of the texts in already in the batch
+                for text in example.texts:
+                    text_norm = text.strip().lower()
+                    if text_norm in texts_in_batch:
+                        valid_example = False
+                    texts_in_batch.add(text_norm)
+                # If the example has a label, check if label is in batch
+                if example.guid is not None:
+                    valid_example = valid_example and example.guid not in guid_in_batch
+                    guid_in_batch.add(example.guid)
+                if valid_example:
+                    if self.allow_swap and random.random() > 0.5:
+                        example.texts[0], example.texts[1] = example.texts[1], example.texts[0]
+                    batch.append(example)
+                dataset['pointer'] += 1
+                if dataset['pointer'] >= len(dataset['elements']):
+                    dataset['pointer'] = 0
+                    random.shuffle(dataset['elements'])
+            yield self.collate_fn(batch) if self.collate_fn is not None else batch
+    def __len__(self):
+        return int(self.dataset_lengths_sum / self.batch_size_pairs)
+#####
+# These four classes of custom generators parse the raw data from the files in S3 and format it into InputExamples which can be properly interpreted by a SentenceTransformer model.
+class RedditTitleBodyDataset:
+    def __init__(self, source_uri, max_seq_length):
+        self.source_uri = source_uri
+        self.s3_client = boto3.client("s3")
+        self.max_seq_length = max_seq_length
+    def __iter__(self):
+        while True:
+            for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
+                data_line = json.loads(json_line.strip())
+                if "title" in data_line and "body" in data_line:
+                    data = {'guid': None, 'texts': [" ".join(data_line['title'].split(" ")[:self.max_seq_length]), " ".join(data_line['body'].split(" ")[:self.max_seq_length])]}
+                    record = InputExample(guid=data.get('guid', None), texts=data['texts'])
+                    yield record
+class RedditYearDataset:
+    def __init__(self, source_uri, max_seq_length):
+        self.source_uri = source_uri
+        self.s3_client = boto3.client("s3")
+        self.max_seq_length = max_seq_length
+    def __iter__(self):
+        while True:
+            for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
+                data_line = json.loads(json_line.strip())
+                if "response" in data_line and "context" in data_line:
+                    data = {'guid': None, 'texts': [" ".join(data_line['response'].split(" ")[:self.max_seq_length]), " ".join(data_line['context'].split(" ")[:self.max_seq_length])]}
+                    record = InputExample(guid=data.get('guid', None), texts=data['texts'])
+                    yield record
+class HuggingFaceQueryPosDataset:
+    def __init__(self, source_uri, max_seq_length):
+        self.source_uri = source_uri
+        self.s3_client = boto3.client("s3")
+        self.max_seq_length = max_seq_length
+    def __iter__(self):
+        while True:
+            for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
+                data_line = json.loads(json_line.strip())
+                if "query" in data_line and "pos" in data_line:
+                    for i in range(len(data_line['pos'])):
+                        data = {'guid': None, 'texts': [" ".join(data_line['query'].split(" ")[:self.max_seq_length]), " ".join(data_line['pos'][i].split(" ")[:self.max_seq_length])]}
+                        record = InputExample(guid=data.get('guid', None), texts=data['texts'])
+                        yield record
+class Dataset:
+    def __init__(self, source_uri, max_seq_length):
+        self.source_uri = source_uri
+        self.s3_client = boto3.client("s3")
+        self.max_seq_length = max_seq_length
+    def __iter__(self):
+        while True:
+            for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
+                data_line = json.loads(json_line.strip())
+                if not isinstance(data_line, dict):
+                    data = {'guid': None, 'texts': data_line}
+                    for text_idx in range(len(data['texts'])):
+                        data['texts'][text_idx] = " ".join(data['texts'][text_idx].split(" ")[:self.max_seq_length])
+                    record = InputExample(guid=data.get('guid', None), texts=data['texts'])
+                else:
+                    for text_idx in range(len(data_line['texts'])):
+                        data_line['texts'][text_idx] = " ".join(data_line['texts'][text_idx].split(" ")[:self.max_seq_length])
+                    record = InputExample(guid=data_line.get('guid', None), texts=data_line['texts'])
+                yield record
+#####
+def build_generators(data_records, max_seq_length=512, testing=False):
+    """
+    This function consumes the data_records dictionary and creates a new dictionary of data generators where each entry is
+    of the form {filename: data generator object}.
+    """
+    if testing:
+        # filepaths = [file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]
+        filepaths = [file for file in list(data_records.keys())][:3]
+    else:
+        filepaths = list(data_records.keys())
+    generators = {}
+    for filepath in filepaths:
+        filepath = filepath.strip()
+        source_uri = 's3://lodestone-rnd/data/'+filepath
+        if filepath in ['S2ORC_citations_abstracts.json.gz', 'amazon-qa.json.gz'] or 'reddit' in filepath:
+            if "title" in filepath:
+                generators[f'{filepath.split(".")[0]}'] = iter(RedditTitleBodyDataset(source_uri, max_seq_length))
+            elif "reddit" in filepath:
+                generators[f'{filepath.split(".")[0]}'] = iter(RedditYearDataset(source_uri, max_seq_length))
+            else:
+                generators[f'{filepath.split(".")[0]}'] = iter(HuggingFaceQueryPosDataset(source_uri, max_seq_length))
+        else:
+            generators[f'{filepath.split(".")[0]}'] = iter(Dataset(source_uri, max_seq_length))
+    return generators
+#####
+def produce_data(data_records, num_chunks, generators, batch_size, failed_on=None, first_iter=False, testing=False, temp=-1):
+    """
+    This function consumes the data_records dictionary, the number of chunks to break the datasets into, the dictionary of
+    data generators, and a batch size and returns a MultiDatasetDataloader which can be fed into the .fit method of a
+    SentenceTransformer model.
+    """
+    if testing:
+        # filepaths = [file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]
+        filepaths = [file for file in list(data_records.keys())][:3]
+    else:
+        filepaths = list(data_records.keys())
+    datasets = []
+    for file_idx, filepath in enumerate(filepaths):
+        filepath = filepath.strip()
+        dataset = []
+        if failed_on is not None and failed_on != 1 and first_iter:
+            for k in range((failed_on-1)*max(1, data_records[filepath]//num_chunks)):
+                next(generators[f'{filepath.split(".")[0]}'])
+            for m in range(max(1, data_records[filepath]//num_chunks)):
+                dataset.append(next(generators[f'{filepath.split(".")[0]}']))
+        else:
+            for n in range(max(1, data_records[filepath]//num_chunks)):
+                dataset.append(next(generators[f'{filepath.split(".")[0]}']))
+        datasets.append(dataset)
+        logging.info("{}. {}: {}".format(file_idx+1, filepath, len(dataset)))
+    dataset_lengths_sum = sum(list(map(len, datasets)))
+    batch_size_pairs = batch_size_triplets = batch_size
+    # Special data loader to load from multiple datasets
+    train_dataloader = MultiDatasetDataLoader(datasets=datasets,
+                                              batch_size_pairs=batch_size_pairs,
+                                              batch_size_triplets=batch_size_triplets,
+                                              dataset_size_temp=temp)
+    return train_dataloader, dataset_lengths_sum
+#####
+def construct_model(model_name, max_seq_length=512):
+    """
+    This function constructs a SentenceTransformer model from a HuggingFace transformer model name
+    or from a local path to a transformer model repository.
+    """
+    word_embedding_model = models.Transformer(model_name_or_path=model_name,
+                                              max_seq_length=max_seq_length,
+                                              tokenizer_name_or_path='bert-base-uncased',
+                                              trust_remote_code=True,
+                                              model_args={'torch_dtype': torch.bfloat16})
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+    norm = models.Normalize()
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model, norm], device='cuda')
+    model[0].tokenizer.model_max_length = max_seq_length
+    return model
+#####
+# Just some code to print debug information to stdout
+logging.basicConfig(format='%(asctime)s - %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S',
+                    level=logging.INFO,
+                    handlers=[LoggingHandler()])
+# /print debug information to stdout
+#####
+# Set Hyperparameters
+model_name = 'mosaic-bert-base-seqlen-2048'
+# model_name = 'hum-lodestone-v1'
+batch_size = 16
+batch_size_pairs = batch_size_triplets = batch_size
+max_seq_length = 2048
+use_amp = False
+num_cycles = 2
+num_chunks = 50
+num_epochs = 2
+steps_per_epoch = 10000
+# Total training steps = num_cycles * num_chunks * num_epochs * steps_per_epoch = 2 * 50 * 2 * 10,000 = 2,000,000 steps
+warmup_steps = 500
+testing = False
+temp = -1
+#####
+output_path = 'hum-lodestone-v1'
+logging.info("Output: "+output_path)
+# Instantiate SentenceTransformer Model
+model = construct_model(model_name=model_name, max_seq_length=max_seq_length)
+# Load File Names and Record Volumes
+with open('data_records.json') as fIn:
+    data_records = json.load(fIn)
+total_pairs = sum(data_records.values())
+logging.info("Total Training Pairs: {}".format(total_pairs))
+# Initialize Data Generators
+generators = build_generators(data_records=data_records,
+                              max_seq_length=max_seq_length,
+                              testing=testing)
+logging.info("Data Generators Initialized")
+# Define Training Loss Function
+train_loss = losses.MultipleNegativesRankingLoss(model,
+                                                 scale=20,
+                                                 similarity_fct=util.dot_score)
+logging.info(print_gpu_utilization())
+#####
+# Configure Training Cycles
+failed_on = None  # chunk that the process failed on
+random.seed(42)
+steps = 0
+first_iter = True
+for cycle_num in range(num_cycles):
+    logging.info("Starting Cycle {}".format(cycle_num+1))
+    for chunk_num in range(num_chunks):
+        if failed_on is not None and (chunk_num+1) < failed_on and (cycle_num+1) == 1:
+            pass
+        else:
+            logging.info("Chunk {}/{}".format(chunk_num+1, num_chunks))
+            logging.info("Loading {} Datasets".format(len([file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]) if testing else len(data_records)))
+            # t_dataload0 = time.time()
+            # Create the training dataloader for the given chunk of data
+            train_dataloader, dataset_lengths_sum = produce_data(data_records,
+                                                                 num_chunks,
+                                                                 generators,
+                                                                 batch_size,
+                                                                 failed_on=failed_on,
+                                                                 first_iter=first_iter,
+                                                                 testing=testing,
+                                                                 temp=temp)
+            first_iter = False
+            # t_dataload1 = time.time()
+            # print(t_dataload1-t_dataload0)
+            logging.info(print_gpu_utilization())
+            # steps_per_epoch = dataset_lengths_sum // batch_size_pairs
+            for epoch_num in range(num_epochs):
+                logging.info("Performing Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1))
+                try:
+                    # t_fit0 = time.time()
+                    # Train the model
+                    model.fit(train_objectives=[(train_dataloader, train_loss)],
+                              evaluator=None,
+                              epochs=1,
+                              warmup_steps=warmup_steps,
+                              steps_per_epoch=steps_per_epoch,
+                              use_amp=use_amp,
+                              output_path=output_path)
+                    # t_fit1 = time.time()
+                    # print(t_fit1-t_fit0)
+                    steps += steps_per_epoch
+                    logging.info(print_gpu_utilization())
+                    logging.info("Succeeded on Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1))
+                    logging.info("{} Steps Completed in Total".format(steps))
+                    with open('train_logs.txt', 'a') as log:
+                        log.write("Succeeded on Cycle {}, Chunk {}, Epoch {}: {} Steps Completed in Total\n".format(cycle_num+1, chunk_num+1, epoch_num+1, steps))
+                except:
+                    logging.info("Failed on Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1))
+                    with open('train_logs.txt', 'a') as log:
+                        log.write("Failed on Cycle {}, Chunk {}, Epoch {}: {} Steps Completed in Total\n".format(cycle_num+1, chunk_num+1, epoch_num+1, steps))
+                finally:
+                    warmup_steps = 0
+            # Clear GPU/CUDA memory cache between data chunks
+            train_dataloader = None
+            model = None
+            train_loss = None
+            gc.collect()
+            torch.cuda.empty_cache()
+            # Reload the model and reinitialize the loss function
+            model = construct_model(model_name='hum-lodestone-v1', max_seq_length=max_seq_length)
+            train_loss = losses.MultipleNegativesRankingLoss(model,
+                                                             scale=20,
+                                                             similarity_fct=util.dot_score)
+            logging.info(print_gpu_utilization())

bert_layers.py ADDED Viewed

	@@ -0,0 +1,1072 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022, Tri Dao.
+"""Implements Mosaic BERT, with an eye towards the Hugging Face API.
+Mosaic BERT improves performance over Hugging Face BERT through the following:
+1. ALiBi. This architectural change removes positional embeddings and instead encodes positional
+information through attention biases based on query-key position distance. It improves the effectiveness
+of training with shorter sequence lengths by enabling extrapolation to longer sequences.
+2. Gated Linear Units (GLU). This architectural change replaces the FFN component of the BERT layer
+to improve overall expressiveness, providing better convergence properties.
+3. Flash Attention. The Mosaic BERT's self-attention layer makes use of Flash Attention, which dramatically
+improves the speed of self-attention. Our implementation utilizes a bleeding edge implementation that
+supports attention biases, which allows us to use Flash Attention with ALiBi.
+4. Unpadding. Padding is often used to simplify batching across sequences of different lengths. Standard BERT
+implementations waste computation on padded tokens. Mosaic BERT internally unpads to reduce unnecessary computation
+and improve speed. It does this without changing how the user interfaces with the model, thereby
+preserving the simple API of standard implementations.
+Currently, Mosaic BERT is available for masked language modeling :class:`BertForMaskedLM` and sequence
+classification :class:`BertForSequenceClassification`. We aim to expand this catalogue in future releases.
+See :file:`./mosaic_bert.py` for utilities to simplify working with Mosaic BERT in Composer, and for example usage
+of the core Mosaic BERT classes.
+"""
+import copy
+import logging
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (MaskedLMOutput,
+                                           SequenceClassifierOutput)
+from transformers.models.bert.modeling_bert import BertPreTrainedModel
+from .bert_padding import (index_first_axis,
+                                            index_put_first_axis, pad_input,
+                                            unpad_input, unpad_input_only)
+try:
+    from .flash_attn_triton import flash_attn_qkvpacked_func
+except ImportError as e:
+    flash_attn_qkvpacked_func = None
+logger = logging.getLogger(__name__)
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings for words, ignoring position.
+    There are no positional embeddings since we use ALiBi and token_type
+    embeddings.
+    This module is modeled after the Hugging Face BERT's
+    :class:`~transformers.model.bert.modeling_bert.BertEmbeddings`, but is
+    modified as part of Mosaic BERT's ALiBi implementation. The key change is
+    that position embeddings are removed. Position information instead comes
+    from attention biases that scale linearly with the position distance
+    between query and key tokens.
+    This module ignores the `position_ids` input to the `forward` method.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size,
+                                            padding_idx=config.pad_token_id)
+        # ALiBi doesn't use position embeddings
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.register_buffer('token_type_ids',
+                             torch.zeros(config.max_position_embeddings,
+                                         dtype=torch.long),
+                             persistent=False)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if (input_ids is not None) == (inputs_embeds is not None):
+            raise ValueError('Must specify either input_ids or input_embeds!')
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            assert inputs_embeds is not None  # just for type checking
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            # great! ALiBi
+            pass
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when it's auto-generated;
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                assert isinstance(self.token_type_ids, torch.LongTensor)
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded  # type: ignore
+            else:
+                token_type_ids = torch.zeros(input_shape,  # type: ignore
+                                             dtype=torch.long,
+                                             device=self.word_embeddings.device) # type: ignore  # yapf: disable
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        # no position embeddings! ALiBi
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertUnpadSelfAttention(nn.Module):
+    """Performs multi-headed self attention on a batch of unpadded sequences.
+    If Triton is installed, this module uses Flash Attention to greatly improve throughput.
+    The Flash Attention implementation used in Mosaic BERT supports arbitrary attention biases (which
+    we use to implement ALiBi), but does not support attention dropout. If either Triton is not installed
+    or `config.attention_probs_dropout_prob > 0`, the implementation will default to a
+    math-equivalent pytorch version, which is much slower.
+    See `forward` method for additional detail.
+    """
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size /
+                                       config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.p_dropout = config.attention_probs_dropout_prob
+        self.Wqkv = nn.Linear(self.all_head_size, 3 * config.hidden_size)
+        # Warn if defaulting to pytorch because of import issues
+        if flash_attn_qkvpacked_func is None:
+            warnings.warn(
+                'Unable to import Triton; defaulting MosaicBERT attention implementation to pytorch (this will reduce throughput when using this model).'
+            )
+    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+                max_seqlen_in_batch: int, indices: torch.Tensor,
+                attn_mask: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
+        """Perform self-attention.
+        If dropout is zero, then we can use the Triton kernel, so we do that. However, if not, we send through a standard PyTorch
+        implementation of self-attention.
+        The arguments are unpadded, and our implementations of attention require padded arguments,
+        so we first call `pad_input`. Once we compute attention, we re-unpad our outputs for the other layers.
+        The pad/unpad operations add overhead, but not sending pad tokens through ffs saves compute.
+        It is possible to write an unpadded implementation of attention (in Triton and PyTorch), which we will eventually do.
+        Args:
+            hidden_states: (total_nnz, dim)
+            cu_seqlens: (batch + 1,)
+            max_seqlen_in_batch: int
+            indices: (total_nnz,)
+            attn_mask: (batch, max_seqlen_in_batch)
+            bias: (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
+        Returns:
+            attention: (total_nnz, dim)
+        """
+        qkv = self.Wqkv(hidden_states)
+        qkv = pad_input(qkv, indices, cu_seqlens.shape[0] - 1,
+                        max_seqlen_in_batch)  # batch, max_seqlen_in_batch, thd
+        qkv = rearrange(qkv,
+                        'b s (t h d) -> b s t h d',
+                        t=3,
+                        h=self.num_attention_heads)
+        if self.p_dropout or flash_attn_qkvpacked_func is None:
+            # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
+            q = qkv[:, :, 0, :, :].permute(0, 2, 1, 3)  # b h s d
+            k = qkv[:, :, 1, :, :].permute(0, 2, 3, 1)  # b h d s
+            v = qkv[:, :, 2, :, :].permute(0, 2, 1, 3)  # b h s d
+            attention_scores = torch.matmul(q, k) / math.sqrt(
+                self.attention_head_size)
+            attention_scores = attention_scores + bias
+            attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+            attention_probs = self.dropout(attention_probs)
+            attention = torch.matmul(attention_probs, v).permute(0, 2, 1,
+                                                                 3)  # b s h d
+        else:
+            # Triton implementation only supports 0 attention dropout
+            convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
+            if convert_dtype:
+                # Triton implementation only supports fp16 and bf16
+                orig_dtype = qkv.dtype
+                qkv = qkv.to(torch.float16)
+                bias_dtype = bias.dtype
+                bias = bias.to(torch.float16)
+                attention = flash_attn_qkvpacked_func(qkv, bias)
+                attention = attention.to(orig_dtype)
+                bias = bias.to(bias_dtype)
+            else:
+                attention = flash_attn_qkvpacked_func(qkv, bias)
+        # attn_mask is 1 for attend and 0 for don't
+        attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
+        return rearrange(attention, 'nnz h d -> nnz (h d)')
+# Copy of transformer's library BertSelfOutput that will not be caught by surgery methods looking for HF BERT modules.
+class BertSelfOutput(nn.Module):
+    """Computes the output of the attention layer.
+    This module is modeled after the Hugging Face BERT's
+    :class:`~transformers.model.bert.modeling_bert.BertSelfOutput`.
+    The implementation is identical. Rather than use the original module
+    directly, we re-implement it here so that Mosaic BERT's modules will not
+    be affected by any Composer surgery algorithm that modifies Hugging Face
+    BERT modules.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertUnpadAttention(nn.Module):
+    """Chains attention, Dropout, and LayerNorm for Mosaic BERT."""
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertUnpadSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(
+        self,
+        input_tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_s: int,
+        subset_idx: Optional[torch.Tensor] = None,
+        indices: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass for scaled self-attention without padding.
+        Arguments:
+            input_tensor: (total_nnz, dim)
+            cu_seqlens: (batch + 1,)
+            max_s: int
+            subset_idx: () set of indices whose values we care about at the end of the layer
+                        (e.g., the masked tokens, if this is the final layer).
+            indices: None or (total_nnz,)
+            attn_mask: None or (batch, max_seqlen_in_batch)
+            bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
+        """
+        self_output = self.self(input_tensor, cu_seqlens, max_s, indices,
+                                attn_mask, bias)
+        if subset_idx is not None:
+            return self.output(index_first_axis(self_output, subset_idx),
+                               index_first_axis(input_tensor, subset_idx))
+        else:
+            return self.output(self_output, input_tensor)
+class BertGatedLinearUnitMLP(nn.Module):
+    """Applies the FFN at the end of each Mosaic BERT layer.
+    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
+    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality, but
+    introduces Gated Linear Units.
+    Note: Mosaic BERT adds parameters in order to implement Gated Linear Units. To keep parameter count consistent with that of a
+    standard Hugging Face BERT, scale down `config.intermediate_size` by 2/3. For example, a Mosaic BERT constructed with
+    `config.intermediate_size=2048` will have the same parameter footprint as its Hugging Face BERT counterpart constructed
+    with the `config.intermediate_size=3072`.
+    However, in most cases it will not be necessary to adjust `config.intermediate_size` since, despite the increased
+    parameter size, Mosaic BERT typically offers a net higher throughput than a Hugging Face BERT built from the same `config`.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.gated_layers = nn.Linear(config.hidden_size,
+                                      config.intermediate_size * 2,
+                                      bias=False)
+        self.act = nn.GELU(approximate='none')
+        self.wo = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Compute new hidden states from current hidden states.
+        Args:
+            hidden_states (torch.Tensor): The (unpadded) hidden states from
+                the attention layer [nnz, dim].
+        """
+        residual_connection = hidden_states
+        # compute the activation
+        hidden_states = self.gated_layers(hidden_states)
+        gated = hidden_states[:, :self.config.intermediate_size]
+        non_gated = hidden_states[:, self.config.intermediate_size:]
+        hidden_states = self.act(gated) * non_gated
+        hidden_states = self.dropout(hidden_states)
+        # multiply by the second matrix
+        hidden_states = self.wo(hidden_states)
+        # add the residual connection and post-LN
+        hidden_states = self.layernorm(hidden_states + residual_connection)
+        return hidden_states
+class BertLayer(nn.Module):
+    """Composes the Mosaic BERT attention and FFN blocks into a single layer."""
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertUnpadAttention(config)
+        self.mlp = BertGatedLinearUnitMLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        seqlen: int,
+        subset_idx: Optional[torch.Tensor] = None,
+        indices: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass for a BERT layer, including both attention and MLP.
+        Args:
+            hidden_states: (total_nnz, dim)
+            cu_seqlens: (batch + 1,)
+            seqlen: int
+            subset_idx: () set of indices whose values we care about at the end of the layer
+                        (e.g., the masked tokens, if this is the final layer).
+            indices: None or (total_nnz,)
+            attn_mask: None or (batch, max_seqlen_in_batch)
+            bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
+        """
+        attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
+                                          subset_idx, indices, attn_mask, bias)
+        layer_output = self.mlp(attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    """A stack of BERT layers providing the backbone of Mosaic BERT.
+    This module is modeled after the Hugging Face BERT's :class:`~transformers.model.bert.modeling_bert.BertEncoder`,
+    but with substantial modifications to implement unpadding and ALiBi.
+    Compared to the analogous Hugging Face BERT module, this module handles unpadding to reduce unnecessary computation
+    at padded tokens, and pre-computes attention biases to implement ALiBi.
+    """
+    def __init__(self, config):
+        super().__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.num_attention_heads = config.num_attention_heads
+        # The alibi mask will be dynamically expanded if it is too small for
+        # the input the model receives. But it generally helps to initialize it
+        # to a reasonably large size to help pre-allocate CUDA memory.
+        # The default `alibi_starting_size` is 512.
+        self._current_alibi_size = int(config.alibi_starting_size)
+        self.alibi = torch.zeros(
+            (1, self.num_attention_heads, self._current_alibi_size,
+             self._current_alibi_size))
+        self.rebuild_alibi_tensor(size=config.alibi_starting_size)
+    def rebuild_alibi_tensor(self,
+                             size: int,
+                             device: Optional[Union[torch.device, str]] = None):
+        # Alibi
+        # Following https://github.com/ofirpress/attention_with_linear_biases/issues/5 (Implementation 1)
+        # In the causal case, you can exploit the fact that softmax is invariant to a uniform translation
+        # of the logits, which makes the math work out *after* applying causal masking. If no causal masking
+        # will be applied, it is necessary to construct the diagonal mask.
+        n_heads = self.num_attention_heads
+        def _get_alibi_head_slopes(n_heads: int) -> List[float]:
+            def get_slopes_power_of_2(n_heads: int) -> List[float]:
+                start = (2**(-2**-(math.log2(n_heads) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n_heads)]
+            # In the paper, they only train models that have 2^a heads for some a. This function
+            # has some good properties that only occur when the input is a power of 2. To
+            # maintain that even when the number of heads is not a power of 2, we use a
+            # workaround.
+            if math.log2(n_heads).is_integer():
+                return get_slopes_power_of_2(n_heads)
+            closest_power_of_2 = 2**math.floor(math.log2(n_heads))
+            slopes_a = get_slopes_power_of_2(closest_power_of_2)
+            slopes_b = _get_alibi_head_slopes(2 * closest_power_of_2)
+            slopes_b = slopes_b[0::2][:n_heads - closest_power_of_2]
+            return slopes_a + slopes_b
+        context_position = torch.arange(size, device=device)[:, None]
+        memory_position = torch.arange(size, device=device)[None, :]
+        relative_position = torch.abs(memory_position - context_position)
+        # [n_heads, max_token_length, max_token_length]
+        relative_position = relative_position.unsqueeze(0).expand(
+            n_heads, -1, -1)
+        slopes = torch.Tensor(_get_alibi_head_slopes(n_heads)).to(device)
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * -relative_position
+        # [1, n_heads, max_token_length, max_token_length]
+        alibi = alibi.unsqueeze(0)
+        assert alibi.shape == torch.Size([1, n_heads, size, size])
+        self._current_alibi_size = size
+        self.alibi = alibi
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_all_encoded_layers: Optional[bool] = True,
+        subset_mask: Optional[torch.Tensor] = None,
+    ) -> List[torch.Tensor]:
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        attention_mask_bool = attention_mask.bool()
+        batch, seqlen = hidden_states.shape[:2]
+        # Unpad inputs and mask. It will remove tokens that are padded.
+        # Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens.
+        # Then unpadding performs the following compression of the inputs:
+        # hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
+        hidden_states, indices, cu_seqlens, _ = unpad_input(
+            hidden_states, attention_mask_bool)
+        # Add alibi matrix to extended_attention_mask
+        if self._current_alibi_size < seqlen:
+            # Rebuild the alibi tensor when needed
+            warnings.warn(
+                f'Increasing alibi size from {self._current_alibi_size} to {seqlen}'
+            )
+            self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device)
+        elif self.alibi.device != hidden_states.device:
+            # Device catch-up
+            self.alibi = self.alibi.to(hidden_states.device)
+        alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
+        attn_bias = extended_attention_mask[:, :, :seqlen, :seqlen]
+        alibi_attn_mask = attn_bias + alibi_bias
+        all_encoder_layers = []
+        if subset_mask is None:
+            for layer_module in self.layer:
+                hidden_states = layer_module(hidden_states,
+                                             cu_seqlens,
+                                             seqlen,
+                                             None,
+                                             indices,
+                                             attn_mask=attention_mask,
+                                             bias=alibi_attn_mask)
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+            # Pad inputs and mask. It will insert back zero-padded tokens.
+            # Assume ntokens is total number of tokens (padded and non-padded)
+            # and ntokens_unpad is total number of non-padded tokens.
+            # Then padding performs the following de-compression:
+            #     hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
+            hidden_states = pad_input(hidden_states, indices, batch, seqlen)
+        else:
+            for i in range(len(self.layer) - 1):
+                layer_module = self.layer[i]
+                hidden_states = layer_module(hidden_states,
+                                             cu_seqlens,
+                                             seqlen,
+                                             None,
+                                             indices,
+                                             attn_mask=attention_mask,
+                                             bias=alibi_attn_mask)
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+            subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
+                                       as_tuple=False).flatten()
+            hidden_states = self.layer[-1](hidden_states,
+                                           cu_seqlens,
+                                           seqlen,
+                                           subset_idx=subset_idx,
+                                           indices=indices,
+                                           attn_mask=attention_mask,
+                                           bias=alibi_attn_mask)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self,
+                hidden_states: torch.Tensor,
+                pool: Optional[bool] = True) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertModel(BertPreTrainedModel):
+    """Overall BERT model.
+    Args:
+        config: a BertConfig class instance with the configuration to build a new model
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_all_encoded_layers: Optional[bool] = False,
+        masked_tokens_mask: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        embedding_output = self.embeddings(input_ids, token_type_ids,
+                                           position_ids)
+        subset_mask = []
+        first_col_mask = []
+        if masked_tokens_mask is None:
+            subset_mask = None
+        else:
+            first_col_mask = torch.zeros_like(masked_tokens_mask)
+            first_col_mask[:, 0] = True
+            subset_mask = masked_tokens_mask | first_col_mask
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            subset_mask=subset_mask)
+        if masked_tokens_mask is None:
+            sequence_output = encoder_outputs[-1]
+            pooled_output = self.pooler(
+                sequence_output) if self.pooler is not None else None
+        else:
+            # TD [2022-03-01]: the indexing here is very tricky.
+            attention_mask_bool = attention_mask.bool()
+            subset_idx = subset_mask[attention_mask_bool]  # type: ignore
+            sequence_output = encoder_outputs[-1][
+                masked_tokens_mask[attention_mask_bool][subset_idx]]
+            if self.pooler is not None:
+                pool_input = encoder_outputs[-1][
+                    first_col_mask[attention_mask_bool][subset_idx]]
+                pooled_output = self.pooler(pool_input, pool=False)
+            else:
+                pooled_output = None
+        if not output_all_encoded_layers:
+            encoder_outputs = sequence_output
+        if self.pooler is not None:
+            return encoder_outputs, pooled_output
+        return encoder_outputs, None
+###################
+# Bert Heads
+###################
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0))
+        self.decoder.weight = bert_model_embedding_weights
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+#####################
+# Various Bert models
+#####################
+class BertForPreTraining(BertPreTrainedModel):
+    #TBD: Coming in Future Commit
+    pass
+class BertLMHeadModel(BertPreTrainedModel):
+    #TBD: Coming in Future Commit
+    pass
+class BertForMaskedLM(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if config.is_decoder:
+            warnings.warn(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config,
+                                   self.bert.embeddings.word_embeddings.weight)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @classmethod
+    def from_composer(cls,
+                      pretrained_checkpoint,
+                      state_dict=None,
+                      cache_dir=None,
+                      from_tf=False,
+                      config=None,
+                      *inputs,
+                      **kwargs):
+        """Load from pre-trained."""
+        model = cls(config, *inputs, **kwargs)
+        if from_tf:
+            raise ValueError(
+                'Mosaic BERT does not support loading TensorFlow weights.')
+        state_dict = torch.load(pretrained_checkpoint)
+        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
+        consume_prefix_in_state_dict_if_present(state_dict, prefix='model.')
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict,
+                                                              strict=False)
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}"
+            )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}"
+            )
+        return model
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        # labels should be a `torch.LongTensor` of shape
+        # `(batch_size, sequence_length)`. These are used for computing the
+        #  masked language modeling loss.
+        #
+        # Indices should be in `[-100, 0, ..., config.vocab_size]` (see
+        # `input_ids` docstring) Tokens with indices set to `-100` are ignored
+        # (masked), the loss is only computed for the tokens with labels in `[0,
+        # ..., config.vocab_size]`
+        #
+        # Prediction scores are only computed for masked tokens and the (bs,
+        # seqlen) dimensions are flattened
+        if (input_ids is not None) == (inputs_embeds is not None):
+            raise ValueError('Must specify either input_ids or input_embeds!')
+        if labels is None:
+            masked_tokens_mask = None
+        else:
+            masked_tokens_mask = labels > 0
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            masked_tokens_mask=masked_tokens_mask,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        loss = None
+        if labels is not None:
+            # Compute loss
+            loss_fct = nn.CrossEntropyLoss()
+            masked_token_idx = torch.nonzero(labels.flatten() > 0,
+                                             as_tuple=False).flatten()
+            loss = loss_fct(prediction_scores,
+                            labels.flatten()[masked_token_idx])
+            assert input_ids is not None, 'Coding error; please open an issue'
+            batch, seqlen = input_ids.shape[:2]
+            prediction_scores = rearrange(index_put_first_axis(
+                prediction_scores, masked_token_idx, batch * seqlen),
+                                          '(b s) d -> b s d',
+                                          b=batch)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=None,
+            attentions=None,
+        )
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor,
+                                      attention_mask: torch.Tensor,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+        attention_mask = torch.cat([
+            attention_mask,
+            attention_mask.new_zeros((attention_mask.shape[0], 1))
+        ],
+                                   dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    #TBD: Push in future commit
+    pass
+class BertForSequenceClassification(BertPreTrainedModel):
+    """Bert Model transformer with a sequence classification/regression head.
+    This head is just a linear layer on top of the pooled output. Used for,
+    e.g., GLUE tasks.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bert = BertModel(config)
+        classifier_dropout = (config.classifier_dropout
+                              if config.classifier_dropout is not None else
+                              config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @classmethod
+    def from_composer(cls,
+                      pretrained_checkpoint,
+                      state_dict=None,
+                      cache_dir=None,
+                      from_tf=False,
+                      config=None,
+                      *inputs,
+                      **kwargs):
+        """Load from pre-trained."""
+        model = cls(config, *inputs, **kwargs)
+        if from_tf:
+            raise ValueError(
+                'Mosaic BERT does not support loading TensorFlow weights.')
+        state_dict = torch.load(pretrained_checkpoint)
+        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
+        consume_prefix_in_state_dict_if_present(state_dict, prefix='model.')
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict,
+                                                              strict=False)
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}"
+            )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}"
+            )
+        return model
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        # labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+        # Labels for computing the sequence classification/regression loss.
+        # Indices should be in `[0, ..., config.num_labels - 1]`.
+        # If `config.num_labels == 1` a regression loss is computed
+        # (mean-square loss). If `config.num_labels > 1` a classification loss
+        # is computed (cross-entropy).
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            # Compute loss
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or
+                                              labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels),
+                                labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=None,
+            attentions=None,
+        )
+class BertForMultipleChoice(BertPreTrainedModel):
+    #TBD: Push in future commit
+    pass
+class BertForTokenClassification(BertPreTrainedModel):
+    #TBD: Push in future commit
+    pass
+class BertForQuestionAnswering(BertPreTrainedModel):
+    """Bert Model with a span classification head.
+    This is used for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden states' output to compute `span start logits`
+    and `span end logits`).
+    """
+    #TBD: Push in future commit

bert_padding.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/bert_padding.py
+# Which was adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+"""Helper functions for padding and unpadding batches.
+These functions are used extensively throughout the Mosaic BERT implementation
+in `bert_layers.py`.
+"""
+from typing import Tuple, cast
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """Get just the values of `input` which are at `indices`.
+        Arguments:
+            ctx: the autograd context object
+            input: (b, ...) 2+ dimensional tensor
+            indices: (num_idx) 1D tensor
+        """
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[
+            1:]  # type: ignore
+        second_dim = other_shape.numel(
+        )  # product of sizes of all but first dimension
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        return torch.gather(
+            rearrange(input, 'b ... -> b (...)'),  # (b, ...) -> (b, second_dim)
+            0,
+            repeat(indices, 'z -> z d',
+                   d=second_dim)  # (indices,) -> (indices, second_dim)
+        ).reshape(-1, *other_shape)  # (num_idx, ...)
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        indices, = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, 'b ... -> b (...)')
+        grad_input = torch.zeros([ctx.first_axis_dim, grad_output.shape[1]],
+                                 device=grad_output.device,
+                                 dtype=grad_output.dtype)
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0,
+                            repeat(indices, 'z -> z d', d=grad_output.shape[1]),
+                            grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values: torch.Tensor, indices: torch.Tensor,
+                first_axis_dim) -> torch.Tensor:
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(first_axis_dim,
+                             *values.shape[1:],
+                             device=values.device,
+                             dtype=values.dtype)
+        output[indices] = values
+        return output
+    @staticmethod
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+        indices, = ctx.saved_tensors
+        grad_values = grad_output[indices]
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+def unpad_input(
+    hidden_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+    """Remove padding from input sequences.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Returns:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz)
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int ()
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = int(seqlens_in_batch.max().item())
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32),
+                       (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    hidden_states = cast(
+        torch.Tensor,
+        index_first_axis(rearrange(hidden_states, 'b s ... -> (b s) ...'),
+                         indices))
+    return hidden_states, indices, cu_seqlens, max_seqlen_in_batch
+def unpad_input_only(
+    hidden_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Like unpad_input, but only return the unpadded first tensor.
+    Save a small amount of overhead.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Returns:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+    """
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    return index_first_axis(rearrange(hidden_states, 'b s ... -> (b s) ...'),
+                            indices)
+def pad_input(hidden_states: torch.Tensor, indices: torch.Tensor, batch: int,
+              seqlen: int) -> torch.Tensor:
+    """Add padding to sequences.
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz)
+        batch: int batch_size
+        seqlen: int max sequence length
+    Returns:
+        hidden_states: (batch, seqlen, ...)
+    """
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, '(b s) ... -> b s ...', b=batch)

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "hum-lodestone-v1",
+  "alibi_starting_size": 4096,
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bert.BertConfig",
+    "AutoModel": "bert_layers.BertModel",
+    "AutoModelForMaskedLM": "bert_layers.BertForMaskedLM"
+  },
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "tokenizer_class": "BertTokenizerFast",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.28.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30528
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.28.1",
+    "pytorch": "2.0.1+cu117"
+  }
+}

configuration_bert.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+from transformers import BertConfig as TransformersBertConfig
+class BertConfig(TransformersBertConfig):
+    def __init__(
+        self,
+        alibi_starting_size: int = 512,
+        attention_probs_dropout_prob: float = 0.0,
+        **kwargs,
+    ):
+        """Configuration class for MosaicBert.
+        Args:
+            alibi_starting_size (int): Use `alibi_starting_size` to determine how large of an alibi tensor to
+                create when initializing the model. You should be able to ignore this parameter in most cases.
+                Defaults to 512.
+            attention_probs_dropout_prob (float): By default, turn off attention dropout in Mosaic BERT
+                (otherwise, Flash Attention will be off by default). Defaults to 0.0.
+        """
+        super().__init__(attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs)
+        self.alibi_starting_size = alibi_starting_size

data_records.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"AllNLI.json.gz": 277230, "CodeSearchNet.json.gz": 1375067, "NQ-train_pairs.json.gz": 100231, "PAQ_pairs.json.gz": 64371441, "S2ORC_citation_pairs.json.gz": 52603982, "S2ORC_citations_abstracts.json.gz": 252102397, "S2ORC_title_abstract.json.gz": 41769185, "SimpleWiki.json.gz": 102225, "TriviaQA_pairs.json.gz": 73346, "WikiAnswers_pairs.json.gz": 77427422, "agnews.json.gz": 1157745, "altlex.json.gz": 112696, "amazon-qa.json.gz": 2507114, "amazon_review_2018.json.gz": 87877725, "ccnews_title_text.json.gz": 614664, "cnn_dailymail.json.gz": 311971, "coco_captions.json.gz": 828395, "eli5_question_answer.json.gz": 325475, "fever_train.json.gz": 139051, "flickr30k_captions.json.gz": 317695, "gooaq_pairs.json.gz": 3012496, "msmarco-query_passage.json.gz": 532751, "msmarco-query_passage_negative.json.gz": 9144553, "npr.json.gz": 594384, "quora_duplicates.json.gz": 103663, "quora_duplicates_triplets.json.gz": 103663, "reddit-title-body/reddit_title_text_2010.json.gz": 431782, "reddit-title-body/reddit_title_text_2011.json.gz": 1673264, "reddit-title-body/reddit_title_text_2012.json.gz": 3727526, "reddit-title-body/reddit_title_text_2013.json.gz": 5713956, "reddit-title-body/reddit_title_text_2014.json.gz": 8538976, "reddit-title-body/reddit_title_text_2015.json.gz": 11064453, "reddit-title-body/reddit_title_text_2016.json.gz": 12224789, "reddit-title-body/reddit_title_text_2017.json.gz": 13558139, "reddit-title-body/reddit_title_text_2018.json.gz": 15552110, "reddit-title-body/reddit_title_text_2019.json.gz": 19224970, "reddit-title-body/reddit_title_text_2020.json.gz": 23030988, "reddit-title-body/reddit_title_text_2021.json.gz": 12704958, "reddit_2015.json.gz": 135108166, "reddit_2016.json.gz": 159164386, "reddit_2017.json.gz": 191485219, "reddit_2018.json.gz": 240726659, "searchQA_question_top5_snippets_merged.json.gz": 582261, "searchQA_question_topSnippet.json.gz": 117384, "sentence-compression.json.gz": 180000, "specter_train_triples.json.gz": 684100, "squad_pairs.json.gz": 87599, "stackexchange_duplicate_questions_body_body.json.gz": 250459, "stackexchange_duplicate_questions_title-body_title-body.json.gz": 250518, "stackexchange_duplicate_questions_title_title.json.gz": 304524, "stackexchange_title_best_voted_answer_jsonl/3dprinting.stackexchange.com.json.gz": 3488, "stackexchange_title_best_voted_answer_jsonl/academia.stackexchange.com.json.gz": 32137, "stackexchange_title_best_voted_answer_jsonl/ai.stackexchange.com.json.gz": 5763, "stackexchange_title_best_voted_answer_jsonl/android.stackexchange.com.json.gz": 38077, "stackexchange_title_best_voted_answer_jsonl/anime.stackexchange.com.json.gz": 10131, "stackexchange_title_best_voted_answer_jsonl/apple.stackexchange.com.json.gz": 92487, "stackexchange_title_best_voted_answer_jsonl/arduino.stackexchange.com.json.gz": 16281, "stackexchange_title_best_voted_answer_jsonl/askubuntu.com.json.gz": 267135, "stackexchange_title_best_voted_answer_jsonl/astronomy.stackexchange.com.json.gz": 9086, "stackexchange_title_best_voted_answer_jsonl/aviation.stackexchange.com.json.gz": 18755, "stackexchange_title_best_voted_answer_jsonl/avp.stackexchange.com.json.gz": 6450, "stackexchange_title_best_voted_answer_jsonl/beer.stackexchange.com.json.gz": 1012, "stackexchange_title_best_voted_answer_jsonl/bicycles.stackexchange.com.json.gz": 15708, "stackexchange_title_best_voted_answer_jsonl/bioinformatics.stackexchange.com.json.gz": 3135, "stackexchange_title_best_voted_answer_jsonl/biology.stackexchange.com.json.gz": 19277, "stackexchange_title_best_voted_answer_jsonl/bitcoin.stackexchange.com.json.gz": 22474, "stackexchange_title_best_voted_answer_jsonl/blender.stackexchange.com.json.gz": 54153, "stackexchange_title_best_voted_answer_jsonl/boardgames.stackexchange.com.json.gz": 11805, "stackexchange_title_best_voted_answer_jsonl/bricks.stackexchange.com.json.gz": 3530, "stackexchange_title_best_voted_answer_jsonl/buddhism.stackexchange.com.json.gz": 6787, "stackexchange_title_best_voted_answer_jsonl/cardano.stackexchange.com.json.gz": 248, "stackexchange_title_best_voted_answer_jsonl/chemistry.stackexchange.com.json.gz": 27061, "stackexchange_title_best_voted_answer_jsonl/chess.stackexchange.com.json.gz": 6392, "stackexchange_title_best_voted_answer_jsonl/chinese.stackexchange.com.json.gz": 8646, "stackexchange_title_best_voted_answer_jsonl/christianity.stackexchange.com.json.gz": 11498, "stackexchange_title_best_voted_answer_jsonl/civicrm.stackexchange.com.json.gz": 10648, "stackexchange_title_best_voted_answer_jsonl/codegolf.stackexchange.com.json.gz": 8211, "stackexchange_title_best_voted_answer_jsonl/codereview.stackexchange.com.json.gz": 41748, "stackexchange_title_best_voted_answer_jsonl/coffee.stackexchange.com.json.gz": 1188, "stackexchange_title_best_voted_answer_jsonl/cogsci.stackexchange.com.json.gz": 5101, "stackexchange_title_best_voted_answer_jsonl/computergraphics.stackexchange.com.json.gz": 2306, "stackexchange_title_best_voted_answer_jsonl/conlang.stackexchange.com.json.gz": 334, "stackexchange_title_best_voted_answer_jsonl/cooking.stackexchange.com.json.gz": 22641, "stackexchange_title_best_voted_answer_jsonl/craftcms.stackexchange.com.json.gz": 11236, "stackexchange_title_best_voted_answer_jsonl/crafts.stackexchange.com.json.gz": 1659, "stackexchange_title_best_voted_answer_jsonl/crypto.stackexchange.com.json.gz": 19404, "stackexchange_title_best_voted_answer_jsonl/cs.stackexchange.com.json.gz": 30010, "stackexchange_title_best_voted_answer_jsonl/cseducators.stackexchange.com.json.gz": 902, "stackexchange_title_best_voted_answer_jsonl/cstheory.stackexchange.com.json.gz": 7742, "stackexchange_title_best_voted_answer_jsonl/datascience.stackexchange.com.json.gz": 20503, "stackexchange_title_best_voted_answer_jsonl/dba.stackexchange.com.json.gz": 71449, "stackexchange_title_best_voted_answer_jsonl/devops.stackexchange.com.json.gz": 3462, "stackexchange_title_best_voted_answer_jsonl/diy.stackexchange.com.json.gz": 52896, "stackexchange_title_best_voted_answer_jsonl/drones.stackexchange.com.json.gz": 496, "stackexchange_title_best_voted_answer_jsonl/drupal.stackexchange.com.json.gz": 67817, "stackexchange_title_best_voted_answer_jsonl/dsp.stackexchange.com.json.gz": 17430, "stackexchange_title_best_voted_answer_jsonl/earthscience.stackexchange.com.json.gz": 4396, "stackexchange_title_best_voted_answer_jsonl/ebooks.stackexchange.com.json.gz": 1107, "stackexchange_title_best_voted_answer_jsonl/economics.stackexchange.com.json.gz": 8844, "stackexchange_title_best_voted_answer_jsonl/electronics.stackexchange.com.json.gz": 129494, "stackexchange_title_best_voted_answer_jsonl/ell.stackexchange.com.json.gz": 77892, "stackexchange_title_best_voted_answer_jsonl/emacs.stackexchange.com.json.gz": 16830, "stackexchange_title_best_voted_answer_jsonl/engineering.stackexchange.com.json.gz": 8649, "stackexchange_title_best_voted_answer_jsonl/english.stackexchange.com.json.gz": 100640, "stackexchange_title_best_voted_answer_jsonl/eosio.stackexchange.com.json.gz": 1940, "stackexchange_title_best_voted_answer_jsonl/esperanto.stackexchange.com.json.gz": 1466, "stackexchange_title_best_voted_answer_jsonl/ethereum.stackexchange.com.json.gz": 26124, "stackexchange_title_best_voted_answer_jsonl/expatriates.stackexchange.com.json.gz": 4913, "stackexchange_title_best_voted_answer_jsonl/expressionengine.stackexchange.com.json.gz": 10742, "stackexchange_title_best_voted_answer_jsonl/fitness.stackexchange.com.json.gz": 8297, "stackexchange_title_best_voted_answer_jsonl/freelancing.stackexchange.com.json.gz": 1663, "stackexchange_title_best_voted_answer_jsonl/french.stackexchange.com.json.gz": 10578, "stackexchange_title_best_voted_answer_jsonl/gamedev.stackexchange.com.json.gz": 40154, "stackexchange_title_best_voted_answer_jsonl/gaming.stackexchange.com.json.gz": 82887, "stackexchange_title_best_voted_answer_jsonl/gardening.stackexchange.com.json.gz": 13246, "stackexchange_title_best_voted_answer_jsonl/genealogy.stackexchange.com.json.gz": 2895, "stackexchange_title_best_voted_answer_jsonl/german.stackexchange.com.json.gz": 13733, "stackexchange_title_best_voted_answer_jsonl/gis.stackexchange.com.json.gz": 100254, "stackexchange_title_best_voted_answer_jsonl/graphicdesign.stackexchange.com.json.gz": 28083, "stackexchange_title_best_voted_answer_jsonl/ham.stackexchange.com.json.gz": 3501, "stackexchange_title_best_voted_answer_jsonl/hardwarerecs.stackexchange.com.json.gz": 2050, "stackexchange_title_best_voted_answer_jsonl/health.stackexchange.com.json.gz": 4494, "stackexchange_title_best_voted_answer_jsonl/hermeneutics.stackexchange.com.json.gz": 9516, "stackexchange_title_best_voted_answer_jsonl/hinduism.stackexchange.com.json.gz": 8999, "stackexchange_title_best_voted_answer_jsonl/history.stackexchange.com.json.gz": 10766, "stackexchange_title_best_voted_answer_jsonl/homebrew.stackexchange.com.json.gz": 5608, "stackexchange_title_best_voted_answer_jsonl/hsm.stackexchange.com.json.gz": 2517, "stackexchange_title_best_voted_answer_jsonl/interpersonal.stackexchange.com.json.gz": 3398, "stackexchange_title_best_voted_answer_jsonl/iot.stackexchange.com.json.gz": 1359, "stackexchange_title_best_voted_answer_jsonl/iota.stackexchange.com.json.gz": 775, "stackexchange_title_best_voted_answer_jsonl/islam.stackexchange.com.json.gz": 10052, "stackexchange_title_best_voted_answer_jsonl/italian.stackexchange.com.json.gz": 3101, "stackexchange_title_best_voted_answer_jsonl/ja.stackoverflow.com.json.gz": 17376, "stackexchange_title_best_voted_answer_jsonl/japanese.stackexchange.com.json.gz": 20948, "stackexchange_title_best_voted_answer_jsonl/joomla.stackexchange.com.json.gz": 5887, "stackexchange_title_best_voted_answer_jsonl/judaism.stackexchange.com.json.gz": 26085, "stackexchange_title_best_voted_answer_jsonl/korean.stackexchange.com.json.gz": 1406, "stackexchange_title_best_voted_answer_jsonl/languagelearning.stackexchange.com.json.gz": 948, "stackexchange_title_best_voted_answer_jsonl/latin.stackexchange.com.json.gz": 3969, "stackexchange_title_best_voted_answer_jsonl/law.stackexchange.com.json.gz": 16133, "stackexchange_title_best_voted_answer_jsonl/lifehacks.stackexchange.com.json.gz": 2576, "stackexchange_title_best_voted_answer_jsonl/linguistics.stackexchange.com.json.gz": 6843, "stackexchange_title_best_voted_answer_jsonl/literature.stackexchange.com.json.gz": 3539, "stackexchange_title_best_voted_answer_jsonl/magento.stackexchange.com.json.gz": 79241, "stackexchange_title_best_voted_answer_jsonl/martialarts.stackexchange.com.json.gz": 1737, "stackexchange_title_best_voted_answer_jsonl/materials.stackexchange.com.json.gz": 1101, "stackexchange_title_best_voted_answer_jsonl/matheducators.stackexchange.com.json.gz": 2706, "stackexchange_title_best_voted_answer_jsonl/mathematica.stackexchange.com.json.gz": 59895, "stackexchange_title_best_voted_answer_jsonl/mathoverflow.net.json.gz": 85289, "stackexchange_title_best_voted_answer_jsonl/mechanics.stackexchange.com.json.gz": 18613, "stackexchange_title_best_voted_answer_jsonl/meta.askubuntu.com.json.gz": 4268, "stackexchange_title_best_voted_answer_jsonl/meta.mathoverflow.net.json.gz": 1000, "stackexchange_title_best_voted_answer_jsonl/meta.serverfault.com.json.gz": 1726, "stackexchange_title_best_voted_answer_jsonl/meta.stackexchange.com.json.gz": 60744, "stackexchange_title_best_voted_answer_jsonl/meta.stackoverflow.com.json.gz": 24044, "stackexchange_title_best_voted_answer_jsonl/meta.superuser.com.json.gz": 3629, "stackexchange_title_best_voted_answer_jsonl/moderators.stackexchange.com.json.gz": 504, "stackexchange_title_best_voted_answer_jsonl/money.stackexchange.com.json.gz": 29404, "stackexchange_title_best_voted_answer_jsonl/movies.stackexchange.com.json.gz": 18243, "stackexchange_title_best_voted_answer_jsonl/music.stackexchange.com.json.gz": 19936, "stackexchange_title_best_voted_answer_jsonl/musicfans.stackexchange.com.json.gz": 2431, "stackexchange_title_best_voted_answer_jsonl/mythology.stackexchange.com.json.gz": 1595, "stackexchange_title_best_voted_answer_jsonl/networkengineering.stackexchange.com.json.gz": 12590, "stackexchange_title_best_voted_answer_jsonl/opendata.stackexchange.com.json.gz": 3842, "stackexchange_title_best_voted_answer_jsonl/opensource.stackexchange.com.json.gz": 3221, "stackexchange_title_best_voted_answer_jsonl/or.stackexchange.com.json.gz": 1490, "stackexchange_title_best_voted_answer_jsonl/outdoors.stackexchange.com.json.gz": 5278, "stackexchange_title_best_voted_answer_jsonl/parenting.stackexchange.com.json.gz": 5998, "stackexchange_title_best_voted_answer_jsonl/patents.stackexchange.com.json.gz": 3573, "stackexchange_title_best_voted_answer_jsonl/pets.stackexchange.com.json.gz": 6156, "stackexchange_title_best_voted_answer_jsonl/philosophy.stackexchange.com.json.gz": 13114, "stackexchange_title_best_voted_answer_jsonl/photo.stackexchange.com.json.gz": 23204, "stackexchange_title_best_voted_answer_jsonl/physics.stackexchange.com.json.gz": 141230, "stackexchange_title_best_voted_answer_jsonl/pm.stackexchange.com.json.gz": 5435, "stackexchange_title_best_voted_answer_jsonl/poker.stackexchange.com.json.gz": 1665, "stackexchange_title_best_voted_answer_jsonl/politics.stackexchange.com.json.gz": 11047, "stackexchange_title_best_voted_answer_jsonl/portuguese.stackexchange.com.json.gz": 1964, "stackexchange_title_best_voted_answer_jsonl/pt.stackoverflow.com.json.gz": 103277, "stackexchange_title_best_voted_answer_jsonl/puzzling.stackexchange.com.json.gz": 17448, "stackexchange_title_best_voted_answer_jsonl/quant.stackexchange.com.json.gz": 12933, "stackexchange_title_best_voted_answer_jsonl/quantumcomputing.stackexchange.com.json.gz": 4320, "stackexchange_title_best_voted_answer_jsonl/raspberrypi.stackexchange.com.json.gz": 24143, "stackexchange_title_best_voted_answer_jsonl/retrocomputing.stackexchange.com.json.gz": 3907, "stackexchange_title_best_voted_answer_jsonl/reverseengineering.stackexchange.com.json.gz": 5817, "stackexchange_title_best_voted_answer_jsonl/robotics.stackexchange.com.json.gz": 4648, "stackexchange_title_best_voted_answer_jsonl/rpg.stackexchange.com.json.gz": 40435, "stackexchange_title_best_voted_answer_jsonl/ru.stackoverflow.com.json.gz": 253289, "stackexchange_title_best_voted_answer_jsonl/rus.stackexchange.com.json.gz": 16528, "stackexchange_title_best_voted_answer_jsonl/russian.stackexchange.com.json.gz": 3937, "stackexchange_title_best_voted_answer_jsonl/salesforce.stackexchange.com.json.gz": 87272, "stackexchange_title_best_voted_answer_jsonl/scicomp.stackexchange.com.json.gz": 7036, "stackexchange_title_best_voted_answer_jsonl/scifi.stackexchange.com.json.gz": 54805, "stackexchange_title_best_voted_answer_jsonl/serverfault.com.json.gz": 238507, "stackexchange_title_best_voted_answer_jsonl/sharepoint.stackexchange.com.json.gz": 80420, "stackexchange_title_best_voted_answer_jsonl/sitecore.stackexchange.com.json.gz": 7838, "stackexchange_title_best_voted_answer_jsonl/skeptics.stackexchange.com.json.gz": 8145, "stackexchange_title_best_voted_answer_jsonl/softwareengineering.stackexchange.com.json.gz": 51326, "stackexchange_title_best_voted_answer_jsonl/softwarerecs.stackexchange.com.json.gz": 11761, "stackexchange_title_best_voted_answer_jsonl/sound.stackexchange.com.json.gz": 8303, "stackexchange_title_best_voted_answer_jsonl/space.stackexchange.com.json.gz": 12893, "stackexchange_title_best_voted_answer_jsonl/spanish.stackexchange.com.json.gz": 7675, "stackexchange_title_best_voted_answer_jsonl/sports.stackexchange.com.json.gz": 4707, "stackexchange_title_best_voted_answer_jsonl/sqa.stackexchange.com.json.gz": 9256, "stackexchange_title_best_voted_answer_jsonl/stackapps.com.json.gz": 1518, "stackexchange_title_best_voted_answer_jsonl/stats.stackexchange.com.json.gz": 115679, "stackexchange_title_best_voted_answer_jsonl/stellar.stackexchange.com.json.gz": 1078, "stackexchange_title_best_voted_answer_jsonl/superuser.com.json.gz": 352610, "stackexchange_title_best_voted_answer_jsonl/sustainability.stackexchange.com.json.gz": 1674, "stackexchange_title_best_voted_answer_jsonl/tex.stackexchange.com.json.gz": 171628, "stackexchange_title_best_voted_answer_jsonl/tezos.stackexchange.com.json.gz": 1169, "stackexchange_title_best_voted_answer_jsonl/tor.stackexchange.com.json.gz": 4167, "stackexchange_title_best_voted_answer_jsonl/travel.stackexchange.com.json.gz": 36533, "stackexchange_title_best_voted_answer_jsonl/tridion.stackexchange.com.json.gz": 5907, "stackexchange_title_best_voted_answer_jsonl/ukrainian.stackexchange.com.json.gz": 1767, "stackexchange_title_best_voted_answer_jsonl/unix.stackexchange.com.json.gz": 155414, "stackexchange_title_best_voted_answer_jsonl/ux.stackexchange.com.json.gz": 28901, "stackexchange_title_best_voted_answer_jsonl/vegetarianism.stackexchange.com.json.gz": 585, "stackexchange_title_best_voted_answer_jsonl/vi.stackexchange.com.json.gz": 9000, "stackexchange_title_best_voted_answer_jsonl/webapps.stackexchange.com.json.gz": 24867, "stackexchange_title_best_voted_answer_jsonl/webmasters.stackexchange.com.json.gz": 30370, "stackexchange_title_best_voted_answer_jsonl/windowsphone.stackexchange.com.json.gz": 2807, "stackexchange_title_best_voted_answer_jsonl/woodworking.stackexchange.com.json.gz": 2955, "stackexchange_title_best_voted_answer_jsonl/wordpress.stackexchange.com.json.gz": 83621, "stackexchange_title_best_voted_answer_jsonl/workplace.stackexchange.com.json.gz": 24012, "stackexchange_title_best_voted_answer_jsonl/worldbuilding.stackexchange.com.json.gz": 26210, "stackexchange_title_best_voted_answer_jsonl/writers.stackexchange.com.json.gz": 9867, "stackexchange_title_body_jsonl/academia.stackexchange.com.json.gz": 34331, "stackexchange_title_body_jsonl/android.stackexchange.com.json.gz": 51608, "stackexchange_title_body_jsonl/anime.stackexchange.com.json.gz": 11444, "stackexchange_title_body_jsonl/apple.stackexchange.com.json.gz": 110622, "stackexchange_title_body_jsonl/arduino.stackexchange.com.json.gz": 19553, "stackexchange_title_body_jsonl/askubuntu.com.json.gz": 347925, "stackexchange_title_body_jsonl/astronomy.stackexchange.com.json.gz": 10462, "stackexchange_title_body_jsonl/aviation.stackexchange.com.json.gz": 20139, "stackexchange_title_body_jsonl/bicycles.stackexchange.com.json.gz": 16353, "stackexchange_title_body_jsonl/biology.stackexchange.com.json.gz": 24447, "stackexchange_title_body_jsonl/bitcoin.stackexchange.com.json.gz": 25374, "stackexchange_title_body_jsonl/blender.stackexchange.com.json.gz": 80766, "stackexchange_title_body_jsonl/boardgames.stackexchange.com.json.gz": 12149, "stackexchange_title_body_jsonl/chemistry.stackexchange.com.json.gz": 34506, "stackexchange_title_body_jsonl/christianity.stackexchange.com.json.gz": 12108, "stackexchange_title_body_jsonl/civicrm.stackexchange.com.json.gz": 12543, "stackexchange_title_body_jsonl/codereview.stackexchange.com.json.gz": 45765, "stackexchange_title_body_jsonl/cooking.stackexchange.com.json.gz": 23705, "stackexchange_title_body_jsonl/craftcms.stackexchange.com.json.gz": 12574, "stackexchange_title_body_jsonl/crypto.stackexchange.com.json.gz": 23231, "stackexchange_title_body_jsonl/cs.stackexchange.com.json.gz": 38314, "stackexchange_title_body_jsonl/cstheory.stackexchange.com.json.gz": 10642, "stackexchange_title_body_jsonl/datascience.stackexchange.com.json.gz": 27397, "stackexchange_title_body_jsonl/dba.stackexchange.com.json.gz": 81871, "stackexchange_title_body_jsonl/diy.stackexchange.com.json.gz": 60083, "stackexchange_title_body_jsonl/drupal.stackexchange.com.json.gz": 79717, "stackexchange_title_body_jsonl/dsp.stackexchange.com.json.gz": 21252, "stackexchange_title_body_jsonl/economics.stackexchange.com.json.gz": 11115, "stackexchange_title_body_jsonl/electronics.stackexchange.com.json.gz": 143582, "stackexchange_title_body_jsonl/ell.stackexchange.com.json.gz": 83271, "stackexchange_title_body_jsonl/emacs.stackexchange.com.json.gz": 21055, "stackexchange_title_body_jsonl/engineering.stackexchange.com.json.gz": 10753, "stackexchange_title_body_jsonl/english.stackexchange.com.json.gz": 109522, "stackexchange_title_body_jsonl/ethereum.stackexchange.com.json.gz": 32760, "stackexchange_title_body_jsonl/expressionengine.stackexchange.com.json.gz": 11866, "stackexchange_title_body_jsonl/french.stackexchange.com.json.gz": 10794, "stackexchange_title_body_jsonl/gamedev.stackexchange.com.json.gz": 46485, "stackexchange_title_body_jsonl/gaming.stackexchange.com.json.gz": 88912, "stackexchange_title_body_jsonl/gardening.stackexchange.com.json.gz": 15136, "stackexchange_title_body_jsonl/german.stackexchange.com.json.gz": 13950, "stackexchange_title_body_jsonl/gis.stackexchange.com.json.gz": 131000, "stackexchange_title_body_jsonl/graphicdesign.stackexchange.com.json.gz": 30233, "stackexchange_title_body_jsonl/hinduism.stackexchange.com.json.gz": 13450, "stackexchange_title_body_jsonl/history.stackexchange.com.json.gz": 12021, "stackexchange_title_body_jsonl/islam.stackexchange.com.json.gz": 11853, "stackexchange_title_body_jsonl/japanese.stackexchange.com.json.gz": 22056, "stackexchange_title_body_jsonl/judaism.stackexchange.com.json.gz": 32028, "stackexchange_title_body_jsonl/law.stackexchange.com.json.gz": 17941, "stackexchange_title_body_jsonl/magento.stackexchange.com.json.gz": 99991, "stackexchange_title_body_jsonl/math.stackexchange.com.json.gz": 1338443, "stackexchange_title_body_jsonl/mathematica.stackexchange.com.json.gz": 73131, "stackexchange_title_body_jsonl/mathoverflow.net.json.gz": 120851, "stackexchange_title_body_jsonl/mechanics.stackexchange.com.json.gz": 22868, "stackexchange_title_body_jsonl/meta.stackexchange.com.json.gz": 83510, "stackexchange_title_body_jsonl/meta.stackoverflow.com.json.gz": 36456, "stackexchange_title_body_jsonl/money.stackexchange.com.json.gz": 32021, "stackexchange_title_body_jsonl/movies.stackexchange.com.json.gz": 20181, "stackexchange_title_body_jsonl/music.stackexchange.com.json.gz": 20636, "stackexchange_title_body_jsonl/networkengineering.stackexchange.com.json.gz": 13454, "stackexchange_title_body_jsonl/philosophy.stackexchange.com.json.gz": 14829, "stackexchange_title_body_jsonl/photo.stackexchange.com.json.gz": 23753, "stackexchange_title_body_jsonl/physics.stackexchange.com.json.gz": 173307, "stackexchange_title_body_jsonl/politics.stackexchange.com.json.gz": 11894, "stackexchange_title_body_jsonl/puzzling.stackexchange.com.json.gz": 17851, "stackexchange_title_body_jsonl/quant.stackexchange.com.json.gz": 17261, "stackexchange_title_body_jsonl/raspberrypi.stackexchange.com.json.gz": 30625, "stackexchange_title_body_jsonl/rpg.stackexchange.com.json.gz": 42303, "stackexchange_title_body_jsonl/rus.stackexchange.com.json.gz": 16871, "stackexchange_title_body_jsonl/salesforce.stackexchange.com.json.gz": 105260, "stackexchange_title_body_jsonl/scifi.stackexchange.com.json.gz": 61528, "stackexchange_title_body_jsonl/sharepoint.stackexchange.com.json.gz": 94011, "stackexchange_title_body_jsonl/skeptics.stackexchange.com.json.gz": 10009, "stackexchange_title_body_jsonl/small_stackexchanges.json.gz": 448146, "stackexchange_title_body_jsonl/softwareengineering.stackexchange.com.json.gz": 53942, "stackexchange_title_body_jsonl/softwarerecs.stackexchange.com.json.gz": 20142, "stackexchange_title_body_jsonl/space.stackexchange.com.json.gz": 15142, "stackexchange_title_body_jsonl/stackoverflow.com-Posts.json.gz": 18562443, "stackexchange_title_body_jsonl/stats.stackexchange.com.json.gz": 173466, "stackexchange_title_body_jsonl/superuser.com.json.gz": 435463, "stackexchange_title_body_jsonl/tex.stackexchange.com.json.gz": 202954, "stackexchange_title_body_jsonl/travel.stackexchange.com.json.gz": 41227, "stackexchange_title_body_jsonl/unix.stackexchange.com.json.gz": 185997, "stackexchange_title_body_jsonl/ux.stackexchange.com.json.gz": 29403, "stackexchange_title_body_jsonl/vi.stackexchange.com.json.gz": 10551, "stackexchange_title_body_jsonl/webapps.stackexchange.com.json.gz": 29697, "stackexchange_title_body_jsonl/webmasters.stackexchange.com.json.gz": 34559, "stackexchange_title_body_jsonl/wordpress.stackexchange.com.json.gz": 100474, "stackexchange_title_body_jsonl/workplace.stackexchange.com.json.gz": 24189, "stackexchange_title_body_jsonl/worldbuilding.stackexchange.com.json.gz": 26763, "stackexchange_title_body_jsonl/writers.stackexchange.com.json.gz": 10157, "stackexchange_title_body_small.json.gz": 364000, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/3dprinting.stackexchange.com.json.gz": 109, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/academia.stackexchange.com.json.gz": 2465, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ai.stackexchange.com.json.gz": 130, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/android.stackexchange.com.json.gz": 2830, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/anime.stackexchange.com.json.gz": 802, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/apple.stackexchange.com.json.gz": 6696, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/arduino.stackexchange.com.json.gz": 595, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/askubuntu.com.json.gz": 9975, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/astronomy.stackexchange.com.json.gz": 371, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/aviation.stackexchange.com.json.gz": 903, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/avp.stackexchange.com.json.gz": 152, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/beer.stackexchange.com.json.gz": 57, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/bicycles.stackexchange.com.json.gz": 984, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/bioinformatics.stackexchange.com.json.gz": 39, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/biology.stackexchange.com.json.gz": 832, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/bitcoin.stackexchange.com.json.gz": 1068, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/blender.stackexchange.com.json.gz": 1312, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/boardgames.stackexchange.com.json.gz": 691, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/bricks.stackexchange.com.json.gz": 79, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/buddhism.stackexchange.com.json.gz": 770, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/cardano.stackexchange.com.json.gz": 7, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/chemistry.stackexchange.com.json.gz": 1523, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/chess.stackexchange.com.json.gz": 402, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/chinese.stackexchange.com.json.gz": 611, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/christianity.stackexchange.com.json.gz": 1502, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/civicrm.stackexchange.com.json.gz": 85, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/codegolf.stackexchange.com.json.gz": 333, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/codereview.stackexchange.com.json.gz": 666, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/coffee.stackexchange.com.json.gz": 47, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/cogsci.stackexchange.com.json.gz": 221, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/computergraphics.stackexchange.com.json.gz": 30, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/conlang.stackexchange.com.json.gz": 8, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/cooking.stackexchange.com.json.gz": 2064, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/craftcms.stackexchange.com.json.gz": 26, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/crafts.stackexchange.com.json.gz": 72, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/crypto.stackexchange.com.json.gz": 595, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/cs.stackexchange.com.json.gz": 936, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/cseducators.stackexchange.com.json.gz": 67, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/cstheory.stackexchange.com.json.gz": 314, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/datascience.stackexchange.com.json.gz": 325, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/dba.stackexchange.com.json.gz": 2502, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/devops.stackexchange.com.json.gz": 53, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/diy.stackexchange.com.json.gz": 2037, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/drones.stackexchange.com.json.gz": 6, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/drupal.stackexchange.com.json.gz": 1714, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/dsp.stackexchange.com.json.gz": 387, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/earthscience.stackexchange.com.json.gz": 229, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ebooks.stackexchange.com.json.gz": 54, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/economics.stackexchange.com.json.gz": 441, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/electronics.stackexchange.com.json.gz": 4014, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/elementaryos.stackexchange.com.json.gz": 224, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ell.stackexchange.com.json.gz": 4438, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/emacs.stackexchange.com.json.gz": 188, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/engineering.stackexchange.com.json.gz": 227, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/english.stackexchange.com.json.gz": 13003, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/eosio.stackexchange.com.json.gz": 44, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/esperanto.stackexchange.com.json.gz": 56, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ethereum.stackexchange.com.json.gz": 479, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/expatriates.stackexchange.com.json.gz": 132, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/expressionengine.stackexchange.com.json.gz": 91, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/fitness.stackexchange.com.json.gz": 567, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/freelancing.stackexchange.com.json.gz": 70, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/french.stackexchange.com.json.gz": 632, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/gamedev.stackexchange.com.json.gz": 1598, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/gaming.stackexchange.com.json.gz": 7321, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/gardening.stackexchange.com.json.gz": 210, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/genealogy.stackexchange.com.json.gz": 86, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/german.stackexchange.com.json.gz": 1047, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/gis.stackexchange.com.json.gz": 1843, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/graphicdesign.stackexchange.com.json.gz": 1565, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ham.stackexchange.com.json.gz": 158, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/hardwarerecs.stackexchange.com.json.gz": 58, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/health.stackexchange.com.json.gz": 299, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/hermeneutics.stackexchange.com.json.gz": 1719, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/hinduism.stackexchange.com.json.gz": 343, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/history.stackexchange.com.json.gz": 1099, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/homebrew.stackexchange.com.json.gz": 176, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/hsm.stackexchange.com.json.gz": 70, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/interpersonal.stackexchange.com.json.gz": 469, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/iot.stackexchange.com.json.gz": 10, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/iota.stackexchange.com.json.gz": 31, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/islam.stackexchange.com.json.gz": 2037, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/italian.stackexchange.com.json.gz": 181, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ja.stackoverflow.com.json.gz": 328, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/japanese.stackexchange.com.json.gz": 1124, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/joomla.stackexchange.com.json.gz": 124, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/judaism.stackexchange.com.json.gz": 2216, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/korean.stackexchange.com.json.gz": 28, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/languagelearning.stackexchange.com.json.gz": 42, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/latin.stackexchange.com.json.gz": 55, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/law.stackexchange.com.json.gz": 1297, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/lifehacks.stackexchange.com.json.gz": 316, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/linguistics.stackexchange.com.json.gz": 442, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/literature.stackexchange.com.json.gz": 191, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/magento.stackexchange.com.json.gz": 1849, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/martialarts.stackexchange.com.json.gz": 254, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/materials.stackexchange.com.json.gz": 1, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/matheducators.stackexchange.com.json.gz": 177, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/mathematica.stackexchange.com.json.gz": 262, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/mathoverflow.net.json.gz": 1109, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/mechanics.stackexchange.com.json.gz": 842, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/meta.askubuntu.com.json.gz": 252, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/meta.mathoverflow.net.json.gz": 61, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/meta.serverfault.com.json.gz": 114, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/meta.stackexchange.com.json.gz": 2517, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/meta.stackoverflow.com.json.gz": 2678, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/meta.superuser.com.json.gz": 145, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/moderators.stackexchange.com.json.gz": 23, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/monero.stackexchange.com.json.gz": 26, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/money.stackexchange.com.json.gz": 1905, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/movies.stackexchange.com.json.gz": 1577, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/music.stackexchange.com.json.gz": 1228, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/musicfans.stackexchange.com.json.gz": 78, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/mythology.stackexchange.com.json.gz": 103, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/networkengineering.stackexchange.com.json.gz": 476, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/opendata.stackexchange.com.json.gz": 45, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/opensource.stackexchange.com.json.gz": 123, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/or.stackexchange.com.json.gz": 13, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/outdoors.stackexchange.com.json.gz": 221, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/parenting.stackexchange.com.json.gz": 624, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/patents.stackexchange.com.json.gz": 137, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/pets.stackexchange.com.json.gz": 322, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/philosophy.stackexchange.com.json.gz": 1184, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/photo.stackexchange.com.json.gz": 1432, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/physics.stackexchange.com.json.gz": 8362, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/pm.stackexchange.com.json.gz": 241, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/poker.stackexchange.com.json.gz": 115, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/politics.stackexchange.com.json.gz": 1468, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/portuguese.stackexchange.com.json.gz": 144, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/pt.stackoverflow.com.json.gz": 3718, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/puzzling.stackexchange.com.json.gz": 784, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/quant.stackexchange.com.json.gz": 340, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/quantumcomputing.stackexchange.com.json.gz": 46, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/raspberrypi.stackexchange.com.json.gz": 1011, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/retrocomputing.stackexchange.com.json.gz": 135, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/reverseengineering.stackexchange.com.json.gz": 97, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/robotics.stackexchange.com.json.gz": 110, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/rpg.stackexchange.com.json.gz": 4212, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ru.stackoverflow.com.json.gz": 6305, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/rus.stackexchange.com.json.gz": 514, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/russian.stackexchange.com.json.gz": 353, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/salesforce.stackexchange.com.json.gz": 1781, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/scicomp.stackexchange.com.json.gz": 127, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/scifi.stackexchange.com.json.gz": 5176, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/security.stackexchange.com.json.gz": 3069, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/serverfault.com.json.gz": 7969, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/sharepoint.stackexchange.com.json.gz": 1691, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/sitecore.stackexchange.com.json.gz": 122, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/skeptics.stackexchange.com.json.gz": 670, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/softwareengineering.stackexchange.com.json.gz": 4238, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/softwarerecs.stackexchange.com.json.gz": 348, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/sound.stackexchange.com.json.gz": 365, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/space.stackexchange.com.json.gz": 405, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/spanish.stackexchange.com.json.gz": 366, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/sports.stackexchange.com.json.gz": 455, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/sqa.stackexchange.com.json.gz": 353, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/stackapps.com.json.gz": 15, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/stats.stackexchange.com.json.gz": 2238, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/stellar.stackexchange.com.json.gz": 3, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/superuser.com.json.gz": 17425, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/sustainability.stackexchange.com.json.gz": 152, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/tex.stackexchange.com.json.gz": 1095, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/tezos.stackexchange.com.json.gz": 11, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/tor.stackexchange.com.json.gz": 137, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/travel.stackexchange.com.json.gz": 1317, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/tridion.stackexchange.com.json.gz": 68, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ukrainian.stackexchange.com.json.gz": 87, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/unix.stackexchange.com.json.gz": 6173, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/ux.stackexchange.com.json.gz": 1107, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/vegetarianism.stackexchange.com.json.gz": 35, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/vi.stackexchange.com.json.gz": 95, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/webapps.stackexchange.com.json.gz": 1906, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/webmasters.stackexchange.com.json.gz": 854, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/windowsphone.stackexchange.com.json.gz": 153, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/woodworking.stackexchange.com.json.gz": 93, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/wordpress.stackexchange.com.json.gz": 3046, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/workplace.stackexchange.com.json.gz": 4317, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/worldbuilding.stackexchange.com.json.gz": 2087, "stackexchange_titlebody_best_and_down_voted_answer_jsonl/writers.stackexchange.com.json.gz": 407, "stackexchange_titlebody_best_voted_answer_jsonl/3dprinting.stackexchange.com.json.gz": 3488, "stackexchange_titlebody_best_voted_answer_jsonl/academia.stackexchange.com.json.gz": 32137, "stackexchange_titlebody_best_voted_answer_jsonl/ai.stackexchange.com.json.gz": 5763, "stackexchange_titlebody_best_voted_answer_jsonl/android.stackexchange.com.json.gz": 38077, "stackexchange_titlebody_best_voted_answer_jsonl/anime.stackexchange.com.json.gz": 10131, "stackexchange_titlebody_best_voted_answer_jsonl/apple.stackexchange.com.json.gz": 92487, "stackexchange_titlebody_best_voted_answer_jsonl/arduino.stackexchange.com.json.gz": 16281, "stackexchange_titlebody_best_voted_answer_jsonl/askubuntu.com.json.gz": 267135, "stackexchange_titlebody_best_voted_answer_jsonl/astronomy.stackexchange.com.json.gz": 9086, "stackexchange_titlebody_best_voted_answer_jsonl/aviation.stackexchange.com.json.gz": 18755, "stackexchange_titlebody_best_voted_answer_jsonl/avp.stackexchange.com.json.gz": 6450, "stackexchange_titlebody_best_voted_answer_jsonl/beer.stackexchange.com.json.gz": 1012, "stackexchange_titlebody_best_voted_answer_jsonl/bicycles.stackexchange.com.json.gz": 15708, "stackexchange_titlebody_best_voted_answer_jsonl/bioinformatics.stackexchange.com.json.gz": 3135, "stackexchange_titlebody_best_voted_answer_jsonl/biology.stackexchange.com.json.gz": 19277, "stackexchange_titlebody_best_voted_answer_jsonl/bitcoin.stackexchange.com.json.gz": 22474, "stackexchange_titlebody_best_voted_answer_jsonl/blender.stackexchange.com.json.gz": 54153, "stackexchange_titlebody_best_voted_answer_jsonl/boardgames.stackexchange.com.json.gz": 11805, "stackexchange_titlebody_best_voted_answer_jsonl/bricks.stackexchange.com.json.gz": 3530, "stackexchange_titlebody_best_voted_answer_jsonl/buddhism.stackexchange.com.json.gz": 6787, "stackexchange_titlebody_best_voted_answer_jsonl/cardano.stackexchange.com.json.gz": 248, "stackexchange_titlebody_best_voted_answer_jsonl/chemistry.stackexchange.com.json.gz": 27061, "stackexchange_titlebody_best_voted_answer_jsonl/chess.stackexchange.com.json.gz": 6392, "stackexchange_titlebody_best_voted_answer_jsonl/chinese.stackexchange.com.json.gz": 8646, "stackexchange_titlebody_best_voted_answer_jsonl/christianity.stackexchange.com.json.gz": 11498, "stackexchange_titlebody_best_voted_answer_jsonl/civicrm.stackexchange.com.json.gz": 10648, "stackexchange_titlebody_best_voted_answer_jsonl/codegolf.stackexchange.com.json.gz": 8211, "stackexchange_titlebody_best_voted_answer_jsonl/codereview.stackexchange.com.json.gz": 41748, "stackexchange_titlebody_best_voted_answer_jsonl/coffee.stackexchange.com.json.gz": 1188, "stackexchange_titlebody_best_voted_answer_jsonl/cogsci.stackexchange.com.json.gz": 5101, "stackexchange_titlebody_best_voted_answer_jsonl/computergraphics.stackexchange.com.json.gz": 2306, "stackexchange_titlebody_best_voted_answer_jsonl/conlang.stackexchange.com.json.gz": 334, "stackexchange_titlebody_best_voted_answer_jsonl/cooking.stackexchange.com.json.gz": 22641, "stackexchange_titlebody_best_voted_answer_jsonl/craftcms.stackexchange.com.json.gz": 11236, "stackexchange_titlebody_best_voted_answer_jsonl/crafts.stackexchange.com.json.gz": 1659, "stackexchange_titlebody_best_voted_answer_jsonl/crypto.stackexchange.com.json.gz": 19404, "stackexchange_titlebody_best_voted_answer_jsonl/cs.stackexchange.com.json.gz": 30010, "stackexchange_titlebody_best_voted_answer_jsonl/cseducators.stackexchange.com.json.gz": 902, "stackexchange_titlebody_best_voted_answer_jsonl/cstheory.stackexchange.com.json.gz": 7742, "stackexchange_titlebody_best_voted_answer_jsonl/datascience.stackexchange.com.json.gz": 20503, "stackexchange_titlebody_best_voted_answer_jsonl/dba.stackexchange.com.json.gz": 71449, "stackexchange_titlebody_best_voted_answer_jsonl/devops.stackexchange.com.json.gz": 3462, "stackexchange_titlebody_best_voted_answer_jsonl/diy.stackexchange.com.json.gz": 52896, "stackexchange_titlebody_best_voted_answer_jsonl/drones.stackexchange.com.json.gz": 496, "stackexchange_titlebody_best_voted_answer_jsonl/drupal.stackexchange.com.json.gz": 67817, "stackexchange_titlebody_best_voted_answer_jsonl/dsp.stackexchange.com.json.gz": 17430, "stackexchange_titlebody_best_voted_answer_jsonl/earthscience.stackexchange.com.json.gz": 4396, "stackexchange_titlebody_best_voted_answer_jsonl/ebooks.stackexchange.com.json.gz": 1107, "stackexchange_titlebody_best_voted_answer_jsonl/economics.stackexchange.com.json.gz": 8844, "stackexchange_titlebody_best_voted_answer_jsonl/electronics.stackexchange.com.json.gz": 129494, "stackexchange_titlebody_best_voted_answer_jsonl/elementaryos.stackexchange.com.json.gz": 5917, "stackexchange_titlebody_best_voted_answer_jsonl/ell.stackexchange.com.json.gz": 77892, "stackexchange_titlebody_best_voted_answer_jsonl/emacs.stackexchange.com.json.gz": 16830, "stackexchange_titlebody_best_voted_answer_jsonl/engineering.stackexchange.com.json.gz": 8649, "stackexchange_titlebody_best_voted_answer_jsonl/english.stackexchange.com.json.gz": 100640, "stackexchange_titlebody_best_voted_answer_jsonl/eosio.stackexchange.com.json.gz": 1940, "stackexchange_titlebody_best_voted_answer_jsonl/esperanto.stackexchange.com.json.gz": 1466, "stackexchange_titlebody_best_voted_answer_jsonl/ethereum.stackexchange.com.json.gz": 26124, "stackexchange_titlebody_best_voted_answer_jsonl/expatriates.stackexchange.com.json.gz": 4913, "stackexchange_titlebody_best_voted_answer_jsonl/expressionengine.stackexchange.com.json.gz": 10742, "stackexchange_titlebody_best_voted_answer_jsonl/fitness.stackexchange.com.json.gz": 8297, "stackexchange_titlebody_best_voted_answer_jsonl/freelancing.stackexchange.com.json.gz": 1663, "stackexchange_titlebody_best_voted_answer_jsonl/french.stackexchange.com.json.gz": 10578, "stackexchange_titlebody_best_voted_answer_jsonl/gamedev.stackexchange.com.json.gz": 40154, "stackexchange_titlebody_best_voted_answer_jsonl/gaming.stackexchange.com.json.gz": 82887, "stackexchange_titlebody_best_voted_answer_jsonl/gardening.stackexchange.com.json.gz": 13246, "stackexchange_titlebody_best_voted_answer_jsonl/genealogy.stackexchange.com.json.gz": 2895, "stackexchange_titlebody_best_voted_answer_jsonl/german.stackexchange.com.json.gz": 13733, "stackexchange_titlebody_best_voted_answer_jsonl/gis.stackexchange.com.json.gz": 100254, "stackexchange_titlebody_best_voted_answer_jsonl/graphicdesign.stackexchange.com.json.gz": 28083, "stackexchange_titlebody_best_voted_answer_jsonl/ham.stackexchange.com.json.gz": 3501, "stackexchange_titlebody_best_voted_answer_jsonl/hardwarerecs.stackexchange.com.json.gz": 2050, "stackexchange_titlebody_best_voted_answer_jsonl/health.stackexchange.com.json.gz": 4494, "stackexchange_titlebody_best_voted_answer_jsonl/hermeneutics.stackexchange.com.json.gz": 9516, "stackexchange_titlebody_best_voted_answer_jsonl/hinduism.stackexchange.com.json.gz": 8999, "stackexchange_titlebody_best_voted_answer_jsonl/history.stackexchange.com.json.gz": 10766, "stackexchange_titlebody_best_voted_answer_jsonl/homebrew.stackexchange.com.json.gz": 5608, "stackexchange_titlebody_best_voted_answer_jsonl/hsm.stackexchange.com.json.gz": 2517, "stackexchange_titlebody_best_voted_answer_jsonl/interpersonal.stackexchange.com.json.gz": 3398, "stackexchange_titlebody_best_voted_answer_jsonl/iot.stackexchange.com.json.gz": 1359, "stackexchange_titlebody_best_voted_answer_jsonl/iota.stackexchange.com.json.gz": 775, "stackexchange_titlebody_best_voted_answer_jsonl/islam.stackexchange.com.json.gz": 10052, "stackexchange_titlebody_best_voted_answer_jsonl/italian.stackexchange.com.json.gz": 3101, "stackexchange_titlebody_best_voted_answer_jsonl/ja.stackoverflow.com.json.gz": 17376, "stackexchange_titlebody_best_voted_answer_jsonl/japanese.stackexchange.com.json.gz": 20948, "stackexchange_titlebody_best_voted_answer_jsonl/joomla.stackexchange.com.json.gz": 5887, "stackexchange_titlebody_best_voted_answer_jsonl/judaism.stackexchange.com.json.gz": 26085, "stackexchange_titlebody_best_voted_answer_jsonl/korean.stackexchange.com.json.gz": 1406, "stackexchange_titlebody_best_voted_answer_jsonl/languagelearning.stackexchange.com.json.gz": 948, "stackexchange_titlebody_best_voted_answer_jsonl/latin.stackexchange.com.json.gz": 3969, "stackexchange_titlebody_best_voted_answer_jsonl/law.stackexchange.com.json.gz": 16133, "stackexchange_titlebody_best_voted_answer_jsonl/lifehacks.stackexchange.com.json.gz": 2576, "stackexchange_titlebody_best_voted_answer_jsonl/linguistics.stackexchange.com.json.gz": 6843, "stackexchange_titlebody_best_voted_answer_jsonl/literature.stackexchange.com.json.gz": 3539, "stackexchange_titlebody_best_voted_answer_jsonl/magento.stackexchange.com.json.gz": 79241, "stackexchange_titlebody_best_voted_answer_jsonl/martialarts.stackexchange.com.json.gz": 1737, "stackexchange_titlebody_best_voted_answer_jsonl/materials.stackexchange.com.json.gz": 1101, "stackexchange_titlebody_best_voted_answer_jsonl/matheducators.stackexchange.com.json.gz": 2706, "stackexchange_titlebody_best_voted_answer_jsonl/mathematica.stackexchange.com.json.gz": 59895, "stackexchange_titlebody_best_voted_answer_jsonl/mathoverflow.net.json.gz": 85289, "stackexchange_titlebody_best_voted_answer_jsonl/mechanics.stackexchange.com.json.gz": 18613, "stackexchange_titlebody_best_voted_answer_jsonl/meta.askubuntu.com.json.gz": 4268, "stackexchange_titlebody_best_voted_answer_jsonl/meta.mathoverflow.net.json.gz": 1000, "stackexchange_titlebody_best_voted_answer_jsonl/meta.serverfault.com.json.gz": 1726, "stackexchange_titlebody_best_voted_answer_jsonl/meta.stackexchange.com.json.gz": 60744, "stackexchange_titlebody_best_voted_answer_jsonl/meta.stackoverflow.com.json.gz": 24044, "stackexchange_titlebody_best_voted_answer_jsonl/meta.superuser.com.json.gz": 3629, "stackexchange_titlebody_best_voted_answer_jsonl/moderators.stackexchange.com.json.gz": 504, "stackexchange_titlebody_best_voted_answer_jsonl/money.stackexchange.com.json.gz": 29404, "stackexchange_titlebody_best_voted_answer_jsonl/movies.stackexchange.com.json.gz": 18243, "stackexchange_titlebody_best_voted_answer_jsonl/music.stackexchange.com.json.gz": 19936, "stackexchange_titlebody_best_voted_answer_jsonl/musicfans.stackexchange.com.json.gz": 2431, "stackexchange_titlebody_best_voted_answer_jsonl/mythology.stackexchange.com.json.gz": 1595, "stackexchange_titlebody_best_voted_answer_jsonl/networkengineering.stackexchange.com.json.gz": 12590, "stackexchange_titlebody_best_voted_answer_jsonl/opendata.stackexchange.com.json.gz": 3842, "stackexchange_titlebody_best_voted_answer_jsonl/opensource.stackexchange.com.json.gz": 3221, "stackexchange_titlebody_best_voted_answer_jsonl/or.stackexchange.com.json.gz": 1490, "stackexchange_titlebody_best_voted_answer_jsonl/outdoors.stackexchange.com.json.gz": 5278, "stackexchange_titlebody_best_voted_answer_jsonl/parenting.stackexchange.com.json.gz": 5998, "stackexchange_titlebody_best_voted_answer_jsonl/patents.stackexchange.com.json.gz": 3573, "stackexchange_titlebody_best_voted_answer_jsonl/pets.stackexchange.com.json.gz": 6156, "stackexchange_titlebody_best_voted_answer_jsonl/philosophy.stackexchange.com.json.gz": 13114, "stackexchange_titlebody_best_voted_answer_jsonl/photo.stackexchange.com.json.gz": 23204, "stackexchange_titlebody_best_voted_answer_jsonl/physics.stackexchange.com.json.gz": 141230, "stackexchange_titlebody_best_voted_answer_jsonl/pm.stackexchange.com.json.gz": 5435, "stackexchange_titlebody_best_voted_answer_jsonl/poker.stackexchange.com.json.gz": 1665, "stackexchange_titlebody_best_voted_answer_jsonl/politics.stackexchange.com.json.gz": 11047, "stackexchange_titlebody_best_voted_answer_jsonl/portuguese.stackexchange.com.json.gz": 1964, "stackexchange_titlebody_best_voted_answer_jsonl/pt.stackoverflow.com.json.gz": 103277, "stackexchange_titlebody_best_voted_answer_jsonl/puzzling.stackexchange.com.json.gz": 17448, "stackexchange_titlebody_best_voted_answer_jsonl/quant.stackexchange.com.json.gz": 12933, "stackexchange_titlebody_best_voted_answer_jsonl/quantumcomputing.stackexchange.com.json.gz": 4320, "stackexchange_titlebody_best_voted_answer_jsonl/raspberrypi.stackexchange.com.json.gz": 24143, "stackexchange_titlebody_best_voted_answer_jsonl/retrocomputing.stackexchange.com.json.gz": 3907, "stackexchange_titlebody_best_voted_answer_jsonl/reverseengineering.stackexchange.com.json.gz": 5817, "stackexchange_titlebody_best_voted_answer_jsonl/robotics.stackexchange.com.json.gz": 4648, "stackexchange_titlebody_best_voted_answer_jsonl/rpg.stackexchange.com.json.gz": 40435, "stackexchange_titlebody_best_voted_answer_jsonl/ru.stackoverflow.com.json.gz": 253289, "stackexchange_titlebody_best_voted_answer_jsonl/rus.stackexchange.com.json.gz": 16528, "stackexchange_titlebody_best_voted_answer_jsonl/russian.stackexchange.com.json.gz": 3937, "stackexchange_titlebody_best_voted_answer_jsonl/salesforce.stackexchange.com.json.gz": 87272, "stackexchange_titlebody_best_voted_answer_jsonl/scicomp.stackexchange.com.json.gz": 7036, "stackexchange_titlebody_best_voted_answer_jsonl/scifi.stackexchange.com.json.gz": 54805, "stackexchange_titlebody_best_voted_answer_jsonl/sharepoint.stackexchange.com.json.gz": 80420, "stackexchange_titlebody_best_voted_answer_jsonl/sitecore.stackexchange.com.json.gz": 7838, "stackexchange_titlebody_best_voted_answer_jsonl/skeptics.stackexchange.com.json.gz": 8145, "stackexchange_titlebody_best_voted_answer_jsonl/softwareengineering.stackexchange.com.json.gz": 51326, "stackexchange_titlebody_best_voted_answer_jsonl/softwarerecs.stackexchange.com.json.gz": 11761, "stackexchange_titlebody_best_voted_answer_jsonl/sound.stackexchange.com.json.gz": 8303, "stackexchange_titlebody_best_voted_answer_jsonl/space.stackexchange.com.json.gz": 12893, "stackexchange_titlebody_best_voted_answer_jsonl/spanish.stackexchange.com.json.gz": 7675, "stackexchange_titlebody_best_voted_answer_jsonl/sports.stackexchange.com.json.gz": 4707, "stackexchange_titlebody_best_voted_answer_jsonl/sqa.stackexchange.com.json.gz": 9256, "stackexchange_titlebody_best_voted_answer_jsonl/stackapps.com.json.gz": 1518, "stackexchange_titlebody_best_voted_answer_jsonl/stats.stackexchange.com.json.gz": 115679, "stackexchange_titlebody_best_voted_answer_jsonl/stellar.stackexchange.com.json.gz": 1078, "stackexchange_titlebody_best_voted_answer_jsonl/superuser.com.json.gz": 352610, "stackexchange_titlebody_best_voted_answer_jsonl/sustainability.stackexchange.com.json.gz": 1674, "stackexchange_titlebody_best_voted_answer_jsonl/tex.stackexchange.com.json.gz": 171628, "stackexchange_titlebody_best_voted_answer_jsonl/tezos.stackexchange.com.json.gz": 1169, "stackexchange_titlebody_best_voted_answer_jsonl/tor.stackexchange.com.json.gz": 4167, "stackexchange_titlebody_best_voted_answer_jsonl/travel.stackexchange.com.json.gz": 36533, "stackexchange_titlebody_best_voted_answer_jsonl/tridion.stackexchange.com.json.gz": 5907, "stackexchange_titlebody_best_voted_answer_jsonl/ukrainian.stackexchange.com.json.gz": 1767, "stackexchange_titlebody_best_voted_answer_jsonl/unix.stackexchange.com.json.gz": 155414, "stackexchange_titlebody_best_voted_answer_jsonl/ux.stackexchange.com.json.gz": 28901, "stackexchange_titlebody_best_voted_answer_jsonl/vegetarianism.stackexchange.com.json.gz": 585, "stackexchange_titlebody_best_voted_answer_jsonl/vi.stackexchange.com.json.gz": 9000, "stackexchange_titlebody_best_voted_answer_jsonl/webapps.stackexchange.com.json.gz": 24867, "stackexchange_titlebody_best_voted_answer_jsonl/webmasters.stackexchange.com.json.gz": 30370, "stackexchange_titlebody_best_voted_answer_jsonl/windowsphone.stackexchange.com.json.gz": 2807, "stackexchange_titlebody_best_voted_answer_jsonl/woodworking.stackexchange.com.json.gz": 2955, "stackexchange_titlebody_best_voted_answer_jsonl/wordpress.stackexchange.com.json.gz": 83621, "stackexchange_titlebody_best_voted_answer_jsonl/workplace.stackexchange.com.json.gz": 24012, "stackexchange_titlebody_best_voted_answer_jsonl/worldbuilding.stackexchange.com.json.gz": 26210, "stackexchange_titlebody_best_voted_answer_jsonl/writers.stackexchange.com.json.gz": 9867, "wikihow.json.gz": 128542, "xsum.json.gz": 226711, "yahoo_answers_question_answer.json.gz": 681164, "yahoo_answers_title_answer.json.gz": 1198260, "yahoo_answers_title_question.json.gz": 659896}

flash_attn_triton.py ADDED Viewed

	@@ -0,0 +1,1112 @@

+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+"""Triton implementation of Flash Attention.
+# Copyright (c) 2022, Tri Dao.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*Experimental* implementation of FlashAttention in Triton.
+We use the FlashAttention implementation from Phil Tillet a starting point.
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+Changes:
+- Implement both causal and non-causal attention.
+- Implement both self-attention and cross-attention.
+- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
+- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
+- Support attention bias.
+- Speed up the forward pass a bit, and only store the LSE instead of m and l.
+- Make the backward for d=128 much faster by reducing register spilling.
+- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
+small batch size * nheads.
+Caution:
+- If you plan to use headdim other than 64 and 128, you should test for race conditions
+(due to the Triton compiler), as done in tests/test_flash_attn.py
+"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
+for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
+that there are none left for other head dimensions.
+Differences between this Triton version and the CUDA version:
+- Triton version doesn't support dropout.
+- Triton forward is generally faster than CUDA forward.
+- Triton backward is faster than CUDA backward when batch * nheads is small, and when headdim=64.
+It is slightly slower when headdim=128 and batch * nheads is large.
+- Triton version doesn't yet support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
+"""
+import math
+import torch
+import triton  # type: ignore (reportMissingImports)
+import triton.language as tl  # type: ignore (reportMissingImports)
+from einops import repeat
+@triton.autotune(
+    configs=[
+        triton.Config({
+            'BLOCK_M': 128,
+            'BLOCK_N': 128
+        },
+                      num_warps=8,
+                      num_stages=1),
+        # This config has a race condition when EVEN_M == False, disabling it for now.
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
+    ],
+    key=[
+        'CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL',
+        'BLOCK_HEADDIM'
+    ])
+@triton.heuristics({
+    'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0,
+    'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0,
+    'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM'],
+})
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    Out,
+    Lse,
+    TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # off_b = tl.program_id(1)
+    # off_h = tl.program_id(2)
+    # off_hb = off_b * nheads + off_h
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # Initialize pointers to Q, K, V
+    # Adding parenthesis around indexing might use int32 math instead of int64 math?
+    # https://github.com/openai/triton/issues/741
+    # I'm seeing a tiny bit of difference (5-7us)
+    q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (
+        offs_m[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (
+        offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (
+        offs_n[:, None] * stride_vn + offs_d[None, :])
+    if BIAS_TYPE == 'vector':
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
+    elif BIAS_TYPE == 'matrix':
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (
+            offs_m[:, None] * stride_bm + offs_n[None, :])
+    else:
+        raise ValueError("BIAS_TYPE must be one of {'vector', 'matrix'}")
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
+    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
+    # tl.load(q_ptrs), we get the wrong output!
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
+        else:
+            q = tl.load(q_ptrs,
+                        mask=(offs_m[:, None] < seqlen_q) &
+                        (offs_d[None, :] < headdim),
+                        other=0.0)
+    # loop over k, v and update accumulator
+    end_n = seqlen_k if not IS_CAUSAL else tl.minimum(
+        (start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(0, end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                k = tl.load(k_ptrs + start_n * stride_kn)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn,
+                            mask=offs_d[None, :] < headdim,
+                            other=0.0)
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(k_ptrs + start_n * stride_kn,
+                            mask=(start_n + offs_n)[:, None] < seqlen_k,
+                            other=0.0)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn,
+                            mask=((start_n + offs_n)[:, None] < seqlen_k) &
+                            (offs_d[None, :] < headdim),
+                            other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0,
+                           float('-inf'))
+        if IS_CAUSAL:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0,
+                           float('-inf'))
+        if BIAS_TYPE != 'none':
+            if BIAS_TYPE == 'vector':
+                if EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs + start_n,
+                                   mask=(start_n + offs_n) < seqlen_k,
+                                   other=0.0).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == 'matrix':
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs + start_n,
+                                   mask=(offs_m[:, None] < seqlen_q) &
+                                   ((start_n + offs_n)[None, :] < seqlen_k),
+                                   other=0.0).to(tl.float32)
+            else:
+                raise ValueError(
+                    "BIAS_TYPE must be one of {'vector', 'matrix'}")
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            qk = qk * softmax_scale + bias
+            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+            p = tl.exp(qk - m_ij[:, None])
+        else:
+            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
+            p = tl.exp(qk * softmax_scale - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # scale acc_o
+        acc_o_scale = tl.exp(m_i - m_ij)
+        # # -- update output accumulator --
+        # BUG: have to store and immediately load
+        tl.store(t_ptrs, acc_o_scale)
+        acc_o_scale = tl.load(t_ptrs)
+        acc_o = acc_o * acc_o_scale[:, None]
+        # update acc_o
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                v = tl.load(v_ptrs + start_n * stride_vn)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn,
+                            mask=offs_d[None, :] < headdim,
+                            other=0.0)
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(v_ptrs + start_n * stride_vn,
+                            mask=(start_n + offs_n)[:, None] < seqlen_k,
+                            other=0.0)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn,
+                            mask=((start_n + offs_n)[:, None] < seqlen_k) &
+                            (offs_d[None, :] < headdim),
+                            other=0.0)
+        p = p.to(v.dtype)
+        acc_o += tl.dot(p, v)
+        # -- update statistics
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    # BUG: have to store and immediately load
+    tl.store(t_ptrs, o_scale)
+    o_scale = tl.load(t_ptrs)
+    acc_o = acc_o * o_scale[:, None]
+    # rematerialize offsets to save registers
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
+    tl.store(lse_ptrs, lse_i)
+    # initialize pointers to output
+    offs_n = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (
+        offs_m[:, None] * stride_om + offs_n[None, :])
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o)
+        else:
+            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
+        else:
+            tl.store(out_ptrs,
+                     acc_o,
+                     mask=(offs_m[:, None] < seqlen_q) &
+                     (offs_d[None, :] < headdim))
+@triton.jit
+def _bwd_preprocess_do_o_dot(
+    Out,
+    DO,
+    Delta,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    nheads,
+    seqlen_q,
+    seqlen_q_rounded,
+    headdim,
+    BLOCK_M: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # load
+    o = tl.load(Out + off_b * stride_ob + off_h * stride_oh +
+                offs_m[:, None] * stride_om + offs_d[None, :],
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0).to(tl.float32)
+    do = tl.load(DO + off_b * stride_dob + off_h * stride_doh +
+                 offs_m[:, None] * stride_dom + offs_d[None, :],
+                 mask=(offs_m[:, None] < seqlen_q) &
+                 (offs_d[None, :] < headdim),
+                 other=0.0).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
+@triton.jit
+def _bwd_kernel_one_col_block(
+    start_n,
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qm,
+    stride_kn,
+    stride_vn,
+    stride_bm,
+    stride_dom,
+    stride_dqm,
+    stride_dkn,
+    stride_dvn,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    ATOMIC_ADD: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
+    begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
+    # initialize row/col offsets
+    offs_qm = begin_m + tl.arange(0, BLOCK_M)
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
+    if BIAS_TYPE == 'vector':
+        b_ptrs = Bias + offs_n
+    elif BIAS_TYPE == 'matrix':
+        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
+    else:
+        raise ValueError("BIAS_TYPE must be one of {'vector', 'matrix'}")
+    # initialize dv and dk
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    # k and v stay in SRAM throughout
+    # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
+    # if we just call tl.load(k_ptrs), we get the wrong output!
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        else:
+            k = tl.load(k_ptrs,
+                        mask=(offs_n[:, None] < seqlen_k) &
+                        (offs_d[None, :] < headdim),
+                        other=0.0)
+            v = tl.load(v_ptrs,
+                        mask=(offs_n[:, None] < seqlen_k) &
+                        (offs_d[None, :] < headdim),
+                        other=0.0)
+    # loop over rows
+    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
+    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        # load q, k, v, do on-chip
+        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
+        if EVEN_M & EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(q_ptrs,
+                            mask=offs_m_curr[:, None] < seqlen_q,
+                            other=0.0)
+            else:
+                q = tl.load(q_ptrs,
+                            mask=(offs_m_curr[:, None] < seqlen_q) &
+                            (offs_d[None, :] < headdim),
+                            other=0.0)
+        # recompute p = softmax(qk, dim=-1).T
+        qk = tl.dot(q, k, trans_b=True)
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float('-inf'))
+        if IS_CAUSAL:
+            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk,
+                          float('-inf'))
+        if BIAS_TYPE != 'none':
+            if BIAS_TYPE == 'vector':
+                if EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k,
+                                   other=0.0).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == 'matrix':
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs,
+                                   mask=(offs_m_curr[:, None] < seqlen_q) &
+                                   (offs_n[None, :] < seqlen_k),
+                                   other=0.0).to(tl.float32)
+            else:
+                raise ValueError(
+                    "BIAS_TYPE must be one of {'vector', 'matrix'}")
+            qk = qk * softmax_scale + bias
+        # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
+        # Also wrong for headdim=64.
+        if not (EVEN_M & EVEN_HEADDIM):
+            tl.debug_barrier()
+        lse_i = tl.load(LSE + offs_m_curr)
+        if BIAS_TYPE == 'none':
+            p = tl.exp(qk * softmax_scale - lse_i[:, None])
+        else:
+            p = tl.exp(qk - lse_i[:, None])
+        # compute dv
+        # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
+        # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
+        # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
+        # the output is correct.
+        if EVEN_M & EVEN_HEADDIM:
+            do = tl.load(do_ptrs)
+        else:
+            # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
+            do = tl.load(do_ptrs,
+                         mask=(offs_m_curr[:, None] < seqlen_q) &
+                         (offs_d[None, :] < headdim),
+                         other=0.0)
+        # if EVEN_M:
+        #     if EVEN_HEADDIM:
+        #         do = tl.load(do_ptrs)
+        #     else:
+        #         do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+        # else:
+        #     if EVEN_HEADDIM:
+        #         do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+        #     else:
+        #         do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
+        #                                    & (offs_d[None, :] < headdim), other=0.0)
+        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+        # compute dp = dot(v, do)
+        # There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
+        # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
+        # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
+        if not (EVEN_M & EVEN_HEADDIM):
+            tl.debug_barrier()
+        dp = tl.dot(do, v, trans_b=True)
+        # There's a race condition for headdim=48
+        if not EVEN_HEADDIM:
+            tl.debug_barrier()
+        # compute ds = p * (dp - delta[:, None])
+        # Putting the subtraction after the dp matmul (instead of before) is slightly faster
+        Di = tl.load(D + offs_m_curr)
+        # Converting ds to q.dtype here reduces register pressure and makes it much faster
+        # for BLOCK_HEADDIM=128
+        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
+        # compute dk = dot(ds.T, q)
+        dk += tl.dot(ds, q, trans_a=True)
+        # compute dq
+        if not ATOMIC_ADD:
+            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
+                dq = tl.load(dq_ptrs, eviction_policy='evict_last')
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, eviction_policy='evict_last')
+            else:
+                if EVEN_HEADDIM:
+                    dq = tl.load(dq_ptrs,
+                                 mask=offs_m_curr[:, None] < seqlen_q,
+                                 other=0.0,
+                                 eviction_policy='evict_last')
+                    dq += tl.dot(ds, k)
+                    tl.store(dq_ptrs,
+                             dq,
+                             mask=offs_m_curr[:, None] < seqlen_q,
+                             eviction_policy='evict_last')
+                else:
+                    dq = tl.load(dq_ptrs,
+                                 mask=(offs_m_curr[:, None] < seqlen_q) &
+                                 (offs_d[None, :] < headdim),
+                                 other=0.0,
+                                 eviction_policy='evict_last')
+                    dq += tl.dot(ds, k)
+                    tl.store(dq_ptrs,
+                             dq,
+                             mask=(offs_m_curr[:, None] < seqlen_q) &
+                             (offs_d[None, :] < headdim),
+                             eviction_policy='evict_last')
+        else:  # If we're parallelizing across the seqlen_k dimension
+            dq = tl.dot(ds, k)
+            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
+                tl.atomic_add(dq_ptrs, dq)
+            else:
+                if EVEN_HEADDIM:
+                    tl.atomic_add(dq_ptrs,
+                                  dq,
+                                  mask=offs_m_curr[:, None] < seqlen_q)
+                else:
+                    tl.atomic_add(dq_ptrs,
+                                  dq,
+                                  mask=(offs_m_curr[:, None] < seqlen_q) &
+                                  (offs_d[None, :] < headdim))
+        # increment pointers
+        dq_ptrs += BLOCK_M * stride_dqm
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_dom
+        if BIAS_TYPE == 'matrix':
+            b_ptrs += BLOCK_M * stride_bm
+    # write-back
+    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+    # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
+    # if we just call tl.store(dv_ptrs), there's a race condition
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+        else:
+            tl.store(dv_ptrs,
+                     dv,
+                     mask=(offs_n[:, None] < seqlen_k) &
+                     (offs_d[None, :] < headdim))
+            tl.store(dk_ptrs,
+                     dk,
+                     mask=(offs_n[:, None] < seqlen_k) &
+                     (offs_d[None, :] < headdim))
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 128,
+                'SEQUENCE_PARALLEL': False
+            },
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero('DQ')),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 128,
+                'SEQUENCE_PARALLEL': True
+            },
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero('DQ')),
+        # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
+        # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
+        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
+    ],
+    key=[
+        'CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL',
+        'BLOCK_HEADDIM'
+    ],
+)
+@triton.heuristics({
+    'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0,
+    'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0,
+    'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM'],
+})
+@triton.jit
+def _bwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    stride_dqb,
+    stride_dqh,
+    stride_dqm,
+    stride_dkb,
+    stride_dkh,
+    stride_dkn,
+    stride_dvb,
+    stride_dvh,
+    stride_dvn,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # offset pointers for batch/head
+    Q += off_b * stride_qb + off_h * stride_qh
+    K += off_b * stride_kb + off_h * stride_kh
+    V += off_b * stride_vb + off_h * stride_vh
+    DO += off_b * stride_dob + off_h * stride_doh
+    DQ += off_b * stride_dqb + off_h * stride_dqh
+    DK += off_b * stride_dkb + off_h * stride_dkh
+    DV += off_b * stride_dvb + off_h * stride_dvh
+    if BIAS_TYPE != 'none':
+        Bias += off_b * stride_bb + off_h * stride_bh
+    # pointer to row-wise quantities in value-like data
+    D += off_hb * seqlen_q_rounded
+    LSE += off_hb * seqlen_q_rounded
+    if not SEQUENCE_PARALLEL:
+        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
+        for start_n in range(0, num_block_n):
+            _bwd_kernel_one_col_block(start_n,
+                                      Q,
+                                      K,
+                                      V,
+                                      Bias,
+                                      DO,
+                                      DQ,
+                                      DK,
+                                      DV,
+                                      LSE,
+                                      D,
+                                      softmax_scale,
+                                      stride_qm,
+                                      stride_kn,
+                                      stride_vn,
+                                      stride_bm,
+                                      stride_dom,
+                                      stride_dqm,
+                                      stride_dkn,
+                                      stride_dvn,
+                                      seqlen_q,
+                                      seqlen_k,
+                                      headdim,
+                                      ATOMIC_ADD=False,
+                                      BIAS_TYPE=BIAS_TYPE,
+                                      IS_CAUSAL=IS_CAUSAL,
+                                      BLOCK_HEADDIM=BLOCK_HEADDIM,
+                                      EVEN_M=EVEN_M,
+                                      EVEN_N=EVEN_N,
+                                      EVEN_HEADDIM=EVEN_HEADDIM,
+                                      BLOCK_M=BLOCK_M,
+                                      BLOCK_N=BLOCK_N)
+    else:
+        start_n = tl.program_id(0)
+        _bwd_kernel_one_col_block(start_n,
+                                  Q,
+                                  K,
+                                  V,
+                                  Bias,
+                                  DO,
+                                  DQ,
+                                  DK,
+                                  DV,
+                                  LSE,
+                                  D,
+                                  softmax_scale,
+                                  stride_qm,
+                                  stride_kn,
+                                  stride_vn,
+                                  stride_bm,
+                                  stride_dom,
+                                  stride_dqm,
+                                  stride_dkn,
+                                  stride_dvn,
+                                  seqlen_q,
+                                  seqlen_k,
+                                  headdim,
+                                  ATOMIC_ADD=True,
+                                  BIAS_TYPE=BIAS_TYPE,
+                                  IS_CAUSAL=IS_CAUSAL,
+                                  BLOCK_HEADDIM=BLOCK_HEADDIM,
+                                  EVEN_M=EVEN_M,
+                                  EVEN_N=EVEN_N,
+                                  EVEN_HEADDIM=EVEN_HEADDIM,
+                                  BLOCK_M=BLOCK_M,
+                                  BLOCK_N=BLOCK_N)
+def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
+    # shape constraints
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    assert k.shape == (batch, seqlen_k, nheads, d)
+    assert v.shape == (batch, seqlen_k, nheads, d)
+    assert d <= 128, 'FlashAttention only support head dimensions up to 128'
+    assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
+    assert q.dtype in [torch.float16,
+                       torch.bfloat16], 'Only support fp16 and bf16'
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    has_bias = bias is not None
+    bias_type = 'none'
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        if bias.stride(-1) != 1:
+            bias = bias.contiguous()
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = 'vector'
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = 'matrix'
+        else:
+            raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k)'
+                               ' or (seqlen_q, seqlen_k)')
+        if bias.shape[:2] == (1, nheads):
+            bias = repeat(bias, '1 h ... -> b h ...', b=batch)
+        elif bias.shape[:2] == (batch, 1):
+            bias = repeat(bias, 'b 1 ... -> b h ...', h=nheads)
+        elif bias.shape[:2] == (1, 1):
+            bias = repeat(bias, '1 h ... -> b h ...', b=batch)
+            bias = repeat(bias, 'b 1 ... -> b h ...', h=nheads)
+        assert bias.shape[:2] == (
+            batch, nheads
+        ), f'First 2 dimensions of bias must be broadcastible to (batch, nheads) = ({batch, nheads}). Bias has shape: {bias.shape}'
+    assert bias is not None  # for type checking
+    bias_strides = (bias.stride(0), bias.stride(1),
+                    bias.stride(2)) if has_bias else (0, 0, 0)
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    lse = torch.empty((batch, nheads, seqlen_q_rounded),
+                      device=q.device,
+                      dtype=torch.float32)
+    tmp = torch.empty((batch, nheads, seqlen_q_rounded),
+                      device=q.device,
+                      dtype=torch.float32)
+    o = torch.empty_like(q)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    # BLOCK = 128
+    # num_warps = 4 if d <= 64 else 8
+    grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
+    _fwd_kernel[grid](  # type: ignore
+        q,
+        k,
+        v,
+        bias,
+        o,
+        lse,
+        tmp,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,  # key for triton cache (limit number of compilations)
+        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
+        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        # BLOCK_M=BLOCK, BLOCK_N=BLOCK,
+        # num_warps=num_warps,
+        # num_stages=1,
+    )
+    return o, lse, softmax_scale  # softmax_scale could have been updated
+def _flash_attn_backward(do,
+                         q,
+                         k,
+                         v,
+                         o,
+                         lse,
+                         dq,
+                         dk,
+                         dv,
+                         bias=None,
+                         causal=False,
+                         softmax_scale=None):
+    # Make sure that the last dimension is contiguous
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    # assert d in {16, 32, 64, 128}
+    assert d <= 128
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    assert lse.shape == (batch, nheads, seqlen_q_rounded)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    # dq_accum = torch.zeros_like(q, dtype=torch.float32)
+    dq_accum = torch.empty_like(q, dtype=torch.float32)
+    delta = torch.empty_like(lse)
+    # delta = torch.zeros_like(lse)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
+    _bwd_preprocess_do_o_dot[grid](  # type: ignore
+        o,
+        do,
+        delta,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_q_rounded,
+        d,
+        BLOCK_M=128,
+        BLOCK_HEADDIM=BLOCK_HEADDIM,
+    )
+    has_bias = bias is not None
+    bias_type = 'none'
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        assert bias.stride(-1) == 1
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = 'vector'
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = 'matrix'
+        else:
+            raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k)'
+                               ' or (seqlen_q, seqlen_k)')
+        if bias.shape[:2] == (1, nheads):
+            bias = repeat(bias, '1 h ... -> b h ...', b=batch)
+        elif bias.shape[:2] == (batch, 1):
+            bias = repeat(bias, 'b 1 ... -> b h ...', h=nheads)
+        elif bias.shape[:2] == (1, 1):
+            bias = repeat(bias, '1 h ... -> b h ...', b=batch)
+            bias = repeat(bias, 'b 1 ... -> b h ...', h=nheads)
+        assert bias.shape[:2] == (
+            batch, nheads
+        ), f'First 2 dimensions of bias must be broadcastible to (batch, nheads) = ({batch, nheads}). Bias has shape: {bias.shape}'
+    assert bias is not None  # type checking
+    bias_strides = (bias.stride(0), bias.stride(1),
+                    bias.stride(2)) if has_bias else (0, 0, 0)
+    # BLOCK_M = 128
+    # BLOCK_N = 64
+    # num_warps = 4
+    grid = lambda META: (triton.cdiv(seqlen_k, META['BLOCK_N'])
+                         if META['SEQUENCE_PARALLEL'] else 1, batch * nheads)
+    _bwd_kernel[grid](  # type: ignore
+        q,
+        k,
+        v,
+        bias,
+        do,
+        dq_accum,
+        dk,
+        dv,
+        lse,
+        delta,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        dq_accum.stride(0),
+        dq_accum.stride(2),
+        dq_accum.stride(1),
+        dk.stride(0),
+        dk.stride(2),
+        dk.stride(1),
+        dv.stride(0),
+        dv.stride(2),
+        dv.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,  # key for triton cache (limit number of compilations)
+        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
+        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        # SEQUENCE_PARALLEL=False,
+        # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        # num_warps=num_warps,
+        # num_stages=1,
+    )
+    dq.copy_(dq_accum)
+class _FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
+        """Forward pass for packed FlashAttention.
+        Args:
+            ctx: autograd context
+            qkv: (batch, seqlen, 3, nheads, headdim)
+            bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
+                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
+                ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
+            causal (bool): whether to incorporate causal attention masking
+            softmax_scale (float, optional): scale factor for softmax
+        """
+        # Make sure that the last dimension is contiguous
+        if qkv.stride(-1) != 1:
+            qkv = qkv.contiguous()
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            bias=bias,
+            causal=causal,
+            softmax_scale=softmax_scale)
+        ctx.save_for_backward(qkv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        qkv, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[
+            1], 'FlashAttention does not support bias gradient yet'
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dqkv = torch.empty_like(qkv)
+            _flash_attn_backward(do,
+                                 qkv[:, :, 0],
+                                 qkv[:, :, 1],
+                                 qkv[:, :, 2],
+                                 o,
+                                 lse,
+                                 dqkv[:, :, 0],
+                                 dqkv[:, :, 1],
+                                 dqkv[:, :, 2],
+                                 bias=bias,
+                                 causal=ctx.causal,
+                                 softmax_scale=ctx.softmax_scale)
+        return dqkv, None, None, None
+flash_attn_qkvpacked_func = _FlashAttnQKVPackedFunc.apply
+class _FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
+        """Forward pass for FlashAttention.
+        Args:
+            ctx: autograd context
+            q: (batch_size, seqlen_q, nheads, headdim)
+            k: (batch_size, seqlen_k, nheads, headdim)
+            v: (batch_size, seqlen_k, nheads, headdim)
+            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+            causal (bool): whether to incorporate causal attention masking
+            softmax_scale (float, optional): scale factor for softmax
+        """
+        # Make sure that the last dimension is contiguous
+        q, k, v = [
+            x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]
+        ]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
+        ctx.save_for_backward(q, k, v, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[
+            3], 'FlashAttention does not support bias gradient yet'
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            _flash_attn_backward(do,
+                                 q,
+                                 k,
+                                 v,
+                                 o,
+                                 lse,
+                                 dq,
+                                 dk,
+                                 dv,
+                                 bias=bias,
+                                 causal=ctx.causal,
+                                 softmax_scale=ctx.softmax_scale)
+        return dq, dk, dv, None, None, None
+flash_attn_func = _FlashAttnFunc.apply

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

mteb_results/AmazonCounterfactualClassification.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
+  "mteb_dataset_name": "AmazonCounterfactualClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "en": {
+      "accuracy": 0.697313432835821,
+      "accuracy_stderr": 0.04363113167902916,
+      "ap": 0.31618259511417734,
+      "ap_stderr": 0.0243939127481388,
+      "f1": 0.6330313825394228,
+      "f1_stderr": 0.03331211721747352,
+      "main_score": 0.697313432835821
+    },
+    "evaluation_time": 3.55
+  },
+  "validation": {
+    "en": {
+      "accuracy": 0.7074626865671642,
+      "accuracy_stderr": 0.03173177854547658,
+      "ap": 0.2916547890175021,
+      "ap_stderr": 0.028577509879931906,
+      "f1": 0.628207439570022,
+      "f1_stderr": 0.02728677964172927,
+      "main_score": 0.7074626865671642
+    },
+    "evaluation_time": 7.2
+  }
+}

mteb_results/AmazonPolarityClassification.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "dataset_revision": "e2d317d38cd51312af73b3d32a06d1a08b442046",
+  "mteb_dataset_name": "AmazonPolarityClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "accuracy": 0.8689837499999999,
+    "accuracy_stderr": 0.010742354621427285,
+    "ap": 0.8239500885672127,
+    "ap_stderr": 0.013236818266475252,
+    "evaluation_time": 1082.95,
+    "f1": 0.8687317947399658,
+    "f1_stderr": 0.011035411217540664,
+    "main_score": 0.8689837499999999
+  }
+}

mteb_results/AmazonReviewsClassification.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
+  "mteb_dataset_name": "AmazonReviewsClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "en": {
+      "accuracy": 0.44049999999999995,
+      "accuracy_stderr": 0.014423938435808711,
+      "f1": 0.4267624383248947,
+      "f1_stderr": 0.01351683620968048,
+      "main_score": 0.44049999999999995
+    },
+    "evaluation_time": 11.38
+  },
+  "validation": {
+    "en": {
+      "accuracy": 0.43798000000000004,
+      "accuracy_stderr": 0.012288352208494032,
+      "f1": 0.42483998553432956,
+      "f1_stderr": 0.015752944478543963,
+      "main_score": 0.43798000000000004
+    },
+    "evaluation_time": 13.69
+  }
+}

mteb_results/ArguAna.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "ArguAna",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 52.53,
+    "map_at_1": 0.26174,
+    "map_at_10": 0.40976,
+    "map_at_100": 0.42067,
+    "map_at_1000": 0.42075,
+    "map_at_3": 0.35917,
+    "map_at_5": 0.38656,
+    "mrr_at_1": 0.26814,
+    "mrr_at_10": 0.41252,
+    "mrr_at_100": 0.42337,
+    "mrr_at_1000": 0.42345,
+    "mrr_at_3": 0.36226,
+    "mrr_at_5": 0.38914,
+    "ndcg_at_1": 0.26174,
+    "ndcg_at_10": 0.49819,
+    "ndcg_at_100": 0.54404,
+    "ndcg_at_1000": 0.5459,
+    "ndcg_at_3": 0.39231,
+    "ndcg_at_5": 0.44189,
+    "precision_at_1": 0.26174,
+    "precision_at_10": 0.07838,
+    "precision_at_100": 0.00982,
+    "precision_at_1000": 0.001,
+    "precision_at_3": 0.16287,
+    "precision_at_5": 0.12191,
+    "recall_at_1": 0.26174,
+    "recall_at_10": 0.78378,
+    "recall_at_100": 0.98222,
+    "recall_at_1000": 0.99644,
+    "recall_at_3": 0.48862,
+    "recall_at_5": 0.60953
+  }
+}

mteb_results/ArxivClusteringP2P.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "a122ad7f3f0291bf49cc6f4d32aa80929df69d5d",
+  "mteb_dataset_name": "ArxivClusteringP2P",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 4034.06,
+    "v_measure": 0.4231689035788179,
+    "v_measure_std": 0.1399577095144373
+  }
+}

mteb_results/ArxivClusteringS2S.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "f910caf1a6075f7329cdf8c1a6135696f37dbd53",
+  "mteb_dataset_name": "ArxivClusteringS2S",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 432.48,
+    "v_measure": 0.31280245136660983,
+    "v_measure_std": 0.14616358182910433
+  }
+}

mteb_results/AskUbuntuDupQuestions.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "2000358ca161889fa9c082cb41daa8dcfb161a54",
+  "mteb_dataset_name": "AskUbuntuDupQuestions",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 4.26,
+    "map": 0.5879109720839415,
+    "mrr": 0.7179615705931495
+  }
+}

mteb_results/BIOSSES.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "dataset_revision": "d3fb88f8f02e40887cd149695127462bbcf29b4a",
+  "mteb_dataset_name": "BIOSSES",
+  "mteb_version": "1.1.0",
+  "test": {
+    "cos_sim": {
+      "pearson": 0.7644918756608116,
+      "spearman": 0.7086607256286257
+    },
+    "euclidean": {
+      "pearson": 0.7412154678100815,
+      "spearman": 0.7086607256286257
+    },
+    "evaluation_time": 1.08,
+    "manhattan": {
+      "pearson": 0.7400786269644171,
+      "spearman": 0.7068353828321327
+    }
+  }
+}

mteb_results/Banking77Classification.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
+  "mteb_dataset_name": "Banking77Classification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "accuracy": 0.7540584415584415,
+    "accuracy_stderr": 0.007828985179390284,
+    "evaluation_time": 20.37,
+    "f1": 0.7429514617572676,
+    "f1_stderr": 0.00868929710762345,
+    "main_score": 0.7540584415584415
+  }
+}

mteb_results/BiorxivClusteringP2P.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "65b79d1d13f80053f67aca9498d9402c2d9f1f40",
+  "mteb_dataset_name": "BiorxivClusteringP2P",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 547.26,
+    "v_measure": 0.3741860080664014,
+    "v_measure_std": 0.008407780040443218
+  }
+}

mteb_results/BiorxivClusteringS2S.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "258694dd0231531bc1fd9de6ceb52a0853c6d908",
+  "mteb_dataset_name": "BiorxivClusteringS2S",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 53.57,
+    "v_measure": 0.29319217023090705,
+    "v_measure_std": 0.010219281239166302
+  }
+}

mteb_results/CQADupstackEnglishRetrieval.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "CQADupstackEnglishRetrieval",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 113.5,
+    "map_at_1": 0.22528,
+    "map_at_10": 0.30751,
+    "map_at_100": 0.31855,
+    "map_at_1000": 0.31972,
+    "map_at_3": 0.28465,
+    "map_at_5": 0.29738,
+    "mrr_at_1": 0.28662,
+    "mrr_at_10": 0.35912,
+    "mrr_at_100": 0.36726,
+    "mrr_at_1000": 0.36777,
+    "mrr_at_3": 0.34013,
+    "mrr_at_5": 0.35156,
+    "ndcg_at_1": 0.28662,
+    "ndcg_at_10": 0.35452,
+    "ndcg_at_100": 0.401,
+    "ndcg_at_1000": 0.42323,
+    "ndcg_at_3": 0.32112,
+    "ndcg_at_5": 0.33638,
+    "precision_at_1": 0.28662,
+    "precision_at_10": 0.06688,
+    "precision_at_100": 0.0113,
+    "precision_at_1000": 0.0016,
+    "precision_at_3": 0.15563,
+    "precision_at_5": 0.11019,
+    "recall_at_1": 0.22528,
+    "recall_at_10": 0.43748,
+    "recall_at_100": 0.64235,
+    "recall_at_1000": 0.78609,
+    "recall_at_3": 0.33937,
+    "recall_at_5": 0.38234
+  }
+}

mteb_results/ClimateFEVER.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "ClimateFEVER",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 8671.85,
+    "map_at_1": 0.09468,
+    "map_at_10": 0.16029,
+    "map_at_100": 0.17693,
+    "map_at_1000": 0.17886,
+    "map_at_3": 0.1315,
+    "map_at_5": 0.14568,
+    "mrr_at_1": 0.21173,
+    "mrr_at_10": 0.31028,
+    "mrr_at_100": 0.32061,
+    "mrr_at_1000": 0.32119,
+    "mrr_at_3": 0.27535,
+    "mrr_at_5": 0.29431,
+    "ndcg_at_1": 0.21173,
+    "ndcg_at_10": 0.23224,
+    "ndcg_at_100": 0.30225,
+    "ndcg_at_1000": 0.33961,
+    "ndcg_at_3": 0.18174,
+    "ndcg_at_5": 0.19897,
+    "precision_at_1": 0.21173,
+    "precision_at_10": 0.07472,
+    "precision_at_100": 0.01501,
+    "precision_at_1000": 0.00219,
+    "precision_at_3": 0.13312,
+    "precision_at_5": 0.10619,
+    "recall_at_1": 0.09468,
+    "recall_at_10": 0.28823,
+    "recall_at_100": 0.53265,
+    "recall_at_1000": 0.74536,
+    "recall_at_3": 0.16672,
+    "recall_at_5": 0.21302
+  }
+}

mteb_results/DBPedia.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "DBPedia",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 4445.99,
+    "map_at_1": 0.06343,
+    "map_at_10": 0.12717,
+    "map_at_100": 0.1648,
+    "map_at_1000": 0.17381,
+    "map_at_3": 0.09569,
+    "map_at_5": 0.11125,
+    "mrr_at_1": 0.4875,
+    "mrr_at_10": 0.58425,
+    "mrr_at_100": 0.59075,
+    "mrr_at_1000": 0.59095,
+    "mrr_at_3": 0.56292,
+    "mrr_at_5": 0.57679,
+    "ndcg_at_1": 0.37875,
+    "ndcg_at_10": 0.2777,
+    "ndcg_at_100": 0.30289,
+    "ndcg_at_1000": 0.36188,
+    "ndcg_at_3": 0.31386,
+    "ndcg_at_5": 0.29923,
+    "precision_at_1": 0.4875,
+    "precision_at_10": 0.22375,
+    "precision_at_100": 0.06342,
+    "precision_at_1000": 0.01449,
+    "precision_at_3": 0.355,
+    "precision_at_5": 0.3055,
+    "recall_at_1": 0.06343,
+    "recall_at_10": 0.16936,
+    "recall_at_100": 0.35956,
+    "recall_at_1000": 0.55787,
+    "recall_at_3": 0.10771,
+    "recall_at_5": 0.1367
+  }
+}

mteb_results/EmotionClassification.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "dataset_revision": "4f58c6b202a23cf9a4da393831edf4f9183cad37",
+  "mteb_dataset_name": "EmotionClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "accuracy": 0.4199,
+    "accuracy_stderr": 0.02234367919569201,
+    "evaluation_time": 3.37,
+    "f1": 0.3682340217456495,
+    "f1_stderr": 0.021776128234136445,
+    "main_score": 0.4199
+  },
+  "validation": {
+    "accuracy": 0.41864999999999997,
+    "accuracy_stderr": 0.022959801828413062,
+    "evaluation_time": 3.29,
+    "f1": 0.3748604511300154,
+    "f1_stderr": 0.02042335727335004,
+    "main_score": 0.41864999999999997
+  }
+}

mteb_results/FEVER.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "FEVER",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 16698.45,
+    "map_at_1": 0.40088,
+    "map_at_10": 0.52692,
+    "map_at_100": 0.53296,
+    "map_at_1000": 0.53325,
+    "map_at_3": 0.49905,
+    "map_at_5": 0.51617,
+    "mrr_at_1": 0.43009,
+    "mrr_at_10": 0.56203,
+    "mrr_at_100": 0.5675,
+    "mrr_at_1000": 0.56769,
+    "mrr_at_3": 0.534,
+    "mrr_at_5": 0.55163,
+    "ndcg_at_1": 0.43009,
+    "ndcg_at_10": 0.5939,
+    "ndcg_at_100": 0.6213,
+    "ndcg_at_1000": 0.62793,
+    "ndcg_at_3": 0.53878,
+    "ndcg_at_5": 0.56887,
+    "precision_at_1": 0.43009,
+    "precision_at_10": 0.08366,
+    "precision_at_100": 0.00983,
+    "precision_at_1000": 0.00105,
+    "precision_at_3": 0.22377,
+    "precision_at_5": 0.15035,
+    "recall_at_1": 0.40088,
+    "recall_at_10": 0.76687,
+    "recall_at_100": 0.8891,
+    "recall_at_1000": 0.93782,
+    "recall_at_3": 0.6181,
+    "recall_at_5": 0.69131
+  }
+}

mteb_results/FiQA2018.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "FiQA2018",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 138.39,
+    "map_at_1": 0.10817,
+    "map_at_10": 0.189,
+    "map_at_100": 0.20448,
+    "map_at_1000": 0.20661,
+    "map_at_3": 0.15979,
+    "map_at_5": 0.17415,
+    "mrr_at_1": 0.23148,
+    "mrr_at_10": 0.31208,
+    "mrr_at_100": 0.32167,
+    "mrr_at_1000": 0.32242,
+    "mrr_at_3": 0.28498,
+    "mrr_at_5": 0.29964,
+    "ndcg_at_1": 0.23148,
+    "ndcg_at_10": 0.25326,
+    "ndcg_at_100": 0.31927,
+    "ndcg_at_1000": 0.36081,
+    "ndcg_at_3": 0.21647,
+    "ndcg_at_5": 0.22763,
+    "precision_at_1": 0.23148,
+    "precision_at_10": 0.07546,
+    "precision_at_100": 0.01415,
+    "precision_at_1000": 0.00216,
+    "precision_at_3": 0.14969,
+    "precision_at_5": 0.11327,
+    "recall_at_1": 0.10817,
+    "recall_at_10": 0.32164,
+    "recall_at_100": 0.57655,
+    "recall_at_1000": 0.82797,
+    "recall_at_3": 0.19709,
+    "recall_at_5": 0.24333
+  }
+}

mteb_results/HotpotQA.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "HotpotQA",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 5192.2,
+    "map_at_1": 0.25381,
+    "map_at_10": 0.3314,
+    "map_at_100": 0.33948,
+    "map_at_1000": 0.34028,
+    "map_at_3": 0.3102,
+    "map_at_5": 0.3223,
+    "mrr_at_1": 0.50763,
+    "mrr_at_10": 0.57899,
+    "mrr_at_100": 0.58426,
+    "mrr_at_1000": 0.58457,
+    "mrr_at_3": 0.56093,
+    "mrr_at_5": 0.57116,
+    "ndcg_at_1": 0.50763,
+    "ndcg_at_10": 0.41656,
+    "ndcg_at_100": 0.45079,
+    "ndcg_at_1000": 0.46917,
+    "ndcg_at_3": 0.37834,
+    "ndcg_at_5": 0.39732,
+    "precision_at_1": 0.50763,
+    "precision_at_10": 0.08648,
+    "precision_at_100": 0.01135,
+    "precision_at_1000": 0.00138,
+    "precision_at_3": 0.23106,
+    "precision_at_5": 0.15363,
+    "recall_at_1": 0.25381,
+    "recall_at_10": 0.43241,
+    "recall_at_100": 0.56745,
+    "recall_at_1000": 0.69048,
+    "recall_at_3": 0.34659,
+    "recall_at_5": 0.38406
+  }
+}

mteb_results/ImdbClassification.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "dataset_revision": "3d86128a09e091d6018b6d26cad27f2739fc2db7",
+  "mteb_dataset_name": "ImdbClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "accuracy": 0.79544,
+    "accuracy_stderr": 0.022193916283522398,
+    "ap": 0.7382920133396664,
+    "ap_stderr": 0.029776228173533717,
+    "evaluation_time": 205.12,
+    "f1": 0.7951048124883265,
+    "f1_stderr": 0.02219958939576688,
+    "main_score": 0.79544
+  }
+}

mteb_results/MSMARCO.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "dev": {
+    "evaluation_time": 19626.59,
+    "map_at_1": 0.11174,
+    "map_at_10": 0.19452,
+    "map_at_100": 0.20612,
+    "map_at_1000": 0.20703,
+    "map_at_3": 0.16444,
+    "map_at_5": 0.18083,
+    "mrr_at_1": 0.11447,
+    "mrr_at_10": 0.19808,
+    "mrr_at_100": 0.20958,
+    "mrr_at_1000": 0.21042,
+    "mrr_at_3": 0.16791,
+    "mrr_at_5": 0.18459,
+    "ndcg_at_1": 0.11447,
+    "ndcg_at_10": 0.24556,
+    "ndcg_at_100": 0.30638,
+    "ndcg_at_1000": 0.3314,
+    "ndcg_at_3": 0.18325,
+    "ndcg_at_5": 0.21278,
+    "precision_at_1": 0.11447,
+    "precision_at_10": 0.04215,
+    "precision_at_100": 0.00732,
+    "precision_at_1000": 0.00095,
+    "precision_at_3": 0.08052,
+    "precision_at_5": 0.06318,
+    "recall_at_1": 0.11174,
+    "recall_at_10": 0.40543,
+    "recall_at_100": 0.69699,
+    "recall_at_1000": 0.89403,
+    "recall_at_3": 0.23442,
+    "recall_at_5": 0.30536
+  },
+  "mteb_dataset_name": "MSMARCO",
+  "mteb_version": "1.1.0"
+}

mteb_results/MTOPDomainClassification.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "dataset_revision": "d80d48c1eb48d3562165c59d59d0034df9fff0bf",
+  "mteb_dataset_name": "MTOPDomainClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "en": {
+      "accuracy": 0.8966712266301871,
+      "accuracy_stderr": 0.009523011920085962,
+      "f1": 0.8957660424361247,
+      "f1_stderr": 0.009247170021662966,
+      "main_score": 0.8966712266301871
+    },
+    "evaluation_time": 7.75
+  },
+  "validation": {
+    "en": {
+      "accuracy": 0.9017002237136464,
+      "accuracy_stderr": 0.009890167527403295,
+      "f1": 0.9039792204701363,
+      "f1_stderr": 0.009182351003334687,
+      "main_score": 0.9017002237136464
+    },
+    "evaluation_time": 4.88
+  }
+}

mteb_results/MTOPIntentClassification.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "dataset_revision": "ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba",
+  "mteb_dataset_name": "MTOPIntentClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "en": {
+      "accuracy": 0.6028499772001825,
+      "accuracy_stderr": 0.018495543127366038,
+      "f1": 0.40306374001528233,
+      "f1_stderr": 0.011859407815520086,
+      "main_score": 0.6028499772001825
+    },
+    "evaluation_time": 30.96
+  },
+  "validation": {
+    "en": {
+      "accuracy": 0.6150335570469799,
+      "accuracy_stderr": 0.01903139236025276,
+      "f1": 0.4147129810603558,
+      "f1_stderr": 0.015560901035463594,
+      "main_score": 0.6150335570469799
+    },
+    "evaluation_time": 28.07
+  }
+}

mteb_results/MassiveIntentClassification.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "dataset_revision": "31efe3c427b0bae9c22cbb560b8f15491cc6bed7",
+  "mteb_dataset_name": "MassiveIntentClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "en": {
+      "accuracy": 0.6333557498318763,
+      "accuracy_stderr": 0.014612806300514952,
+      "f1": 0.6024039910680179,
+      "f1_stderr": 0.012256367770368185,
+      "main_score": 0.6333557498318763
+    },
+    "evaluation_time": 22.52
+  },
+  "validation": {
+    "en": {
+      "accuracy": 0.6426955238563699,
+      "accuracy_stderr": 0.01633350887848132,
+      "f1": 0.5828069832892886,
+      "f1_stderr": 0.013604921852646317,
+      "main_score": 0.6426955238563699
+    },
+    "evaluation_time": 17.6
+  }
+}

mteb_results/MassiveScenarioClassification.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "dataset_revision": "7d571f92784cd94a019292a1f45445077d0ef634",
+  "mteb_dataset_name": "MassiveScenarioClassification",
+  "mteb_version": "1.1.0",
+  "test": {
+    "en": {
+      "accuracy": 0.7237390719569603,
+      "accuracy_stderr": 0.006043481355389665,
+      "f1": 0.7233097333477316,
+      "f1_stderr": 0.0075559844507943974,
+      "main_score": 0.7237390719569603
+    },
+    "evaluation_time": 6.86
+  },
+  "validation": {
+    "en": {
+      "accuracy": 0.7321200196753566,
+      "accuracy_stderr": 0.010745609148770754,
+      "f1": 0.7288011677053199,
+      "f1_stderr": 0.010826173990376636,
+      "main_score": 0.7321200196753566
+    },
+    "evaluation_time": 5.61
+  }
+}

mteb_results/MedrxivClusteringP2P.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "e7a26af6f3ae46b30dde8737f02c07b1505bcc73",
+  "mteb_dataset_name": "MedrxivClusteringP2P",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 226.37,
+    "v_measure": 0.34681589390605516,
+    "v_measure_std": 0.01515645822647098
+  }
+}

mteb_results/MedrxivClusteringS2S.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "35191c8c0dca72d8ff3efcd72aa802307d469663",
+  "mteb_dataset_name": "MedrxivClusteringS2S",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 31.13,
+    "v_measure": 0.30340061711905236,
+    "v_measure_std": 0.012579424998938571
+  }
+}

mteb_results/MindSmallReranking.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "3bdac13927fdc888b903db93b2ffdbd90b295a69",
+  "mteb_dataset_name": "MindSmallReranking",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 1849.79,
+    "map": 0.32018143262958026,
+    "mrr": 0.33205552400553673
+  }
+}

mteb_results/NFCorpus.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "NFCorpus",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 17.47,
+    "map_at_1": 0.03391,
+    "map_at_10": 0.07722,
+    "map_at_100": 0.10286,
+    "map_at_1000": 0.11668,
+    "map_at_3": 0.05552,
+    "map_at_5": 0.06468,
+    "mrr_at_1": 0.34365,
+    "mrr_at_10": 0.42555,
+    "mrr_at_100": 0.43295,
+    "mrr_at_1000": 0.43357,
+    "mrr_at_3": 0.40299,
+    "mrr_at_5": 0.41182,
+    "ndcg_at_1": 0.31424,
+    "ndcg_at_10": 0.24758,
+    "ndcg_at_100": 0.23678,
+    "ndcg_at_1000": 0.33377,
+    "ndcg_at_3": 0.28302,
+    "ndcg_at_5": 0.26342,
+    "precision_at_1": 0.33437,
+    "precision_at_10": 0.19257,
+    "precision_at_100": 0.06663,
+    "precision_at_1000": 0.0199,
+    "precision_at_3": 0.27761,
+    "precision_at_5": 0.23715,
+    "recall_at_1": 0.03391,
+    "recall_at_10": 0.11068,
+    "recall_at_100": 0.25878,
+    "recall_at_1000": 0.6019,
+    "recall_at_3": 0.06169,
+    "recall_at_5": 0.07767
+  }
+}

mteb_results/NQ.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "NQ",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 7686.43,
+    "map_at_1": 0.15168,
+    "map_at_10": 0.26177,
+    "map_at_100": 0.27564,
+    "map_at_1000": 0.27629,
+    "map_at_3": 0.2203,
+    "map_at_5": 0.24276,
+    "mrr_at_1": 0.17439,
+    "mrr_at_10": 0.28205,
+    "mrr_at_100": 0.29357,
+    "mrr_at_1000": 0.29408,
+    "mrr_at_3": 0.24377,
+    "mrr_at_5": 0.2654,
+    "ndcg_at_1": 0.1741,
+    "ndcg_at_10": 0.32936,
+    "ndcg_at_100": 0.39197,
+    "ndcg_at_1000": 0.40892,
+    "ndcg_at_3": 0.24721,
+    "ndcg_at_5": 0.28615,
+    "precision_at_1": 0.1741,
+    "precision_at_10": 0.06199,
+    "precision_at_100": 0.00969,
+    "precision_at_1000": 0.00113,
+    "precision_at_3": 0.1179,
+    "precision_at_5": 0.09264,
+    "recall_at_1": 0.15168,
+    "recall_at_10": 0.51914,
+    "recall_at_100": 0.79804,
+    "recall_at_1000": 0.9276,
+    "recall_at_3": 0.30212,
+    "recall_at_5": 0.39204
+  }
+}

mteb_results/QuoraRetrieval.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "QuoraRetrieval",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 258.94,
+    "map_at_1": 0.67306,
+    "map_at_10": 0.80634,
+    "map_at_100": 0.81349,
+    "map_at_1000": 0.81373,
+    "map_at_3": 0.77691,
+    "map_at_5": 0.79512,
+    "mrr_at_1": 0.7756,
+    "mrr_at_10": 0.84177,
+    "mrr_at_100": 0.8435,
+    "mrr_at_1000": 0.84353,
+    "mrr_at_3": 0.83003,
+    "mrr_at_5": 0.83799,
+    "ndcg_at_1": 0.7758,
+    "ndcg_at_10": 0.84782,
+    "ndcg_at_100": 0.86443,
+    "ndcg_at_1000": 0.86654,
+    "ndcg_at_3": 0.8167,
+    "ndcg_at_5": 0.83356,
+    "precision_at_1": 0.7758,
+    "precision_at_10": 0.12875,
+    "precision_at_100": 0.01503,
+    "precision_at_1000": 0.00156,
+    "precision_at_3": 0.3563,
+    "precision_at_5": 0.23484,
+    "recall_at_1": 0.67306,
+    "recall_at_10": 0.9264,
+    "recall_at_100": 0.98681,
+    "recall_at_1000": 0.9979,
+    "recall_at_3": 0.83682,
+    "recall_at_5": 0.88424
+  }
+}

mteb_results/RedditClustering.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "24640382cdbf8abc73003fb0fa6d111a705499eb",
+  "mteb_dataset_name": "RedditClustering",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 237.53,
+    "v_measure": 0.5076319866126382,
+    "v_measure_std": 0.04676162821389071
+  }
+}

mteb_results/RedditClusteringP2P.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dataset_revision": "282350215ef01743dc01b456c7f5241fa8937f16",
+  "mteb_dataset_name": "RedditClusteringP2P",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 1202.29,
+    "v_measure": 0.55024711941649,
+    "v_measure_std": 0.12775990781233748
+  }
+}

mteb_results/SCIDOCS.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "dataset_revision": null,
+  "mteb_dataset_name": "SCIDOCS",
+  "mteb_version": "1.1.0",
+  "test": {
+    "evaluation_time": 82.79,
+    "map_at_1": 0.03938,
+    "map_at_10": 0.08817,
+    "map_at_100": 0.10547,
+    "map_at_1000": 0.10852,
+    "map_at_3": 0.06352,
+    "map_at_5": 0.07453,
+    "mrr_at_1": 0.194,
+    "mrr_at_10": 0.27371,
+    "mrr_at_100": 0.28672,
+    "mrr_at_1000": 0.28747,
+    "mrr_at_3": 0.24583,
+    "mrr_at_5": 0.26143,
+    "ndcg_at_1": 0.194,
+    "ndcg_at_10": 0.15264,
+    "ndcg_at_100": 0.2263,
+    "ndcg_at_1000": 0.28559,
+    "ndcg_at_3": 0.14425,
+    "ndcg_at_5": 0.1252,
+    "precision_at_1": 0.194,
+    "precision_at_10": 0.0781,
+    "precision_at_100": 0.01854,
+    "precision_at_1000": 0.00329,
+    "precision_at_3": 0.131,
+    "precision_at_5": 0.1068,
+    "recall_at_1": 0.03938,
+    "recall_at_10": 0.15903,
+    "recall_at_100": 0.37645,
+    "recall_at_1000": 0.6686,
+    "recall_at_3": 0.07993,
+    "recall_at_5": 0.10885
+  }
+}

mteb_results/SICK-R.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "dataset_revision": "a6ea5a8cab320b040a23452cc28066d9beae2cee",
+  "mteb_dataset_name": "SICK-R",
+  "mteb_version": "1.1.0",
+  "test": {
+    "cos_sim": {
+      "pearson": 0.8012689060151424,
+      "spearman": 0.7046515535094772
+    },
+    "euclidean": {
+      "pearson": 0.7717160003557223,
+      "spearman": 0.704651757047438
+    },
+    "evaluation_time": 7.91,
+    "manhattan": {
+      "pearson": 0.7718129609281936,
+      "spearman": 0.7046610403752913
+    }
+  }
+}

mteb_results/STS12.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "dataset_revision": "a0d554a64d88156834ff5ae9920b964011b16384",
+  "mteb_dataset_name": "STS12",
+  "mteb_version": "1.1.0",
+  "test": {
+    "cos_sim": {
+      "pearson": 0.70451157033355,
+      "spearman": 0.6399899601697853
+    },
+    "euclidean": {
+      "pearson": 0.6746985359967678,
+      "spearman": 0.6400001637764805
+    },
+    "evaluation_time": 2.34,
+    "manhattan": {
+      "pearson": 0.6756534741780037,
+      "spearman": 0.6406533893575366
+    }
+  }
+}

mteb_results/STS13.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "dataset_revision": "7e90230a92c190f1bf69ae9002b8cea547a64cca",
+  "mteb_dataset_name": "STS13",
+  "mteb_version": "1.1.0",
+  "test": {
+    "cos_sim": {
+      "pearson": 0.7765086614464292,
+      "spearman": 0.7820169706921849
+    },
+    "euclidean": {
+      "pearson": 0.7777758172155284,
+      "spearman": 0.7820169706921849
+    },
+    "evaluation_time": 1.03,
+    "manhattan": {
+      "pearson": 0.7775077884860052,
+      "spearman": 0.7816875216484164
+    }
+  }
+}