diff --git a/.gitattributes b/.gitattributes index f2ce4453a3682e00d1155a8fde953914118e2113..5b0b9613c3d3a8b6f6a52f5c455a56e9c79ab7e7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -26,4 +26,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text tokenizer.json filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 011ff227539eef44a40ed9dc4c6390be3c690955..eaa9158912e3c31e28e8268d72022fbb734a0645 100644 --- a/README.md +++ b/README.md @@ -81,12 +81,12 @@ widget: - text: Pour déguster un ortolan, il faut tout d'abord example_title: Recipe group: French -- text: |- +- text: | 34+10=44 54+20= example_title: Addition group: Math -- text: |- +- text: | This tool converts irregular verbs to past tense. Arise - Arose Become - Became @@ -94,13 +94,13 @@ widget: Freeze - example_title: Irregular verbs group: English -- text: |- +- text: | Please unscramble the letters into a word, and write that word: r e!c.i p r o.c a/l = reciprocal d.o m i!n a n.t = example_title: Word unscrambling group: English -- text: |- +- text: | Estos ejemplos quitan vocales de las palabras Ejemplos: hola - hl @@ -110,7 +110,7 @@ widget: papa - example_title: Vowel removal group: Spanish -- text: |- +- text: | Traduce español de España a español de Argentina El coche es rojo - el auto es rojo El ordenador es nuevo - la computadora es nueva @@ -133,26 +133,26 @@ widget: - text: 'Regexp. Here is a regular expression to match a word starting with a number and then having only vowels:' example_title: Regular expressions group: English -- text: |- +- text: | Do a hello world in different languages: Python: print("hello world") R: example_title: Code generation group: English -- text: |- - Which is the correct preposition? I'm born X July. X is the preposition in +- text: | + Which is the correct preposition?I'm born X July. X is the preposition in He sat X a chair. X is the preposition on She drove X the bridge. X is the preposition example_title: Grammar exercise 2 group: English -- text: |- - Traduction en français: Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience. - Traduction en espagnol: +- text: | + Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience. + Traduction en espagnol: « example_title: Translation to Spanish group: French -- text: |- - Traducción al francés: Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience. - Traducción al español: +- text: | + Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience. + Traduction en espagnol: « example_title: Translation from French group: Spanish - text: ذات مرة ، عاش شبل الدب في الغابة @@ -164,51 +164,1615 @@ widget: - text: Il était une fois une licorne qui vivait example_title: Fairy tale group: French -- text: |- - Q: A juggler can juggle 16 balls. Half of the balls are golf balls, and half of the golf balls are blue. How many blue golf balls are there? - A: Let's think step by step. +- text: | + Q: A juggler can juggle 16 balls. Half of the balls are golf balls, and half of the gold balls are blue. How many blue golf balls are there? + A: Let's think step by step. example_title: Mathematical reasoning group: English - -co2_eq_emissions: - emissions: 24_700_000 - source: "Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model. https://arxiv.org/abs/2211.02001" - training_type: "pre-training" - geographical_location: "Orsay, France" - hardware_used: "384 A100 80GB GPUs" - model-index: - name: bloom results: - task: type: text-generation + name: text generation + dataset: + name: arc_challenge + type: arc_challenge + metrics: + - name: acc + type: acc + value: 0.4112627986348123 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: arc_easy + type: arc_easy + metrics: + - name: acc + type: acc + value: 0.726010101010101 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: axb + type: axb + metrics: + - name: acc + type: acc + value: 0.5751811594202898 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: axg + type: axg + metrics: + - name: acc + type: acc + value: 0.5252808988764045 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: boolq + type: boolq + metrics: + - name: acc + type: acc + value: 0.6345565749235474 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: cb + type: cb + metrics: + - name: acc + type: acc + value: 0.3392857142857143 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: cola + type: cola + metrics: + - name: acc + type: acc + value: 0.39022051773729627 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: copa + type: copa + metrics: + - name: acc + type: acc + value: 0.56 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: crows_pairs_english + type: crows_pairs_english + metrics: + - name: acc + type: acc + value: 0.5 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: crows_pairs_french + type: crows_pairs_french + metrics: + - name: acc + type: acc + value: 0.505664877757901 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: diabla + type: diabla + metrics: + - name: acc + type: acc + value: 0.2947981906750174 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_afr + type: gsarti/flores_101_afr + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.25431550058444 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_amh + type: gsarti/flores_101_amh + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.716877477347089 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ara + type: gsarti/flores_101_ara + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.7049030137120964 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_asm + type: gsarti/flores_101_asm + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.576581380404954 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ast + type: gsarti/flores_101_ast + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.8562364775797944 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_azj + type: gsarti/flores_101_azj + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.80721528624391 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_bel + type: gsarti/flores_101_bel + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.7312177406635065 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ben + type: gsarti/flores_101_ben + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.993409478990023 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_bos + type: gsarti/flores_101_bos + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.5936169095529493 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_bul + type: gsarti/flores_101_bul + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.159035321398085 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_cat + type: gsarti/flores_101_cat + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.167873680006659 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ceb + type: gsarti/flores_101_ceb + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.286975089885673 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ces + type: gsarti/flores_101_ces + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.4516208322236017 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ckb + type: gsarti/flores_101_ckb + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.7051034724765612 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_cym + type: gsarti/flores_101_cym + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 7.0889312398688125 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_dan + type: gsarti/flores_101_dan + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.4300748208111838 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_deu + type: gsarti/flores_101_deu + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.3380585896268107 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ell + type: gsarti/flores_101_ell + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.9595604725375586 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_eng + type: gsarti/flores_101_eng + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.8819637649637901 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_est + type: gsarti/flores_101_est + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.773850600380297 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_fas + type: gsarti/flores_101_fas + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.4306140728294086 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_fin + type: gsarti/flores_101_fin + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.304305536244342 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_fra + type: gsarti/flores_101_fra + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.9374688438541796 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ful + type: gsarti/flores_101_ful + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 9.740353097219378 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_gle + type: gsarti/flores_101_gle + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.035269765075012 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_glg + type: gsarti/flores_101_glg + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.365451129546636 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_guj + type: gsarti/flores_101_guj + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.70676742569154 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_hau + type: gsarti/flores_101_hau + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 8.855204288260023 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_heb + type: gsarti/flores_101_heb + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.920943798471208 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_hin + type: gsarti/flores_101_hin + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.452028001573195 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_hrv + type: gsarti/flores_101_hrv + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.7056829077179225 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_hun + type: gsarti/flores_101_hun + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.058579478967854 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_hye + type: gsarti/flores_101_hye + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.127237816041562 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ibo + type: gsarti/flores_101_ibo + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.9500357969906683 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ind + type: gsarti/flores_101_ind + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.976163584180101 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_isl + type: gsarti/flores_101_isl + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.500542085165231 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ita + type: gsarti/flores_101_ita + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.314465100752677 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_jav + type: gsarti/flores_101_jav + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.942322446550142 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_jpn + type: gsarti/flores_101_jpn + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.259421750521777 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kam + type: gsarti/flores_101_kam + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 9.743025325635475 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kan + type: gsarti/flores_101_kan + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.233724699944989 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kat + type: gsarti/flores_101_kat + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.0508893415872107 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kaz + type: gsarti/flores_101_kaz + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.0390148516287927 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kea + type: gsarti/flores_101_kea + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 7.147132270533836 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_khm + type: gsarti/flores_101_khm + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.366514710252477 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kir + type: gsarti/flores_101_kir + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.2413845359487885 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_kor + type: gsarti/flores_101_kor + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.9023196482741027 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_lao + type: gsarti/flores_101_lao + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.331446855837494 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_lav + type: gsarti/flores_101_lav + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.223609016485348 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_lin + type: gsarti/flores_101_lin + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.847471204107301 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_lit + type: gsarti/flores_101_lit + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.5432035498036765 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ltz + type: gsarti/flores_101_ltz + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.5910516978201015 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_lug + type: gsarti/flores_101_lug + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.4301049946044175 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_luo + type: gsarti/flores_101_luo + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 12.031029857399394 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mal + type: gsarti/flores_101_mal + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.794302548141229 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mar + type: gsarti/flores_101_mar + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.856682255407709 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mkd + type: gsarti/flores_101_mkd + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.3354144607382983 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mlt + type: gsarti/flores_101_mlt + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 9.04135227904975 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mon + type: gsarti/flores_101_mon + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.094907723618666 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mri + type: gsarti/flores_101_mri + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.2659698341456505 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_msa + type: gsarti/flores_101_msa + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.2220779892820985 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_mya + type: gsarti/flores_101_mya + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.5229159853414433 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_nld + type: gsarti/flores_101_nld + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.799153089002766 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_nob + type: gsarti/flores_101_nob + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.628942049758715 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_npi + type: gsarti/flores_101_npi + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.666236527803879 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_nso + type: gsarti/flores_101_nso + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.015319074943932 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_nya + type: gsarti/flores_101_nya + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.938044040751036 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_oci + type: gsarti/flores_101_oci + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.607440766288032 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_orm + type: gsarti/flores_101_orm + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 11.31585044916705 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ory + type: gsarti/flores_101_ory + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.981891184515959 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_pan + type: gsarti/flores_101_pan + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.7716086841502685 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_pol + type: gsarti/flores_101_pol + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.01200174157614 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_por + type: gsarti/flores_101_por + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.8411472115156693 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_pus + type: gsarti/flores_101_pus + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.623872921169341 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ron + type: gsarti/flores_101_ron + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.049829411973529 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_rus + type: gsarti/flores_101_rus + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.7083443875791493 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_slk + type: gsarti/flores_101_slk + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.037719650548048 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_slv + type: gsarti/flores_101_slv + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.141036287764831 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_sna + type: gsarti/flores_101_sna + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.7109183690601295 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_snd + type: gsarti/flores_101_snd + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.206170931541356 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_som + type: gsarti/flores_101_som + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 9.154342083821405 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_spa + type: gsarti/flores_101_spa + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.7955816311143258 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_srp + type: gsarti/flores_101_srp + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.241096141430147 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_swe + type: gsarti/flores_101_swe + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.344977179674293 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_swh + type: gsarti/flores_101_swh + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.6844272218041634 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_tam + type: gsarti/flores_101_tam + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 5.1645951632801745 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_tel + type: gsarti/flores_101_tel + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.8098996634099445 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_tgk + type: gsarti/flores_101_tgk + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.785457016715163 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_tgl + type: gsarti/flores_101_tgl + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.7498953645610875 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_tha + type: gsarti/flores_101_tha + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.104151663233468 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_tur + type: gsarti/flores_101_tur + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 3.3178240103796037 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_ukr + type: gsarti/flores_101_ukr + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.088543437159643 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_umb + type: gsarti/flores_101_umb + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 11.766013385445124 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_urd + type: gsarti/flores_101_urd + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.7788699847612357 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_uzb + type: gsarti/flores_101_uzb + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 8.499879863290486 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_vie + type: gsarti/flores_101_vie + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 1.65901207387262 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_wol + type: gsarti/flores_101_wol + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.141703791276928 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_xho + type: gsarti/flores_101_xho + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.690199677955254 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_yor + type: gsarti/flores_101_yor + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 4.360585696242932 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_zho_simpl + type: gsarti/flores_101_zho_simpl + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.1183545781883515 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_zho_trad + type: gsarti/flores_101_zho_trad + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 2.273787884962656 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: gsarti/flores_101_zul + type: gsarti/flores_101_zul + metrics: + - name: byte_perplexity + type: byte_perplexity + value: 6.016954767729589 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: headqa + type: headqa + metrics: + - name: acc + type: acc + value: 0.3464624361779723 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: hellaswag + type: hellaswag + metrics: + - name: acc + type: acc + value: 0.5353515236008763 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: lambada_mt_de + type: lambada_mt_de + metrics: + - name: acc + type: acc + value: 0.3291286629148069 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: lambada_mt_en + type: lambada_mt_en + metrics: + - name: acc + type: acc + value: 0.6720357073549389 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: lambada_mt_es + type: lambada_mt_es + metrics: + - name: acc + type: acc + value: 0.476421502037648 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: lambada_mt_it + type: lambada_mt_it + metrics: + - name: acc + type: acc + value: 0.4061711624296526 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: logiqa + type: logiqa + metrics: + - name: acc + type: acc + value: 0.2350230414746544 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: mathqa + type: mathqa + metrics: + - name: acc + type: acc + value: 0.27671691792294806 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: mc_taco + type: mc_taco + metrics: + - name: em + type: em + value: 0.13063063063063063 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: mnli + type: mnli + metrics: + - name: acc + type: acc + value: 0.3545565500406835 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: mnli_mismatched + type: mnli_mismatched + metrics: + - name: acc + type: acc + value: 0.3545565500406835 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: mrpc + type: mrpc + metrics: + - name: acc + type: acc + value: 0.3872549019607843 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: multirc + type: multirc + metrics: + - name: acc + type: acc + value: 0.570957095709571 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: openbookqa + type: openbookqa + metrics: + - name: acc + type: acc + value: 0.312 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: piqa + type: piqa + metrics: + - name: acc + type: acc + value: 0.7812840043525572 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: prost + type: prost + metrics: + - name: acc + type: acc + value: 0.2977156276686593 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: pubmedqa + type: pubmedqa + metrics: + - name: acc + type: acc + value: 0.741 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: qnli + type: qnli + metrics: + - name: acc + type: acc + value: 0.5172981878088962 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: qqp + type: qqp + metrics: + - name: acc + type: acc + value: 0.5883007667573584 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: race + type: race + metrics: + - name: acc + type: acc + value: 0.39043062200956935 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: rte + type: rte + metrics: + - name: acc + type: acc + value: 0.5198555956678701 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: sciq + type: sciq + metrics: + - name: acc + type: acc + value: 0.936 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: sst + type: sst + metrics: + - name: acc + type: acc + value: 0.6043577981651376 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: triviaqa + type: triviaqa + metrics: + - name: acc + type: acc + value: 0.18332891363917617 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: tydiqa_primary + type: tydiqa_primary + metrics: + - name: acc + type: acc + value: 0.2809817301342725 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: webqs + type: webqs + metrics: + - name: acc + type: acc + value: 0.061515748031496065 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: wic + type: wic + metrics: + - name: acc + type: acc + value: 0.5062695924764891 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: winogrande + type: winogrande + metrics: + - name: acc + type: acc + value: 0.7095501183898973 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: wnli + type: wnli + metrics: + - name: acc + type: acc + value: 0.5704225352112676 + verified: false + - task: + type: text-generation + name: text generation + dataset: + name: wsc + type: wsc + metrics: + - name: acc + type: acc + value: 0.5192307692307693 + verified: false + - task: + type: text-generation + name: text generation dataset: - type: openai_humaneval name: humaneval + type: humaneval metrics: - name: pass@1 type: pass@1 - value: 0.15542682926829265 + value: 0.15524390243902436 verified: false - name: pass@10 type: pass@10 - value: 0.3278356276947017 + value: 0.3220367632383857 verified: false - name: pass@100 type: pass@100 - value: 0.5719815685597749 + value: 0.5545431515723145 verified: false --- -BigScience Logo +BigScience Logo BigScience Large Open-science Open-access Multilingual Language Model -Version 1.3 / 6 July 2022 +Version 1.3 / 6.July.2022 Current Checkpoint: **Training Iteration 95000** -Link to paper: [here](https://arxiv.org/abs/2211.05100) - Total seen tokens: **366B** --- @@ -230,8 +1794,6 @@ BLOOM is an autoregressive Large Language Model (LLM), trained to continue text **Model Type:** Transformer-based Language Model -**Checkpoints format:** `transformers` (Megatron-DeepSpeed format available [here](https://huggingface.co/bigscience/bloom-optimizer-states)) - **Version:** 1.0.0 **Languages:** Multiple; see [training data](#training-data) @@ -274,9 +1836,7 @@ Please see [the BLOOM training README](https://github.com/bigscience-workshop/bi * ALiBI positional encodings (see [paper](https://arxiv.org/pdf/2108.12409.pdf)), with GeLU activation functions -* 176,247,271,424 parameters: - - * 3,596,615,680 embedding parameters +* 176 billion parameters: * 70 layers, 112 attention heads @@ -355,7 +1915,7 @@ The following tables shows the further distribution of Niger-Congo & Indic langu Distribution of Niger Congo and Indic languages. | Niger Congo | Percentage | | Indic | Percentage | -|----------------|------------| ------ |-----------|------------| +|----------------|------------ |------ |-----------|------------| | Chi Tumbuka | 0.00002 | | Assamese | 0.01 | | Kikuyu | 0.00004 | | Odia | 0.04 | | Bambara | 0.00004 | | Gujarati | 0.04 | @@ -364,12 +1924,11 @@ Distribution of Niger Congo and Indic languages. | Sesotho | 0.00007 | | Kannada | 0.06 | | Chi Chewa | 0.0001 | | Nepali | 0.07 | | Setswana | 0.0002 | | Telugu | 0.09 | -| Lingala | 0.0002 | | Malayalam | 0.10 | -| Northern Sotho | 0.0002 | | Urdu | 0.10 | -| Fon | 0.0002 | | Tamil | 0.20 | -| Kirundi | 0.0003 | | Bengali | 0.50 | -| Wolof | 0.0004 | | Hindi | 0.70 | -| Luganda | 0.0004 | +| Northern Sotho | 0.0002 | | Malayalam | 0.10 | +| Fon | 0.0002 | | Urdu | 0.10 | +| Kirundi | 0.0003 | | Tamil | 0.20 | +| Wolof | 0.0004 | | Bengali | 0.50 | +| Kuganda | 0.0004 | | Hindi | 0.70 | | Chi Shona | 0.001 | | Isi Zulu | 0.001 | | Igbo | 0.001 | @@ -494,7 +2053,7 @@ See the [BLOOM License](https://huggingface.co/spaces/bigscience/license), Attac #### Out-of-scope Uses -Using the model in [high-stakes](#high-stakes) settings is out of scope for this model. The model is not designed for [critical decisions](#critical-decisions) nor uses with any material consequences on an individual's livelihood or wellbeing. The model outputs content that appears factual but may not be correct. +Using the model in [high-stakes](#high-stakes) settings is out of scope for this model.  The model is not designed for [critical decisions](#critical-decisions) nor uses with any material consequences on an individual's livelihood or wellbeing. The model outputs content that appears factual but may not be correct. Out-of-scope Uses Include: @@ -602,6 +2161,7 @@ Model may: ## Metrics *This section describes the different ways performance is calculated and why.* + Includes: | Metric | Why chosen | @@ -625,15 +2185,156 @@ And multiple different metrics for specific tasks. _(More evaluation metrics for **Zero-shot evaluations:** -WARNING: This section used to contain much more results, however they were not correct and we released without the approval of the evaluation working group. We are currently in the process of fixing the evaluations. - See this repository for JSON files: https://github.com/bigscience-workshop/evaluation-results | Task | Language | Metric | BLOOM-176B | OPT-175B* | |:--------|:-----------------|:------------------------|-------------:|------------:| +| arc_challenge | eng | acc ↑ | 0.411 | 0.412 | +| arc_easy | eng | acc ↑ | 0.726 | 0.751 | +| axb (Median of 10 prompts) | eng | acc ↑ | 0.575 | 0.532 | +| axg (Median of 10 prompts) | eng | acc ↑ | 0.525 | 0.548 | +| boolq (Median of 11 prompts) | eng | acc ↑ | 0.635 | 0.622 | +| cb (Median of 15 prompts) | eng | acc ↑ | 0.339 | 0.411 | +| cola (Median of 5 prompts) | eng | acc ↑ | 0.39 | 0.444 | +| copa (Median of 9 prompts) | eng | acc ↑ | 0.56 | 0.55 | +| crows_pairs_english (Median of 6 prompts) | eng | acc ↑ | 0.5 | 0.502 | +| crows_pairs_french (Median of 7 prompts) | fra | acc ↑ | 0.506 | 0.499 | +| diabla (Median of 2 prompts) | eng | acc ↑ | 0.295 | 0.289 | +| gsarti/flores_101_afr | afr | byte_perplexity ↓ | 4.254 | 3.381 | +| gsarti/flores_101_amh | amh | byte_perplexity ↓ | 3.717 | 3.87 | +| gsarti/flores_101_ara | ara | byte_perplexity ↓ | 1.705 | 2.42 | +| gsarti/flores_101_asm | asm | byte_perplexity ↓ | 6.577 | 3.028 | +| gsarti/flores_101_ast | ast | byte_perplexity ↓ | 2.856 | 4.737 | +| gsarti/flores_101_azj | azj | byte_perplexity ↓ | 4.807 | 4.767 | +| gsarti/flores_101_bel | bel | byte_perplexity ↓ | 2.731 | 2.557 | +| gsarti/flores_101_ben | ben | byte_perplexity ↓ | 5.993 | 2.243 | +| gsarti/flores_101_bos | bos | byte_perplexity ↓ | 3.594 | 2.668 | +| gsarti/flores_101_bul | bul | byte_perplexity ↓ | 2.159 | 2.099 | +| gsarti/flores_101_cat | cat | byte_perplexity ↓ | 2.168 | 2.837 | +| gsarti/flores_101_ceb | ceb | byte_perplexity ↓ | 5.287 | 3.636 | +| gsarti/flores_101_ces | ces | byte_perplexity ↓ | 3.452 | 2.749 | +| gsarti/flores_101_ckb | ckb | byte_perplexity ↓ | 3.705 | 4.688 | +| gsarti/flores_101_cym | cym | byte_perplexity ↓ | 7.089 | 5.075 | +| gsarti/flores_101_dan | dan | byte_perplexity ↓ | 3.43 | 2.492 | +| gsarti/flores_101_deu | deu | byte_perplexity ↓ | 2.338 | 2.099 | +| gsarti/flores_101_ell | ell | byte_perplexity ↓ | 1.96 | 1.811 | +| gsarti/flores_101_eng | eng | byte_perplexity ↓ | 1.882 | 1.9 | +| gsarti/flores_101_est | est | byte_perplexity ↓ | 5.774 | 3.533 | +| gsarti/flores_101_fas | fas | byte_perplexity ↓ | 2.431 | 2.444 | +| gsarti/flores_101_fin | fin | byte_perplexity ↓ | 4.304 | 2.601 | +| gsarti/flores_101_fra | fra | byte_perplexity ↓ | 1.937 | 1.984 | +| gsarti/flores_101_ful | ful | byte_perplexity ↓ | 9.74 | 11.84 | +| gsarti/flores_101_gle | gle | byte_perplexity ↓ | 6.035 | 3.914 | +| gsarti/flores_101_glg | glg | byte_perplexity ↓ | 2.365 | 3.015 | +| gsarti/flores_101_guj | guj | byte_perplexity ↓ | 5.707 | 2.438 | +| gsarti/flores_101_hau | hau | byte_perplexity ↓ | 8.855 | 5.283 | +| gsarti/flores_101_heb | heb | byte_perplexity ↓ | 2.921 | 2.903 | +| gsarti/flores_101_hin | hin | byte_perplexity ↓ | 5.452 | 1.86 | +| gsarti/flores_101_hrv | hrv | byte_perplexity ↓ | 3.706 | 2.715 | +| gsarti/flores_101_hun | hun | byte_perplexity ↓ | 4.059 | 2.865 | +| gsarti/flores_101_hye | hye | byte_perplexity ↓ | 3.127 | 3.411 | +| gsarti/flores_101_ibo | ibo | byte_perplexity ↓ | 3.95 | 8.008 | +| gsarti/flores_101_ind | ind | byte_perplexity ↓ | 1.976 | 2.632 | +| gsarti/flores_101_isl | isl | byte_perplexity ↓ | 5.501 | 4.701 | +| gsarti/flores_101_ita | ita | byte_perplexity ↓ | 2.314 | 2.104 | +| gsarti/flores_101_jav | jav | byte_perplexity ↓ | 4.942 | 8.16 | +| gsarti/flores_101_jpn | jpn | byte_perplexity ↓ | 2.259 | 2.198 | +| gsarti/flores_101_kam | kam | byte_perplexity ↓ | 9.743 | 10.981 | +| gsarti/flores_101_kan | kan | byte_perplexity ↓ | 6.234 | 2.373 | +| gsarti/flores_101_kat | kat | byte_perplexity ↓ | 2.051 | 2.466 | +| gsarti/flores_101_kaz | kaz | byte_perplexity ↓ | 3.039 | 4.376 | +| gsarti/flores_101_kea | kea | byte_perplexity ↓ | 7.147 | 9.632 | +| gsarti/flores_101_khm | khm | byte_perplexity ↓ | 3.367 | 2.646 | +| gsarti/flores_101_kir | kir | byte_perplexity ↓ | 3.241 | 4.522 | +| gsarti/flores_101_kor | kor | byte_perplexity ↓ | 2.902 | 3.376 | +| gsarti/flores_101_lao | lao | byte_perplexity ↓ | 2.331 | 3.106 | +| gsarti/flores_101_lav | lav | byte_perplexity ↓ | 5.224 | 4.811 | +| gsarti/flores_101_lin | lin | byte_perplexity ↓ | 4.847 | 8.871 | +| gsarti/flores_101_lit | lit | byte_perplexity ↓ | 4.543 | 5.183 | +| gsarti/flores_101_ltz | ltz | byte_perplexity ↓ | 5.591 | 7.158 | +| gsarti/flores_101_lug | lug | byte_perplexity ↓ | 5.43 | 7.399 | +| gsarti/flores_101_luo | luo | byte_perplexity ↓ | 12.031 | 11.951 | +| gsarti/flores_101_mal | mal | byte_perplexity ↓ | 4.794 | 2.054 | +| gsarti/flores_101_mar | mar | byte_perplexity ↓ | 6.857 | 2.274 | +| gsarti/flores_101_mkd | mkd | byte_perplexity ↓ | 2.335 | 2.538 | +| gsarti/flores_101_mlt | mlt | byte_perplexity ↓ | 9.041 | 5.996 | +| gsarti/flores_101_mon | mon | byte_perplexity ↓ | 3.095 | 4.519 | +| gsarti/flores_101_mri | mri | byte_perplexity ↓ | 5.266 | 4.438 | +| gsarti/flores_101_msa | msa | byte_perplexity ↓ | 2.222 | 2.935 | +| gsarti/flores_101_mya | mya | byte_perplexity ↓ | 2.523 | 2.413 | +| gsarti/flores_101_nld | nld | byte_perplexity ↓ | 2.799 | 2.293 | +| gsarti/flores_101_nob | nob | byte_perplexity ↓ | 3.629 | 2.593 | +| gsarti/flores_101_npi | npi | byte_perplexity ↓ | 6.666 | 2.499 | +| gsarti/flores_101_nso | nso | byte_perplexity ↓ | 5.015 | 8.485 | +| gsarti/flores_101_nya | nya | byte_perplexity ↓ | 4.938 | 7.548 | +| gsarti/flores_101_oci | oci | byte_perplexity ↓ | 3.607 | 4.936 | +| gsarti/flores_101_orm | orm | byte_perplexity ↓ | 11.316 | 7.145 | +| gsarti/flores_101_ory | ory | byte_perplexity ↓ | 5.982 | 2.668 | +| gsarti/flores_101_pan | pan | byte_perplexity ↓ | 4.772 | 2.782 | +| gsarti/flores_101_pol | pol | byte_perplexity ↓ | 3.012 | 2.432 | +| gsarti/flores_101_por | por | byte_perplexity ↓ | 1.841 | 2.178 | +| gsarti/flores_101_pus | pus | byte_perplexity ↓ | 4.624 | 4.785 | +| gsarti/flores_101_ron | ron | byte_perplexity ↓ | 3.05 | 2.197 | +| gsarti/flores_101_rus | rus | byte_perplexity ↓ | 1.708 | 1.689 | +| gsarti/flores_101_slk | slk | byte_perplexity ↓ | 4.038 | 3.419 | +| gsarti/flores_101_slv | slv | byte_perplexity ↓ | 4.141 | 3.582 | +| gsarti/flores_101_sna | sna | byte_perplexity ↓ | 4.711 | 5.588 | +| gsarti/flores_101_snd | snd | byte_perplexity ↓ | 4.206 | 5.667 | +| gsarti/flores_101_som | som | byte_perplexity ↓ | 9.154 | 4.788 | +| gsarti/flores_101_spa | spa | byte_perplexity ↓ | 1.796 | 2.098 | +| gsarti/flores_101_srp | srp | byte_perplexity ↓ | 2.241 | 2.688 | +| gsarti/flores_101_swe | swe | byte_perplexity ↓ | 3.345 | 2.468 | +| gsarti/flores_101_swh | swh | byte_perplexity ↓ | 2.684 | 4.473 | +| gsarti/flores_101_tam | tam | byte_perplexity ↓ | 5.165 | 2.024 | +| gsarti/flores_101_tel | tel | byte_perplexity ↓ | 6.81 | 2.407 | +| gsarti/flores_101_tgk | tgk | byte_perplexity ↓ | 3.785 | 4.899 | +| gsarti/flores_101_tgl | tgl | byte_perplexity ↓ | 3.75 | 2.738 | +| gsarti/flores_101_tha | tha | byte_perplexity ↓ | 2.104 | 2.035 | +| gsarti/flores_101_tur | tur | byte_perplexity ↓ | 3.318 | 2.622 | +| gsarti/flores_101_ukr | ukr | byte_perplexity ↓ | 2.089 | 1.93 | +| gsarti/flores_101_umb | umb | byte_perplexity ↓ | 11.766 | 11.64 | +| gsarti/flores_101_urd | urd | byte_perplexity ↓ | 1.779 | 2.982 | +| gsarti/flores_101_uzb | uzb | byte_perplexity ↓ | 8.5 | 13.209 | +| gsarti/flores_101_vie | vie | byte_perplexity ↓ | 1.659 | 2.229 | +| gsarti/flores_101_wol | wol | byte_perplexity ↓ | 6.142 | 13.945 | +| gsarti/flores_101_xho | xho | byte_perplexity ↓ | 4.69 | 8.42 | +| gsarti/flores_101_yor | yor | byte_perplexity ↓ | 4.361 | 7.636 | +| gsarti/flores_101_zho_simpl | zho_simpl | byte_perplexity ↓ | 2.118 | 5.113 | +| gsarti/flores_101_zho_trad | zho_trad | byte_perplexity ↓ | 2.274 | 5.67 | +| gsarti/flores_101_zul | zul | byte_perplexity ↓ | 6.017 | 7.341 | +| headqa | esp | acc ↑ | 0.346 | 0.244 | +| hellaswag | eng | acc ↑ | 0.535 | 0.592 | +| lambada_mt_de | deu | acc ↑ | 0.329 | 0.358 | +| lambada_mt_en | eng | acc ↑ | 0.672 | 0.747 | +| lambada_mt_es | esp | acc ↑ | 0.476 | 0.397 | +| lambada_mt_it | ita | acc ↑ | 0.406 | 0.409 | +| logiqa | eng | acc ↑ | 0.235 | 0.244 | +| mathqa | eng | acc ↑ | 0.277 | 0.268 | +| mc_taco | eng | em ↑ | 0.131 | 0.124 | +| mnli (Median of 15 prompts) | eng | acc ↑ | 0.355 | 0.36 | +| mnli_mismatched (Median of 15 prompts) | eng | acc ↑ | 0.355 | 0.36 | +| mrpc | eng | acc ↑ | 0.387 | 0.446 | +| multirc (Median of 11 prompts) | eng | acc ↑ | 0.571 | 0.599 | +| openbookqa | eng | acc ↑ | 0.312 | 0.322 | +| piqa | eng | acc ↑ | 0.781 | 0.791 | +| prost | eng | acc ↑ | 0.298 | 0.299 | +| pubmedqa | eng | acc ↑ | 0.741 | 0.709 | +| qnli | eng | acc ↑ | 0.517 | 0.554 | +| qqp (Median of 7 prompts) | eng | acc ↑ | 0.588 | 0.395 | +| race | eng | acc ↑ | 0.39 | 0.402 | +| rte (Median of 6 prompts) | eng | acc ↑ | 0.52 | 0.495 | +| sciq | eng | acc ↑ | 0.936 | 0.948 | +| sst (Median of 6 prompts) | eng | acc ↑ | 0.604 | 0.647 | +| triviaqa | eng | acc ↑ | 0.183 | 0.342 | +| tydiqa_primary (Median of 16 prompts) | eng | acc ↑ | 0.281 | 0.148 | +| webqs | eng | acc ↑ | 0.062 | 0.159 | +| wic (Median of 11 prompts) | eng | acc ↑ | 0.506 | 0.498 | +| winogrande | eng | acc ↑ | 0.71 | 0.736 | +| wnli (Median of 6 prompts) | eng | acc ↑ | 0.57 | 0.563 | +| wsc (Median of 11 prompts) | eng | acc ↑ | 0.519 | 0.413 | | humaneval | python | pass@1 ↑ | 0.155 | 0.0 | -| humaneval | python | pass@10 ↑ | 0.328 | 0.0 | -| humaneval | python | pass@100 ↑ | 0.572 | 0.003 | +| humaneval | python | pass@10 ↑ | 0.322 | 0.0 | +| humaneval | python | pass@100 ↑ | 0.555 | 0.003 | **Train-time Evaluation:** @@ -736,16 +2437,9 @@ Initial prompting experiments using interim checkpoints: https://huggingface.co/ - -## Original checkpoints - -The checkpoints in this repo correspond to the HuggingFace Transformers format. If you want to use our fork of [Megatron-DeepSpeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed) that the model was trained with, you'd want to use [this repo instead](https://huggingface.co/bigscience/bloom-optimizer-states). - -Many intermediate checkpoints are available at https://huggingface.co/bigscience/bloom-intermediate/ - --- # Model Card Authors -*Ordered roughly chronologically and by amount of time spent on creating this model card.* +*Ordered roughly chronologically and by amount of time spent.* -Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff \ No newline at end of file +Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff diff --git a/config.json b/config.json index e9d0a5574e859ae07790c378d5c69a5a2aae5c9e..c8079e2826ac7108a7ca2ba54c47ec0cab8b18f8 100644 --- a/config.json +++ b/config.json @@ -2,7 +2,7 @@ "apply_residual_connection_post_layernorm": false, "attention_dropout": 0.0, "architectures": [ - "BloomForCausalLM" + "BloomModel" ], "attention_softmax_in_fp32": true, "pad_token_id": 3, @@ -21,4 +21,4 @@ "transformers_version": "4.21.0", "use_cache": true, "vocab_size": 250880 -} \ No newline at end of file +} diff --git a/model.safetensors.index.json b/model.safetensors.index.json deleted file mode 100644 index 7dd7b9f86c92d65291d95a8bd02b5fe73c545c81..0000000000000000000000000000000000000000 --- a/model.safetensors.index.json +++ /dev/null @@ -1,852 +0,0 @@ -{ - "metadata": { - "total_size": 352494542848 - }, - "weight_map": { - "h.0.input_layernorm.bias": "model_00002-of-00072.safetensors", - "h.0.input_layernorm.weight": "model_00002-of-00072.safetensors", - "h.0.mlp.dense_4h_to_h.bias": "model_00002-of-00072.safetensors", - "h.0.mlp.dense_4h_to_h.weight": "model_00002-of-00072.safetensors", - "h.0.mlp.dense_h_to_4h.bias": "model_00002-of-00072.safetensors", - "h.0.mlp.dense_h_to_4h.weight": "model_00002-of-00072.safetensors", - "h.0.post_attention_layernorm.bias": "model_00002-of-00072.safetensors", - "h.0.post_attention_layernorm.weight": "model_00002-of-00072.safetensors", - "h.0.self_attention.dense.bias": "model_00002-of-00072.safetensors", - "h.0.self_attention.dense.weight": "model_00002-of-00072.safetensors", - "h.0.self_attention.query_key_value.bias": "model_00002-of-00072.safetensors", - "h.0.self_attention.query_key_value.weight": "model_00002-of-00072.safetensors", - "h.1.input_layernorm.bias": "model_00003-of-00072.safetensors", - "h.1.input_layernorm.weight": "model_00003-of-00072.safetensors", - "h.1.mlp.dense_4h_to_h.bias": "model_00003-of-00072.safetensors", - "h.1.mlp.dense_4h_to_h.weight": "model_00003-of-00072.safetensors", - "h.1.mlp.dense_h_to_4h.bias": "model_00003-of-00072.safetensors", - "h.1.mlp.dense_h_to_4h.weight": "model_00003-of-00072.safetensors", - "h.1.post_attention_layernorm.bias": "model_00003-of-00072.safetensors", - "h.1.post_attention_layernorm.weight": "model_00003-of-00072.safetensors", - "h.1.self_attention.dense.bias": "model_00003-of-00072.safetensors", - "h.1.self_attention.dense.weight": "model_00003-of-00072.safetensors", - "h.1.self_attention.query_key_value.bias": "model_00003-of-00072.safetensors", - "h.1.self_attention.query_key_value.weight": "model_00003-of-00072.safetensors", - "h.10.input_layernorm.bias": "model_00012-of-00072.safetensors", - "h.10.input_layernorm.weight": "model_00012-of-00072.safetensors", - "h.10.mlp.dense_4h_to_h.bias": "model_00012-of-00072.safetensors", - "h.10.mlp.dense_4h_to_h.weight": "model_00012-of-00072.safetensors", - "h.10.mlp.dense_h_to_4h.bias": "model_00012-of-00072.safetensors", - "h.10.mlp.dense_h_to_4h.weight": "model_00012-of-00072.safetensors", - "h.10.post_attention_layernorm.bias": "model_00012-of-00072.safetensors", - "h.10.post_attention_layernorm.weight": "model_00012-of-00072.safetensors", - "h.10.self_attention.dense.bias": "model_00012-of-00072.safetensors", - "h.10.self_attention.dense.weight": "model_00012-of-00072.safetensors", - "h.10.self_attention.query_key_value.bias": "model_00012-of-00072.safetensors", - "h.10.self_attention.query_key_value.weight": "model_00012-of-00072.safetensors", - "h.11.input_layernorm.bias": "model_00013-of-00072.safetensors", - "h.11.input_layernorm.weight": "model_00013-of-00072.safetensors", - "h.11.mlp.dense_4h_to_h.bias": "model_00013-of-00072.safetensors", - "h.11.mlp.dense_4h_to_h.weight": "model_00013-of-00072.safetensors", - "h.11.mlp.dense_h_to_4h.bias": "model_00013-of-00072.safetensors", - "h.11.mlp.dense_h_to_4h.weight": "model_00013-of-00072.safetensors", - "h.11.post_attention_layernorm.bias": "model_00013-of-00072.safetensors", - "h.11.post_attention_layernorm.weight": "model_00013-of-00072.safetensors", - "h.11.self_attention.dense.bias": "model_00013-of-00072.safetensors", - "h.11.self_attention.dense.weight": "model_00013-of-00072.safetensors", - "h.11.self_attention.query_key_value.bias": "model_00013-of-00072.safetensors", - "h.11.self_attention.query_key_value.weight": "model_00013-of-00072.safetensors", - "h.12.input_layernorm.bias": "model_00014-of-00072.safetensors", - "h.12.input_layernorm.weight": "model_00014-of-00072.safetensors", - "h.12.mlp.dense_4h_to_h.bias": "model_00014-of-00072.safetensors", - "h.12.mlp.dense_4h_to_h.weight": "model_00014-of-00072.safetensors", - "h.12.mlp.dense_h_to_4h.bias": "model_00014-of-00072.safetensors", - "h.12.mlp.dense_h_to_4h.weight": "model_00014-of-00072.safetensors", - "h.12.post_attention_layernorm.bias": "model_00014-of-00072.safetensors", - "h.12.post_attention_layernorm.weight": "model_00014-of-00072.safetensors", - "h.12.self_attention.dense.bias": "model_00014-of-00072.safetensors", - "h.12.self_attention.dense.weight": "model_00014-of-00072.safetensors", - "h.12.self_attention.query_key_value.bias": "model_00014-of-00072.safetensors", - "h.12.self_attention.query_key_value.weight": "model_00014-of-00072.safetensors", - "h.13.input_layernorm.bias": "model_00015-of-00072.safetensors", - "h.13.input_layernorm.weight": "model_00015-of-00072.safetensors", - "h.13.mlp.dense_4h_to_h.bias": "model_00015-of-00072.safetensors", - "h.13.mlp.dense_4h_to_h.weight": "model_00015-of-00072.safetensors", - "h.13.mlp.dense_h_to_4h.bias": "model_00015-of-00072.safetensors", - "h.13.mlp.dense_h_to_4h.weight": "model_00015-of-00072.safetensors", - "h.13.post_attention_layernorm.bias": "model_00015-of-00072.safetensors", - "h.13.post_attention_layernorm.weight": "model_00015-of-00072.safetensors", - "h.13.self_attention.dense.bias": "model_00015-of-00072.safetensors", - "h.13.self_attention.dense.weight": "model_00015-of-00072.safetensors", - "h.13.self_attention.query_key_value.bias": "model_00015-of-00072.safetensors", - "h.13.self_attention.query_key_value.weight": "model_00015-of-00072.safetensors", - "h.14.input_layernorm.bias": "model_00016-of-00072.safetensors", - "h.14.input_layernorm.weight": "model_00016-of-00072.safetensors", - "h.14.mlp.dense_4h_to_h.bias": "model_00016-of-00072.safetensors", - "h.14.mlp.dense_4h_to_h.weight": "model_00016-of-00072.safetensors", - "h.14.mlp.dense_h_to_4h.bias": "model_00016-of-00072.safetensors", - "h.14.mlp.dense_h_to_4h.weight": "model_00016-of-00072.safetensors", - "h.14.post_attention_layernorm.bias": "model_00016-of-00072.safetensors", - "h.14.post_attention_layernorm.weight": "model_00016-of-00072.safetensors", - "h.14.self_attention.dense.bias": "model_00016-of-00072.safetensors", - "h.14.self_attention.dense.weight": "model_00016-of-00072.safetensors", - "h.14.self_attention.query_key_value.bias": "model_00016-of-00072.safetensors", - "h.14.self_attention.query_key_value.weight": "model_00016-of-00072.safetensors", - "h.15.input_layernorm.bias": "model_00017-of-00072.safetensors", - "h.15.input_layernorm.weight": "model_00017-of-00072.safetensors", - "h.15.mlp.dense_4h_to_h.bias": "model_00017-of-00072.safetensors", - "h.15.mlp.dense_4h_to_h.weight": "model_00017-of-00072.safetensors", - "h.15.mlp.dense_h_to_4h.bias": "model_00017-of-00072.safetensors", - "h.15.mlp.dense_h_to_4h.weight": "model_00017-of-00072.safetensors", - "h.15.post_attention_layernorm.bias": "model_00017-of-00072.safetensors", - "h.15.post_attention_layernorm.weight": "model_00017-of-00072.safetensors", - "h.15.self_attention.dense.bias": "model_00017-of-00072.safetensors", - "h.15.self_attention.dense.weight": "model_00017-of-00072.safetensors", - "h.15.self_attention.query_key_value.bias": "model_00017-of-00072.safetensors", - "h.15.self_attention.query_key_value.weight": "model_00017-of-00072.safetensors", - "h.16.input_layernorm.bias": "model_00018-of-00072.safetensors", - "h.16.input_layernorm.weight": "model_00018-of-00072.safetensors", - "h.16.mlp.dense_4h_to_h.bias": "model_00018-of-00072.safetensors", - "h.16.mlp.dense_4h_to_h.weight": "model_00018-of-00072.safetensors", - "h.16.mlp.dense_h_to_4h.bias": "model_00018-of-00072.safetensors", - "h.16.mlp.dense_h_to_4h.weight": "model_00018-of-00072.safetensors", - "h.16.post_attention_layernorm.bias": "model_00018-of-00072.safetensors", - "h.16.post_attention_layernorm.weight": "model_00018-of-00072.safetensors", - "h.16.self_attention.dense.bias": "model_00018-of-00072.safetensors", - "h.16.self_attention.dense.weight": "model_00018-of-00072.safetensors", - "h.16.self_attention.query_key_value.bias": "model_00018-of-00072.safetensors", - "h.16.self_attention.query_key_value.weight": "model_00018-of-00072.safetensors", - "h.17.input_layernorm.bias": "model_00019-of-00072.safetensors", - "h.17.input_layernorm.weight": "model_00019-of-00072.safetensors", - "h.17.mlp.dense_4h_to_h.bias": "model_00019-of-00072.safetensors", - "h.17.mlp.dense_4h_to_h.weight": "model_00019-of-00072.safetensors", - "h.17.mlp.dense_h_to_4h.bias": "model_00019-of-00072.safetensors", - "h.17.mlp.dense_h_to_4h.weight": "model_00019-of-00072.safetensors", - "h.17.post_attention_layernorm.bias": "model_00019-of-00072.safetensors", - "h.17.post_attention_layernorm.weight": "model_00019-of-00072.safetensors", - "h.17.self_attention.dense.bias": "model_00019-of-00072.safetensors", - "h.17.self_attention.dense.weight": "model_00019-of-00072.safetensors", - "h.17.self_attention.query_key_value.bias": "model_00019-of-00072.safetensors", - "h.17.self_attention.query_key_value.weight": "model_00019-of-00072.safetensors", - "h.18.input_layernorm.bias": "model_00020-of-00072.safetensors", - "h.18.input_layernorm.weight": "model_00020-of-00072.safetensors", - "h.18.mlp.dense_4h_to_h.bias": "model_00020-of-00072.safetensors", - "h.18.mlp.dense_4h_to_h.weight": "model_00020-of-00072.safetensors", - "h.18.mlp.dense_h_to_4h.bias": "model_00020-of-00072.safetensors", - "h.18.mlp.dense_h_to_4h.weight": "model_00020-of-00072.safetensors", - "h.18.post_attention_layernorm.bias": "model_00020-of-00072.safetensors", - "h.18.post_attention_layernorm.weight": "model_00020-of-00072.safetensors", - "h.18.self_attention.dense.bias": "model_00020-of-00072.safetensors", - "h.18.self_attention.dense.weight": "model_00020-of-00072.safetensors", - "h.18.self_attention.query_key_value.bias": "model_00020-of-00072.safetensors", - "h.18.self_attention.query_key_value.weight": "model_00020-of-00072.safetensors", - "h.19.input_layernorm.bias": "model_00021-of-00072.safetensors", - "h.19.input_layernorm.weight": "model_00021-of-00072.safetensors", - "h.19.mlp.dense_4h_to_h.bias": "model_00021-of-00072.safetensors", - "h.19.mlp.dense_4h_to_h.weight": "model_00021-of-00072.safetensors", - "h.19.mlp.dense_h_to_4h.bias": "model_00021-of-00072.safetensors", - "h.19.mlp.dense_h_to_4h.weight": "model_00021-of-00072.safetensors", - "h.19.post_attention_layernorm.bias": "model_00021-of-00072.safetensors", - "h.19.post_attention_layernorm.weight": "model_00021-of-00072.safetensors", - "h.19.self_attention.dense.bias": "model_00021-of-00072.safetensors", - "h.19.self_attention.dense.weight": "model_00021-of-00072.safetensors", - "h.19.self_attention.query_key_value.bias": "model_00021-of-00072.safetensors", - "h.19.self_attention.query_key_value.weight": "model_00021-of-00072.safetensors", - "h.2.input_layernorm.bias": "model_00004-of-00072.safetensors", - "h.2.input_layernorm.weight": "model_00004-of-00072.safetensors", - "h.2.mlp.dense_4h_to_h.bias": "model_00004-of-00072.safetensors", - "h.2.mlp.dense_4h_to_h.weight": "model_00004-of-00072.safetensors", - "h.2.mlp.dense_h_to_4h.bias": "model_00004-of-00072.safetensors", - "h.2.mlp.dense_h_to_4h.weight": "model_00004-of-00072.safetensors", - "h.2.post_attention_layernorm.bias": "model_00004-of-00072.safetensors", - "h.2.post_attention_layernorm.weight": "model_00004-of-00072.safetensors", - "h.2.self_attention.dense.bias": "model_00004-of-00072.safetensors", - "h.2.self_attention.dense.weight": "model_00004-of-00072.safetensors", - "h.2.self_attention.query_key_value.bias": "model_00004-of-00072.safetensors", - "h.2.self_attention.query_key_value.weight": "model_00004-of-00072.safetensors", - "h.20.input_layernorm.bias": "model_00022-of-00072.safetensors", - "h.20.input_layernorm.weight": "model_00022-of-00072.safetensors", - "h.20.mlp.dense_4h_to_h.bias": "model_00022-of-00072.safetensors", - "h.20.mlp.dense_4h_to_h.weight": "model_00022-of-00072.safetensors", - "h.20.mlp.dense_h_to_4h.bias": "model_00022-of-00072.safetensors", - "h.20.mlp.dense_h_to_4h.weight": "model_00022-of-00072.safetensors", - "h.20.post_attention_layernorm.bias": "model_00022-of-00072.safetensors", - "h.20.post_attention_layernorm.weight": "model_00022-of-00072.safetensors", - "h.20.self_attention.dense.bias": "model_00022-of-00072.safetensors", - "h.20.self_attention.dense.weight": "model_00022-of-00072.safetensors", - "h.20.self_attention.query_key_value.bias": "model_00022-of-00072.safetensors", - "h.20.self_attention.query_key_value.weight": "model_00022-of-00072.safetensors", - "h.21.input_layernorm.bias": "model_00023-of-00072.safetensors", - "h.21.input_layernorm.weight": "model_00023-of-00072.safetensors", - "h.21.mlp.dense_4h_to_h.bias": "model_00023-of-00072.safetensors", - "h.21.mlp.dense_4h_to_h.weight": "model_00023-of-00072.safetensors", - "h.21.mlp.dense_h_to_4h.bias": "model_00023-of-00072.safetensors", - "h.21.mlp.dense_h_to_4h.weight": "model_00023-of-00072.safetensors", - "h.21.post_attention_layernorm.bias": "model_00023-of-00072.safetensors", - "h.21.post_attention_layernorm.weight": "model_00023-of-00072.safetensors", - "h.21.self_attention.dense.bias": "model_00023-of-00072.safetensors", - "h.21.self_attention.dense.weight": "model_00023-of-00072.safetensors", - "h.21.self_attention.query_key_value.bias": "model_00023-of-00072.safetensors", - "h.21.self_attention.query_key_value.weight": "model_00023-of-00072.safetensors", - "h.22.input_layernorm.bias": "model_00024-of-00072.safetensors", - "h.22.input_layernorm.weight": "model_00024-of-00072.safetensors", - "h.22.mlp.dense_4h_to_h.bias": "model_00024-of-00072.safetensors", - "h.22.mlp.dense_4h_to_h.weight": "model_00024-of-00072.safetensors", - "h.22.mlp.dense_h_to_4h.bias": "model_00024-of-00072.safetensors", - "h.22.mlp.dense_h_to_4h.weight": "model_00024-of-00072.safetensors", - "h.22.post_attention_layernorm.bias": "model_00024-of-00072.safetensors", - "h.22.post_attention_layernorm.weight": "model_00024-of-00072.safetensors", - "h.22.self_attention.dense.bias": "model_00024-of-00072.safetensors", - "h.22.self_attention.dense.weight": "model_00024-of-00072.safetensors", - "h.22.self_attention.query_key_value.bias": "model_00024-of-00072.safetensors", - "h.22.self_attention.query_key_value.weight": "model_00024-of-00072.safetensors", - "h.23.input_layernorm.bias": "model_00025-of-00072.safetensors", - "h.23.input_layernorm.weight": "model_00025-of-00072.safetensors", - "h.23.mlp.dense_4h_to_h.bias": "model_00025-of-00072.safetensors", - "h.23.mlp.dense_4h_to_h.weight": "model_00025-of-00072.safetensors", - "h.23.mlp.dense_h_to_4h.bias": "model_00025-of-00072.safetensors", - "h.23.mlp.dense_h_to_4h.weight": "model_00025-of-00072.safetensors", - "h.23.post_attention_layernorm.bias": "model_00025-of-00072.safetensors", - "h.23.post_attention_layernorm.weight": "model_00025-of-00072.safetensors", - "h.23.self_attention.dense.bias": "model_00025-of-00072.safetensors", - "h.23.self_attention.dense.weight": "model_00025-of-00072.safetensors", - "h.23.self_attention.query_key_value.bias": "model_00025-of-00072.safetensors", - "h.23.self_attention.query_key_value.weight": "model_00025-of-00072.safetensors", - "h.24.input_layernorm.bias": "model_00026-of-00072.safetensors", - "h.24.input_layernorm.weight": "model_00026-of-00072.safetensors", - "h.24.mlp.dense_4h_to_h.bias": "model_00026-of-00072.safetensors", - "h.24.mlp.dense_4h_to_h.weight": "model_00026-of-00072.safetensors", - "h.24.mlp.dense_h_to_4h.bias": "model_00026-of-00072.safetensors", - "h.24.mlp.dense_h_to_4h.weight": "model_00026-of-00072.safetensors", - "h.24.post_attention_layernorm.bias": "model_00026-of-00072.safetensors", - "h.24.post_attention_layernorm.weight": "model_00026-of-00072.safetensors", - "h.24.self_attention.dense.bias": "model_00026-of-00072.safetensors", - "h.24.self_attention.dense.weight": "model_00026-of-00072.safetensors", - "h.24.self_attention.query_key_value.bias": "model_00026-of-00072.safetensors", - "h.24.self_attention.query_key_value.weight": "model_00026-of-00072.safetensors", - "h.25.input_layernorm.bias": "model_00027-of-00072.safetensors", - "h.25.input_layernorm.weight": "model_00027-of-00072.safetensors", - "h.25.mlp.dense_4h_to_h.bias": "model_00027-of-00072.safetensors", - "h.25.mlp.dense_4h_to_h.weight": "model_00027-of-00072.safetensors", - "h.25.mlp.dense_h_to_4h.bias": "model_00027-of-00072.safetensors", - "h.25.mlp.dense_h_to_4h.weight": "model_00027-of-00072.safetensors", - "h.25.post_attention_layernorm.bias": "model_00027-of-00072.safetensors", - "h.25.post_attention_layernorm.weight": "model_00027-of-00072.safetensors", - "h.25.self_attention.dense.bias": "model_00027-of-00072.safetensors", - "h.25.self_attention.dense.weight": "model_00027-of-00072.safetensors", - "h.25.self_attention.query_key_value.bias": "model_00027-of-00072.safetensors", - "h.25.self_attention.query_key_value.weight": "model_00027-of-00072.safetensors", - "h.26.input_layernorm.bias": "model_00028-of-00072.safetensors", - "h.26.input_layernorm.weight": "model_00028-of-00072.safetensors", - "h.26.mlp.dense_4h_to_h.bias": "model_00028-of-00072.safetensors", - "h.26.mlp.dense_4h_to_h.weight": "model_00028-of-00072.safetensors", - "h.26.mlp.dense_h_to_4h.bias": "model_00028-of-00072.safetensors", - "h.26.mlp.dense_h_to_4h.weight": "model_00028-of-00072.safetensors", - "h.26.post_attention_layernorm.bias": "model_00028-of-00072.safetensors", - "h.26.post_attention_layernorm.weight": "model_00028-of-00072.safetensors", - "h.26.self_attention.dense.bias": "model_00028-of-00072.safetensors", - "h.26.self_attention.dense.weight": "model_00028-of-00072.safetensors", - "h.26.self_attention.query_key_value.bias": "model_00028-of-00072.safetensors", - "h.26.self_attention.query_key_value.weight": "model_00028-of-00072.safetensors", - "h.27.input_layernorm.bias": "model_00029-of-00072.safetensors", - "h.27.input_layernorm.weight": "model_00029-of-00072.safetensors", - "h.27.mlp.dense_4h_to_h.bias": "model_00029-of-00072.safetensors", - "h.27.mlp.dense_4h_to_h.weight": "model_00029-of-00072.safetensors", - "h.27.mlp.dense_h_to_4h.bias": "model_00029-of-00072.safetensors", - "h.27.mlp.dense_h_to_4h.weight": "model_00029-of-00072.safetensors", - "h.27.post_attention_layernorm.bias": "model_00029-of-00072.safetensors", - "h.27.post_attention_layernorm.weight": "model_00029-of-00072.safetensors", - "h.27.self_attention.dense.bias": "model_00029-of-00072.safetensors", - "h.27.self_attention.dense.weight": "model_00029-of-00072.safetensors", - "h.27.self_attention.query_key_value.bias": "model_00029-of-00072.safetensors", - "h.27.self_attention.query_key_value.weight": "model_00029-of-00072.safetensors", - "h.28.input_layernorm.bias": "model_00030-of-00072.safetensors", - "h.28.input_layernorm.weight": "model_00030-of-00072.safetensors", - "h.28.mlp.dense_4h_to_h.bias": "model_00030-of-00072.safetensors", - "h.28.mlp.dense_4h_to_h.weight": "model_00030-of-00072.safetensors", - "h.28.mlp.dense_h_to_4h.bias": "model_00030-of-00072.safetensors", - "h.28.mlp.dense_h_to_4h.weight": "model_00030-of-00072.safetensors", - "h.28.post_attention_layernorm.bias": "model_00030-of-00072.safetensors", - "h.28.post_attention_layernorm.weight": "model_00030-of-00072.safetensors", - "h.28.self_attention.dense.bias": "model_00030-of-00072.safetensors", - "h.28.self_attention.dense.weight": "model_00030-of-00072.safetensors", - "h.28.self_attention.query_key_value.bias": "model_00030-of-00072.safetensors", - "h.28.self_attention.query_key_value.weight": "model_00030-of-00072.safetensors", - "h.29.input_layernorm.bias": "model_00031-of-00072.safetensors", - "h.29.input_layernorm.weight": "model_00031-of-00072.safetensors", - "h.29.mlp.dense_4h_to_h.bias": "model_00031-of-00072.safetensors", - "h.29.mlp.dense_4h_to_h.weight": "model_00031-of-00072.safetensors", - "h.29.mlp.dense_h_to_4h.bias": "model_00031-of-00072.safetensors", - "h.29.mlp.dense_h_to_4h.weight": "model_00031-of-00072.safetensors", - "h.29.post_attention_layernorm.bias": "model_00031-of-00072.safetensors", - "h.29.post_attention_layernorm.weight": "model_00031-of-00072.safetensors", - "h.29.self_attention.dense.bias": "model_00031-of-00072.safetensors", - "h.29.self_attention.dense.weight": "model_00031-of-00072.safetensors", - "h.29.self_attention.query_key_value.bias": "model_00031-of-00072.safetensors", - "h.29.self_attention.query_key_value.weight": "model_00031-of-00072.safetensors", - "h.3.input_layernorm.bias": "model_00005-of-00072.safetensors", - "h.3.input_layernorm.weight": "model_00005-of-00072.safetensors", - "h.3.mlp.dense_4h_to_h.bias": "model_00005-of-00072.safetensors", - "h.3.mlp.dense_4h_to_h.weight": "model_00005-of-00072.safetensors", - "h.3.mlp.dense_h_to_4h.bias": "model_00005-of-00072.safetensors", - "h.3.mlp.dense_h_to_4h.weight": "model_00005-of-00072.safetensors", - "h.3.post_attention_layernorm.bias": "model_00005-of-00072.safetensors", - "h.3.post_attention_layernorm.weight": "model_00005-of-00072.safetensors", - "h.3.self_attention.dense.bias": "model_00005-of-00072.safetensors", - "h.3.self_attention.dense.weight": "model_00005-of-00072.safetensors", - "h.3.self_attention.query_key_value.bias": "model_00005-of-00072.safetensors", - "h.3.self_attention.query_key_value.weight": "model_00005-of-00072.safetensors", - "h.30.input_layernorm.bias": "model_00032-of-00072.safetensors", - "h.30.input_layernorm.weight": "model_00032-of-00072.safetensors", - "h.30.mlp.dense_4h_to_h.bias": "model_00032-of-00072.safetensors", - "h.30.mlp.dense_4h_to_h.weight": "model_00032-of-00072.safetensors", - "h.30.mlp.dense_h_to_4h.bias": "model_00032-of-00072.safetensors", - "h.30.mlp.dense_h_to_4h.weight": "model_00032-of-00072.safetensors", - "h.30.post_attention_layernorm.bias": "model_00032-of-00072.safetensors", - "h.30.post_attention_layernorm.weight": "model_00032-of-00072.safetensors", - "h.30.self_attention.dense.bias": "model_00032-of-00072.safetensors", - "h.30.self_attention.dense.weight": "model_00032-of-00072.safetensors", - "h.30.self_attention.query_key_value.bias": "model_00032-of-00072.safetensors", - "h.30.self_attention.query_key_value.weight": "model_00032-of-00072.safetensors", - "h.31.input_layernorm.bias": "model_00033-of-00072.safetensors", - "h.31.input_layernorm.weight": "model_00033-of-00072.safetensors", - "h.31.mlp.dense_4h_to_h.bias": "model_00033-of-00072.safetensors", - "h.31.mlp.dense_4h_to_h.weight": "model_00033-of-00072.safetensors", - "h.31.mlp.dense_h_to_4h.bias": "model_00033-of-00072.safetensors", - "h.31.mlp.dense_h_to_4h.weight": "model_00033-of-00072.safetensors", - "h.31.post_attention_layernorm.bias": "model_00033-of-00072.safetensors", - "h.31.post_attention_layernorm.weight": "model_00033-of-00072.safetensors", - "h.31.self_attention.dense.bias": "model_00033-of-00072.safetensors", - "h.31.self_attention.dense.weight": "model_00033-of-00072.safetensors", - "h.31.self_attention.query_key_value.bias": "model_00033-of-00072.safetensors", - "h.31.self_attention.query_key_value.weight": "model_00033-of-00072.safetensors", - "h.32.input_layernorm.bias": "model_00034-of-00072.safetensors", - "h.32.input_layernorm.weight": "model_00034-of-00072.safetensors", - "h.32.mlp.dense_4h_to_h.bias": "model_00034-of-00072.safetensors", - "h.32.mlp.dense_4h_to_h.weight": "model_00034-of-00072.safetensors", - "h.32.mlp.dense_h_to_4h.bias": "model_00034-of-00072.safetensors", - "h.32.mlp.dense_h_to_4h.weight": "model_00034-of-00072.safetensors", - "h.32.post_attention_layernorm.bias": "model_00034-of-00072.safetensors", - "h.32.post_attention_layernorm.weight": "model_00034-of-00072.safetensors", - "h.32.self_attention.dense.bias": "model_00034-of-00072.safetensors", - "h.32.self_attention.dense.weight": "model_00034-of-00072.safetensors", - "h.32.self_attention.query_key_value.bias": "model_00034-of-00072.safetensors", - "h.32.self_attention.query_key_value.weight": "model_00034-of-00072.safetensors", - "h.33.input_layernorm.bias": "model_00035-of-00072.safetensors", - "h.33.input_layernorm.weight": "model_00035-of-00072.safetensors", - "h.33.mlp.dense_4h_to_h.bias": "model_00035-of-00072.safetensors", - "h.33.mlp.dense_4h_to_h.weight": "model_00035-of-00072.safetensors", - "h.33.mlp.dense_h_to_4h.bias": "model_00035-of-00072.safetensors", - "h.33.mlp.dense_h_to_4h.weight": "model_00035-of-00072.safetensors", - "h.33.post_attention_layernorm.bias": "model_00035-of-00072.safetensors", - "h.33.post_attention_layernorm.weight": "model_00035-of-00072.safetensors", - "h.33.self_attention.dense.bias": "model_00035-of-00072.safetensors", - "h.33.self_attention.dense.weight": "model_00035-of-00072.safetensors", - "h.33.self_attention.query_key_value.bias": "model_00035-of-00072.safetensors", - "h.33.self_attention.query_key_value.weight": "model_00035-of-00072.safetensors", - "h.34.input_layernorm.bias": "model_00036-of-00072.safetensors", - "h.34.input_layernorm.weight": "model_00036-of-00072.safetensors", - "h.34.mlp.dense_4h_to_h.bias": "model_00036-of-00072.safetensors", - "h.34.mlp.dense_4h_to_h.weight": "model_00036-of-00072.safetensors", - "h.34.mlp.dense_h_to_4h.bias": "model_00036-of-00072.safetensors", - "h.34.mlp.dense_h_to_4h.weight": "model_00036-of-00072.safetensors", - "h.34.post_attention_layernorm.bias": "model_00036-of-00072.safetensors", - "h.34.post_attention_layernorm.weight": "model_00036-of-00072.safetensors", - "h.34.self_attention.dense.bias": "model_00036-of-00072.safetensors", - "h.34.self_attention.dense.weight": "model_00036-of-00072.safetensors", - "h.34.self_attention.query_key_value.bias": "model_00036-of-00072.safetensors", - "h.34.self_attention.query_key_value.weight": "model_00036-of-00072.safetensors", - "h.35.input_layernorm.bias": "model_00037-of-00072.safetensors", - "h.35.input_layernorm.weight": "model_00037-of-00072.safetensors", - "h.35.mlp.dense_4h_to_h.bias": "model_00037-of-00072.safetensors", - "h.35.mlp.dense_4h_to_h.weight": "model_00037-of-00072.safetensors", - "h.35.mlp.dense_h_to_4h.bias": "model_00037-of-00072.safetensors", - "h.35.mlp.dense_h_to_4h.weight": "model_00037-of-00072.safetensors", - "h.35.post_attention_layernorm.bias": "model_00037-of-00072.safetensors", - "h.35.post_attention_layernorm.weight": "model_00037-of-00072.safetensors", - "h.35.self_attention.dense.bias": "model_00037-of-00072.safetensors", - "h.35.self_attention.dense.weight": "model_00037-of-00072.safetensors", - "h.35.self_attention.query_key_value.bias": "model_00037-of-00072.safetensors", - "h.35.self_attention.query_key_value.weight": "model_00037-of-00072.safetensors", - "h.36.input_layernorm.bias": "model_00038-of-00072.safetensors", - "h.36.input_layernorm.weight": "model_00038-of-00072.safetensors", - "h.36.mlp.dense_4h_to_h.bias": "model_00038-of-00072.safetensors", - "h.36.mlp.dense_4h_to_h.weight": "model_00038-of-00072.safetensors", - "h.36.mlp.dense_h_to_4h.bias": "model_00038-of-00072.safetensors", - "h.36.mlp.dense_h_to_4h.weight": "model_00038-of-00072.safetensors", - "h.36.post_attention_layernorm.bias": "model_00038-of-00072.safetensors", - "h.36.post_attention_layernorm.weight": "model_00038-of-00072.safetensors", - "h.36.self_attention.dense.bias": "model_00038-of-00072.safetensors", - "h.36.self_attention.dense.weight": "model_00038-of-00072.safetensors", - "h.36.self_attention.query_key_value.bias": "model_00038-of-00072.safetensors", - "h.36.self_attention.query_key_value.weight": "model_00038-of-00072.safetensors", - "h.37.input_layernorm.bias": "model_00039-of-00072.safetensors", - "h.37.input_layernorm.weight": "model_00039-of-00072.safetensors", - "h.37.mlp.dense_4h_to_h.bias": "model_00039-of-00072.safetensors", - "h.37.mlp.dense_4h_to_h.weight": "model_00039-of-00072.safetensors", - "h.37.mlp.dense_h_to_4h.bias": "model_00039-of-00072.safetensors", - "h.37.mlp.dense_h_to_4h.weight": "model_00039-of-00072.safetensors", - "h.37.post_attention_layernorm.bias": "model_00039-of-00072.safetensors", - "h.37.post_attention_layernorm.weight": "model_00039-of-00072.safetensors", - "h.37.self_attention.dense.bias": "model_00039-of-00072.safetensors", - "h.37.self_attention.dense.weight": "model_00039-of-00072.safetensors", - "h.37.self_attention.query_key_value.bias": "model_00039-of-00072.safetensors", - "h.37.self_attention.query_key_value.weight": "model_00039-of-00072.safetensors", - "h.38.input_layernorm.bias": "model_00040-of-00072.safetensors", - "h.38.input_layernorm.weight": "model_00040-of-00072.safetensors", - "h.38.mlp.dense_4h_to_h.bias": "model_00040-of-00072.safetensors", - "h.38.mlp.dense_4h_to_h.weight": "model_00040-of-00072.safetensors", - "h.38.mlp.dense_h_to_4h.bias": "model_00040-of-00072.safetensors", - "h.38.mlp.dense_h_to_4h.weight": "model_00040-of-00072.safetensors", - "h.38.post_attention_layernorm.bias": "model_00040-of-00072.safetensors", - "h.38.post_attention_layernorm.weight": "model_00040-of-00072.safetensors", - "h.38.self_attention.dense.bias": "model_00040-of-00072.safetensors", - "h.38.self_attention.dense.weight": "model_00040-of-00072.safetensors", - "h.38.self_attention.query_key_value.bias": "model_00040-of-00072.safetensors", - "h.38.self_attention.query_key_value.weight": "model_00040-of-00072.safetensors", - "h.39.input_layernorm.bias": "model_00041-of-00072.safetensors", - "h.39.input_layernorm.weight": "model_00041-of-00072.safetensors", - "h.39.mlp.dense_4h_to_h.bias": "model_00041-of-00072.safetensors", - "h.39.mlp.dense_4h_to_h.weight": "model_00041-of-00072.safetensors", - "h.39.mlp.dense_h_to_4h.bias": "model_00041-of-00072.safetensors", - "h.39.mlp.dense_h_to_4h.weight": "model_00041-of-00072.safetensors", - "h.39.post_attention_layernorm.bias": "model_00041-of-00072.safetensors", - "h.39.post_attention_layernorm.weight": "model_00041-of-00072.safetensors", - "h.39.self_attention.dense.bias": "model_00041-of-00072.safetensors", - "h.39.self_attention.dense.weight": "model_00041-of-00072.safetensors", - "h.39.self_attention.query_key_value.bias": "model_00041-of-00072.safetensors", - "h.39.self_attention.query_key_value.weight": "model_00041-of-00072.safetensors", - "h.4.input_layernorm.bias": "model_00006-of-00072.safetensors", - "h.4.input_layernorm.weight": "model_00006-of-00072.safetensors", - "h.4.mlp.dense_4h_to_h.bias": "model_00006-of-00072.safetensors", - "h.4.mlp.dense_4h_to_h.weight": "model_00006-of-00072.safetensors", - "h.4.mlp.dense_h_to_4h.bias": "model_00006-of-00072.safetensors", - "h.4.mlp.dense_h_to_4h.weight": "model_00006-of-00072.safetensors", - "h.4.post_attention_layernorm.bias": "model_00006-of-00072.safetensors", - "h.4.post_attention_layernorm.weight": "model_00006-of-00072.safetensors", - "h.4.self_attention.dense.bias": "model_00006-of-00072.safetensors", - "h.4.self_attention.dense.weight": "model_00006-of-00072.safetensors", - "h.4.self_attention.query_key_value.bias": "model_00006-of-00072.safetensors", - "h.4.self_attention.query_key_value.weight": "model_00006-of-00072.safetensors", - "h.40.input_layernorm.bias": "model_00042-of-00072.safetensors", - "h.40.input_layernorm.weight": "model_00042-of-00072.safetensors", - "h.40.mlp.dense_4h_to_h.bias": "model_00042-of-00072.safetensors", - "h.40.mlp.dense_4h_to_h.weight": "model_00042-of-00072.safetensors", - "h.40.mlp.dense_h_to_4h.bias": "model_00042-of-00072.safetensors", - "h.40.mlp.dense_h_to_4h.weight": "model_00042-of-00072.safetensors", - "h.40.post_attention_layernorm.bias": "model_00042-of-00072.safetensors", - "h.40.post_attention_layernorm.weight": "model_00042-of-00072.safetensors", - "h.40.self_attention.dense.bias": "model_00042-of-00072.safetensors", - "h.40.self_attention.dense.weight": "model_00042-of-00072.safetensors", - "h.40.self_attention.query_key_value.bias": "model_00042-of-00072.safetensors", - "h.40.self_attention.query_key_value.weight": "model_00042-of-00072.safetensors", - "h.41.input_layernorm.bias": "model_00043-of-00072.safetensors", - "h.41.input_layernorm.weight": "model_00043-of-00072.safetensors", - "h.41.mlp.dense_4h_to_h.bias": "model_00043-of-00072.safetensors", - "h.41.mlp.dense_4h_to_h.weight": "model_00043-of-00072.safetensors", - "h.41.mlp.dense_h_to_4h.bias": "model_00043-of-00072.safetensors", - "h.41.mlp.dense_h_to_4h.weight": "model_00043-of-00072.safetensors", - "h.41.post_attention_layernorm.bias": "model_00043-of-00072.safetensors", - "h.41.post_attention_layernorm.weight": "model_00043-of-00072.safetensors", - "h.41.self_attention.dense.bias": "model_00043-of-00072.safetensors", - "h.41.self_attention.dense.weight": "model_00043-of-00072.safetensors", - "h.41.self_attention.query_key_value.bias": "model_00043-of-00072.safetensors", - "h.41.self_attention.query_key_value.weight": "model_00043-of-00072.safetensors", - "h.42.input_layernorm.bias": "model_00044-of-00072.safetensors", - "h.42.input_layernorm.weight": "model_00044-of-00072.safetensors", - "h.42.mlp.dense_4h_to_h.bias": "model_00044-of-00072.safetensors", - "h.42.mlp.dense_4h_to_h.weight": "model_00044-of-00072.safetensors", - "h.42.mlp.dense_h_to_4h.bias": "model_00044-of-00072.safetensors", - "h.42.mlp.dense_h_to_4h.weight": "model_00044-of-00072.safetensors", - "h.42.post_attention_layernorm.bias": "model_00044-of-00072.safetensors", - "h.42.post_attention_layernorm.weight": "model_00044-of-00072.safetensors", - "h.42.self_attention.dense.bias": "model_00044-of-00072.safetensors", - "h.42.self_attention.dense.weight": "model_00044-of-00072.safetensors", - "h.42.self_attention.query_key_value.bias": "model_00044-of-00072.safetensors", - "h.42.self_attention.query_key_value.weight": "model_00044-of-00072.safetensors", - "h.43.input_layernorm.bias": "model_00045-of-00072.safetensors", - "h.43.input_layernorm.weight": "model_00045-of-00072.safetensors", - "h.43.mlp.dense_4h_to_h.bias": "model_00045-of-00072.safetensors", - "h.43.mlp.dense_4h_to_h.weight": "model_00045-of-00072.safetensors", - "h.43.mlp.dense_h_to_4h.bias": "model_00045-of-00072.safetensors", - "h.43.mlp.dense_h_to_4h.weight": "model_00045-of-00072.safetensors", - "h.43.post_attention_layernorm.bias": "model_00045-of-00072.safetensors", - "h.43.post_attention_layernorm.weight": "model_00045-of-00072.safetensors", - "h.43.self_attention.dense.bias": "model_00045-of-00072.safetensors", - "h.43.self_attention.dense.weight": "model_00045-of-00072.safetensors", - "h.43.self_attention.query_key_value.bias": "model_00045-of-00072.safetensors", - "h.43.self_attention.query_key_value.weight": "model_00045-of-00072.safetensors", - "h.44.input_layernorm.bias": "model_00046-of-00072.safetensors", - "h.44.input_layernorm.weight": "model_00046-of-00072.safetensors", - "h.44.mlp.dense_4h_to_h.bias": "model_00046-of-00072.safetensors", - "h.44.mlp.dense_4h_to_h.weight": "model_00046-of-00072.safetensors", - "h.44.mlp.dense_h_to_4h.bias": "model_00046-of-00072.safetensors", - "h.44.mlp.dense_h_to_4h.weight": "model_00046-of-00072.safetensors", - "h.44.post_attention_layernorm.bias": "model_00046-of-00072.safetensors", - "h.44.post_attention_layernorm.weight": "model_00046-of-00072.safetensors", - "h.44.self_attention.dense.bias": "model_00046-of-00072.safetensors", - "h.44.self_attention.dense.weight": "model_00046-of-00072.safetensors", - "h.44.self_attention.query_key_value.bias": "model_00046-of-00072.safetensors", - "h.44.self_attention.query_key_value.weight": "model_00046-of-00072.safetensors", - "h.45.input_layernorm.bias": "model_00047-of-00072.safetensors", - "h.45.input_layernorm.weight": "model_00047-of-00072.safetensors", - "h.45.mlp.dense_4h_to_h.bias": "model_00047-of-00072.safetensors", - "h.45.mlp.dense_4h_to_h.weight": "model_00047-of-00072.safetensors", - "h.45.mlp.dense_h_to_4h.bias": "model_00047-of-00072.safetensors", - "h.45.mlp.dense_h_to_4h.weight": "model_00047-of-00072.safetensors", - "h.45.post_attention_layernorm.bias": "model_00047-of-00072.safetensors", - "h.45.post_attention_layernorm.weight": "model_00047-of-00072.safetensors", - "h.45.self_attention.dense.bias": "model_00047-of-00072.safetensors", - "h.45.self_attention.dense.weight": "model_00047-of-00072.safetensors", - "h.45.self_attention.query_key_value.bias": "model_00047-of-00072.safetensors", - "h.45.self_attention.query_key_value.weight": "model_00047-of-00072.safetensors", - "h.46.input_layernorm.bias": "model_00048-of-00072.safetensors", - "h.46.input_layernorm.weight": "model_00048-of-00072.safetensors", - "h.46.mlp.dense_4h_to_h.bias": "model_00048-of-00072.safetensors", - "h.46.mlp.dense_4h_to_h.weight": "model_00048-of-00072.safetensors", - "h.46.mlp.dense_h_to_4h.bias": "model_00048-of-00072.safetensors", - "h.46.mlp.dense_h_to_4h.weight": "model_00048-of-00072.safetensors", - "h.46.post_attention_layernorm.bias": "model_00048-of-00072.safetensors", - "h.46.post_attention_layernorm.weight": "model_00048-of-00072.safetensors", - "h.46.self_attention.dense.bias": "model_00048-of-00072.safetensors", - "h.46.self_attention.dense.weight": "model_00048-of-00072.safetensors", - "h.46.self_attention.query_key_value.bias": "model_00048-of-00072.safetensors", - "h.46.self_attention.query_key_value.weight": "model_00048-of-00072.safetensors", - "h.47.input_layernorm.bias": "model_00049-of-00072.safetensors", - "h.47.input_layernorm.weight": "model_00049-of-00072.safetensors", - "h.47.mlp.dense_4h_to_h.bias": "model_00049-of-00072.safetensors", - "h.47.mlp.dense_4h_to_h.weight": "model_00049-of-00072.safetensors", - "h.47.mlp.dense_h_to_4h.bias": "model_00049-of-00072.safetensors", - "h.47.mlp.dense_h_to_4h.weight": "model_00049-of-00072.safetensors", - "h.47.post_attention_layernorm.bias": "model_00049-of-00072.safetensors", - "h.47.post_attention_layernorm.weight": "model_00049-of-00072.safetensors", - "h.47.self_attention.dense.bias": "model_00049-of-00072.safetensors", - "h.47.self_attention.dense.weight": "model_00049-of-00072.safetensors", - "h.47.self_attention.query_key_value.bias": "model_00049-of-00072.safetensors", - "h.47.self_attention.query_key_value.weight": "model_00049-of-00072.safetensors", - "h.48.input_layernorm.bias": "model_00050-of-00072.safetensors", - "h.48.input_layernorm.weight": "model_00050-of-00072.safetensors", - "h.48.mlp.dense_4h_to_h.bias": "model_00050-of-00072.safetensors", - "h.48.mlp.dense_4h_to_h.weight": "model_00050-of-00072.safetensors", - "h.48.mlp.dense_h_to_4h.bias": "model_00050-of-00072.safetensors", - "h.48.mlp.dense_h_to_4h.weight": "model_00050-of-00072.safetensors", - "h.48.post_attention_layernorm.bias": "model_00050-of-00072.safetensors", - "h.48.post_attention_layernorm.weight": "model_00050-of-00072.safetensors", - "h.48.self_attention.dense.bias": "model_00050-of-00072.safetensors", - "h.48.self_attention.dense.weight": "model_00050-of-00072.safetensors", - "h.48.self_attention.query_key_value.bias": "model_00050-of-00072.safetensors", - "h.48.self_attention.query_key_value.weight": "model_00050-of-00072.safetensors", - "h.49.input_layernorm.bias": "model_00051-of-00072.safetensors", - "h.49.input_layernorm.weight": "model_00051-of-00072.safetensors", - "h.49.mlp.dense_4h_to_h.bias": "model_00051-of-00072.safetensors", - "h.49.mlp.dense_4h_to_h.weight": "model_00051-of-00072.safetensors", - "h.49.mlp.dense_h_to_4h.bias": "model_00051-of-00072.safetensors", - "h.49.mlp.dense_h_to_4h.weight": "model_00051-of-00072.safetensors", - "h.49.post_attention_layernorm.bias": "model_00051-of-00072.safetensors", - "h.49.post_attention_layernorm.weight": "model_00051-of-00072.safetensors", - "h.49.self_attention.dense.bias": "model_00051-of-00072.safetensors", - "h.49.self_attention.dense.weight": "model_00051-of-00072.safetensors", - "h.49.self_attention.query_key_value.bias": "model_00051-of-00072.safetensors", - "h.49.self_attention.query_key_value.weight": "model_00051-of-00072.safetensors", - "h.5.input_layernorm.bias": "model_00007-of-00072.safetensors", - "h.5.input_layernorm.weight": "model_00007-of-00072.safetensors", - "h.5.mlp.dense_4h_to_h.bias": "model_00007-of-00072.safetensors", - "h.5.mlp.dense_4h_to_h.weight": "model_00007-of-00072.safetensors", - "h.5.mlp.dense_h_to_4h.bias": "model_00007-of-00072.safetensors", - "h.5.mlp.dense_h_to_4h.weight": "model_00007-of-00072.safetensors", - "h.5.post_attention_layernorm.bias": "model_00007-of-00072.safetensors", - "h.5.post_attention_layernorm.weight": "model_00007-of-00072.safetensors", - "h.5.self_attention.dense.bias": "model_00007-of-00072.safetensors", - "h.5.self_attention.dense.weight": "model_00007-of-00072.safetensors", - "h.5.self_attention.query_key_value.bias": "model_00007-of-00072.safetensors", - "h.5.self_attention.query_key_value.weight": "model_00007-of-00072.safetensors", - "h.50.input_layernorm.bias": "model_00052-of-00072.safetensors", - "h.50.input_layernorm.weight": "model_00052-of-00072.safetensors", - "h.50.mlp.dense_4h_to_h.bias": "model_00052-of-00072.safetensors", - "h.50.mlp.dense_4h_to_h.weight": "model_00052-of-00072.safetensors", - "h.50.mlp.dense_h_to_4h.bias": "model_00052-of-00072.safetensors", - "h.50.mlp.dense_h_to_4h.weight": "model_00052-of-00072.safetensors", - "h.50.post_attention_layernorm.bias": "model_00052-of-00072.safetensors", - "h.50.post_attention_layernorm.weight": "model_00052-of-00072.safetensors", - "h.50.self_attention.dense.bias": "model_00052-of-00072.safetensors", - "h.50.self_attention.dense.weight": "model_00052-of-00072.safetensors", - "h.50.self_attention.query_key_value.bias": "model_00052-of-00072.safetensors", - "h.50.self_attention.query_key_value.weight": "model_00052-of-00072.safetensors", - "h.51.input_layernorm.bias": "model_00053-of-00072.safetensors", - "h.51.input_layernorm.weight": "model_00053-of-00072.safetensors", - "h.51.mlp.dense_4h_to_h.bias": "model_00053-of-00072.safetensors", - "h.51.mlp.dense_4h_to_h.weight": "model_00053-of-00072.safetensors", - "h.51.mlp.dense_h_to_4h.bias": "model_00053-of-00072.safetensors", - "h.51.mlp.dense_h_to_4h.weight": "model_00053-of-00072.safetensors", - "h.51.post_attention_layernorm.bias": "model_00053-of-00072.safetensors", - "h.51.post_attention_layernorm.weight": "model_00053-of-00072.safetensors", - "h.51.self_attention.dense.bias": "model_00053-of-00072.safetensors", - "h.51.self_attention.dense.weight": "model_00053-of-00072.safetensors", - "h.51.self_attention.query_key_value.bias": "model_00053-of-00072.safetensors", - "h.51.self_attention.query_key_value.weight": "model_00053-of-00072.safetensors", - "h.52.input_layernorm.bias": "model_00054-of-00072.safetensors", - "h.52.input_layernorm.weight": "model_00054-of-00072.safetensors", - "h.52.mlp.dense_4h_to_h.bias": "model_00054-of-00072.safetensors", - "h.52.mlp.dense_4h_to_h.weight": "model_00054-of-00072.safetensors", - "h.52.mlp.dense_h_to_4h.bias": "model_00054-of-00072.safetensors", - "h.52.mlp.dense_h_to_4h.weight": "model_00054-of-00072.safetensors", - "h.52.post_attention_layernorm.bias": "model_00054-of-00072.safetensors", - "h.52.post_attention_layernorm.weight": "model_00054-of-00072.safetensors", - "h.52.self_attention.dense.bias": "model_00054-of-00072.safetensors", - "h.52.self_attention.dense.weight": "model_00054-of-00072.safetensors", - "h.52.self_attention.query_key_value.bias": "model_00054-of-00072.safetensors", - "h.52.self_attention.query_key_value.weight": "model_00054-of-00072.safetensors", - "h.53.input_layernorm.bias": "model_00055-of-00072.safetensors", - "h.53.input_layernorm.weight": "model_00055-of-00072.safetensors", - "h.53.mlp.dense_4h_to_h.bias": "model_00055-of-00072.safetensors", - "h.53.mlp.dense_4h_to_h.weight": "model_00055-of-00072.safetensors", - "h.53.mlp.dense_h_to_4h.bias": "model_00055-of-00072.safetensors", - "h.53.mlp.dense_h_to_4h.weight": "model_00055-of-00072.safetensors", - "h.53.post_attention_layernorm.bias": "model_00055-of-00072.safetensors", - "h.53.post_attention_layernorm.weight": "model_00055-of-00072.safetensors", - "h.53.self_attention.dense.bias": "model_00055-of-00072.safetensors", - "h.53.self_attention.dense.weight": "model_00055-of-00072.safetensors", - "h.53.self_attention.query_key_value.bias": "model_00055-of-00072.safetensors", - "h.53.self_attention.query_key_value.weight": "model_00055-of-00072.safetensors", - "h.54.input_layernorm.bias": "model_00056-of-00072.safetensors", - "h.54.input_layernorm.weight": "model_00056-of-00072.safetensors", - "h.54.mlp.dense_4h_to_h.bias": "model_00056-of-00072.safetensors", - "h.54.mlp.dense_4h_to_h.weight": "model_00056-of-00072.safetensors", - "h.54.mlp.dense_h_to_4h.bias": "model_00056-of-00072.safetensors", - "h.54.mlp.dense_h_to_4h.weight": "model_00056-of-00072.safetensors", - "h.54.post_attention_layernorm.bias": "model_00056-of-00072.safetensors", - "h.54.post_attention_layernorm.weight": "model_00056-of-00072.safetensors", - "h.54.self_attention.dense.bias": "model_00056-of-00072.safetensors", - "h.54.self_attention.dense.weight": "model_00056-of-00072.safetensors", - "h.54.self_attention.query_key_value.bias": "model_00056-of-00072.safetensors", - "h.54.self_attention.query_key_value.weight": "model_00056-of-00072.safetensors", - "h.55.input_layernorm.bias": "model_00057-of-00072.safetensors", - "h.55.input_layernorm.weight": "model_00057-of-00072.safetensors", - "h.55.mlp.dense_4h_to_h.bias": "model_00057-of-00072.safetensors", - "h.55.mlp.dense_4h_to_h.weight": "model_00057-of-00072.safetensors", - "h.55.mlp.dense_h_to_4h.bias": "model_00057-of-00072.safetensors", - "h.55.mlp.dense_h_to_4h.weight": "model_00057-of-00072.safetensors", - "h.55.post_attention_layernorm.bias": "model_00057-of-00072.safetensors", - "h.55.post_attention_layernorm.weight": "model_00057-of-00072.safetensors", - "h.55.self_attention.dense.bias": "model_00057-of-00072.safetensors", - "h.55.self_attention.dense.weight": "model_00057-of-00072.safetensors", - "h.55.self_attention.query_key_value.bias": "model_00057-of-00072.safetensors", - "h.55.self_attention.query_key_value.weight": "model_00057-of-00072.safetensors", - "h.56.input_layernorm.bias": "model_00058-of-00072.safetensors", - "h.56.input_layernorm.weight": "model_00058-of-00072.safetensors", - "h.56.mlp.dense_4h_to_h.bias": "model_00058-of-00072.safetensors", - "h.56.mlp.dense_4h_to_h.weight": "model_00058-of-00072.safetensors", - "h.56.mlp.dense_h_to_4h.bias": "model_00058-of-00072.safetensors", - "h.56.mlp.dense_h_to_4h.weight": "model_00058-of-00072.safetensors", - "h.56.post_attention_layernorm.bias": "model_00058-of-00072.safetensors", - "h.56.post_attention_layernorm.weight": "model_00058-of-00072.safetensors", - "h.56.self_attention.dense.bias": "model_00058-of-00072.safetensors", - "h.56.self_attention.dense.weight": "model_00058-of-00072.safetensors", - "h.56.self_attention.query_key_value.bias": "model_00058-of-00072.safetensors", - "h.56.self_attention.query_key_value.weight": "model_00058-of-00072.safetensors", - "h.57.input_layernorm.bias": "model_00059-of-00072.safetensors", - "h.57.input_layernorm.weight": "model_00059-of-00072.safetensors", - "h.57.mlp.dense_4h_to_h.bias": "model_00059-of-00072.safetensors", - "h.57.mlp.dense_4h_to_h.weight": "model_00059-of-00072.safetensors", - "h.57.mlp.dense_h_to_4h.bias": "model_00059-of-00072.safetensors", - "h.57.mlp.dense_h_to_4h.weight": "model_00059-of-00072.safetensors", - "h.57.post_attention_layernorm.bias": "model_00059-of-00072.safetensors", - "h.57.post_attention_layernorm.weight": "model_00059-of-00072.safetensors", - "h.57.self_attention.dense.bias": "model_00059-of-00072.safetensors", - "h.57.self_attention.dense.weight": "model_00059-of-00072.safetensors", - "h.57.self_attention.query_key_value.bias": "model_00059-of-00072.safetensors", - "h.57.self_attention.query_key_value.weight": "model_00059-of-00072.safetensors", - "h.58.input_layernorm.bias": "model_00060-of-00072.safetensors", - "h.58.input_layernorm.weight": "model_00060-of-00072.safetensors", - "h.58.mlp.dense_4h_to_h.bias": "model_00060-of-00072.safetensors", - "h.58.mlp.dense_4h_to_h.weight": "model_00060-of-00072.safetensors", - "h.58.mlp.dense_h_to_4h.bias": "model_00060-of-00072.safetensors", - "h.58.mlp.dense_h_to_4h.weight": "model_00060-of-00072.safetensors", - "h.58.post_attention_layernorm.bias": "model_00060-of-00072.safetensors", - "h.58.post_attention_layernorm.weight": "model_00060-of-00072.safetensors", - "h.58.self_attention.dense.bias": "model_00060-of-00072.safetensors", - "h.58.self_attention.dense.weight": "model_00060-of-00072.safetensors", - "h.58.self_attention.query_key_value.bias": "model_00060-of-00072.safetensors", - "h.58.self_attention.query_key_value.weight": "model_00060-of-00072.safetensors", - "h.59.input_layernorm.bias": "model_00061-of-00072.safetensors", - "h.59.input_layernorm.weight": "model_00061-of-00072.safetensors", - "h.59.mlp.dense_4h_to_h.bias": "model_00061-of-00072.safetensors", - "h.59.mlp.dense_4h_to_h.weight": "model_00061-of-00072.safetensors", - "h.59.mlp.dense_h_to_4h.bias": "model_00061-of-00072.safetensors", - "h.59.mlp.dense_h_to_4h.weight": "model_00061-of-00072.safetensors", - "h.59.post_attention_layernorm.bias": "model_00061-of-00072.safetensors", - "h.59.post_attention_layernorm.weight": "model_00061-of-00072.safetensors", - "h.59.self_attention.dense.bias": "model_00061-of-00072.safetensors", - "h.59.self_attention.dense.weight": "model_00061-of-00072.safetensors", - "h.59.self_attention.query_key_value.bias": "model_00061-of-00072.safetensors", - "h.59.self_attention.query_key_value.weight": "model_00061-of-00072.safetensors", - "h.6.input_layernorm.bias": "model_00008-of-00072.safetensors", - "h.6.input_layernorm.weight": "model_00008-of-00072.safetensors", - "h.6.mlp.dense_4h_to_h.bias": "model_00008-of-00072.safetensors", - "h.6.mlp.dense_4h_to_h.weight": "model_00008-of-00072.safetensors", - "h.6.mlp.dense_h_to_4h.bias": "model_00008-of-00072.safetensors", - "h.6.mlp.dense_h_to_4h.weight": "model_00008-of-00072.safetensors", - "h.6.post_attention_layernorm.bias": "model_00008-of-00072.safetensors", - "h.6.post_attention_layernorm.weight": "model_00008-of-00072.safetensors", - "h.6.self_attention.dense.bias": "model_00008-of-00072.safetensors", - "h.6.self_attention.dense.weight": "model_00008-of-00072.safetensors", - "h.6.self_attention.query_key_value.bias": "model_00008-of-00072.safetensors", - "h.6.self_attention.query_key_value.weight": "model_00008-of-00072.safetensors", - "h.60.input_layernorm.bias": "model_00062-of-00072.safetensors", - "h.60.input_layernorm.weight": "model_00062-of-00072.safetensors", - "h.60.mlp.dense_4h_to_h.bias": "model_00062-of-00072.safetensors", - "h.60.mlp.dense_4h_to_h.weight": "model_00062-of-00072.safetensors", - "h.60.mlp.dense_h_to_4h.bias": "model_00062-of-00072.safetensors", - "h.60.mlp.dense_h_to_4h.weight": "model_00062-of-00072.safetensors", - "h.60.post_attention_layernorm.bias": "model_00062-of-00072.safetensors", - "h.60.post_attention_layernorm.weight": "model_00062-of-00072.safetensors", - "h.60.self_attention.dense.bias": "model_00062-of-00072.safetensors", - "h.60.self_attention.dense.weight": "model_00062-of-00072.safetensors", - "h.60.self_attention.query_key_value.bias": "model_00062-of-00072.safetensors", - "h.60.self_attention.query_key_value.weight": "model_00062-of-00072.safetensors", - "h.61.input_layernorm.bias": "model_00063-of-00072.safetensors", - "h.61.input_layernorm.weight": "model_00063-of-00072.safetensors", - "h.61.mlp.dense_4h_to_h.bias": "model_00063-of-00072.safetensors", - "h.61.mlp.dense_4h_to_h.weight": "model_00063-of-00072.safetensors", - "h.61.mlp.dense_h_to_4h.bias": "model_00063-of-00072.safetensors", - "h.61.mlp.dense_h_to_4h.weight": "model_00063-of-00072.safetensors", - "h.61.post_attention_layernorm.bias": "model_00063-of-00072.safetensors", - "h.61.post_attention_layernorm.weight": "model_00063-of-00072.safetensors", - "h.61.self_attention.dense.bias": "model_00063-of-00072.safetensors", - "h.61.self_attention.dense.weight": "model_00063-of-00072.safetensors", - "h.61.self_attention.query_key_value.bias": "model_00063-of-00072.safetensors", - "h.61.self_attention.query_key_value.weight": "model_00063-of-00072.safetensors", - "h.62.input_layernorm.bias": "model_00064-of-00072.safetensors", - "h.62.input_layernorm.weight": "model_00064-of-00072.safetensors", - "h.62.mlp.dense_4h_to_h.bias": "model_00064-of-00072.safetensors", - "h.62.mlp.dense_4h_to_h.weight": "model_00064-of-00072.safetensors", - "h.62.mlp.dense_h_to_4h.bias": "model_00064-of-00072.safetensors", - "h.62.mlp.dense_h_to_4h.weight": "model_00064-of-00072.safetensors", - "h.62.post_attention_layernorm.bias": "model_00064-of-00072.safetensors", - "h.62.post_attention_layernorm.weight": "model_00064-of-00072.safetensors", - "h.62.self_attention.dense.bias": "model_00064-of-00072.safetensors", - "h.62.self_attention.dense.weight": "model_00064-of-00072.safetensors", - "h.62.self_attention.query_key_value.bias": "model_00064-of-00072.safetensors", - "h.62.self_attention.query_key_value.weight": "model_00064-of-00072.safetensors", - "h.63.input_layernorm.bias": "model_00065-of-00072.safetensors", - "h.63.input_layernorm.weight": "model_00065-of-00072.safetensors", - "h.63.mlp.dense_4h_to_h.bias": "model_00065-of-00072.safetensors", - "h.63.mlp.dense_4h_to_h.weight": "model_00065-of-00072.safetensors", - "h.63.mlp.dense_h_to_4h.bias": "model_00065-of-00072.safetensors", - "h.63.mlp.dense_h_to_4h.weight": "model_00065-of-00072.safetensors", - "h.63.post_attention_layernorm.bias": "model_00065-of-00072.safetensors", - "h.63.post_attention_layernorm.weight": "model_00065-of-00072.safetensors", - "h.63.self_attention.dense.bias": "model_00065-of-00072.safetensors", - "h.63.self_attention.dense.weight": "model_00065-of-00072.safetensors", - "h.63.self_attention.query_key_value.bias": "model_00065-of-00072.safetensors", - "h.63.self_attention.query_key_value.weight": "model_00065-of-00072.safetensors", - "h.64.input_layernorm.bias": "model_00066-of-00072.safetensors", - "h.64.input_layernorm.weight": "model_00066-of-00072.safetensors", - "h.64.mlp.dense_4h_to_h.bias": "model_00066-of-00072.safetensors", - "h.64.mlp.dense_4h_to_h.weight": "model_00066-of-00072.safetensors", - "h.64.mlp.dense_h_to_4h.bias": "model_00066-of-00072.safetensors", - "h.64.mlp.dense_h_to_4h.weight": "model_00066-of-00072.safetensors", - "h.64.post_attention_layernorm.bias": "model_00066-of-00072.safetensors", - "h.64.post_attention_layernorm.weight": "model_00066-of-00072.safetensors", - "h.64.self_attention.dense.bias": "model_00066-of-00072.safetensors", - "h.64.self_attention.dense.weight": "model_00066-of-00072.safetensors", - "h.64.self_attention.query_key_value.bias": "model_00066-of-00072.safetensors", - "h.64.self_attention.query_key_value.weight": "model_00066-of-00072.safetensors", - "h.65.input_layernorm.bias": "model_00067-of-00072.safetensors", - "h.65.input_layernorm.weight": "model_00067-of-00072.safetensors", - "h.65.mlp.dense_4h_to_h.bias": "model_00067-of-00072.safetensors", - "h.65.mlp.dense_4h_to_h.weight": "model_00067-of-00072.safetensors", - "h.65.mlp.dense_h_to_4h.bias": "model_00067-of-00072.safetensors", - "h.65.mlp.dense_h_to_4h.weight": "model_00067-of-00072.safetensors", - "h.65.post_attention_layernorm.bias": "model_00067-of-00072.safetensors", - "h.65.post_attention_layernorm.weight": "model_00067-of-00072.safetensors", - "h.65.self_attention.dense.bias": "model_00067-of-00072.safetensors", - "h.65.self_attention.dense.weight": "model_00067-of-00072.safetensors", - "h.65.self_attention.query_key_value.bias": "model_00067-of-00072.safetensors", - "h.65.self_attention.query_key_value.weight": "model_00067-of-00072.safetensors", - "h.66.input_layernorm.bias": "model_00068-of-00072.safetensors", - "h.66.input_layernorm.weight": "model_00068-of-00072.safetensors", - "h.66.mlp.dense_4h_to_h.bias": "model_00068-of-00072.safetensors", - "h.66.mlp.dense_4h_to_h.weight": "model_00068-of-00072.safetensors", - "h.66.mlp.dense_h_to_4h.bias": "model_00068-of-00072.safetensors", - "h.66.mlp.dense_h_to_4h.weight": "model_00068-of-00072.safetensors", - "h.66.post_attention_layernorm.bias": "model_00068-of-00072.safetensors", - "h.66.post_attention_layernorm.weight": "model_00068-of-00072.safetensors", - "h.66.self_attention.dense.bias": "model_00068-of-00072.safetensors", - "h.66.self_attention.dense.weight": "model_00068-of-00072.safetensors", - "h.66.self_attention.query_key_value.bias": "model_00068-of-00072.safetensors", - "h.66.self_attention.query_key_value.weight": "model_00068-of-00072.safetensors", - "h.67.input_layernorm.bias": "model_00069-of-00072.safetensors", - "h.67.input_layernorm.weight": "model_00069-of-00072.safetensors", - "h.67.mlp.dense_4h_to_h.bias": "model_00069-of-00072.safetensors", - "h.67.mlp.dense_4h_to_h.weight": "model_00069-of-00072.safetensors", - "h.67.mlp.dense_h_to_4h.bias": "model_00069-of-00072.safetensors", - "h.67.mlp.dense_h_to_4h.weight": "model_00069-of-00072.safetensors", - "h.67.post_attention_layernorm.bias": "model_00069-of-00072.safetensors", - "h.67.post_attention_layernorm.weight": "model_00069-of-00072.safetensors", - "h.67.self_attention.dense.bias": "model_00069-of-00072.safetensors", - "h.67.self_attention.dense.weight": "model_00069-of-00072.safetensors", - "h.67.self_attention.query_key_value.bias": "model_00069-of-00072.safetensors", - "h.67.self_attention.query_key_value.weight": "model_00069-of-00072.safetensors", - "h.68.input_layernorm.bias": "model_00070-of-00072.safetensors", - "h.68.input_layernorm.weight": "model_00070-of-00072.safetensors", - "h.68.mlp.dense_4h_to_h.bias": "model_00070-of-00072.safetensors", - "h.68.mlp.dense_4h_to_h.weight": "model_00070-of-00072.safetensors", - "h.68.mlp.dense_h_to_4h.bias": "model_00070-of-00072.safetensors", - "h.68.mlp.dense_h_to_4h.weight": "model_00070-of-00072.safetensors", - "h.68.post_attention_layernorm.bias": "model_00070-of-00072.safetensors", - "h.68.post_attention_layernorm.weight": "model_00070-of-00072.safetensors", - "h.68.self_attention.dense.bias": "model_00070-of-00072.safetensors", - "h.68.self_attention.dense.weight": "model_00070-of-00072.safetensors", - "h.68.self_attention.query_key_value.bias": "model_00070-of-00072.safetensors", - "h.68.self_attention.query_key_value.weight": "model_00070-of-00072.safetensors", - "h.69.input_layernorm.bias": "model_00071-of-00072.safetensors", - "h.69.input_layernorm.weight": "model_00071-of-00072.safetensors", - "h.69.mlp.dense_4h_to_h.bias": "model_00071-of-00072.safetensors", - "h.69.mlp.dense_4h_to_h.weight": "model_00071-of-00072.safetensors", - "h.69.mlp.dense_h_to_4h.bias": "model_00071-of-00072.safetensors", - "h.69.mlp.dense_h_to_4h.weight": "model_00071-of-00072.safetensors", - "h.69.post_attention_layernorm.bias": "model_00071-of-00072.safetensors", - "h.69.post_attention_layernorm.weight": "model_00071-of-00072.safetensors", - "h.69.self_attention.dense.bias": "model_00071-of-00072.safetensors", - "h.69.self_attention.dense.weight": "model_00071-of-00072.safetensors", - "h.69.self_attention.query_key_value.bias": "model_00071-of-00072.safetensors", - "h.69.self_attention.query_key_value.weight": "model_00071-of-00072.safetensors", - "h.7.input_layernorm.bias": "model_00009-of-00072.safetensors", - "h.7.input_layernorm.weight": "model_00009-of-00072.safetensors", - "h.7.mlp.dense_4h_to_h.bias": "model_00009-of-00072.safetensors", - "h.7.mlp.dense_4h_to_h.weight": "model_00009-of-00072.safetensors", - "h.7.mlp.dense_h_to_4h.bias": "model_00009-of-00072.safetensors", - "h.7.mlp.dense_h_to_4h.weight": "model_00009-of-00072.safetensors", - "h.7.post_attention_layernorm.bias": "model_00009-of-00072.safetensors", - "h.7.post_attention_layernorm.weight": "model_00009-of-00072.safetensors", - "h.7.self_attention.dense.bias": "model_00009-of-00072.safetensors", - "h.7.self_attention.dense.weight": "model_00009-of-00072.safetensors", - "h.7.self_attention.query_key_value.bias": "model_00009-of-00072.safetensors", - "h.7.self_attention.query_key_value.weight": "model_00009-of-00072.safetensors", - "h.8.input_layernorm.bias": "model_00010-of-00072.safetensors", - "h.8.input_layernorm.weight": "model_00010-of-00072.safetensors", - "h.8.mlp.dense_4h_to_h.bias": "model_00010-of-00072.safetensors", - "h.8.mlp.dense_4h_to_h.weight": "model_00010-of-00072.safetensors", - "h.8.mlp.dense_h_to_4h.bias": "model_00010-of-00072.safetensors", - "h.8.mlp.dense_h_to_4h.weight": "model_00010-of-00072.safetensors", - "h.8.post_attention_layernorm.bias": "model_00010-of-00072.safetensors", - "h.8.post_attention_layernorm.weight": "model_00010-of-00072.safetensors", - "h.8.self_attention.dense.bias": "model_00010-of-00072.safetensors", - "h.8.self_attention.dense.weight": "model_00010-of-00072.safetensors", - "h.8.self_attention.query_key_value.bias": "model_00010-of-00072.safetensors", - "h.8.self_attention.query_key_value.weight": "model_00010-of-00072.safetensors", - "h.9.input_layernorm.bias": "model_00011-of-00072.safetensors", - "h.9.input_layernorm.weight": "model_00011-of-00072.safetensors", - "h.9.mlp.dense_4h_to_h.bias": "model_00011-of-00072.safetensors", - "h.9.mlp.dense_4h_to_h.weight": "model_00011-of-00072.safetensors", - "h.9.mlp.dense_h_to_4h.bias": "model_00011-of-00072.safetensors", - "h.9.mlp.dense_h_to_4h.weight": "model_00011-of-00072.safetensors", - "h.9.post_attention_layernorm.bias": "model_00011-of-00072.safetensors", - "h.9.post_attention_layernorm.weight": "model_00011-of-00072.safetensors", - "h.9.self_attention.dense.bias": "model_00011-of-00072.safetensors", - "h.9.self_attention.dense.weight": "model_00011-of-00072.safetensors", - "h.9.self_attention.query_key_value.bias": "model_00011-of-00072.safetensors", - "h.9.self_attention.query_key_value.weight": "model_00011-of-00072.safetensors", - "ln_f.bias": "model_00072-of-00072.safetensors", - "ln_f.weight": "model_00072-of-00072.safetensors", - "word_embeddings.weight": "model_00001-of-00072.safetensors", - "word_embeddings_layernorm.bias": "model_00001-of-00072.safetensors", - "word_embeddings_layernorm.weight": "model_00001-of-00072.safetensors" - } -} diff --git a/model_00001-of-00072.safetensors b/model_00001-of-00072.safetensors deleted file mode 100644 index aed2346478cce088e440b475228d5ebb5e40863f..0000000000000000000000000000000000000000 --- a/model_00001-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:069a00c03b0397f9befb533e6b0f68072cb550d69050868d16d76450c8000357 -size 7193289031 diff --git a/model_00002-of-00072.safetensors b/model_00002-of-00072.safetensors deleted file mode 100644 index 98a5376fea468322162ec82601ef188aa71832f7..0000000000000000000000000000000000000000 --- a/model_00002-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a59f21045cc81d2e8f72ba46ce16f0688f5f6c58e8344ebfba16ef9f09a94eb0 -size 4932875549 diff --git a/model_00003-of-00072.safetensors b/model_00003-of-00072.safetensors deleted file mode 100644 index 4399f4adadfe882c75bd82ddbe0ce3be233f5276..0000000000000000000000000000000000000000 --- a/model_00003-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:204922e4d94bb9d9f30c5cb4fc3b35ee5fd2325c7f9840783de220d2d42766ae -size 4932875551 diff --git a/model_00004-of-00072.safetensors b/model_00004-of-00072.safetensors deleted file mode 100644 index 82e3d9eec347eed402bdc2b17967325c23576950..0000000000000000000000000000000000000000 --- a/model_00004-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce3db7dcd528ec2266b9062a1082c545eaa7a6dd06631e85c962c1afd59e24ab -size 4932875557 diff --git a/model_00005-of-00072.safetensors b/model_00005-of-00072.safetensors deleted file mode 100644 index 9df3c3c2c684c8c081ba392faac202ff3507a9c6..0000000000000000000000000000000000000000 --- a/model_00005-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:53d8499b65f093087058f490a1d7c19104d846bda9f6625362f149102fdc1d1e -size 4932875509 diff --git a/model_00006-of-00072.safetensors b/model_00006-of-00072.safetensors deleted file mode 100644 index 2a0e8a9c37793d1c963c28f9967db39151131948..0000000000000000000000000000000000000000 --- a/model_00006-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07454a1127747cf58503bd9a3e7671b3adf8d0816d289e08d7e6662f4c2e1824 -size 4932875553 diff --git a/model_00007-of-00072.safetensors b/model_00007-of-00072.safetensors deleted file mode 100644 index fd9812ddc62dca71e3015a65e240c6ae37ab121c..0000000000000000000000000000000000000000 --- a/model_00007-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4c991a16871f33406e8c98bf4e7da09447e4d067d3f5056d4fe16daa7a72ab9 -size 4932875551 diff --git a/model_00008-of-00072.safetensors b/model_00008-of-00072.safetensors deleted file mode 100644 index a56e8abfc86219226cb437f0515a3e1fef00a949..0000000000000000000000000000000000000000 --- a/model_00008-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ce5257fad9f05ed3399d1736bc0a5c097ad34b2c58e4a97d09a479194932fd2 -size 4932875519 diff --git a/model_00009-of-00072.safetensors b/model_00009-of-00072.safetensors deleted file mode 100644 index 98fe6b57d56ff3cacb105b7912fd8c3471d5f5c1..0000000000000000000000000000000000000000 --- a/model_00009-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:003aa8c59f07ec60f4599dbd10106e01fed184454d39046e467db47c0a879409 -size 4932875551 diff --git a/model_00010-of-00072.safetensors b/model_00010-of-00072.safetensors deleted file mode 100644 index 318fbffd648bc7ff9c1a00c38fa3ebad08e34417..0000000000000000000000000000000000000000 --- a/model_00010-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:095828fe07d6c44aa40065595da7800277b1d4079f2a966e841b314c8f3f0d9b -size 4932875541 diff --git a/model_00011-of-00072.safetensors b/model_00011-of-00072.safetensors deleted file mode 100644 index ab5e51d7883c292bc6d66e6a207f1b367dbbb97a..0000000000000000000000000000000000000000 --- a/model_00011-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:400a61941848f8a7960684671cd19d232ed9325336c419669012e2aadc9d9051 -size 4932875551 diff --git a/model_00012-of-00072.safetensors b/model_00012-of-00072.safetensors deleted file mode 100644 index c351caa8250b0b1459ba46a666709cdb61170944..0000000000000000000000000000000000000000 --- a/model_00012-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0f1392fce5a77cbfe29a2d680d82cf57b96e20466ac5efc7119eccd868006443 -size 4932875573 diff --git a/model_00013-of-00072.safetensors b/model_00013-of-00072.safetensors deleted file mode 100644 index af273759953b30559a591b3af25b778fc6893717..0000000000000000000000000000000000000000 --- a/model_00013-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c95c6eeb6aa593cdf3fb72239b021ce43714b13d60a89c064944e91afb4999d -size 4932875573 diff --git a/model_00014-of-00072.safetensors b/model_00014-of-00072.safetensors deleted file mode 100644 index 3257d4ad42009aebf545b866b84e7809077e48fc..0000000000000000000000000000000000000000 --- a/model_00014-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:982a3db07d4e7f49955519ac5ef9846424e72994514e24b7501d718ffaa732cc -size 4932875551 diff --git a/model_00015-of-00072.safetensors b/model_00015-of-00072.safetensors deleted file mode 100644 index 963ee034ed6699381ab6ace6cddb8da0a5906252..0000000000000000000000000000000000000000 --- a/model_00015-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c27838630d5bd45f9cff790ba2b4f998669be7c6afe94af7cb2b5eca8aab50f -size 4932875531 diff --git a/model_00016-of-00072.safetensors b/model_00016-of-00072.safetensors deleted file mode 100644 index 20da2b6b5dee4af991b5f1a39f612f5c50871ec7..0000000000000000000000000000000000000000 --- a/model_00016-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47ae50301b94f38138ba06c9a1ae435e33893231464fc65937f1e702f96f1031 -size 4932875573 diff --git a/model_00017-of-00072.safetensors b/model_00017-of-00072.safetensors deleted file mode 100644 index 89a6713bbbc4fa8f9d6b5c96a7c63be74d931d77..0000000000000000000000000000000000000000 --- a/model_00017-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41e389e47d117b2e95afd21a9f46f2cf25bf7652d34627e005d3168bdac62c13 -size 4932875563 diff --git a/model_00018-of-00072.safetensors b/model_00018-of-00072.safetensors deleted file mode 100644 index 591e4e5d58f095031fed31f1bb955b96ca4e3d9b..0000000000000000000000000000000000000000 --- a/model_00018-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ff513612c05e9cacbf264d28d4e1ae789a7c0ee754b7eb06fc22fc87cc3a14d -size 4932875573 diff --git a/model_00019-of-00072.safetensors b/model_00019-of-00072.safetensors deleted file mode 100644 index 889ad353eee2bfb2936dfd791f208ce750414318..0000000000000000000000000000000000000000 --- a/model_00019-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bec845aa268edd4eed7f6a158676bdf55a3b3df7a3b7a0a5ef31bb1004279ee6 -size 4932875555 diff --git a/model_00020-of-00072.safetensors b/model_00020-of-00072.safetensors deleted file mode 100644 index 318e3462d1a27f25c9ee14f7ec9c1f5c4704fb12..0000000000000000000000000000000000000000 --- a/model_00020-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9411792e3db49c214c32fe006b2798a3040c53cf96ab81b0da1d8fcb0dddf50d -size 4932875563 diff --git a/model_00021-of-00072.safetensors b/model_00021-of-00072.safetensors deleted file mode 100644 index eb117827cf46b7561382e48dbf7c1593d336cb18..0000000000000000000000000000000000000000 --- a/model_00021-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c48f8df4f00f199d4cd0364b2447988d9b72a5a4a4d22cb0cafddad841c42d8b -size 4932875553 diff --git a/model_00022-of-00072.safetensors b/model_00022-of-00072.safetensors deleted file mode 100644 index 7934ed9edc1dbe79200eda58a1049a771bf4f42c..0000000000000000000000000000000000000000 --- a/model_00022-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67d8af807a9d47357b39882ecdf1cd2c1c04fec49b73ee6a72ba29caf3cc4321 -size 4932875573 diff --git a/model_00023-of-00072.safetensors b/model_00023-of-00072.safetensors deleted file mode 100644 index a5da02fd7dc4a7e0168d396ce95f917bba2066cd..0000000000000000000000000000000000000000 --- a/model_00023-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e75d47b9b5683110da78e9497b8a634e4480ee9ccf4a27cd4911416fec16afc -size 4932875573 diff --git a/model_00024-of-00072.safetensors b/model_00024-of-00072.safetensors deleted file mode 100644 index 88773ef50a4e0c37442c628d4b8d56ca60f67a04..0000000000000000000000000000000000000000 --- a/model_00024-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c63189061ad724cc29735f11bc2ab6c86385e2f8916be3019e17a79388e9c45 -size 4932875561 diff --git a/model_00025-of-00072.safetensors b/model_00025-of-00072.safetensors deleted file mode 100644 index 45ab10e6d16e065a242b2223637930db35428175..0000000000000000000000000000000000000000 --- a/model_00025-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aaa686ebecf614dccc73b953160ccbc29b1e63fde53821aa1408bde478e1707b -size 4932875553 diff --git a/model_00026-of-00072.safetensors b/model_00026-of-00072.safetensors deleted file mode 100644 index 40e0560cc41e03d17de3797c6efc428dae47951f..0000000000000000000000000000000000000000 --- a/model_00026-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df99b77709af12a4cf78d140b0a354d89a07d01e8b4e1e5a2d51cc44c34c221d -size 4932875563 diff --git a/model_00027-of-00072.safetensors b/model_00027-of-00072.safetensors deleted file mode 100644 index 8365f6a43d61ed08604af18625d2705b59f103fc..0000000000000000000000000000000000000000 --- a/model_00027-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e25903c2f7fe2a310840a30466c36affc521f0d755e747b737bc4ae6b99dbed2 -size 4932875563 diff --git a/model_00028-of-00072.safetensors b/model_00028-of-00072.safetensors deleted file mode 100644 index b34d6727824363b6d5f5ba43d67b722e10fd18b1..0000000000000000000000000000000000000000 --- a/model_00028-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ef5dd81d468f2ca297e5f31230ec02c7810f65529dc5c51ad897ce7d734cba9 -size 4932875573 diff --git a/model_00029-of-00072.safetensors b/model_00029-of-00072.safetensors deleted file mode 100644 index bf5116747fb812a6f9a37e623415a9d562f69356..0000000000000000000000000000000000000000 --- a/model_00029-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e79599b306d14e2a81eaa3478fd0aa92d493df310b03f494a6be44fe523e4c05 -size 4932875563 diff --git a/model_00030-of-00072.safetensors b/model_00030-of-00072.safetensors deleted file mode 100644 index a8f724167d784688438de3e21255a1d76e5cd400..0000000000000000000000000000000000000000 --- a/model_00030-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88d6c77f313ec50f7bc229a8979cdef707ef870ee547b5b99a6af61ac3de2710 -size 4932875531 diff --git a/model_00031-of-00072.safetensors b/model_00031-of-00072.safetensors deleted file mode 100644 index 81e86c4b12e84a0a426d2525d6e6549c0071ddfe..0000000000000000000000000000000000000000 --- a/model_00031-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88e905d0328f4f1e5c93c0ca2c3e5dbf513b2a4509395dc56b16aac400be1050 -size 4932875555 diff --git a/model_00032-of-00072.safetensors b/model_00032-of-00072.safetensors deleted file mode 100644 index 0f8881d962b52b466fdd6a3bff9b23d9a8b59171..0000000000000000000000000000000000000000 --- a/model_00032-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:173da9d59ec2731fd9110e571aa1d1926ff4a3501d34a4511f006caed253d719 -size 4932875573 diff --git a/model_00033-of-00072.safetensors b/model_00033-of-00072.safetensors deleted file mode 100644 index 8d0b77157adf4ef1b1412d72706d37d498755724..0000000000000000000000000000000000000000 --- a/model_00033-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:066778f8acabce366e4f5259223af642639a87aaf9cb6aa8b6272944a96184f5 -size 4932875573 diff --git a/model_00034-of-00072.safetensors b/model_00034-of-00072.safetensors deleted file mode 100644 index 6a463bb72575d671c7bfc7982cf7de9acf6bed41..0000000000000000000000000000000000000000 --- a/model_00034-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b0f29133ff93cd564ed5b0c3b90de547c0b09ab45a09eb86d7189b73df848ac -size 4932875573 diff --git a/model_00035-of-00072.safetensors b/model_00035-of-00072.safetensors deleted file mode 100644 index b58c7974bf9336a2892c989bb85770ed4212d831..0000000000000000000000000000000000000000 --- a/model_00035-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e85b9970d999df06ac3589084fadcf8c54abc4c52a5042e63c77c5ca2fd83b13 -size 4932875573 diff --git a/model_00036-of-00072.safetensors b/model_00036-of-00072.safetensors deleted file mode 100644 index d8c21c8ec120e3b50ce9ca0f4652a2cac529bb35..0000000000000000000000000000000000000000 --- a/model_00036-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29292e40478d4c3518f2a88e1b66c41bcf52b8e069ab3eadb1396bc68e4056ff -size 4932875563 diff --git a/model_00037-of-00072.safetensors b/model_00037-of-00072.safetensors deleted file mode 100644 index 5e03feaf9555740ea599b4b5980998d48e1cdf0a..0000000000000000000000000000000000000000 --- a/model_00037-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23a611b2b8c6f50cb1e946f1145d25f38301a26a5e5afd8012eb3f3a4e37f96d -size 4932875573 diff --git a/model_00038-of-00072.safetensors b/model_00038-of-00072.safetensors deleted file mode 100644 index c461a911347f6fc1d778a216ad67c8da843a991b..0000000000000000000000000000000000000000 --- a/model_00038-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:21d97c75a684ee95a80d5f6a28bd83dc228b893b7c38b3e644b5ecd94853f6e6 -size 4932875573 diff --git a/model_00039-of-00072.safetensors b/model_00039-of-00072.safetensors deleted file mode 100644 index 2cccadd18ba2b187437d87131835dd5feddc0d09..0000000000000000000000000000000000000000 --- a/model_00039-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fa116a4705a58ec525ce6cd6472c2960e83e86c4242232d3e361496161c165d -size 4932875555 diff --git a/model_00040-of-00072.safetensors b/model_00040-of-00072.safetensors deleted file mode 100644 index c3fd6d896e6dfdddcf0690eda226de1816934ccb..0000000000000000000000000000000000000000 --- a/model_00040-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:652c454c76f90fc1d0cfc4c8efc023538495f1635fae0892b368c41d64b05daf -size 4932875533 diff --git a/model_00041-of-00072.safetensors b/model_00041-of-00072.safetensors deleted file mode 100644 index d74e27a648b14ad733fefb767670185955017020..0000000000000000000000000000000000000000 --- a/model_00041-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75e3775f02853b3cf39c2d1a29d14762b53ad4264327d13077673c467822de22 -size 4932875573 diff --git a/model_00042-of-00072.safetensors b/model_00042-of-00072.safetensors deleted file mode 100644 index f0b3179495083a224166b397f46ac2aaaadd23f4..0000000000000000000000000000000000000000 --- a/model_00042-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb297c192bec234a20ef1b64d8db4a5b19e6ead67c26923016b2011a3c68ef8d -size 4932875521 diff --git a/model_00043-of-00072.safetensors b/model_00043-of-00072.safetensors deleted file mode 100644 index 5e2a227f84c1cd76e930410a3b8398718d7b2873..0000000000000000000000000000000000000000 --- a/model_00043-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d947feb8a1521370231d89eab5502110c27678be228768759f22ea543a04e494 -size 4932875573 diff --git a/model_00044-of-00072.safetensors b/model_00044-of-00072.safetensors deleted file mode 100644 index 6b377b6820dded289f5afc38cef58956d1ac2825..0000000000000000000000000000000000000000 --- a/model_00044-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba1c110a9f7a5395502e2812d5334fb4f8b618ac2c19277c3b340a75ff25e8e3 -size 4932875555 diff --git a/model_00045-of-00072.safetensors b/model_00045-of-00072.safetensors deleted file mode 100644 index b221ea7609adf600c550eb6b94d08965f2e01547..0000000000000000000000000000000000000000 --- a/model_00045-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da3fbda96267fecb61b573f764dec3f47ae6304c007cafae20d1f938f17d0e6f -size 4932875569 diff --git a/model_00046-of-00072.safetensors b/model_00046-of-00072.safetensors deleted file mode 100644 index f59b3ec5f9a17bd391d507f1d9cc063b1c5f10e9..0000000000000000000000000000000000000000 --- a/model_00046-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db5eb422b2ee78b6491179e118828075d0153f800b0a59314f102b2c7c097bff -size 4932875563 diff --git a/model_00047-of-00072.safetensors b/model_00047-of-00072.safetensors deleted file mode 100644 index e387c95b3d0f82166beb2bb49eb2dd1ef7a62c4c..0000000000000000000000000000000000000000 --- a/model_00047-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1d231f84c18ca2f88b61dff8ec3d00f4d58fc0d780a52bd66f2dd158d5c148 -size 4932875573 diff --git a/model_00048-of-00072.safetensors b/model_00048-of-00072.safetensors deleted file mode 100644 index e30e05f7a4280981f245bf6fcd2fffa7575ff588..0000000000000000000000000000000000000000 --- a/model_00048-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:085dc3aaad679e4c80dfec734c8a5a5ef42e785938a4f7bfe0f92580ca184ee6 -size 4932875555 diff --git a/model_00049-of-00072.safetensors b/model_00049-of-00072.safetensors deleted file mode 100644 index 28523aae8467103277ac40843210521c6beabdc1..0000000000000000000000000000000000000000 --- a/model_00049-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:72976131df3b562c0d90bfbd34d7714b8023423eb591fcb5eaa235504535daf4 -size 4932875541 diff --git a/model_00050-of-00072.safetensors b/model_00050-of-00072.safetensors deleted file mode 100644 index a1d26094941b53cf7b93380eb7ebd63c45d0b7b3..0000000000000000000000000000000000000000 --- a/model_00050-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:695fb74e67a57ba80ebbf05799f28135012293b1192f96a6965bf070d299e9f8 -size 4932875551 diff --git a/model_00051-of-00072.safetensors b/model_00051-of-00072.safetensors deleted file mode 100644 index 9df7f8f46ae2aeab98a18e096c87a30990ad7f6c..0000000000000000000000000000000000000000 --- a/model_00051-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a2b0d15e4276b736066b476d7ae99d2705a2da6358a2cae0bed7b157d5d1e8d9 -size 4932875573 diff --git a/model_00052-of-00072.safetensors b/model_00052-of-00072.safetensors deleted file mode 100644 index b2a76e6a0f4d7508ddbc1f4eda7d5975c991456b..0000000000000000000000000000000000000000 --- a/model_00052-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f3d6fca86403bef9bc5ff684f012d7766fb151f1ae7c6eba8b62fb1f2adc49b -size 4932875549 diff --git a/model_00053-of-00072.safetensors b/model_00053-of-00072.safetensors deleted file mode 100644 index 86a399a39ca174e61f8f86ade0ced53f5c415b99..0000000000000000000000000000000000000000 --- a/model_00053-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cb278a233844db394e78708af7bf664f7039d480e8c4350222dddd0a230806b -size 4932875527 diff --git a/model_00054-of-00072.safetensors b/model_00054-of-00072.safetensors deleted file mode 100644 index e01e4a34cd268ed98ea2b71773fd0b705bd4c7a9..0000000000000000000000000000000000000000 --- a/model_00054-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e9d7dba17a7e77f204d64aff6d41975429ebcdb8272c8d1bd7e0b5b018d430d2 -size 4932875573 diff --git a/model_00055-of-00072.safetensors b/model_00055-of-00072.safetensors deleted file mode 100644 index 3ce131b27a8e173b5f59d5b6e2f776ca17fa9371..0000000000000000000000000000000000000000 --- a/model_00055-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f08e2483bc21d6cd80436b3057fee037e4c913d8742b3f2a852a8e4a5fcec0d9 -size 4932875563 diff --git a/model_00056-of-00072.safetensors b/model_00056-of-00072.safetensors deleted file mode 100644 index d210cb98bdc5a98bfdd51ea56b6cbb4e0b99dab4..0000000000000000000000000000000000000000 --- a/model_00056-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9fa9e44ff040c33ea4e637589223f0da948287994981e3dc1d636d3e6330fb31 -size 4932875565 diff --git a/model_00057-of-00072.safetensors b/model_00057-of-00072.safetensors deleted file mode 100644 index b47f2ccf8bafd338366d7b75fd100b7332718e10..0000000000000000000000000000000000000000 --- a/model_00057-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e4a0722c2be186b3c7e2b46d9fc4212b28ea3c261a4ed4f937ff74764a462de -size 4932875573 diff --git a/model_00058-of-00072.safetensors b/model_00058-of-00072.safetensors deleted file mode 100644 index d7db16faeea7fd3cbcc79e89e5a8562d69232cdb..0000000000000000000000000000000000000000 --- a/model_00058-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1324752aa457baabbe0199768e10f5aa126543794a36b072eae0ade4e0312598 -size 4932875573 diff --git a/model_00059-of-00072.safetensors b/model_00059-of-00072.safetensors deleted file mode 100644 index 2a5e18920458b80a3331b967aeccd045985a0667..0000000000000000000000000000000000000000 --- a/model_00059-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1210cb5b3f55e9eaa1123c8a5da740a3f2f7441bd7c439b66edb0762b4f4a328 -size 4932875563 diff --git a/model_00060-of-00072.safetensors b/model_00060-of-00072.safetensors deleted file mode 100644 index 65259ef2a40c487b210f3ad6d74fc7ea5412b00a..0000000000000000000000000000000000000000 --- a/model_00060-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9621c5e1481f20ecae15709309db04883b848f3378713731a3a0584dc7d634d -size 4932875527 diff --git a/model_00061-of-00072.safetensors b/model_00061-of-00072.safetensors deleted file mode 100644 index e9c60c11b4b5b91d427784a89430b0a9992d1872..0000000000000000000000000000000000000000 --- a/model_00061-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f0c18975077e977ff38a4432786ed68d2b6878c71131105829c8a73e9b53e837 -size 4932875565 diff --git a/model_00062-of-00072.safetensors b/model_00062-of-00072.safetensors deleted file mode 100644 index 783d1931f5ff99f8b0cb2e0494796c1386ffd6b4..0000000000000000000000000000000000000000 --- a/model_00062-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ca1c626087d1ebd1feb40241289776838ab0f3a227b12554439c745de6b26bd -size 4932875573 diff --git a/model_00063-of-00072.safetensors b/model_00063-of-00072.safetensors deleted file mode 100644 index 5b435d2891c55ed83631bdb4b3e09cbd04512b1f..0000000000000000000000000000000000000000 --- a/model_00063-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b845dd611ed3c5cbe66522cf7945610ecef5a59f71a1af5d5482966c6140cefe -size 4932875565 diff --git a/model_00064-of-00072.safetensors b/model_00064-of-00072.safetensors deleted file mode 100644 index e128331d24a6e8c40edf3e5ca2107a580e709d64..0000000000000000000000000000000000000000 --- a/model_00064-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d4d330ffa91861259bd04c489cce52978e892a922620485f983694f5e40ebca0 -size 4932875537 diff --git a/model_00065-of-00072.safetensors b/model_00065-of-00072.safetensors deleted file mode 100644 index 1de170ce16534abb15400b659088d1d0def6d70c..0000000000000000000000000000000000000000 --- a/model_00065-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06941b3f42871c22592098196d067f95ec994c40bee1db36a2df25661ba111fe -size 4932875571 diff --git a/model_00066-of-00072.safetensors b/model_00066-of-00072.safetensors deleted file mode 100644 index 5017f51c14c7ed398ade7a5721cded9e4cff99bf..0000000000000000000000000000000000000000 --- a/model_00066-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d232ef33521b522e82ef32c028a7b6e44abda264c6f96f1f12e9bb19e566f9f -size 4932875565 diff --git a/model_00067-of-00072.safetensors b/model_00067-of-00072.safetensors deleted file mode 100644 index 93d2fb276c92ac61954526b220f6a1ee64b16061..0000000000000000000000000000000000000000 --- a/model_00067-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:72e3b2f771658326d3cd996cb33f067732c6abd6ee1286e1589fff5f1490476a -size 4932875539 diff --git a/model_00068-of-00072.safetensors b/model_00068-of-00072.safetensors deleted file mode 100644 index 08ca7a07fc84a051d902a0c0bc1eae82c28a5df5..0000000000000000000000000000000000000000 --- a/model_00068-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8f53b92e859c9a3e43ece6b880c6ff856bfce1330fca2128d0962d337f82d02 -size 4932875563 diff --git a/model_00069-of-00072.safetensors b/model_00069-of-00072.safetensors deleted file mode 100644 index 6921dc614e2b4fd58f875b1807d1b47f28cf9892..0000000000000000000000000000000000000000 --- a/model_00069-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8337e26123044033391a46e5b16482e2d1a346be6c285ff5667994e118309169 -size 4932875573 diff --git a/model_00070-of-00072.safetensors b/model_00070-of-00072.safetensors deleted file mode 100644 index 43c3b0cae25be03f140fc544fccbf701a848dcd8..0000000000000000000000000000000000000000 --- a/model_00070-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e01486eb1cb0d2f1fba83201184096bcb82286955d6844dc328087f291e18b2e -size 4932875553 diff --git a/model_00071-of-00072.safetensors b/model_00071-of-00072.safetensors deleted file mode 100644 index 6139e6c3742e606bb749af23e3a920d9357e170a..0000000000000000000000000000000000000000 --- a/model_00071-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:664c40c92e29491cd5fdd96c7f52a55fc0ba90c163f5be651584465de4c68a85 -size 4932875557 diff --git a/model_00072-of-00072.safetensors b/model_00072-of-00072.safetensors deleted file mode 100644 index 17ce2927a95aa695573e1791410ff4cbc6cfd925..0000000000000000000000000000000000000000 --- a/model_00072-of-00072.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50ed9f636610d9d87c142f54067554ad15a68c20b57f0ad035cf9abebb56a0a6 -size 57530 diff --git a/tokenizer_config.json b/tokenizer_config.json index 57576ae0ccd46e2a851bcfb912454077fb374c7e..915a9be0734f6da4bca391d453ca1bb602c6dbdc 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -1 +1,2 @@ -{"unk_token": "", "eos_token": "", "bos_token": "", "pad_token": "", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class":"BloomTokenizerFast", "padding_side":"left"} +{"unk_token": "", "eos_token": "", "bos_token": "", "pad_token": "", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class": +"PreTrainedTokenizerFast", "padding_side":"left"}