lrl-modelcloud commited on
Commit
d809459
1 Parent(s): 8c89a11

Upload folder using huggingface_hub (#2)

Browse files

- 105f1178f9d8b66bfdb8d6aeace71d87a44882c93dc1a364951edf3bed5b7fed (4469e6cebb072c49afb19891bb1a39938256c32c)
- 8e0288f8b29a72b2b789f8ee9ef0f1d37afed293ac673c71dd23de9fefb803d6 (6522af42bbc4f9ddec800957f5040b4b37234f8a)
- 7560fe9b264ad4b595838c002a9b7055e3681319cfeb309ac10b11266196080a (6c92710166b0a0c0d4da211136419aaf8b9dd645)
- 3f2c528bc240a676a1082f61de1324301c08251cfbb6acc8eab451682b2182f1 (e959a697ce2dde9767fb6e0c9b5a9184b4813467)
- 113d4616586d81e5b29178fea579ad5a9f9d2a93b4fad2fb6c2bd68a000e16b2 (c7cdb1262588cf8298197381834df203b788bea2)
- 51bbd95fb593c1e8a136ac9d4f19bb4b0717ef9c1405cba883c02c3c836e8a94 (50ab2cb8524ccdd797e658c35307bb72335640cd)
- 8ad6e0f2cf13660450ce44366813e4e9945122f741e988d88b5f63e24c1e780a (7061eae46fbdff8f296a0f8cec7ac1dd6d8f5731)
- 249fcafe317fe688216accde549c582c77256d95a00c8c860e39ee5dcdc1ec3c (fb1517e3001b1393d15c76e2aeee22a3df121784)
- bd798f3f39314cc2ddf4e84ba55da3f5a3c486f3217b6b877356f2209f973899 (4a3e2e6f8e798df20c97083ad2cba535a0d04900)
- 694356148ecfeecf35a16d28f789fe7d35c36f5c292757e27038b35cf3529790 (e419bfb933ceb9c4843b6d6b250f29af140047d0)
- c2fdeaa8fd250a6d83594539757c2c2ef70dcea353326d00effa0bd73d744bac (d3f0bd86195bbc44a0317760036534287864eb53)
- f82470d211e7ccd1cee116bf69ddc54748ec338cc4f642915d994c5861c98eef (704ba3399b346689ee33412c9ef1f664fea895ed)
- 5843f1d77eb70e28c96bd10f0e459b584dca6ecb8b442b8e9a8e592097d54dcb (ee2a1c1088422d6019d5dcd0387e45d477898e58)
- b81e0ecaab94a01095456e12799428eaea7e1fd28cb49c49f920d498b1cfb83f (c26051d373a19b1448a583581681353e45d3f9a1)
- ddcba34b76bc2752e40fa7b6efaf7bf588a401576dfa3aacc7c1c09ebc042aa0 (9bb2a601a0addc0252039b6d01da2098eb669c95)
- 4b7b956dc227b6ec28aa950d8648bef9b7395958aee2541acd4d104352de276e (1484d0d14c5ba4330ca031cd17bc700b9e66c381)
- da6c84c653e90f4366a59652fa2e56d108506e803a235fc5882c9c474298ec63 (7edd69e4f7e5225fcaedbba6933597893642ba5c)
- bd65ef470d916351b344f1e9352a55ec75f41dddcea743ecf2c99e9955a69c99 (e91acc7cbf6d80ea24fa318976628c1828974050)
- 4c70ce4344fabec195450a1705006e2e7bd2bce13e257709040244038c1d819a (cb39d975475b4c126f25c0e4e36cf5e22f6f717d)
- edcbe4b8923b070db9ea4d8c0b9e95aed56d00125c84145b12786806fd2e5cb3 (dcb366a0191bab119f62347bf9cc56dc5460af2e)
- 4960442ece6d6e86446161206bb506f80d00e1cc78cca059ea44e10fba1f1c8b (e807d24ac21156e7358c5ad13bbb3fda7a77f51e)
- c56b2d7ea34183445d32ceb34148ca2f25f579f204a0f5dd1de064edb500e3d7 (9f2990d137daaef900d8e284dde2b75b952184c9)
- a3845ff13533bc9656e226d3da4e0d12ce0951f864a5e42b8af7688d0cad6394 (661176789bd9c32bf4a552c14ce4a7dc34c6c494)
- 219e9050e5931b3f8f618075c87feadb4c00bf6192d06a47b75f6134edda40ff (97fcd3c1834000c7821e6f80104a7d9a2c927784)
- 32823684c8d23182a644e0fabcb4494bc0432a1ce8decc43796a151ce3461da6 (e1a1e6362ffc8b821203ad9711e3f80fc5d22846)
- 357c32f90fe9da08fb4ca1c8d243636766551358918ea669ab5f8f58926b567c (6581ab2859647ded6fa2675b8c1e449e0146a2f8)
- ceb186e57582dd86b2924ebbd130c7a86ec5f602806a871d33bbddef7c62992e (3b9551b7f3376458760001eb04e26a390de496a9)
- de14ddb3914ab39b44cd0fce8ef69912ed6c6e70b0a7de0877fecd12500ed98d (7fb926d9a252f056e7ea59ad3bf6bdf7e1866830)
- 87e869d7f1bc09eb692757974f8e1bbad935c49a6b912b516b7fab6635c2737d (b4d4fc88d2693b0916c74f14724058a3b78e345f)
- 3bf9b1e66836776b69f108b671807028d16566bb6bf11e73cd38cd18615247a5 (521b06c76977ceb60b10a5540fce75781902731f)
- 9cc9da92b7486056bf80bc841ce876c7d769ba68d43e2a25faf973487681c142 (0446b17469d1395100829059897095fd4843cd3c)
- e1fd7e2e15394223d44185a2c131b19acd5f2b25708ad5c4b1ca7f336e15bb09 (f8f75c6ce93d222953187ba2fdc7e4c9e16a70c6)
- 45fec8464d770d8ad8c8b525d692b4e5b8129223742b133cdd0a8dfd14ecccce (5ca196be9ebaf3a7ee62a8ff7d6582886d4626d5)
- d9a94592bd4cd790a51687de8cb052b569edbb5e1727477862234bb7d1a384e6 (88e98173a67e43100eeb91df6fb1f007a0c9321d)
- eaf75a1d942c468d1f429c5b6843107d13f0d27d66a884ae92234beb33e3edd5 (24991f171e6927cc81a37145f43226b17176218e)
- b0a2041f568db86f9cf2f8693db136d60d9e353d348fbc1f8d7e25d9cade12e4 (b5f145f0fcdb0337566557d17ba2a4c2a5f9cbea)
- 9be833f4a5691563507c997a7d7dcb5642bbb1ca5a2c6e215f7373e113e0f5c4 (990675a96d7209eb708747a1dca4583cd9d951e7)
- bee0bd188d8ccde632602cbd4fb165df91bf989ab6ac91756ab5c53c3fb94051 (ee66ccba1ed473a4319a71559ff0d95bcf23c2a2)
- 669f4b11ad6d583d6e639bcc24779e000805bece6bc909682543963a6cfeaf72 (3a329541262ba0ace6b8a4d201fb5efd68081c48)
- 5bb94bb9877c7c0e04b01572b68fd672607a066bd60bf549f5d673b61d24dcf9 (a114c6c7cafc369945040caca12a679aa4156268)
- 17ec2e9f474d62291e04b7177028b293760d2af6ec8f1b1d626cb4da86cd8f03 (36bcb4b1bff73206c13756d5f525c64b289e342b)
- 4881b32cae3096b54069d56857b10d2a615e136be77f8146f2665df2bd68aefe (3b72b6040a59fdc1aba74e4add3e1a29db66b047)
- f82fa822b269247bad7a8881e5e1fd0b073c25b04b5087ae74ae8bc9431db94a (ccee8d8f420ade950a07aea70dbc045c63643f7f)
- 404e08513b373c06ccdaa759b540ddbcd77d998eb9f3431dfdc84b89fb429c4f (c970c653765031965c1772cc2f92090604ebe1d1)
- 1892254fff70c78383b164d6ac9eff9cf432b8685c8f19e68ddbad86a6adbcc3 (24be0f8dd98b61603e1491dba5a590b3423bca66)
- 6d8f664c864cfa0b9f3844ac43d6030ed16bd4f42ab2b42b468ff0cf98246ef0 (270c547307591bff0613e866c4870985c2e3a09e)
- 382478dfd90a1f893c87fcae2dcd308b4459bf3bc51a1093edab543b1b0a4dd0 (bbbcfab9292ea8ca43d1fa769a25e5ed97cfb553)
- 77273d0a1c6d503e54737c79a4b60f39eb4fc28b86b4916da46f086ffee5260a (a61a83183842a52de53d9f4cfb4879b51dff52ab)
- e02cc9d20667e37018e0bf72f77cd696e03ed422e8fc34d5f0d8cea39b756558 (12a1afff85f609cd659507aee452010a86374646)
- 119556f42f9d9b485e426f0156df2b4bfe5a4cdb2d2eb6015f5ff9a4aa59bde5 (d8e8a3b0ab52f038a041752c12704ef697278ee7)
- 6806b3935aacd07f05c5b7cba5967f7545fa6d8d2bee1e4e75f93ff6f51550be (44926ffd2b7050a306725fbe0e11061ad35b9134)
- dd04a1956d17542de4f27dc0a35579a52c48b3e5e794afad60549c39269dd3d3 (4a1b4a2bda1e74e1083718121029a81a3a1ff1e3)
- e64f1ed46e0006b804989983e6ea55e886253266f315ef746613e2a9dd9ece23 (7e3700d11739ee3aa5bc10c6e8696e28b7adae88)
- a412e94a4cd272b33678ade13e4b04a30330159d9f970b59bc685af003e5811a (228911a525eeccfa6bc460a7ca2624b246b8a982)
- d811614d6b7e24f7a12d39c93a6c5237201fcb3151e15a638e283c84b490e37f (21e0452b1d9673ef54626844f22bd7c69b7639f5)
- e76e612bcca95d39ac6d7d8c328bcb08493293a7985deb4f157ce2cac9135f49 (3e66aae99088e0c5a3c150f62607c8054d97f51b)
- a45781161db45b4e965e38302e8bb3f9a67750fbb0e6cfc0f614bf90af52679f (f0a0b82eea0e984c8bb5c5e49ed014026ab7f2aa)
- 5db6743cb290ec2073ac7245bb6f3dfb39e285f835d9edcc43fdc4e63f02358f (05fc3bf1893a16f1b39d0deae1cae049d3f9889f)
- 372664a11a6d5225663ebbd784460491c9b3eceebc736a04668ed87687a0723a (21a0a5e1757e10c3f9e5f2b2ee40a017e63d78ba)
- 6a715d21a640564947ec47d36590def3600e37451ad1c7e17c5794e866b9d25d (b888c36438b9192a414db2cff373a0ba4368d594)
- 2ba83c8624b6f4894287e981390ac00418eae754d8944a6ca40c77f0b1e4ed4c (7cdf1b06af4b4961fb35d600557eab93dc3c0cbb)
- f90ce0b8fdbca174da1b904d6f72ba4c2624cd89bbe3461d64e899f5b7f65f03 (f66f59442fa825a2a993beda0c7793ed0be05729)

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE.txt +176 -0
  2. NOTICE.txt +1 -0
  3. README.md +172 -3
  4. config.json +38 -0
  5. configuration_dbrx.py +264 -0
  6. generation_config.json +7 -0
  7. model-00001-of-00061.safetensors +3 -0
  8. model-00002-of-00061.safetensors +3 -0
  9. model-00003-of-00061.safetensors +3 -0
  10. model-00004-of-00061.safetensors +3 -0
  11. model-00005-of-00061.safetensors +3 -0
  12. model-00006-of-00061.safetensors +3 -0
  13. model-00007-of-00061.safetensors +3 -0
  14. model-00008-of-00061.safetensors +3 -0
  15. model-00009-of-00061.safetensors +3 -0
  16. model-00010-of-00061.safetensors +3 -0
  17. model-00011-of-00061.safetensors +3 -0
  18. model-00012-of-00061.safetensors +3 -0
  19. model-00013-of-00061.safetensors +3 -0
  20. model-00014-of-00061.safetensors +3 -0
  21. model-00015-of-00061.safetensors +3 -0
  22. model-00016-of-00061.safetensors +3 -0
  23. model-00017-of-00061.safetensors +3 -0
  24. model-00018-of-00061.safetensors +3 -0
  25. model-00019-of-00061.safetensors +3 -0
  26. model-00020-of-00061.safetensors +3 -0
  27. model-00021-of-00061.safetensors +3 -0
  28. model-00022-of-00061.safetensors +3 -0
  29. model-00023-of-00061.safetensors +3 -0
  30. model-00024-of-00061.safetensors +3 -0
  31. model-00025-of-00061.safetensors +3 -0
  32. model-00026-of-00061.safetensors +3 -0
  33. model-00027-of-00061.safetensors +3 -0
  34. model-00028-of-00061.safetensors +3 -0
  35. model-00029-of-00061.safetensors +3 -0
  36. model-00030-of-00061.safetensors +3 -0
  37. model-00031-of-00061.safetensors +3 -0
  38. model-00032-of-00061.safetensors +3 -0
  39. model-00033-of-00061.safetensors +3 -0
  40. model-00034-of-00061.safetensors +3 -0
  41. model-00035-of-00061.safetensors +3 -0
  42. model-00036-of-00061.safetensors +3 -0
  43. model-00037-of-00061.safetensors +3 -0
  44. model-00038-of-00061.safetensors +3 -0
  45. model-00039-of-00061.safetensors +3 -0
  46. model-00040-of-00061.safetensors +3 -0
  47. model-00041-of-00061.safetensors +3 -0
  48. model-00042-of-00061.safetensors +3 -0
  49. model-00043-of-00061.safetensors +3 -0
  50. model-00044-of-00061.safetensors +3 -0
LICENSE.txt ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Databricks Open Model License
2
+
3
+ By using, reproducing, modifying, distributing, performing or displaying
4
+ any portion or element of DBRX or DBRX Derivatives, or otherwise accepting
5
+ the terms of this Agreement, you agree to be bound by this Agreement.
6
+
7
+ Version Release Date: March 27, 2024
8
+
9
+
10
+ Section 1: Definitions
11
+
12
+ “Agreement” means these terms and conditions that govern the use, reproduction,
13
+ modification, distribution, performance or display of DBRX and/or DBRX
14
+ Derivatives and any terms and conditions incorporated by reference.
15
+
16
+ “Databricks” or “we” means Databricks, Inc.
17
+
18
+ “Licensee” or “you” means you, or your employer or any other person or entity
19
+ (if you are entering into this Agreement on such person or entity’s behalf),
20
+ of the age required under applicable laws, rules or regulations to provide
21
+ legal consent and that has legal authority to bind your employer or such other
22
+ person or entity if you are entering in this Agreement on their behalf.
23
+
24
+ “DBRX Derivatives” means all (i) modifications to DBRX, (ii) works based on
25
+ DBRX and (iii) any other derivative works thereof. Outputs are not deemed DBRX
26
+ Derivatives.
27
+
28
+ “DBRX” means the foundational large language models and software and
29
+ algorithms, including machine-learning model code, trained model weights,
30
+ inference-enabling code, training-enabling code, fine-tuning enabling code,
31
+ documentation and other elements of the foregoing identified by Databricks at
32
+ https://github.com/databricks/dbrx, regardless of the source that you obtained
33
+ it from.
34
+
35
+ “Output” means the results of operating DBRX or DBRX Derivatives.
36
+
37
+ As used in this Agreement, “including” means “including without limitation.”
38
+
39
+
40
+ Section 2: License Rights and Conditions on Use and Distribution
41
+
42
+ 2.1 Grant of Rights
43
+
44
+ You are granted a non-exclusive, worldwide, non-transferable and royalty-free
45
+ limited license under Databricks’ intellectual property or other rights owned
46
+ by Databricks embodied in DBRX to use, reproduce, distribute, copy, modify,
47
+ and create derivative works of DBRX in accordance with the terms of this
48
+ Agreement.
49
+
50
+ 2.2 Reproduction and Distribution
51
+
52
+ 1. All distributions of DBRX or DBRX Derivatives must be accompanied by a
53
+ "Notice" text file that contains the following notice: "DBRX is provided
54
+ under and subject to the Databricks Open Model License, Copyright ©
55
+ Databricks, Inc. All rights reserved."
56
+
57
+ 2. If you distribute or make DBRX or DBRX Derivatives available to a third
58
+ party, you must provide a copy of this Agreement to such third party.
59
+
60
+ 3. You must cause any modified files that you distribute to carry prominent
61
+ notices stating that you modified the files.
62
+
63
+ You may add your own intellectual property statement to your modifications of
64
+ DBRX and, except as set forth in this Section, may provide additional or
65
+ different terms and conditions for use, reproduction, or distribution of DBRX
66
+ or DBRX Derivatives as a whole, provided your use, reproduction, modification,
67
+ distribution, performance, and display of DBRX or DBRX Derivatives otherwise
68
+ complies with the terms and conditions of this Agreement. Any additional or
69
+ different terms and conditions you impose must not conflict with the terms of
70
+ this Agreement and in the event of a conflict, the terms and conditions of this
71
+ Agreement shall govern over any such additional or different terms and conditions.
72
+
73
+ 2.3 Use Restrictions
74
+
75
+ You will not use DBRX or DBRX Derivatives or any Output to improve any other
76
+ large language model (excluding DBRX or DBRX Derivatives).
77
+
78
+ You will not use DBRX or DBRX Derivatives:
79
+
80
+ 1. for any restricted use set forth in the Databricks Open Model Acceptable
81
+ Use Policy identified at
82
+ https://www.databricks.com/legal/acceptable-use-policy-open-model
83
+ ("Acceptable Use Policy"), which is hereby incorporated by reference into
84
+ this Agreement; or
85
+
86
+ 2. in violation of applicable laws and regulations.
87
+
88
+ To the maximum extent permitted by law, Databricks reserves the right to
89
+ restrict (remotely or otherwise) usage of DBRX or DBRX Derivatives that
90
+ Databricks reasonably believes are in violation of this Agreement.
91
+
92
+
93
+ Section 3: Additional Commercial Terms
94
+
95
+ If, on the DBRX version release date, the monthly active users of the products
96
+ or services made available by or for Licensee, or Licensee’s affiliates, is
97
+ greater than 700 million monthly active users in the preceding calendar month,
98
+ you must request a license from Databricks, which we may grant to you in our
99
+ sole discretion, and you are not authorized to exercise any of the rights under
100
+ this Agreement unless or until Databricks otherwise expressly grants you such
101
+ rights.
102
+
103
+ If you receive DBRX or DBRX Derivatives from a direct or indirect licensee as
104
+ part of an integrated end user product, then this section (Section 3) of the
105
+ Agreement will not apply to you.
106
+
107
+
108
+ Section 4: Additional Provisions
109
+
110
+ 4.1 Updates
111
+
112
+ Databricks may update DBRX from time to time, and you must make reasonable
113
+ efforts to use the latest version of DBRX.
114
+
115
+ 4.2 Intellectual Property
116
+
117
+ a. No trademark licenses are granted under this Agreement, and in connection
118
+ with DBRX or DBRX Derivatives, neither Databricks nor Licensee may use any name
119
+ or mark owned by or associated with the other or any of its affiliates, except
120
+ as required for reasonable and customary use in describing and redistributing
121
+ DBRX or DBRX Derivatives.
122
+
123
+ b. Subject to Databricks’ ownership of DBRX and DRBX Derivatives made by or for
124
+ Databricks, with respect to any DBRX Derivatives that are made by you, as
125
+ between you and Databricks, you are and will be the owner of such DBRX
126
+ Derivatives.
127
+
128
+ c. Databricks claims no ownership rights in Outputs. You are responsible for
129
+ Outputs and their subsequent uses.
130
+
131
+ d. If you institute litigation or other proceedings against Databricks or any
132
+ entity (including a cross-claim or counterclaim in a lawsuit) alleging that
133
+ DBRX or Outputs or results therefrom, or any portion of any of the foregoing,
134
+ constitutes infringement of intellectual property or other rights owned or
135
+ licensable by you, then any licenses granted to you under this Agreement shall
136
+ terminate as of the date such litigation or claim is filed or instituted. You
137
+ will indemnify and hold harmless Databricks from and against any claim by any
138
+ third party arising out of or related to your use or distribution of DBRX or
139
+ DBRX Derivatives.
140
+
141
+ 4.3 DISCLAIMER OF WARRANTY
142
+
143
+ UNLESS REQUIRED BY APPLICABLE LAW, DBRX AND ANY OUTPUT AND RESULTS THEREFROM
144
+ ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER
145
+ EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE,
146
+ NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU
147
+ ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR
148
+ REDISTRIBUTING DBRX OR DBRX DERIVATIVES AND ANY OUTPUT AND ASSUME ANY RISKS
149
+ ASSOCIATED WITH YOUR USE OF DBRX OR DBRX DERIVATIVES AND ANY OUTPUT AND RESULTS.
150
+
151
+ 4.4 LIMITATION OF LIABILITY
152
+
153
+ IN NO EVENT WILL DATABRICKS OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF
154
+ LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR
155
+ OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT,
156
+ SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF
157
+ DATABRICKS OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE
158
+ FOREGOING.
159
+
160
+ 4.5 Term and Termination
161
+
162
+ The term of this Agreement will commence upon your acceptance of this Agreement
163
+ or access to DBRX or DBRX Derivatives and will continue in full force and
164
+ effect until terminated in accordance with the terms and conditions herein.
165
+ Databricks may terminate this Agreement if you are in breach of any term or
166
+ condition of this Agreement. Upon termination of this Agreement, you shall
167
+ delete and cease use of DBRX or any DBRX Derivatives. Sections 1, 4.2(d), 4.3,
168
+ 4.4, and 4.6 shall survive the termination of this Agreement.
169
+
170
+ 4.6 Governing Law and Jurisdiction
171
+
172
+ This Agreement will be governed and construed under the laws of the State of
173
+ California without regard to choice of law principles, and the UN Convention
174
+ on Contracts for the International Sale of Goods does not apply to this
175
+ Agreement. The courts of California shall have exclusive jurisdiction of any
176
+ dispute arising out of this Agreement.
NOTICE.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ DBRX is provided under and subject to the Databricks Open Model License, Copyright © Databricks, Inc. All rights reserved.
README.md CHANGED
@@ -1,3 +1,172 @@
1
- ---
2
- license: unknown
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ extra_gated_heading: You need to share contact information with Databricks to access this model
3
+ extra_gated_prompt: >-
4
+
5
+ ### DBRX Terms of Use
6
+
7
+ Use of DBRX is governed by the [Databricks Open Model License](https://www.databricks.com/legal/open-model-license) and the [Databricks Open Model Acceptable Use Policy](https://www.databricks.com/legal/acceptable-use-policy-open-model).
8
+
9
+ extra_gated_fields:
10
+ First Name: text
11
+ Last Name: text
12
+ Organization: text
13
+ Purpose for Base Model Access: text
14
+ By clicking 'Submit' below, I accept the terms of the license and acknowledge that the information I provide will be collected, stored, processed, and shared in accordance with Databricks' Privacy Notice and I understand I can update my preferences at any time: checkbox
15
+ extra_gated_description: >-
16
+ The information you provide will be collected, stored, processed, and shared in accordance with Databricks [Privacy Notice](https://www.databricks.com/legal/privacynotice).
17
+ extra_gated_button_content: Submit
18
+ inference: false
19
+ license: other
20
+ license_name: databricks-open-model-license
21
+ license_link: https://www.databricks.com/legal/open-model-license
22
+ ---
23
+
24
+ # DBRX Base
25
+
26
+ * DBRX Base is a mixture-of-experts (MoE) large language model trained from scratch by Databricks.
27
+ * We are releasing both DBRX Base, a pretrained base model, and DBRX Instruct, a fine-tuned version for few-turn interactions, under [an open license](https://www.databricks.com/legal/open-model-license).
28
+ * This is the repository for DBRX Base. DBRX Instruct can be found [here](https://huggingface.co/databricks/dbrx-instruct).
29
+ * For full details on the DBRX models, please read our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
30
+
31
+
32
+ ## Model Overview
33
+ DBRX is a [transformer-based](https://www.isattentionallyouneed.com/) decoder-only large language model (LLM) that was trained using next-token prediction.
34
+ It uses a *fine-grained* mixture-of-experts (MoE) architecture with 132B total parameters of which 36B parameters are active on any input.
35
+ It was pre-trained on 12T tokens of text and code data.
36
+ Compared to other open MoE models like Mixtral-8x7B and Grok-1, DBRX is fine-grained, meaning it uses a larger number of smaller experts. DBRX has 16 experts and chooses 4, while Mixtral-8x7B and Grok-1 have 8 experts and choose 2.
37
+ This provides 65x more possible combinations of experts and we found that this improves model quality.
38
+ DBRX uses rotary position encodings (RoPE), gated linear units (GLU), and grouped query attention (GQA).
39
+ It uses the GPT-4 tokenizer as provided in the [tiktoken](https://github.com/openai/tiktoken) repository.
40
+ We made these choices based on exhaustive evaluation and scaling experiments.
41
+
42
+ DBRX was pretrained on 12T tokens of carefully curated data and a maximum context length of 32K tokens.
43
+ We estimate that this data is at least 2x better token-for-token than the data we used to pretrain the MPT family of models.
44
+ This new dataset was developed using the full suite of Databricks tools, including Apache Spark™ and Databricks notebooks for data processing, and Unity Catalog for data management and governance.
45
+ We used curriculum learning for pretraining, changing the data mix during training in ways we found to substantially improve model quality.
46
+
47
+ * **Inputs:** DBRX only accepts text-based inputs and accepts a context length of up to 32768 tokens.
48
+ * **Outputs:** DBRX only produces text-based outputs.
49
+ * **Model Architecture:** More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
50
+ * **License:** [Databricks Open Model License](https://www.databricks.com/legal/open-model-license)
51
+ * **Acceptable Use Policy:** [Databricks Open Model Acceptable Use Policy](https://www.databricks.com/legal/acceptable-use-policy-open-model)
52
+ * **Version:** 1.0
53
+ * **Owner:** Databricks, Inc.
54
+
55
+
56
+ ## Usage
57
+ These are several general ways to use the DBRX models:
58
+ * DBRX Base and DBRX Instruct are available for download on HuggingFace (see our Quickstart guide below). This is the HF repository for DBRX Base; DBRX Instruct can be found [here](https://huggingface.co/databricks/dbrx-instruct).
59
+ * The DBRX model repository can be found on GitHub [here](https://github.com/databricks/dbrx).
60
+ * DBRX Base and DBRX Instruct are available with [Databricks Foundation Model APIs](https://docs.databricks.com/en/machine-learning/foundation-models/index.html) via both *Pay-per-token* and *Provisioned Throughput* endpoints. These are enterprise-ready deployments.
61
+ * For more information on how to fine-tune using LLM-Foundry, please take a look at our LLM pretraining and fine-tuning [documentation](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/README.md).
62
+
63
+
64
+ ## Quickstart Guide
65
+ **NOTE: This is DBRX Base, and has not been instruction finetuned. It has not been trained for interactive chat and is only a completion model.**
66
+ If you are looking for the finetuned model, please use [DBRX Instruct](https://huggingface.co/databricks/dbrx-instruct).
67
+
68
+ Getting started with DBRX models is easy with the `transformers` library. The model requires ~264GB of RAM and the following packages:
69
+
70
+ ```bash
71
+ pip install "transformers>=4.39.2" "tiktoken>=0.6.0"
72
+ ```
73
+
74
+ If you'd like to speed up download time, you can use the `hf_transfer` package as described by Huggingface [here](https://huggingface.co/docs/huggingface_hub/en/guides/download#faster-downloads).
75
+ ```bash
76
+ pip install hf_transfer
77
+ export HF_HUB_ENABLE_HF_TRANSFER=1
78
+ ```
79
+
80
+ You will need to request access to this repository to download the model. Once this is granted,
81
+ [obtain an access token](https://huggingface.co/docs/hub/en/security-tokens) with `read` permission, and supply the token below.
82
+
83
+ ### Run the model on a CPU:
84
+ ```python
85
+ from transformers import AutoTokenizer, AutoModelForCausalLM
86
+ import torch
87
+
88
+ tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-base", trust_remote_code=True, token="hf_YOUR_TOKEN")
89
+ model = AutoModelForCausalLM.from_pretrained("databricks/dbrx-base", device_map="cpu", torch_dtype=torch.bfloat16, trust_remote_code=True, token="hf_YOUR_TOKEN")
90
+
91
+ input_text = "Databricks was founded in "
92
+ input_ids = tokenizer(input_text, return_tensors="pt")
93
+
94
+ outputs = model.generate(**input_ids, max_new_tokens=100)
95
+ print(tokenizer.decode(outputs[0]))
96
+ ```
97
+
98
+ ### Run the model on multiple GPUs:
99
+ ```python
100
+ from transformers import AutoTokenizer, AutoModelForCausalLM
101
+ import torch
102
+
103
+ tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-base", trust_remote_code=True, token="hf_YOUR_TOKEN")
104
+ model = AutoModelForCausalLM.from_pretrained("databricks/dbrx-base", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, token="hf_YOUR_TOKEN")
105
+
106
+ input_text = "Databricks was founded in "
107
+ input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
108
+
109
+ outputs = model.generate(**input_ids, max_new_tokens=100)
110
+ print(tokenizer.decode(outputs[0]))
111
+ ```
112
+ If your GPU system supports [FlashAttention2](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2), you can add `attn_implementation=”flash_attention_2”` as a keyword to `AutoModelForCausalLM.from_pretrained()` to achieve faster inference.
113
+
114
+
115
+ ## Limitations and Ethical Considerations
116
+ ### Training Dataset Limitations
117
+ The DBRX models were trained on 12T tokens of text, with a knowledge cutoff date of December 2023.
118
+
119
+ The training mix used for DBRX contains both natural-language and code examples. The vast majority of our training data is in the English language. We did not test DBRX for non-English proficiency. Therefore, DBRX should be considered a generalist model for text-based use in the English language.
120
+
121
+ DBRX does not have multimodal capabilities.
122
+
123
+ ### Associated Risks and Recommendations
124
+ All foundation models are novel technologies that carry various risks, and may output information that is inaccurate, incomplete, biased, or offensive.
125
+ Users should exercise judgment and evaluate such output for accuracy and appropriateness for their desired use case before using or sharing it.
126
+ Databricks recommends [using retrieval augmented generation (RAG)](https://www.databricks.com/glossary/retrieval-augmented-generation-rag) in scenarios where accuracy and fidelity are important.
127
+ We also recommend that anyone using or fine-tuning either DBRX Base or DBRX Instruct perform additional testing around safety in the context of their particular application and domain.
128
+
129
+
130
+ ## Intended Uses
131
+ ### Intended Use Cases
132
+ The DBRX models are open, general-purpose LLMs intended and licensed for both commercial and research applications.
133
+ They can be further fine-tuned for various domain-specific natural language and coding tasks.
134
+ DBRX Base can be used as an off-the-shelf model for text completion for general English-language and coding tasks.
135
+
136
+ Please review the Associated Risks section above, as well as the [Databricks Open Model License](https://www.databricks.com/legal/open-model-license) and [Databricks Open Model Acceptable Use Policy](https://www.databricks.com/legal/acceptable-use-policy-open-model) for further information about permissible uses of DBRX Base and its derivatives.
137
+
138
+ ### Out-of-Scope Use Cases
139
+ DBRX models are not intended to be used out-of-the-box in non-English languages and do not support native code execution, or other forms of function-calling.
140
+ DBRX models should not be used in any manner that violates applicable laws or regulations or in any other way that is prohibited by the [Databricks Open Model License](https://www.databricks.com/legal/open-model-license) and [Databricks Open Model Acceptable Use Policy](https://www.databricks.com/legal/acceptable-use-policy-open-model).
141
+
142
+
143
+ ## Training Stack
144
+ MoE models are complicated to train, and the training of DBRX Base and DBRX Instruct was heavily supported by Databricks’ infrastructure for data processing and large-scale LLM training (e.g., [Composer](https://github.com/mosaicml/composer), [Streaming](https://github.com/mosaicml/streaming), [Megablocks](https://github.com/stanford-futuredata/megablocks), and [LLM Foundry](https://github.com/mosaicml/llm-foundry)).
145
+
146
+ Composer is our core library for large-scale training.
147
+ It provides an optimized training loop, easy [checkpointing](https://docs.mosaicml.com/projects/composer/en/latest/trainer/checkpointing.html) and [logging](https://docs.mosaicml.com/projects/composer/en/latest/trainer/logging.html#wood-logging),
148
+ [FSDP](https://pytorch.org/docs/stable/fsdp.html)-based [model sharding](https://docs.mosaicml.com/projects/composer/en/latest/notes/distributed_training.html#fullyshardeddataparallel-fsdp),
149
+ convenient [abstractions](https://docs.mosaicml.com/projects/composer/en/latest/trainer/time.html), extreme customizability via [callbacks](https://docs.mosaicml.com/projects/composer/en/latest/trainer/callbacks.html), and more.
150
+
151
+ Streaming enables fast, low cost, and scalable training on large datasets from cloud storage. It handles a variety of challenges around deterministic resumption as node counts change, avoiding redundant downloads across devices, high-quality shuffling at scale, sample-level random access, and speed.
152
+
153
+ Megablocks is a lightweight library for MoE training. Crucially, it supports “dropless MoE,” which avoids inefficient padding and is intended to provide deterministic outputs for a given sequence no matter what other sequences are in the batch.
154
+
155
+ LLM Foundry ties all of these libraries together to create a simple LLM pretraining, fine-tuning, and inference experience.
156
+
157
+ DBRX was trained using proprietary optimized versions of the above open source libraries, along with our [LLM training platform](https://www.databricks.com/product/machine-learning/mosaic-ai-training).
158
+
159
+
160
+ ## Evaluation
161
+ We find that DBRX outperforms established open-source and open-weight base models on the [Databricks Model Gauntlet](https://www.databricks.com/blog/llm-evaluation-for-icl), the [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), and HumanEval.
162
+ The Databricks Model Gauntlet measures performance on more than 30 tasks across six categories: world knowledge, common sense reasoning, language understanding, reading comprehension, symbolic problem solving, and programming.
163
+ The Hugging Face Open LLM Leaderboard measures the average of ARC-Challenge, HellaSwag, MMLU, TruthfulQA, Winogrande and GSM8k.
164
+ HumanEval measures coding ability.
165
+
166
+ Full evaluation details can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
167
+
168
+
169
+ ## Acknowledgements
170
+ The DBRX models were made possible thanks in large part to the open-source community, especially:
171
+ * The [MegaBlocks](https://arxiv.org/abs/2211.15841) library, which established a foundation for our MoE implementation.
172
+ * [PyTorch FSDP](https://arxiv.org/abs/2304.11277), which we built on for distributed training.
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DbrxForCausalLM"
4
+ ],
5
+ "attn_config": {
6
+ "clip_qkv": 8,
7
+ "kv_n_heads": 8,
8
+ "model_type": "",
9
+ "rope_theta": 500000
10
+ },
11
+ "auto_map": {
12
+ "AutoConfig": "configuration_dbrx.DbrxConfig",
13
+ "AutoModelForCausalLM": "modeling_dbrx.DbrxForCausalLM"
14
+ },
15
+ "d_model": 6144,
16
+ "emb_pdrop": 0.0,
17
+ "ffn_config": {
18
+ "ffn_hidden_size": 10752,
19
+ "model_type": "",
20
+ "moe_jitter_eps": 0.01,
21
+ "moe_loss_weight": 0.05,
22
+ "moe_num_experts": 16,
23
+ "moe_top_k": 4
24
+ },
25
+ "initializer_range": 0.02,
26
+ "max_seq_len": 32768,
27
+ "model_type": "dbrx_converted",
28
+ "n_heads": 48,
29
+ "n_layers": 40,
30
+ "output_router_logits": false,
31
+ "resid_pdrop": 0.0,
32
+ "router_aux_loss_coef": 0.05,
33
+ "tie_word_embeddings": false,
34
+ "torch_dtype": "bfloat16",
35
+ "transformers_version": "4.38.2",
36
+ "use_cache": true,
37
+ "vocab_size": 100352
38
+ }
configuration_dbrx.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dbrx configuration."""
2
+ from typing import Any, Optional
3
+
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+ DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
10
+
11
+
12
+ class DbrxAttentionConfig(PretrainedConfig):
13
+ """Configuration class for Dbrx Attention.
14
+
15
+ [`DbrxAttention`] class. It is used to instantiate attention layers
16
+ according to the specified arguments, defining the layers architecture.
17
+
18
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
19
+ documentation from [`PretrainedConfig`] for more information.
20
+
21
+ Args:
22
+ attn_pdrop (`float`, *optional*, defaults to 0.0):
23
+ The dropout probability for the attention layers.
24
+ clip_qkv (`float`, *optional*, defualts to None):
25
+ If not `None`, clip the queries, keys, and values in the attention layer to this value.
26
+ kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
27
+ rope_theta (float): The base frequency for rope.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ attn_pdrop: float = 0,
33
+ clip_qkv: Optional[float] = None,
34
+ kv_n_heads: int = 1,
35
+ rope_theta: float = 10000.0,
36
+ **kwargs: Any,
37
+ ):
38
+ super().__init__(**kwargs)
39
+ self.attn_pdrop = attn_pdrop
40
+ self.clip_qkv = clip_qkv
41
+ self.kv_n_heads = kv_n_heads
42
+ self.rope_theta = rope_theta
43
+
44
+ for k in ['model_type']:
45
+ if k in kwargs:
46
+ kwargs.pop(k)
47
+ if len(kwargs) != 0:
48
+ raise ValueError(f'Found unknown {kwargs=}')
49
+
50
+ @classmethod
51
+ def from_pretrained(cls, pretrained_model_name_or_path: str,
52
+ **kwargs: Any) -> 'PretrainedConfig':
53
+ cls._set_token_in_kwargs(kwargs)
54
+
55
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
56
+ **kwargs)
57
+
58
+ if config_dict.get('model_type') == 'dbrx':
59
+ config_dict = config_dict['attn_config']
60
+
61
+ if 'model_type' in config_dict and hasattr(
62
+ cls,
63
+ 'model_type') and config_dict['model_type'] != cls.model_type:
64
+ logger.warning(
65
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
66
+ +
67
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
68
+ )
69
+
70
+ return cls.from_dict(config_dict, **kwargs)
71
+
72
+
73
+ class DbrxFFNConfig(PretrainedConfig):
74
+ """Configuration class for Dbrx FFN.
75
+
76
+ [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
77
+ the specified arguments, defining the layers architecture.
78
+
79
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
80
+ documentation from [`PretrainedConfig`] for more information.
81
+
82
+ Args:
83
+ ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
84
+ The dict should have a key 'name' with the value being the name of
85
+ the activation function along with any additional keyword arguments.
86
+ ffn_hidden_size (int, optional): The hidden size of the feedforward network.
87
+ moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
88
+ moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
89
+ moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
90
+ moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
91
+ moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
92
+ uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
93
+ This should only be used for benchmarking purposes.
94
+ """
95
+
96
+ def __init__(
97
+ self,
98
+ ffn_act_fn: Optional[dict] = None,
99
+ ffn_hidden_size: int = 3584,
100
+ moe_num_experts: int = 4,
101
+ moe_top_k: int = 1,
102
+ moe_jitter_eps: Optional[float] = None,
103
+ moe_loss_weight: float = 0.01,
104
+ moe_normalize_expert_weights: Optional[float] = 1,
105
+ uniform_expert_assignment: bool = False,
106
+ **kwargs: Any,
107
+ ):
108
+ super().__init__()
109
+ if ffn_act_fn is None:
110
+ ffn_act_fn = {'name': 'silu'}
111
+ self.ffn_act_fn = ffn_act_fn
112
+ self.ffn_hidden_size = ffn_hidden_size
113
+ self.moe_num_experts = moe_num_experts
114
+ self.moe_top_k = moe_top_k
115
+ self.moe_jitter_eps = moe_jitter_eps
116
+ self.moe_loss_weight = moe_loss_weight
117
+ self.moe_normalize_expert_weights = moe_normalize_expert_weights
118
+ self.uniform_expert_assignment = uniform_expert_assignment
119
+
120
+ for k in ['model_type']:
121
+ if k in kwargs:
122
+ kwargs.pop(k)
123
+ if len(kwargs) != 0:
124
+ raise ValueError(f'Found unknown {kwargs=}')
125
+
126
+ @classmethod
127
+ def from_pretrained(cls, pretrained_model_name_or_path: str,
128
+ **kwargs: Any) -> 'PretrainedConfig':
129
+ cls._set_token_in_kwargs(kwargs)
130
+
131
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
132
+ **kwargs)
133
+
134
+ if config_dict.get('model_type') == 'dbrx':
135
+ config_dict = config_dict['ffn_config']
136
+
137
+ if 'model_type' in config_dict and hasattr(
138
+ cls,
139
+ 'model_type') and config_dict['model_type'] != cls.model_type:
140
+ logger.warning(
141
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
142
+ +
143
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
144
+ )
145
+
146
+ return cls.from_dict(config_dict, **kwargs)
147
+
148
+
149
+ class DbrxConfig(PretrainedConfig):
150
+ """Configuration class for Dbrx.
151
+
152
+ [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
153
+ specified arguments, defining the model architecture.
154
+
155
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
156
+ documentation from [`PretrainedConfig`] for more information.
157
+
158
+
159
+ Args:
160
+ d_model (`int`, *optional*, defaults to 6144):
161
+ Dimensionality of the embeddings and hidden states.
162
+ n_heads (`int`, *optional*, defaults to 48):
163
+ Number of attention heads for each attention layer in the Transformer encoder.
164
+ n_layers (`int`, *optional*, defaults to 40):
165
+ Number of hidden layers in the Transformer encoder.
166
+ max_seq_len (`int`, *optional*, defaults to 32768):
167
+ The maximum sequence length of the model.
168
+ vocab_size (`int`, *optional*, defaults to 100352):
169
+ Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
170
+ the `inputs_ids` passed when calling [`DbrxModel`].
171
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
172
+ The dropout probability applied to the attention output before combining with residual.
173
+ emb_pdrop (`float`, *optional*, defaults to 0.0):
174
+ The dropout probability for the embedding layer.
175
+ attn_config (`dict`, *optional*):
176
+ A dictionary used to configure the model's attention module.
177
+ ffn_config (`dict`, *optional*):
178
+ A dictionary used to configure the model's FFN module.
179
+ use_cache (`bool`, *optional*, defaults to `False`):
180
+ Whether or not the model should return the last key/values attentions (not used by all models).
181
+ initializer_range (`float`, *optional*, defaults to 0.02):
182
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
183
+ output_router_logits (`bool`, *optional*, defaults to `False`):
184
+ Whether or not the router logits should be returned by the model. Enabling this will also
185
+ allow the model to output the auxiliary loss. See [here]() for more details
186
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
187
+ The aux loss factor for the total loss.
188
+
189
+
190
+ Example:
191
+ ```python
192
+ >>> from transformers import DbrxConfig, DbrxModel
193
+
194
+ >>> # Initializing a Dbrx configuration
195
+ >>> configuration = DbrxConfig()
196
+
197
+ >>> # Initializing a model (with random weights) from the configuration
198
+ >>> model = DbrxModel(configuration)
199
+
200
+ >>> # Accessing the model configuration
201
+ >>> configuration = model.config
202
+ ```
203
+ """
204
+
205
+ model_type = 'dbrx'
206
+ attribute_map = {
207
+ 'num_attention_heads': 'n_heads',
208
+ 'hidden_size': 'd_model',
209
+ 'num_hidden_layers': 'n_layers',
210
+ 'max_position_embeddings': 'max_seq_len'
211
+ }
212
+
213
+ def __init__(
214
+ self,
215
+ d_model: int = 2048,
216
+ n_heads: int = 16,
217
+ n_layers: int = 24,
218
+ max_seq_len: int = 2048,
219
+ vocab_size: int = 32000,
220
+ resid_pdrop: float = 0.0,
221
+ emb_pdrop: float = 0.0,
222
+ attn_config: Optional[DbrxAttentionConfig] = None,
223
+ ffn_config: Optional[DbrxFFNConfig] = None,
224
+ use_cache: bool = True,
225
+ initializer_range: float = 0.02,
226
+ output_router_logits: bool = False,
227
+ router_aux_loss_coef: float = 0.05,
228
+ **kwargs: Any,
229
+ ):
230
+ if attn_config is None:
231
+ self.attn_config = DbrxAttentionConfig()
232
+ elif isinstance(attn_config, dict):
233
+ self.attn_config = DbrxAttentionConfig(**attn_config)
234
+ else:
235
+ self.attn_config = attn_config
236
+
237
+ if ffn_config is None:
238
+ self.ffn_config = DbrxFFNConfig()
239
+ elif isinstance(ffn_config, dict):
240
+ self.ffn_config = DbrxFFNConfig(**ffn_config)
241
+ else:
242
+ self.ffn_config = ffn_config
243
+
244
+ self.d_model = d_model
245
+ self.n_heads = n_heads
246
+ self.n_layers = n_layers
247
+ self.max_seq_len = max_seq_len
248
+ self.vocab_size = vocab_size
249
+ self.resid_pdrop = resid_pdrop
250
+ self.emb_pdrop = emb_pdrop
251
+ self.use_cache = use_cache
252
+ self.initializer_range = initializer_range
253
+ self.output_router_logits = output_router_logits
254
+ self.router_aux_loss_coef = router_aux_loss_coef
255
+
256
+ tie_word_embeddings = kwargs.pop('tie_word_embeddings', False)
257
+ if tie_word_embeddings:
258
+ raise ValueError(
259
+ 'tie_word_embeddings is not supported for Dbrx models.')
260
+
261
+ super().__init__(
262
+ tie_word_embeddings=tie_word_embeddings,
263
+ **kwargs,
264
+ )
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 100257
5
+ ],
6
+ "transformers_version": "4.38.2"
7
+ }
model-00001-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc28da4a69f7e5548bc4086fba3cef26c79f653793501cee044f1c23b0d83f2d
3
+ size 3523439624
model-00002-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:698e440b10f1ffd061386048a38686c95b42877907212ade39ae414a38516914
3
+ size 4404245416
model-00003-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a53fc538e17f263dbafac832259701d71c039ceb0e0a133411bd868f67c1412
3
+ size 4227862560
model-00004-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf9092dfa84fa162fcecba742834997c052b38a08024c560b5724a29b7ea181d
3
+ size 4404245416
model-00005-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:704462d7756dd4eab0d486ef4cbdaf0723201238bd2dccf0a1b210e85f0e81d6
3
+ size 4404245416
model-00006-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8a9a26d88087d1f17f398bad35b00f8c9df6d45684c2d27ed51c6fd8479da0a
3
+ size 4227862560
model-00007-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bac5317e3eac638314e103fa0e3a1a2df53b4b9c08b0530fe74632733f305ed9
3
+ size 4404245416
model-00008-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c88244322308be72113df4e549db78dca660ae3b1c9918a50a51826fe1369605
3
+ size 4404245416
model-00009-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f06f6d9014cf96eb3650c99db4045d37c8b7e34c5117e9a2738d400de1658860
3
+ size 4227862560
model-00010-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14b737a3b878d7176ecf747ca84b5477e5e87fdf29f8f9e91ddefec61180fc93
3
+ size 4404245416
model-00011-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85af95c26290fceacfddedc271dfdeadf75c4537cbf38a2162844e7dd531db74
3
+ size 4404245416
model-00012-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e56c3d4c9786fcb2eeeddca5f1ffadd4e8a4b64514c807595572ba0fc0f61b76
3
+ size 4227862560
model-00013-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa30aa9cfe2913fb4c4162a3bffa4c705ae03da9ee866982efab504fbb658c8
3
+ size 4404245416
model-00014-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d3f5a60c05bfa2b985fdd148ad7ad360d99d2384d0a5a9fb8205a489a0f95e
3
+ size 4404245416
model-00015-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f55d4b560746620978473685d5a6c87a5eb9ac780b97592a77a27e9fd67dce
3
+ size 4227862560
model-00016-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e21a36bc448a715b3e51c9f131a9ff6e322b499028bfa8af88650d90cf7f30b
3
+ size 4404245440
model-00017-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36fe189b08d023fae4481ab80a70ffbc1980f71e41174fb54ec0bf5d672b2fb9
3
+ size 4404245456
model-00018-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1b24cd6fbcb7458e33aa6db4702aad519a32637b15fda82454747f5e9432e13
3
+ size 4227862592
model-00019-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf77fa36fccf9920386ce934ff38779eccac713df06b088adf48b6372504f01
3
+ size 4404245456
model-00020-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7bac559137d755e4aa144ddbb941dcbf44be3a2a7e40235ef5c4a791b90a7ca
3
+ size 4404245456
model-00021-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45bb5601b485a573af20d7b23bad58e0f4146b349644bd8f728b06252102176b
3
+ size 4227862592
model-00022-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11aa46e29483a012c65fa908d714832adaac3e4f7fe393327720c6e62cc5b181
3
+ size 4404245456
model-00023-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41646f1544ea9f252924af294fa09ef491e9c8e5bcde113ee563cec0999bbfe1
3
+ size 4404245456
model-00024-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2523a4be387557beb04260b0d08cf8a37d0a87006197c9103f365e3cfad06daf
3
+ size 4227862592
model-00025-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:737ec65ff78b7770d09da5ab4087f26e54a4320fbf83ba7ce1650e11d2ab73c8
3
+ size 4404245456
model-00026-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3034000a88019b565c051aa18dd73329139e33fae962077a71317967d78d4c94
3
+ size 4404245456
model-00027-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41879f07d50975e532be376f7efc62bcb5be7719b9303ef8c807706c43c2ad43
3
+ size 4227862592
model-00028-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0939a016e18c68adc20f712afd28b89c1a6b463a07888854a60ac3eb3f62ab17
3
+ size 4404245456
model-00029-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc3c688b7f3ba199fb07883411d7b0bcb78f13aa7f227cc9a02d6acc3a3da603
3
+ size 4404245456
model-00030-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af610db5f894204a2c2c6891f4644b5ae4f38e429f1b33978d3567981aa61faa
3
+ size 4227862592
model-00031-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9a0215ee52194d33cecf47765803c2e6f41fbaa928f1cc65f5cc0569b0b96f2
3
+ size 4404245456
model-00032-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d668fb61c857ba8d6a08fc3ecb34408b9ea3a27c5dd4e5a459f2459235a0d29
3
+ size 4404245456
model-00033-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa6f00ab09861368f0aaf66ec92bf43b17a70cd8bfb5be6264aa5b21d9a36a25
3
+ size 4227862592
model-00034-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baf918d45e5f5f3b684da913b44fb5939e593c5f795bfcb9c11c3f6591c7daf8
3
+ size 4404245456
model-00035-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee00379e0a448352ab4d5ec9d359f2c41f924730d3db2bad4cc1b3cacd3641c
3
+ size 4404245456
model-00036-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:792058de69a54860e03de6dcd68bb88c758e0e701f9a9054ef95e74b66f641c1
3
+ size 4227862592
model-00037-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a8f73c9148bd468922c2cb6892e63387cb57811be537d1c04ed1849b3624441
3
+ size 4404245456
model-00038-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0c0e459f70f3ba6e3f2a8e3edc35e41457dac4d7e0d6cf0933353ee5f90c368
3
+ size 4404245456
model-00039-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b597724c9c431c9b66ef7f1c5c98c572e1eb2cc04eab00b4b5578c804f7dabe2
3
+ size 4227862592
model-00040-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13408aa90943912d344f63cacc190a031575564f94e65adf24064d9ae76335a7
3
+ size 4404245456
model-00041-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:025e3c0e161da006cde492d2288458ecb8db5512961c41d60dfd51ff3cf825b2
3
+ size 4404245456
model-00042-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93347f13c06c95ed4131c03f1bcbade4aaf2fc9e8e8738483467ed482e1ff556
3
+ size 4227862592
model-00043-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b789774316dabda443787a23943410bcd6f6fc85a0321a3d14f6488b6bb172cf
3
+ size 4404245456
model-00044-of-00061.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3efb71226981a713f6551cc0672a9e12e037fcfbda3cc41d11dac7c0fc03624
3
+ size 4404245456