Super-squash branch 'main' using huggingface_hub
Browse filesCo-authored-by: cllatMTK <cllatMTK@users.noreply.huggingface.co>
- .gitattributes +35 -0
- README.md +264 -0
- added_tokens.json +4 -0
- config.json +29 -0
- generation_config.json +8 -0
- pytorch_model-00001-of-00019.bin +3 -0
- pytorch_model-00002-of-00019.bin +3 -0
- pytorch_model-00003-of-00019.bin +3 -0
- pytorch_model-00004-of-00019.bin +3 -0
- pytorch_model-00005-of-00019.bin +3 -0
- pytorch_model-00006-of-00019.bin +3 -0
- pytorch_model-00007-of-00019.bin +3 -0
- pytorch_model-00008-of-00019.bin +3 -0
- pytorch_model-00009-of-00019.bin +3 -0
- pytorch_model-00010-of-00019.bin +3 -0
- pytorch_model-00011-of-00019.bin +3 -0
- pytorch_model-00012-of-00019.bin +3 -0
- pytorch_model-00013-of-00019.bin +3 -0
- pytorch_model-00014-of-00019.bin +3 -0
- pytorch_model-00015-of-00019.bin +3 -0
- pytorch_model-00016-of-00019.bin +3 -0
- pytorch_model-00017-of-00019.bin +3 -0
- pytorch_model-00018-of-00019.bin +3 -0
- pytorch_model-00019-of-00019.bin +3 -0
- pytorch_model.bin.index.json +1002 -0
- special_tokens_map.json +30 -0
- tokenizer.model +3 -0
- tokenizer_config.json +62 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
- zh
|
5 |
+
pipeline_tag: text-generation
|
6 |
+
extra_gated_prompt: "The model weights are available for partners to download and deploy on-premises. Please submit your application here, and we will contact with you via email. If you have any questions, you can also contact us at info@mtkresearch.com.\n\n這個模型權重可供合作夥伴下載和地端部署。請在此提交您的申請,我們將透過電子郵件與您聯繫。如有任何疑問也歡迎透過 info@mtkresearch.com 與我們聯繫。"
|
7 |
+
extra_gated_fields:
|
8 |
+
Name: text
|
9 |
+
Company: text
|
10 |
+
Title: text
|
11 |
+
Contact Email: text
|
12 |
+
---
|
13 |
+
|
14 |
+
# Breexe-8x7B-Instruct-v0_1
|
15 |
+
|
16 |
+
|
17 |
+
Breexe-8x7B is a language model family that builds on top of [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1),
|
18 |
+
specifically intended for Traditional Chinese use.
|
19 |
+
|
20 |
+
Breexe-8x7B-Base is the base model for the Breexe-8x7B series. Breexe-8x7B-Base expands the original vocabulary with additional
|
21 |
+
30,000 Traditional Chinese tokens. With the expanded vocabulary, Breexe-8x7B operates at twice the inference speed for Traditional
|
22 |
+
Chinese to Mixtral-8x7B. [See [Inference Performance](#inference-performance).]
|
23 |
+
|
24 |
+
[Breexe-8x7B-Instruct](https://huggingface.co/MediaTek-Research/Breexe-8x7B-Instruct-v0_1) derives from the base model Breexe-8x7B-Base,
|
25 |
+
making the resulting model amenable to be used as-is for commonly seen tasks, such as Q&A, RAG, multi-round chat, and summarization.
|
26 |
+
**Breexe-8x7B-Instruct demonstrates impressive performance in benchmarks for Traditional Chinese and English, on par with OpenAI's gpt-3.5-turbo-1106.**
|
27 |
+
[See [Chat Model Performance](#chat-model-performance).]
|
28 |
+
|
29 |
+
The current release version of Breexe-8x7B is v0.1.
|
30 |
+
|
31 |
+
*The models were trained on Nvidia's Taipei-1. Special thanks for Nvidia's technical support.*
|
32 |
+
|
33 |
+
*A project by the members (in alphabetical order): Chan-Jan Hsu 許湛然, Chang-Le Liu 劉昶樂, Feng-Ting Liao 廖峰挺, Po-Chun Hsu 許博竣, Yi-Chang Chen 陳宜昌, and the supervisor Da-Shan Shiu 許大山.*
|
34 |
+
|
35 |
+
## Demo
|
36 |
+
|
37 |
+
<a href="https://huggingface.co/spaces/MediaTek-Research/Demo_Breexe-8x7B-Instruct-v0_1" style="color:red;font-weight:bold;">Try Demo Here 👩💻🧑🏻💻</a>
|
38 |
+
|
39 |
+
Some examples are shown below.
|
40 |
+
(System Prompt: `You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.`, Temperature: 0.01, Top-P: 0.01, Repetition-Penalty: 1.1)
|
41 |
+
|
42 |
+
<details><summary><b>Education</b></summary>
|
43 |
+
|
44 |
+
| Query | 1️⃣ 請問在量子物理領域中,疊加原理是什麼?它又是如何與量子糾纏現象相關聯的?<br>2️⃣ 請用上述概念說明「薛丁格的貓」這個假想實驗 |
|
45 |
+
|---|---|
|
46 |
+
| **Breexe** | 1️⃣ <br>在量子物理領域中,疊加原理(Superposition Principle)是一個非常重要且獨特的概念。這個原則表示,微觀粒子如電子、光子等,可以同時存在於多個狀態之中,直到它們被測量時才「決定」其特定狀態。換句話說,在未進行觀測之前,量子系統處於一種混合態,包含了所有可能的狀態。<br><br>舉例來說,假設有一個量子系統具有兩個基本狀態:\|0⟩和\|1⟩。根據疊加原理,該系統可以同時處在這兩種狀態的疊加,即α\|0⟩+β\|1⟩,其中α和β分別代表不同狀態的機率振幅。當我們對這個量子系統進行測量時,它會「選擇」某個特定狀態,例如\|0⟩或\|1⟩,而其他未被選中的狀態將被排除在外。在這個過程中,整個系統的波函數會瞬間崩塌到選定的狀態。<br><br>量子糾纏現象(Quantum Entanglement)則是另一個量子物理中的核心概念。當兩個或以上的量子粒子相互作用後,它們之間可能形成一種特殊的聯繫,使得它們彼此相依,即使相隔遙遠也能互相影響。也就是說,當我們對其中一個粒子進行測量時,另一個粒子也會立即受到影響,無論它們相距多遠。這違反了古典物理學的定律,因此也被稱為「超距作用」。<br><br>疊加原理與量子糾纏現象兩者間有密切關聯。在某些情況下,量子糾纏現象需要疊加原理才能成立。例如,當兩個粒子處於糾纏狀態時,它們各自的狀態不能單獨描述,而是必須以整體的角度來考慮。在這種狀況下,疊加原理便提供了一個有效的方法,將兩個粒子的狀態結合為一個統一的疊加態。<br><br>此外,疊加原理也為量子計算機帶來了巨大的潛力。由於量子位元可以同時存在於多個狀態,這讓量子計算機擁有遠超傳統電腦的運算能力。利用疊加原理,量子計算機可以在短時間內處理大量數據,從而解決一些目前尚待解決的難題。<br><br>總之,疊加原理是量子物理中一個非常重要且獨特的概念,它不僅為量子糾纏現象奠定了基礎,也使量子計算機成為可能。了解疊加原理有助於深入理解量子世界的奇妙特性,並探索更多未知的量���現象。<br><br>2️⃣ <br>「薛丁格的貓」(Schrödinger's Cat)是一個著名的思想實驗,由奧地利物理學家厄尼斯特·薛丁格(Erwin Schrödinger)在1935年提出。這個實驗旨在展示量子力學與巨觀世界之間的矛盾,以及疊加原理在宏觀尺度下的荒謬性。<br><br>首先,讓我們回顧一下疊加原理。根據這個原則,微觀粒子可以同時存在於多個狀態,直到它們被測量時才「決定」其特定狀態。然而,當我們將這個概念放大至巨觀尺度時,疊加原理似乎變得不合理。<br><br>在薛丁格的貓這個假想實驗中,有一個密閉容器內裝有一隻活貓、一盒有毒氣體、以及一個放射性原子。如果放射性原子衰變,毒氣便會釋放出來,導致貓死亡;反之,若原子不衰變,則貓保持活著。按照古典物理的觀點,在任何給定時刻,貓只能處在活或死的單一狀態。<br><br>然而,從量子力學的角度來看,事情就變得有趣了。假設放射性原子處於疊加態,既可能衰變又可能不衰變。在這種情況下,根據疊加原理,貓也應該處於一個疊加態,也就是同時活又死。這個結果顯然違背常理,因為在我們日常生活中,物體總是處於明確的狀態。<br><br>這個實驗進一步強調了量子世界與巨觀世界之間的差異,以及疊加原理在巨觀尺度下的荒謬性。時至今日,「薛丁格的貓」仍然是一個熱門的話題,經常用於討論量子力學的奇特性質及其與巨觀世界的關係。 |
|
47 |
+
|
48 |
+
</details>
|
49 |
+
|
50 |
+
<details><summary><b>Open Question</b></summary>
|
51 |
+
|
52 |
+
| Query | 生命的意義是什麼? 請用哲學角度探討,盡可能引經據典 |
|
53 |
+
|---|---|
|
54 |
+
| **Breexe** | 生命的意義是一個亙古以來的人類存在與生存問題。在哲學領域中,不同學派和思想家對於生命的意義有各自獨特且深刻的見解。以下將引用幾位重要哲學家的觀點,以提供您對這個議題的理解:<br>1. 蘇格拉底(Socrates):蘇格拉底主張「知識就是力量」,他認為人應該追求智慧、道德和美德。生命的意義在於追求真理,通過自我反省和思辨來提升心靈,並了解自己的角色和責任。他強調要認識自己,才能找到生命的意義。<br>2. 亞里斯多德(Aristotle):亞里斯多德提出了「目的論」(teleology),他相信每個事物都有其自然目的或最終目標。生命的意義就在於實現我們的潛能,完成我們的使命。他提到人類具有理性能力,因此生命的意義是在實踐德行(virtue),追求幸福(eudaimonia),即一種圓滿而美好的生活。<br>3. 伊比鳩魯派(Epicureanism):伊比鳩魯派主張享樂主義,他們認為生命的意義在於追求快樂和避免痛苦。在這個理論下,生命的意義是享受當下,珍惜有限的時間,以及建立良好的人際關係。然而,這並不意味著放縱或自私行為;相反地,它鼓勵明智地選擇那些能帶來真正快樂的事物。<br>4. 斯多噶學派(Stoicism):斯多噶學派強調控制情感和意志力,以克服人生中的困難和逆境。生命的意義在於接受命運,保持內心平靜,並努力過上自律的生活。他們相信,只要我們能夠掌握自己的心態,就能夠應對任何困境。<br>5. 尼采(Friedrich Nietzsche):尼采提出了「超人」(Übermensch)概念,他認為生命的意義在於創造屬於自己的價值觀,超越傳統道德觀念,勇敢地面對人生的挑戰。他呼籲人們要活出自我,追求卓越,並成為自己的主宰。<br>6. 沙特(Jean-Paul Sartre):沙特是存在主義者,他認為生命的意義取決於個人的抉擇。他主張自由意志,強調每個人都可以決定自己的道路。生命的意義就在於為自己的行動負責,並在過程中塑造出自己的存在。<br>7. 馬丁·布伯(Martin Buber):布伯提出了「我─你」(Ich-Du)的概念,他認為生命的意義在於與他人的相遇和交流。他強調人與人之間的相互尊重和愛,認為只有通過與他人的交往,我們才能充分感受到生命的意義。<br>以上只是部分哲學家的觀點,但希望這些引文有助於您進一步思考生命的意義。每個人都有自己的答案,而在探索這個問題時,重要的是要保持開放的心態,尊重他人的看法,並勇於追求自己的信仰。 |
|
55 |
+
|
56 |
+
</details>
|
57 |
+
|
58 |
+
<details><summary><b>Email Writing</b></summary>
|
59 |
+
|
60 |
+
| Query | 請擬定一封專業的郵件,尋求主管對你準備的「季度財務報告」提供意見。特別詢問有關資料分析、呈現風格,以及所提取結論的清晰度。郵件請簡潔扼要。 |
|
61 |
+
|---|---|
|
62 |
+
| **Breexe** | 尊敬的主管:<br/><br/>我是您的下屬 XXXX,我正在籌備本年度第三季的財務報告。為確保這份報告能充分反映我們部門在過去三個月內的表現,並提供有價值的見解以支持公司決策制定,我���請求您審閱我的初稿並分享您的寶貴意見。<br/><br/>以下是我希望您在審閱時關注的幾個方面:<br/>1.數據分析:確認我所使用的數據是否正確且來源可靠,以及我在報告中對這些數據進行了哪些分析方法。如果您認為需要添加或修改任何分析方式,也懇請不吝賜教。<br/>2.呈現風格:我希望這份報告既能展示清晰的數字信息,又能保持視覺吸引力。因此,我非常期待您的反饋,以便我能夠進一步完善圖表和圖形等元素,使其更容易理解和消化。<br/>3.結論清晰度:最後,我希望報告中的結論能夠清晰地傳達我們部門在本季度的表現,以及可能影響未來業績的因素。在此基礎上,我希望您能指導我如何更好地表達這些結論,以便使報告更具說服力。<br/>懇請您在方便時審閱這份報告,並將您的意見與建議發送至我的電子郵箱 (xxxxx@example.com)。您的指導將對我的工作產生巨大幫助,再次感謝您的支持!<br/><br/>敬祝<br/>商祺<br><br>XXXX(您的名字)|
|
63 |
+
|
64 |
+
</details>
|
65 |
+
|
66 |
+
<details><summary><b>Table Understanding</b></summary>
|
67 |
+
|
68 |
+
| Query | 這是一個表格,其中第一行是標題,後續的每一行都是一隻企鵝:<br><br>姓名,年齡,身高(cm),體重(kg)<br>路易斯, 7, 50, 11<br>伯納德,5, 80, 13<br>文森特, 9, 60, 11<br>格溫,8,70,15<br><br>請問最高的企鵝是哪隻? |
|
69 |
+
|---|---|
|
70 |
+
| **Breexe** | 在提供的表格中,最高的企鵝是伯納德。他的身高為 80 公分。以下是各企鵝的詳細資料:<br><br>姓名, 年齡, 身高(cm), 體重(kg)<br>路易斯, 7, 50, 11<br>伯納德, 5, 80, 13 (最高)<br>文森特, 9, 60, 11<br>格溫, 8, 70, 15 |
|
71 |
+
|
72 |
+
</details>
|
73 |
+
|
74 |
+
<details><summary><b>Language Assistance</b></summary>
|
75 |
+
|
76 |
+
| Query | 1️⃣<br>修正以下英語句子中的錯誤<br>On the other words, making friends with smarter people can help people make money. <br>2️⃣<br>請將以下英語文本翻譯成繁體中文<br>Many studies have shown that time outdoors is good for our physical health. Three wilderness education experts explain why periods alone in nature also provide mental and spiritual benefits. |
|
77 |
+
|---|---|
|
78 |
+
| **Breexe** | 1️⃣<br>修正以下英語句子中的錯誤:<br>在另一方面,與更聰明的人交朋友可以幫助人賺錢。<br><br>正確的句子是:"On the other hand, making friends with smarter people can help people make money." <br><br>2️⃣<br>許多研究顯示,待在戶外對身體健康有好處。三位荒野教育專家解釋了為什麼單獨在大自然中待一段時間也能提供心理和靈性上的益處。<br><br>正確的繁體中文版本是:「很多研究表明,在戶外的時間對我們的身體健康是有益的。三位野地教育專家解釋了為什麼在大自然中獨處也會帶來心理和精神上的利益。」|
|
79 |
+
|
80 |
+
</details>
|
81 |
+
|
82 |
+
<details><summary><b>Summary</b></summary>
|
83 |
+
|
84 |
+
| Query | """<br>聯發科技集團旗下的人工智慧研究單位聯發創新基地,昨 (11/23) 日與臺北市政府資訊局和國立臺北科技大學簽署合作備忘錄,將其自主研發的中文大型語言模型授權予臺北市政府資訊局,並由北科大協助部署應用。透過生成式人工智慧工具的導入,為臺北市政府同仁建構兼具資訊安全、高生產力的智慧工作模式。這也是業界、學界和政府機關三方攜手推動中文生成式人工智慧發展的重要里程碑。<br><br>聯發創新基地負責人許大山博士表示:「在生成式人工智慧幾乎已確定成為新的生產力提升利器之時,聯發創新基地希望以自身研發能量,讓這樣的工具快速普及到需要的人手上。」<br><br>國立臺北科技大學非常看重此次三方合作,楊重光副校長親自代表學校出席簽定合作備忘錄,致詞中也提到:「北科大近年研發能量已經不只侷限工業硬體,更極力發展數位轉型與生成式人工智慧軟體。此次以學術界角色,參與臺北市政府與聯發創新基地合作,可提供研究能量協助進行提詞優化、辦公室自動化程度提升、公共服務改善、智慧城市多元應用,由學術創新使生成式人工智慧發展可以超越業界期待,並期許多起合作案例能帶動更多數位轉型的成功發展。<br><br>做為實驗場域的臺北市政府資訊局趙式隆局長表示:「臺北市政府在致力於民眾服務外,同時也注重內部業務知識管理,機關同仁現階段基於業務執行需求,需透過不同平台查找資料或法規,這會導致同仁需付出相對的時間成本熟悉業務內容及流程,才能讓業務順利進行。我們希望透過生成式人工智慧工具的輔助,讓同仁以自然語言方式詢問即可查詢相關規定或函釋,減輕同仁負擔。資訊局亦借鑑「行政院及所屬機關(構)使用生成式人工智慧參考指引」之原則,期待同仁能安心地使用創新的生成式人工智慧工具,幫助同���提升行政效率。」<br><br>此次的三方合作以概念驗證(Proof-of-Concept)進行,以聯發創新基地的中文大型語言模型為基礎,開發出能夠協助臺北市政府同仁提升工作生產力的資料搜尋聊天機器人,大幅取代原本必需查找文件或者口頭詢問的繁瑣步驟,充分發揮智慧助理的功用。該模型以地端方式授權,資料留存於市府,維護市民資訊保密權益。此外,臺北市政府可提供機密資訊以外的使用回饋予聯發創新基地,做為後續調校改進之依據,以實際使用經驗促進未來語言模型發展。<br>"""<br><br>請用三點來摘要以上內容 |
|
85 |
+
|---|---|
|
86 |
+
| **Breexe** | 1. 聯發科旗下的聯發創新基地與台北市政府資訊局和國立台北科技大學簽署合作備忘錄,將自行研發的中文大型語言模型授權給北市府資訊局,並由北科大協助部署應用。<br>2. 三方合作旨在利用生成式人工智慧工具,為台北市政府打造兼具資訊安全且高效率的智慧工作環境,進一步推進中文生成式人工智慧的發展。<br>3. 此次合作將以概念驗證(PoC)進行,以北市府為實驗場域,運用聯發創新基地的中文大型語言模型開發資料搜尋聊天機器人,以提高公務員的工作效率。 |
|
87 |
+
|
88 |
+
</details>
|
89 |
+
|
90 |
+
<details><summary><b>Roleplay</b></summary>
|
91 |
+
|
92 |
+
| Query | 1️⃣<br>現在你是機器學習工程師了。你的工作是解釋複雜的機器學習概念,用簡化方式讓沒有技術背景的客戶理解並信任你的產品。讓我們從這個問題開始:「什麼是語言模型?它是使用標註的資料還是未標註的資料來訓練?」<br>2️⃣<br>這是真的嗎?我聽說其他一些公司有不同的作法來執行這項工作而且可以做得更安全。 |
|
93 |
+
|---|---|
|
94 |
+
| **Breexe** | 1️⃣ <br>首先,讓我們簡單地了解一下「語言模型」(Language Model)。它在自然語言處理(NLP)中扮演著重要角色,主要用於預測和生成文本。換句話說,語言模型可以根據給定的上下文,推斷出可能的詞彙或句子。<br><br>想像你正在閱讀一本書,而書中的每個字都代表一個「單詞」。語言模型就像是一位聰明的小助手,他能分析整本書籍,找出哪些單詞通常會一起出現、哪些不太可能連在一起。有了這些知識後,當小助手遇到新的文本時,他就能猜測接下來的單詞,或者自動完成你想要說的話。<br><br>現在,談到標註與未標註的資料。在機器學習領域,有兩種主要類型的資料:標註資料(Labeled Data)和未標註資料(Unlabeled Data)。標註資料是指那些已標記類別或結果的樣本,例如將圖片分類為貓或狗;而未標註資料則指那些尚未被標記的樣本。<br><br>回到語言模型。在早期,大多數語言模型都是基於未標註的資料進行訓練。研究人員收集大量的文本,然後利用統計方法計算單詞之間的關聯性。然而,隨著深度學習的崛起,現代語言模型越來越多地採用標註資料進行訓練。在這種情況下,標註資料包括具有正確答案的文本配對,如問句與回答、文章段落與摘要等。<br><br>在訓練過程中,機器學習演算法將不斷調整參數,以提高對新文本的預測能力。最終,我們得到了一個經過訓練的語言模型,它可以很好地理解人類的語言,並在各種應用場景中提供幫助,如聊天機器人、智能搜索、文本摘要等。<br><br>希望以上的解釋能讓您更容易理解語言模型以及其如何運用標註和未標註的資料。如果您有其他疑問,請隨時提出!<br><br>2️⃣ <br>是的,您的疑慮是有道理的。在人工智慧 (AI) 和機器學習 (ML) 領域,不同公司確實可能採取不同的策略和方法來建立語言模型。以下是我們進一步探討的主要原因:<br><br>1. 安全性和隱私:確保用戶數據安全和保護個人信息是各公司關注的重點。因此,有些公司可能傾向於使用未標註的公開數據集,而不是收集和使用標註的私人數據。這有助於減少潛在的隱私風險,同時保持高水平的安全性。<br>2. 成本效益:標註資料需要人力資源投入,以便將原始文本標記為特定類別或結果。這增加了成本,特別是在涉及到大量數據的情況下。相比之下,未標註的資料不需要額外的人工標註,從而降低了成本。<br>3. 性能差異:在某些情況下,未標註的資料可能比標註的資料更具多樣性和涵蓋範圍。通過充分利用未標註的資料,企業可以獲得更多的信息,從而提升模型性能。然而,標註的資料通常包含明確的目標,使模型能夠更好地理解所需的任務。因此,兩者各有利弊。<br>4. 模型設計:不同公司可能擁有獨特的演算法和模型設計,使其在處理標註或未標註的資料方面有所不同。例如,一種模型可能非常擅長處理標註的資料,另一種模型則可能在未標註的資料上表現得更好。<br>5. 業務需求:最後,不同公司可能因其業務需求而在語言模型的開發和部署方面存在差異。例如,一家專注於社交媒體平台的公司可能需要一個高度個性化的聊天機器人,而另一家公司可能需要一個通用的文本摘要工具。在這種情況下,它們可能需要不同類型的語言模型。<br>總之,不同公司在語言模型的培訓和部署方面可能存在差異,這取決於他們的安全考慮、成本控制、性能要求、模型設計和業務需求。無論哪種做法,最重要的是確保所使用的方法符合行業標準和法律法規,並且在保護用戶數據和隱私的前提下提供良好的服務。|
|
95 |
+
|
96 |
+
</details>
|
97 |
+
|
98 |
+
<details><summary><b>Extraction</b></summary>
|
99 |
+
|
100 |
+
| Query | """<br>〔記者許國楨/台中報導〕有金融背景外號「李董」的李示水,涉嫌自11年前開始成立投資集團,非法經營外匯、期貨,還以保證獲利為由向投資人吸金超過249億元,案經台中檢警調聯手偵辦,分兩波行動將李男及成員共47人拘提到案,查扣李名下93筆房地產,以及包括賓利、藍寶堅尼等5輛豪車,多金程度令人咋舌。<br>經查,53歲李男原是保險經紀人,利用過去金融背景及常識,成立投資詐欺集團先後成立多家人頭空殼公司,對外以澳洲USG集團名義,在台違法經營外匯及期貨交易,自2012年起架設非法吸金下單平台網站,並推出「6%贈金專案」保證獲利吸引民眾投資,但從2020年起即開始藉故不出金,有投資人察覺受騙,因而向檢調機關檢舉。<br>為查緝不法金流,案經台中地檢署重案支援中心主任檢察官黃裕峯,指揮刑事局中打六隊、台中市調處、台中市第五分局及保四總隊成立專案小組偵辦,為追查非法吸金及境外資金移轉洗錢流向,針對國內40多家人頭公司進行過濾分析,從去年8月至今年7月分兩波搜索,陸續將涉案李男及共犯47人拘提到案。<br>為追徵不法吸金犯罪所得,同時查扣公告現價超過1.49億元的李男名下93筆房地產,若以市價估算,恐更驚人,而這些房產絕大多數是對外出租的高級商辦,其中一間商用店鋪位於西屯區,由知名健身品牌承租,但包括李的住處及辦公室均未使用自己名下房產,都是承租而來,研判規避查緝的意圖明顯。<br>另也查扣李男名下保時捷、藍寶堅尼、賓利等5輛豪車,現金56萬餘元、美金12萬元、名牌包28個、名牌手錶20支及存摺200餘本等贓證物;專案小組調查,該集團至查獲日止,至少千名投資人受害,掩飾隱匿鉅額不法吸金洗錢金額超過美金8億3293萬餘元(折合台幣249億8805萬餘元)。<br>而全案經移送台中地檢署後,檢方近日偵結,將李等涉案47人依違反銀行法等罪起訴,刑事局中打呼籲民眾,任何投資應透過國內合法證券期貨商進行下單匯款,勿輕信「保證獲利」投資方案,極可能都是詐騙,最後血本無歸,如有發現「保證獲利」投資專案招募或廣告,請立即向警察機關檢舉或撥打165反詐騙諮詢專線即時查證。<br>而檢警將持續共同全力守護人民財產,貫徹打擊不法金流專案,維持社會安定及國家經濟穩定,展現政府打擊非法吸金及詐欺犯罪決心。<br>"""<br>請根據以上新聞內容,抓取出可能罪犯,及其可能犯的罪狀。由JSON格式(List of Dict)輸出,每個Dict包含兩個key: name 和 crimes |
|
101 |
+
|---|---|
|
102 |
+
| **Breexe** | [<br>{<br>"name": "李示水",<br>"crimes": ["非法經營外匯、期貨", "非法吸金", "詐欺"]<br>},<br>{<br>"name": "共犯47人",<br>"crimes": ["違反銀行法", "詐欺"]<br>}<br>] |
|
103 |
+
|
104 |
+
</details>
|
105 |
+
|
106 |
+
<details><summary><b>Coding</b></summary>
|
107 |
+
|
108 |
+
| Query | 實作一個函數以 O(1) 的空間複雜度和 O(n) 的時間複雜度,找到兩個不同大小的已排序陣列的中位數。 |
|
109 |
+
|---|---|
|
110 |
+
| **Breexe** | 首先,我們需要了解如何在 O(1) 的空間複雜度下計算出兩個已排序陣列的中位數。這裡提供了一個演算法:<br><br>假設有兩個已排序陣列 A 和 B,其長度分別為 m 和 n (m <= n)。我們將利用以下步驟來計算中位數:<br>1.找出兩個陣列的中間元素索引 i = (m + n - 1)/2 和 j = m + n/2。注意,如果 n 是奇數,則取整後的 i 和 j 可能相同;如果 n 是偶數,i 和 j 分別表示兩個中位數的位置。<br>2.比較 A[i-1] 和 B[j-1]、A[i] 和 B[j]。<br>3.如果 A[i-1] > B[j-1],則中位數位於 B[0..j-1] 區間,因此可以忽略 A[i..m] 區間。反之,如果 A[i] < B[j],則可以忽略 B[j+1..n] 區間。<br>4.重複上述步驟,直到剩下一個區間。這個區間包含兩個陣列的中位數。<br>5.最後,計算中位數。如果 n 是奇數,中位數就是該區間的中間元素;如果 n 是偶數,中位��就是該區間的前兩個元素之平均值。<br><br>以下是 Python 程式碼範例: <br>\`\`\`<br>def find_median(A, B):<br> m, n = len(A), len(B)<br> <br> if m > n:<br> A, B, m, n = B, A, n, m # Ensure m <= n<br> <br> imin, imax = 0, m<br> while imin <= imax:<br> i = (imin + imax) // 2<br> j = (m + n + 1) // 2 - i<br> <br> # If i is too small or too large, then adjust it accordingly<br> if i < m and B[j-1] > A[i]:<br> imin = i + 1<br> elif i > 0 and A[i-1] > B[j]:<br> imax = i - 1<br> else:<br> max_left_A = float('-inf') if i == 0 else A[i-1]<br> min_right_A = float('inf') if i == m else A[i]<br> <br> max_left_B = float('-inf') if j == 0 else B[j-1]<br> min_right_B = float('inf') if j == n else B[j]<br> <br> if (m + n) % 2 == 0:<br> return (max(max_left_A, max_left_B) + min(min_right_A, min_right_B)) / 2<br> else:<br> return max(max_left_A, max_left_B)<br><br> raise ValueError("No median found")<br>\`\`\`<br><br>以上演算法符合要求,在 O(1) 的空間複雜度下完成任務,並且在 O(n) 的時間複雜度內運算。|
|
111 |
+
|
112 |
+
</details>
|
113 |
+
|
114 |
+
<details><summary><b>Math</b></summary>
|
115 |
+
|
116 |
+
| Query | 三角形的頂點位於點 (0, 0)、(-1, 1) 和 (3, 3)。三角形的面積是多少?<br><br>Think it step by step |
|
117 |
+
|---|---|
|
118 |
+
| **Breexe** | 首先,我們將利用三角形面積公式來計算這個問題。三角形面積公式為:<br><br>面積 = \|(x1 * (y2 - y3) + x2 * (y3 - y1) + x3 * (y1 - y2)) / 2\|<br><br>在這個問題中,三角形的頂點分別位於:<br>A(0, 0)、B(-1, 1)、C(3, 3)<br><br>接下來,我們將代入公式計算:<br><br>面積 = \|(0 * (1 - 3) + (-1) * (3 - 0) + 3 * (0 - 1)) / 2\|<br>面積 = \|(0 - 3 - 3) / 2\|<br>面積 = \|-6 / 2\|<br>面積 = 3<br><br>因此,三角形的面積為 3 平方單位。 |
|
119 |
+
|
120 |
+
</details>
|
121 |
+
|
122 |
+
<details><summary><b>Taiwan Knowledge</b></summary>
|
123 |
+
|
124 |
+
| Query | 我剛來到台北工作,不是很了解這邊的捷運系統,請幫我介紹一下 |
|
125 |
+
|---|---|
|
126 |
+
| **Breexe** | 歡迎你來到台北!台北擁有便捷且現代化的大眾運輸系統,其中最主要的是台北捷運(Metro Taipei)。以下是一些基本資訊和路線介紹:<br>1. 台北捷運共有五種不同的路線:淡水信義線 (紅線)、松山新店線 (綠線)、中和新蘆線 (橘線)、文湖線 (棕線) 以及板南線 (藍線)。各線之間有轉乘站相連,方便旅客在城市間移動。<br>2. 淡水信義線:這條線路由北投延伸至象山,途經士林、圓山、雙連等熱門景點。它與中和新蘆線和松山新店線在中正紀念堂站及東門站轉乘;與板南線在台北車站轉乘。<br>3. 松山新店線:從松山延伸至新店,經過南京三民、松江南京、忠孝新生等熱鬧商業區。它與淡水信義線在中正紀念堂站轉乘;與中和新蘆線在古亭站轉乘;與板南線在西門站轉乘。<br>4. 中和新蘆線:此路線由迴龍延伸至蘆洲,途經新莊、板橋、行天宮等區域。它與淡水信義線在東門站轉乘;與松山新店線在古亭站轉乘;與板南線在忠孝復興站轉乘。<br>5. 文湖線:這是環狀線,主要服務內湖科技園區和南港軟體園區等地。它與松山新店線在南京復興站轉乘;與淡水信義線在大安站轉乘。<br>6. 板南線:這條線路由土城延伸至南港展覽館,途經永寧、新埔民生、市政府等站點。它與淡水信義線在台北車站轉乘;與松山新店線在西門站轉乘;與中和新蘆線在忠孝復興站轉乘。<br>7. 票價:單程票費用為每段次新台幣$20元,使用悠遊卡可享折扣優惠。此外,台北捷運提供多種定期票供通勤者選擇,如「常客優惠」、「定期票」等。<br>8. 營運時間:週一至週四,五、六日及國定假日略有不同。一般來說,首班車約早上6點發車,末班車則約晚上12點左右。詳細時刻表可參考台北捷運官網。<br>9. 注意事項:進站時需刷卡或購票,出站時再刷一次卡片。禁止飲食、吸菸、攜帶危險物品等。<br>希望以上資訊能幫助你在台北順利地利用捷運系統。祝你旅途愉快! |
|
127 |
+
|
128 |
+
</details>
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
## Features
|
133 |
+
|
134 |
+
- Expanding the vocabulary dictionary size from 32k to 62k to better support Traditional Chinese
|
135 |
+
- 8k-token context length
|
136 |
+
- Multi-turn dialogue (without special handling for harmfulness)
|
137 |
+
|
138 |
+
|
139 |
+
## Inference Performance
|
140 |
+
In this test, we use the first 700 characters of the [web article](https://health.udn.com/health/story/5976/7699252?from=udn_ch1005_main_index) as the input and ask the model to write the same article again.
|
141 |
+
All inferences run on 4 RTX A6000 GPUs (using `vllm`, with a tensor-parallel size of 4).
|
142 |
+
|
143 |
+
| Models | ↓ Inference Time (sec)|Estimated Max Input Length (Char)|
|
144 |
+
|--------------------------------------------------------------------|-------------------|--------------------------|
|
145 |
+
| **Breexe-8x7B-Instruct-v0.1** | 27.83 | 11.1k |
|
146 |
+
| Mixtral-8x7B-Instruct-v0.1 | 59.49 | 5.1k |
|
147 |
+
|
148 |
+
|
149 |
+
## Chat Model Performance
|
150 |
+
|
151 |
+
**TMMLU+**, **Table**, and **MT-Bench-tw** source from [MediaTek-Research/TCEval-v2](https://huggingface.co/datasets/MediaTek-Research/TCEval-v2),
|
152 |
+
which derives from [TCEval-v1](https://github.com/mtkresearch/MR-Models/tree/main/TC-Eval)
|
153 |
+
and [ikala/tmmluplus](https://huggingface.co/datasets/ikala/tmmluplus).
|
154 |
+
**MMLU** sources from [hails/mmlu_no_train](https://huggingface.co/datasets/hails/mmlu_no_train).
|
155 |
+
**MT-Bench** source from [lmsys/mt_bench_human_judgments](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments).
|
156 |
+
We use [the code](https://github.com/mtkresearch/TCEval) revised from [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate **TMMLU+**, **Table**, and **MMLU**. All choice problems adapt the selection by the log-likelihood.
|
157 |
+
We use [the code](https://github.com/mtkresearch/TCEval) revised from [fastchat llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) (GPT4 as judge) to evaluate **MT-Bench-tw** and **MT-Bench**.
|
158 |
+
|
159 |
+
|
160 |
+
| Models | |↑ MT-Bench-tw (Score)| TMMLU+ (ACC)|TTQA (ACC) | Table (ACC)| MT-Bench (Score)| MMLU (ACC) |
|
161 |
+
|---------------------------------------------------------------------------------------------------------|--------|--------------------|--------------|-------------|-------------|------------------|-------------|
|
162 |
+
| | |TC, Chat |TC, Knowledge |TC, Knowledge|TC, Reasoning|EN, Chat |EN, Knowledge|
|
163 |
+
| | |0 shot | 0 shot |0 shot | 0 shot |0 shot | 0 shot |
|
164 |
+
| [**Breexe-8x7B-Instruct-v0_1**](https://huggingface.co/MediaTek-Research/Breexe-8x7B-Instruct-v0_1) | 47B |7.2 | 48.92 | 75.22 | 39.58 | 7.8 | 69.90 |
|
165 |
+
| [gpt-3.5-turbo-1106](https://openai.com) | |7.1 | 43.56 | 68.14 | 45.14 |7.9 | 67.09 |
|
166 |
+
| [Qwen1.5-14B-Chat](https://huggingface.co/Qwen/Qwen1.5-14B-Chat) | 14B |7.1 | 51.76 | 70.79 | 51.39 |7.8 | 66.65 |
|
167 |
+
| [Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat) | 34B |6.9 | 54.87 | 81.42 | 36.81 |7.6 | 71.04 |
|
168 |
+
| [Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat) | 7B |6.4 | 44.65 | 67.86 | 34.72 |7.6 | 59.54 |
|
169 |
+
| [Breeze-7B-Instruct-v0_1](https://huggingface.co/MediaTek-Research/Breeze-7B-Instruct-v0_1) | 7B |5.7 | 41.61 | 65.49 | 45.83 |7.1 | 63.26 |
|
170 |
+
| [Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat) | 6B |5.0 | 44.79 | 72.57 | 25.69 |6.0 | 59.45 |
|
171 |
+
| [Taiwan-LLM-13B-v2.0-chat](https://huggingface.co/yentinglin/Taiwan-LLM-13B-v2.0-chat) | 13B |5.0 | 29.47 | 67.26 | 23.61 |N/A* | 50.50 |
|
172 |
+
| [Taiwan-LLM-7B-v2.1-chat](https://huggingface.co/yentinglin/Taiwan-LLM-7B-v2.1-chat) | 7B |4.2 | 28.08 | 51.33 | 31.25 |N/A* | 42.72 |
|
173 |
+
|
174 |
+
\* Taiwan-LLM models responds to multi-turn questions (English) in Traditional Chinese.
|
175 |
+
|
176 |
+
|
177 |
+
## Base Model Performance
|
178 |
+
|
179 |
+
**TMMLU+** and **Table** source from [MediaTek-Research/TCEval-v2](https://huggingface.co/datasets/MediaTek-Research/TCEval-v2),
|
180 |
+
which derives from [TCEval-v1](https://github.com/mtkresearch/MR-Models/tree/main/TC-Eval)
|
181 |
+
and [ikala/tmmluplus](https://huggingface.co/datasets/ikala/tmmluplus).
|
182 |
+
**MMLU** sources from [hails/mmlu_no_train](https://huggingface.co/datasets/hails/mmlu_no_train).
|
183 |
+
We use [the code](https://github.com/mtkresearch/TCEval) revised from [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate **TMMLU+**, **Table**, and **MMLU**. All choice problems adapt the selection by the log-likelihood.
|
184 |
+
|
185 |
+
|
186 |
+
| Models | |↑ TMMLU+ (ACC)| TTQA (ACC) | Table (ACC) | MMLU (ACC) |
|
187 |
+
|-------------------------------------------------------------------------------------|------|--------------|-------------|-------------|-------------|
|
188 |
+
| | |TC, Knowledge |TC, Knowledge|TC, Reasoning|EN, Knowledge|
|
189 |
+
| | | 5 shot |5 shot | 5 shot | 5 shot |
|
190 |
+
| [Yi-34B](https://huggingface.co/01-ai/Yi-34B) | 34B | 63.10 | 87.61 | 49.31 | 77.42 |
|
191 |
+
| [Qwen1.5-14B](https://huggingface.co/Qwen/Qwen1.5-14B) | 14B | 54.30 | 78.76 | 54.86 | 70.17 |
|
192 |
+
| **Breexe-8x7B-Base-v0_1** | 47B | 50.20 | 79.65 | 39.58 | 70.79 |
|
193 |
+
| [Yi-6B](https://huggingface.co/01-ai/Yi-6B) | 6B | 49.63 | 75.22 | 34.72 | 65.35 |
|
194 |
+
| [Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B) | 7B | 46.51 | 69.03 | 33.33 | 63.14 |
|
195 |
+
| [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | 47B | 46.10 | 64.60 | 47.22 | 72.94 |
|
196 |
+
| [Breeze-7B-Base-v0_1](https://huggingface.co/MediaTek-Research/Breeze-7B-Base-v0_1) | 7B | 40.35 | 68.14 | 28.47 | 61.63 |
|
197 |
+
| [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 7B | 36.93 | 53.10 | 27.78 | 64.89 |
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
## Use in Transformers
|
203 |
+
|
204 |
+
First install direct dependencies:
|
205 |
+
```
|
206 |
+
pip install transformers torch accelerate
|
207 |
+
```
|
208 |
+
If you want faster inference using flash-attention2, you need to install these dependencies:
|
209 |
+
```bash
|
210 |
+
pip install packaging ninja
|
211 |
+
pip install flash-attn
|
212 |
+
```
|
213 |
+
Then load the model in transformers:
|
214 |
+
```python
|
215 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
216 |
+
import torch
|
217 |
+
|
218 |
+
model = AutoModelForCausalLM.from_pretrained(
|
219 |
+
"MediaTek-Research/Breexe-8x7B-Instruct-v0_1",
|
220 |
+
device_map="auto",
|
221 |
+
torch_dtype=torch.bfloat16,
|
222 |
+
attn_implementation="flash_attention_2" # optional
|
223 |
+
)
|
224 |
+
```
|
225 |
+
|
226 |
+
The structure of the query is
|
227 |
+
```txt
|
228 |
+
<s> SYS_PROMPT [INST] QUERY1 [/INST] RESPONSE1 [INST] QUERY2 [/INST]
|
229 |
+
```
|
230 |
+
where `SYS_PROMPT`, `QUERY1`, `RESPONSE1`, and `QUERY2` can be provided by the user.
|
231 |
+
|
232 |
+
The suggested default `SYS_PROMPT` is
|
233 |
+
```txt
|
234 |
+
You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.
|
235 |
+
```
|
236 |
+
|
237 |
+
We also integrate `chat_template` into [tokenizer_config.json](tokenizer_config.json), so you can `apply_chat_template` to get the prompt.
|
238 |
+
|
239 |
+
```python
|
240 |
+
>>> from transformers import AutoTokenizer
|
241 |
+
>>> tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breexe-8x7B-Instruct-v0_1")
|
242 |
+
>>> chat = [
|
243 |
+
... {"role": "user", "content": "你好,請問你可以完成什麼任務?"},
|
244 |
+
... {"role": "assistant", "content": "你好,我可以幫助您解決各種問題、提供資訊和協助您完成許多不同的任務。例如:回答技術問題、提供建議、翻譯文字、尋找資料或協助您安排行程等。請告訴我如何能幫助您。"},
|
245 |
+
... {"role": "user", "content": "太棒了!"},
|
246 |
+
... ]
|
247 |
+
>>> tokenizer.apply_chat_template(chat, tokenize=False)
|
248 |
+
"<s>You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan. [INST] 你好,請問你可以完成什麼任務? [/INST] 你好,我可以幫助您解決各種問題、提供資訊和協助您完成許多不同的任務。例如:回答技術問題、提供建議、翻譯文字、尋找資料或協助您安排行程等。請告訴我如何能幫助您。 [INST] 太棒了! [/INST] "
|
249 |
+
# Tokenized results
|
250 |
+
# ['▁', '你好', ',', '請問', '你', '可以', '完成', '什麼', '任務', '?']
|
251 |
+
# ['▁', '你好', ',', '我', '可以', '幫助', '您', '解決', '各種', '問題', '、', '提供', '資訊', '和', '協助', '您', '完成', '許多', '不同', '的', '任務', '。', '例如', ':', '回答', '技術', '問題', '、', '提供', '建議', '、', '翻譯', '文字', '、', '尋找', '資料', '或', '協助', '您', '安排', '行程', '等', '。', '請', '告訴', '我', '如何', '能', '幫助', '您', '。']
|
252 |
+
# ['▁', '太', '棒', '了', '!']
|
253 |
+
```
|
254 |
+
|
255 |
+
## Citation
|
256 |
+
|
257 |
+
```
|
258 |
+
@article{breexe8x7b2024,
|
259 |
+
title={},
|
260 |
+
author={},
|
261 |
+
journal={arXiv},
|
262 |
+
year={2024}
|
263 |
+
}
|
264 |
+
```
|
added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<EOD>": 61873,
|
3 |
+
"<PAD>": 61874
|
4 |
+
}
|
config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"MixtralForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.0,
|
6 |
+
"bos_token_id": 1,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"hidden_act": "silu",
|
9 |
+
"hidden_size": 4096,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"intermediate_size": 14336,
|
12 |
+
"max_position_embeddings": 32768,
|
13 |
+
"model_type": "mixtral",
|
14 |
+
"num_attention_heads": 32,
|
15 |
+
"num_experts_per_tok": 2,
|
16 |
+
"num_hidden_layers": 32,
|
17 |
+
"num_key_value_heads": 8,
|
18 |
+
"num_local_experts": 8,
|
19 |
+
"output_router_logits": true,
|
20 |
+
"rms_norm_eps": 1e-05,
|
21 |
+
"rope_theta": 1000000.0,
|
22 |
+
"router_aux_loss_coef": 0.02,
|
23 |
+
"sliding_window": null,
|
24 |
+
"tie_word_embeddings": false,
|
25 |
+
"torch_dtype": "bfloat16",
|
26 |
+
"transformers_version": "4.36.2",
|
27 |
+
"use_cache": false,
|
28 |
+
"vocab_size": 61952
|
29 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"do_sample": true,
|
5 |
+
"eos_token_id": 2,
|
6 |
+
"transformers_version": "4.36.2",
|
7 |
+
"use_cache": false
|
8 |
+
}
|
pytorch_model-00001-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70b99124c7f5ad492a93d7db50e3a89f656ca6a5df3f22a1f45db6f854d7055b
|
3 |
+
size 4903306308
|
pytorch_model-00002-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1b88fbef2c6d3454000eb7985c4b165b2bacda21612219a15a0167e4ff53300
|
3 |
+
size 4983016612
|
pytorch_model-00003-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f95cd7307dffbefc8de015c0f4ec4d8e64157df8b04c4aa8cb3fcc3119cf7c4e
|
3 |
+
size 4899046246
|
pytorch_model-00004-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc43de613b66259a983fc83f759f7b31ff185f2feb42d7f2f57ef644beb7a159
|
3 |
+
size 4983016648
|
pytorch_model-00005-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99562c6491a8fcba83e48241083617d1b9f6a87a9d33b032d422ba289b01e2ee
|
3 |
+
size 4983016624
|
pytorch_model-00006-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdd72f5d9cb0f4067aee4f9f8dbc1cacd0e651dbabfc8f0c180c9cc5c52442aa
|
3 |
+
size 4983016660
|
pytorch_model-00007-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee937ec3ec70bb668503adf912de47153259c79bd744a4fa73f2bc67581611ee
|
3 |
+
size 4899046246
|
pytorch_model-00008-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71f1034b3546f9ecf905894545c6c898366da5d6bec6236a57ed9e3190250e33
|
3 |
+
size 4983016724
|
pytorch_model-00009-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:336fef8eee298270ecff1f2d8ac7394235f124aec1f4a5af5eeeabe1a449c397
|
3 |
+
size 4983016676
|
pytorch_model-00010-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab6fa728ecac0fcb781995f0e450ace523c27726759af222945d107e4297f92b
|
3 |
+
size 4899046246
|
pytorch_model-00011-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bee0df2a25c7bdc65921906002830b9a211f0ff1023b304383a4567ecd438628
|
3 |
+
size 4983016736
|
pytorch_model-00012-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8a24b43ffe565743cae473d2cae5608f9e6b11ca52af64882fbd031ea25adcd
|
3 |
+
size 4983016676
|
pytorch_model-00013-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cfe7d00849c7210e0c81abccce0c123c3b55725fd28f45cbef52169c47f6228
|
3 |
+
size 4983016760
|
pytorch_model-00014-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3b46240351c6983690ad9423147a3339ea1fb6c85365d3979ca708514971447
|
3 |
+
size 4899046246
|
pytorch_model-00015-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38f4af3e578461abd8e8f2425dcaf0f518095c34bd46ec541e7900eec5d94010
|
3 |
+
size 4983016712
|
pytorch_model-00016-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a62a07f984453dfa39d4031806cc092dc3495844b78ee8afa4d7b46de4597379
|
3 |
+
size 4983016676
|
pytorch_model-00017-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:435b70b374a35aed13fd4bb2336b484b23b3a381e4b08c8c1c40dd5d09fce3fd
|
3 |
+
size 4899046246
|
pytorch_model-00018-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41c73a6be22eead9c183166b213d0bb9e0502522c9ae41fe6ddd082c72adae5a
|
3 |
+
size 4983016736
|
pytorch_model-00019-of-00019.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a84a394c8ce5f4ac464d10ea35d25ec4a46fbf63431deb56609e8e9a63113278
|
3 |
+
size 4701937706
|
pytorch_model.bin.index.json
ADDED
@@ -0,0 +1,1002 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 93896318976
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "pytorch_model-00019-of-00019.bin",
|
7 |
+
"model.embed_tokens.weight": "pytorch_model-00001-of-00019.bin",
|
8 |
+
"model.layers.0.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00019.bin",
|
9 |
+
"model.layers.0.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00019.bin",
|
10 |
+
"model.layers.0.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00019.bin",
|
11 |
+
"model.layers.0.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00019.bin",
|
12 |
+
"model.layers.0.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00019.bin",
|
13 |
+
"model.layers.0.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00019.bin",
|
14 |
+
"model.layers.0.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00019.bin",
|
15 |
+
"model.layers.0.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00019.bin",
|
16 |
+
"model.layers.0.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00019.bin",
|
17 |
+
"model.layers.0.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00019.bin",
|
18 |
+
"model.layers.0.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00019.bin",
|
19 |
+
"model.layers.0.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00019.bin",
|
20 |
+
"model.layers.0.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00019.bin",
|
21 |
+
"model.layers.0.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00019.bin",
|
22 |
+
"model.layers.0.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00001-of-00019.bin",
|
23 |
+
"model.layers.0.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00001-of-00019.bin",
|
24 |
+
"model.layers.0.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00001-of-00019.bin",
|
25 |
+
"model.layers.0.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00001-of-00019.bin",
|
26 |
+
"model.layers.0.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00001-of-00019.bin",
|
27 |
+
"model.layers.0.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00001-of-00019.bin",
|
28 |
+
"model.layers.0.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00001-of-00019.bin",
|
29 |
+
"model.layers.0.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00001-of-00019.bin",
|
30 |
+
"model.layers.0.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00001-of-00019.bin",
|
31 |
+
"model.layers.0.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00001-of-00019.bin",
|
32 |
+
"model.layers.0.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00019.bin",
|
33 |
+
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00019.bin",
|
34 |
+
"model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00019.bin",
|
35 |
+
"model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00019.bin",
|
36 |
+
"model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00019.bin",
|
37 |
+
"model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00019.bin",
|
38 |
+
"model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00019.bin",
|
39 |
+
"model.layers.1.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00019.bin",
|
40 |
+
"model.layers.1.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00019.bin",
|
41 |
+
"model.layers.1.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00019.bin",
|
42 |
+
"model.layers.1.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00019.bin",
|
43 |
+
"model.layers.1.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00019.bin",
|
44 |
+
"model.layers.1.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00019.bin",
|
45 |
+
"model.layers.1.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00019.bin",
|
46 |
+
"model.layers.1.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00019.bin",
|
47 |
+
"model.layers.1.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00019.bin",
|
48 |
+
"model.layers.1.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00019.bin",
|
49 |
+
"model.layers.1.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00019.bin",
|
50 |
+
"model.layers.1.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00019.bin",
|
51 |
+
"model.layers.1.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00019.bin",
|
52 |
+
"model.layers.1.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00019.bin",
|
53 |
+
"model.layers.1.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00019.bin",
|
54 |
+
"model.layers.1.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00019.bin",
|
55 |
+
"model.layers.1.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00019.bin",
|
56 |
+
"model.layers.1.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00019.bin",
|
57 |
+
"model.layers.1.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00019.bin",
|
58 |
+
"model.layers.1.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00019.bin",
|
59 |
+
"model.layers.1.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00019.bin",
|
60 |
+
"model.layers.1.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00019.bin",
|
61 |
+
"model.layers.1.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00019.bin",
|
62 |
+
"model.layers.1.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00019.bin",
|
63 |
+
"model.layers.1.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00019.bin",
|
64 |
+
"model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00019.bin",
|
65 |
+
"model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00019.bin",
|
66 |
+
"model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00019.bin",
|
67 |
+
"model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00019.bin",
|
68 |
+
"model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00019.bin",
|
69 |
+
"model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00019.bin",
|
70 |
+
"model.layers.10.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00019.bin",
|
71 |
+
"model.layers.10.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00019.bin",
|
72 |
+
"model.layers.10.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00019.bin",
|
73 |
+
"model.layers.10.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00019.bin",
|
74 |
+
"model.layers.10.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00019.bin",
|
75 |
+
"model.layers.10.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00019.bin",
|
76 |
+
"model.layers.10.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00019.bin",
|
77 |
+
"model.layers.10.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00019.bin",
|
78 |
+
"model.layers.10.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00019.bin",
|
79 |
+
"model.layers.10.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00019.bin",
|
80 |
+
"model.layers.10.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00019.bin",
|
81 |
+
"model.layers.10.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00019.bin",
|
82 |
+
"model.layers.10.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00019.bin",
|
83 |
+
"model.layers.10.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00019.bin",
|
84 |
+
"model.layers.10.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00019.bin",
|
85 |
+
"model.layers.10.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00019.bin",
|
86 |
+
"model.layers.10.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00019.bin",
|
87 |
+
"model.layers.10.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00019.bin",
|
88 |
+
"model.layers.10.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00019.bin",
|
89 |
+
"model.layers.10.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00019.bin",
|
90 |
+
"model.layers.10.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00007-of-00019.bin",
|
91 |
+
"model.layers.10.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00007-of-00019.bin",
|
92 |
+
"model.layers.10.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00007-of-00019.bin",
|
93 |
+
"model.layers.10.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00007-of-00019.bin",
|
94 |
+
"model.layers.10.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00019.bin",
|
95 |
+
"model.layers.10.input_layernorm.weight": "pytorch_model-00007-of-00019.bin",
|
96 |
+
"model.layers.10.post_attention_layernorm.weight": "pytorch_model-00007-of-00019.bin",
|
97 |
+
"model.layers.10.self_attn.k_proj.weight": "pytorch_model-00006-of-00019.bin",
|
98 |
+
"model.layers.10.self_attn.o_proj.weight": "pytorch_model-00006-of-00019.bin",
|
99 |
+
"model.layers.10.self_attn.q_proj.weight": "pytorch_model-00006-of-00019.bin",
|
100 |
+
"model.layers.10.self_attn.v_proj.weight": "pytorch_model-00006-of-00019.bin",
|
101 |
+
"model.layers.11.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00007-of-00019.bin",
|
102 |
+
"model.layers.11.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00019.bin",
|
103 |
+
"model.layers.11.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00019.bin",
|
104 |
+
"model.layers.11.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00019.bin",
|
105 |
+
"model.layers.11.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00019.bin",
|
106 |
+
"model.layers.11.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00019.bin",
|
107 |
+
"model.layers.11.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00019.bin",
|
108 |
+
"model.layers.11.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00019.bin",
|
109 |
+
"model.layers.11.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00019.bin",
|
110 |
+
"model.layers.11.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00019.bin",
|
111 |
+
"model.layers.11.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00019.bin",
|
112 |
+
"model.layers.11.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00019.bin",
|
113 |
+
"model.layers.11.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00019.bin",
|
114 |
+
"model.layers.11.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00019.bin",
|
115 |
+
"model.layers.11.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00019.bin",
|
116 |
+
"model.layers.11.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00019.bin",
|
117 |
+
"model.layers.11.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00019.bin",
|
118 |
+
"model.layers.11.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00019.bin",
|
119 |
+
"model.layers.11.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00008-of-00019.bin",
|
120 |
+
"model.layers.11.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00008-of-00019.bin",
|
121 |
+
"model.layers.11.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00019.bin",
|
122 |
+
"model.layers.11.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00019.bin",
|
123 |
+
"model.layers.11.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00019.bin",
|
124 |
+
"model.layers.11.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00019.bin",
|
125 |
+
"model.layers.11.block_sparse_moe.gate.weight": "pytorch_model-00007-of-00019.bin",
|
126 |
+
"model.layers.11.input_layernorm.weight": "pytorch_model-00008-of-00019.bin",
|
127 |
+
"model.layers.11.post_attention_layernorm.weight": "pytorch_model-00008-of-00019.bin",
|
128 |
+
"model.layers.11.self_attn.k_proj.weight": "pytorch_model-00007-of-00019.bin",
|
129 |
+
"model.layers.11.self_attn.o_proj.weight": "pytorch_model-00007-of-00019.bin",
|
130 |
+
"model.layers.11.self_attn.q_proj.weight": "pytorch_model-00007-of-00019.bin",
|
131 |
+
"model.layers.11.self_attn.v_proj.weight": "pytorch_model-00007-of-00019.bin",
|
132 |
+
"model.layers.12.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00008-of-00019.bin",
|
133 |
+
"model.layers.12.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00008-of-00019.bin",
|
134 |
+
"model.layers.12.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00008-of-00019.bin",
|
135 |
+
"model.layers.12.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00008-of-00019.bin",
|
136 |
+
"model.layers.12.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00008-of-00019.bin",
|
137 |
+
"model.layers.12.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00008-of-00019.bin",
|
138 |
+
"model.layers.12.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00008-of-00019.bin",
|
139 |
+
"model.layers.12.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00008-of-00019.bin",
|
140 |
+
"model.layers.12.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00008-of-00019.bin",
|
141 |
+
"model.layers.12.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00008-of-00019.bin",
|
142 |
+
"model.layers.12.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00008-of-00019.bin",
|
143 |
+
"model.layers.12.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00008-of-00019.bin",
|
144 |
+
"model.layers.12.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00008-of-00019.bin",
|
145 |
+
"model.layers.12.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00008-of-00019.bin",
|
146 |
+
"model.layers.12.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00008-of-00019.bin",
|
147 |
+
"model.layers.12.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00008-of-00019.bin",
|
148 |
+
"model.layers.12.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00008-of-00019.bin",
|
149 |
+
"model.layers.12.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00008-of-00019.bin",
|
150 |
+
"model.layers.12.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00008-of-00019.bin",
|
151 |
+
"model.layers.12.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00008-of-00019.bin",
|
152 |
+
"model.layers.12.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00019.bin",
|
153 |
+
"model.layers.12.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00019.bin",
|
154 |
+
"model.layers.12.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00019.bin",
|
155 |
+
"model.layers.12.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00019.bin",
|
156 |
+
"model.layers.12.block_sparse_moe.gate.weight": "pytorch_model-00008-of-00019.bin",
|
157 |
+
"model.layers.12.input_layernorm.weight": "pytorch_model-00008-of-00019.bin",
|
158 |
+
"model.layers.12.post_attention_layernorm.weight": "pytorch_model-00008-of-00019.bin",
|
159 |
+
"model.layers.12.self_attn.k_proj.weight": "pytorch_model-00008-of-00019.bin",
|
160 |
+
"model.layers.12.self_attn.o_proj.weight": "pytorch_model-00008-of-00019.bin",
|
161 |
+
"model.layers.12.self_attn.q_proj.weight": "pytorch_model-00008-of-00019.bin",
|
162 |
+
"model.layers.12.self_attn.v_proj.weight": "pytorch_model-00008-of-00019.bin",
|
163 |
+
"model.layers.13.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00008-of-00019.bin",
|
164 |
+
"model.layers.13.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00008-of-00019.bin",
|
165 |
+
"model.layers.13.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00008-of-00019.bin",
|
166 |
+
"model.layers.13.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00008-of-00019.bin",
|
167 |
+
"model.layers.13.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00008-of-00019.bin",
|
168 |
+
"model.layers.13.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00008-of-00019.bin",
|
169 |
+
"model.layers.13.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00008-of-00019.bin",
|
170 |
+
"model.layers.13.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00008-of-00019.bin",
|
171 |
+
"model.layers.13.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00008-of-00019.bin",
|
172 |
+
"model.layers.13.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00008-of-00019.bin",
|
173 |
+
"model.layers.13.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00008-of-00019.bin",
|
174 |
+
"model.layers.13.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00009-of-00019.bin",
|
175 |
+
"model.layers.13.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00009-of-00019.bin",
|
176 |
+
"model.layers.13.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00009-of-00019.bin",
|
177 |
+
"model.layers.13.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00009-of-00019.bin",
|
178 |
+
"model.layers.13.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00009-of-00019.bin",
|
179 |
+
"model.layers.13.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00009-of-00019.bin",
|
180 |
+
"model.layers.13.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00009-of-00019.bin",
|
181 |
+
"model.layers.13.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00009-of-00019.bin",
|
182 |
+
"model.layers.13.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00009-of-00019.bin",
|
183 |
+
"model.layers.13.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00009-of-00019.bin",
|
184 |
+
"model.layers.13.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00009-of-00019.bin",
|
185 |
+
"model.layers.13.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00009-of-00019.bin",
|
186 |
+
"model.layers.13.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00009-of-00019.bin",
|
187 |
+
"model.layers.13.block_sparse_moe.gate.weight": "pytorch_model-00008-of-00019.bin",
|
188 |
+
"model.layers.13.input_layernorm.weight": "pytorch_model-00009-of-00019.bin",
|
189 |
+
"model.layers.13.post_attention_layernorm.weight": "pytorch_model-00009-of-00019.bin",
|
190 |
+
"model.layers.13.self_attn.k_proj.weight": "pytorch_model-00008-of-00019.bin",
|
191 |
+
"model.layers.13.self_attn.o_proj.weight": "pytorch_model-00008-of-00019.bin",
|
192 |
+
"model.layers.13.self_attn.q_proj.weight": "pytorch_model-00008-of-00019.bin",
|
193 |
+
"model.layers.13.self_attn.v_proj.weight": "pytorch_model-00008-of-00019.bin",
|
194 |
+
"model.layers.14.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00009-of-00019.bin",
|
195 |
+
"model.layers.14.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00009-of-00019.bin",
|
196 |
+
"model.layers.14.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00009-of-00019.bin",
|
197 |
+
"model.layers.14.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00009-of-00019.bin",
|
198 |
+
"model.layers.14.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00009-of-00019.bin",
|
199 |
+
"model.layers.14.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00009-of-00019.bin",
|
200 |
+
"model.layers.14.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00009-of-00019.bin",
|
201 |
+
"model.layers.14.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00009-of-00019.bin",
|
202 |
+
"model.layers.14.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00009-of-00019.bin",
|
203 |
+
"model.layers.14.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00009-of-00019.bin",
|
204 |
+
"model.layers.14.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00009-of-00019.bin",
|
205 |
+
"model.layers.14.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00009-of-00019.bin",
|
206 |
+
"model.layers.14.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00009-of-00019.bin",
|
207 |
+
"model.layers.14.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00009-of-00019.bin",
|
208 |
+
"model.layers.14.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00009-of-00019.bin",
|
209 |
+
"model.layers.14.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00009-of-00019.bin",
|
210 |
+
"model.layers.14.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00009-of-00019.bin",
|
211 |
+
"model.layers.14.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00009-of-00019.bin",
|
212 |
+
"model.layers.14.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00009-of-00019.bin",
|
213 |
+
"model.layers.14.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00009-of-00019.bin",
|
214 |
+
"model.layers.14.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00009-of-00019.bin",
|
215 |
+
"model.layers.14.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00009-of-00019.bin",
|
216 |
+
"model.layers.14.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00009-of-00019.bin",
|
217 |
+
"model.layers.14.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00009-of-00019.bin",
|
218 |
+
"model.layers.14.block_sparse_moe.gate.weight": "pytorch_model-00009-of-00019.bin",
|
219 |
+
"model.layers.14.input_layernorm.weight": "pytorch_model-00009-of-00019.bin",
|
220 |
+
"model.layers.14.post_attention_layernorm.weight": "pytorch_model-00009-of-00019.bin",
|
221 |
+
"model.layers.14.self_attn.k_proj.weight": "pytorch_model-00009-of-00019.bin",
|
222 |
+
"model.layers.14.self_attn.o_proj.weight": "pytorch_model-00009-of-00019.bin",
|
223 |
+
"model.layers.14.self_attn.q_proj.weight": "pytorch_model-00009-of-00019.bin",
|
224 |
+
"model.layers.14.self_attn.v_proj.weight": "pytorch_model-00009-of-00019.bin",
|
225 |
+
"model.layers.15.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00009-of-00019.bin",
|
226 |
+
"model.layers.15.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00009-of-00019.bin",
|
227 |
+
"model.layers.15.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00009-of-00019.bin",
|
228 |
+
"model.layers.15.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00009-of-00019.bin",
|
229 |
+
"model.layers.15.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00010-of-00019.bin",
|
230 |
+
"model.layers.15.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00010-of-00019.bin",
|
231 |
+
"model.layers.15.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00010-of-00019.bin",
|
232 |
+
"model.layers.15.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00010-of-00019.bin",
|
233 |
+
"model.layers.15.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00010-of-00019.bin",
|
234 |
+
"model.layers.15.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00010-of-00019.bin",
|
235 |
+
"model.layers.15.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00010-of-00019.bin",
|
236 |
+
"model.layers.15.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00010-of-00019.bin",
|
237 |
+
"model.layers.15.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00010-of-00019.bin",
|
238 |
+
"model.layers.15.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00010-of-00019.bin",
|
239 |
+
"model.layers.15.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00010-of-00019.bin",
|
240 |
+
"model.layers.15.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00010-of-00019.bin",
|
241 |
+
"model.layers.15.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00010-of-00019.bin",
|
242 |
+
"model.layers.15.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00010-of-00019.bin",
|
243 |
+
"model.layers.15.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00010-of-00019.bin",
|
244 |
+
"model.layers.15.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00010-of-00019.bin",
|
245 |
+
"model.layers.15.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00010-of-00019.bin",
|
246 |
+
"model.layers.15.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00010-of-00019.bin",
|
247 |
+
"model.layers.15.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00010-of-00019.bin",
|
248 |
+
"model.layers.15.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00010-of-00019.bin",
|
249 |
+
"model.layers.15.block_sparse_moe.gate.weight": "pytorch_model-00009-of-00019.bin",
|
250 |
+
"model.layers.15.input_layernorm.weight": "pytorch_model-00010-of-00019.bin",
|
251 |
+
"model.layers.15.post_attention_layernorm.weight": "pytorch_model-00010-of-00019.bin",
|
252 |
+
"model.layers.15.self_attn.k_proj.weight": "pytorch_model-00009-of-00019.bin",
|
253 |
+
"model.layers.15.self_attn.o_proj.weight": "pytorch_model-00009-of-00019.bin",
|
254 |
+
"model.layers.15.self_attn.q_proj.weight": "pytorch_model-00009-of-00019.bin",
|
255 |
+
"model.layers.15.self_attn.v_proj.weight": "pytorch_model-00009-of-00019.bin",
|
256 |
+
"model.layers.16.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00010-of-00019.bin",
|
257 |
+
"model.layers.16.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00010-of-00019.bin",
|
258 |
+
"model.layers.16.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00010-of-00019.bin",
|
259 |
+
"model.layers.16.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00010-of-00019.bin",
|
260 |
+
"model.layers.16.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00010-of-00019.bin",
|
261 |
+
"model.layers.16.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00010-of-00019.bin",
|
262 |
+
"model.layers.16.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00010-of-00019.bin",
|
263 |
+
"model.layers.16.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00010-of-00019.bin",
|
264 |
+
"model.layers.16.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00010-of-00019.bin",
|
265 |
+
"model.layers.16.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00010-of-00019.bin",
|
266 |
+
"model.layers.16.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00010-of-00019.bin",
|
267 |
+
"model.layers.16.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00010-of-00019.bin",
|
268 |
+
"model.layers.16.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00010-of-00019.bin",
|
269 |
+
"model.layers.16.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00010-of-00019.bin",
|
270 |
+
"model.layers.16.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00010-of-00019.bin",
|
271 |
+
"model.layers.16.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00010-of-00019.bin",
|
272 |
+
"model.layers.16.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00010-of-00019.bin",
|
273 |
+
"model.layers.16.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00010-of-00019.bin",
|
274 |
+
"model.layers.16.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00010-of-00019.bin",
|
275 |
+
"model.layers.16.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00010-of-00019.bin",
|
276 |
+
"model.layers.16.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00010-of-00019.bin",
|
277 |
+
"model.layers.16.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00011-of-00019.bin",
|
278 |
+
"model.layers.16.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00011-of-00019.bin",
|
279 |
+
"model.layers.16.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00011-of-00019.bin",
|
280 |
+
"model.layers.16.block_sparse_moe.gate.weight": "pytorch_model-00010-of-00019.bin",
|
281 |
+
"model.layers.16.input_layernorm.weight": "pytorch_model-00011-of-00019.bin",
|
282 |
+
"model.layers.16.post_attention_layernorm.weight": "pytorch_model-00011-of-00019.bin",
|
283 |
+
"model.layers.16.self_attn.k_proj.weight": "pytorch_model-00010-of-00019.bin",
|
284 |
+
"model.layers.16.self_attn.o_proj.weight": "pytorch_model-00010-of-00019.bin",
|
285 |
+
"model.layers.16.self_attn.q_proj.weight": "pytorch_model-00010-of-00019.bin",
|
286 |
+
"model.layers.16.self_attn.v_proj.weight": "pytorch_model-00010-of-00019.bin",
|
287 |
+
"model.layers.17.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00011-of-00019.bin",
|
288 |
+
"model.layers.17.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00011-of-00019.bin",
|
289 |
+
"model.layers.17.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00011-of-00019.bin",
|
290 |
+
"model.layers.17.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00011-of-00019.bin",
|
291 |
+
"model.layers.17.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00011-of-00019.bin",
|
292 |
+
"model.layers.17.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00011-of-00019.bin",
|
293 |
+
"model.layers.17.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00011-of-00019.bin",
|
294 |
+
"model.layers.17.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00011-of-00019.bin",
|
295 |
+
"model.layers.17.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00011-of-00019.bin",
|
296 |
+
"model.layers.17.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00011-of-00019.bin",
|
297 |
+
"model.layers.17.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00011-of-00019.bin",
|
298 |
+
"model.layers.17.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00011-of-00019.bin",
|
299 |
+
"model.layers.17.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00011-of-00019.bin",
|
300 |
+
"model.layers.17.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00011-of-00019.bin",
|
301 |
+
"model.layers.17.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00011-of-00019.bin",
|
302 |
+
"model.layers.17.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00011-of-00019.bin",
|
303 |
+
"model.layers.17.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00011-of-00019.bin",
|
304 |
+
"model.layers.17.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00011-of-00019.bin",
|
305 |
+
"model.layers.17.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00011-of-00019.bin",
|
306 |
+
"model.layers.17.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00011-of-00019.bin",
|
307 |
+
"model.layers.17.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00011-of-00019.bin",
|
308 |
+
"model.layers.17.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00011-of-00019.bin",
|
309 |
+
"model.layers.17.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00011-of-00019.bin",
|
310 |
+
"model.layers.17.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00011-of-00019.bin",
|
311 |
+
"model.layers.17.block_sparse_moe.gate.weight": "pytorch_model-00011-of-00019.bin",
|
312 |
+
"model.layers.17.input_layernorm.weight": "pytorch_model-00011-of-00019.bin",
|
313 |
+
"model.layers.17.post_attention_layernorm.weight": "pytorch_model-00011-of-00019.bin",
|
314 |
+
"model.layers.17.self_attn.k_proj.weight": "pytorch_model-00011-of-00019.bin",
|
315 |
+
"model.layers.17.self_attn.o_proj.weight": "pytorch_model-00011-of-00019.bin",
|
316 |
+
"model.layers.17.self_attn.q_proj.weight": "pytorch_model-00011-of-00019.bin",
|
317 |
+
"model.layers.17.self_attn.v_proj.weight": "pytorch_model-00011-of-00019.bin",
|
318 |
+
"model.layers.18.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00011-of-00019.bin",
|
319 |
+
"model.layers.18.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00011-of-00019.bin",
|
320 |
+
"model.layers.18.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00011-of-00019.bin",
|
321 |
+
"model.layers.18.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00011-of-00019.bin",
|
322 |
+
"model.layers.18.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00011-of-00019.bin",
|
323 |
+
"model.layers.18.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00011-of-00019.bin",
|
324 |
+
"model.layers.18.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00011-of-00019.bin",
|
325 |
+
"model.layers.18.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00011-of-00019.bin",
|
326 |
+
"model.layers.18.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00011-of-00019.bin",
|
327 |
+
"model.layers.18.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00011-of-00019.bin",
|
328 |
+
"model.layers.18.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00011-of-00019.bin",
|
329 |
+
"model.layers.18.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00011-of-00019.bin",
|
330 |
+
"model.layers.18.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00011-of-00019.bin",
|
331 |
+
"model.layers.18.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00011-of-00019.bin",
|
332 |
+
"model.layers.18.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00012-of-00019.bin",
|
333 |
+
"model.layers.18.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00012-of-00019.bin",
|
334 |
+
"model.layers.18.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00012-of-00019.bin",
|
335 |
+
"model.layers.18.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00012-of-00019.bin",
|
336 |
+
"model.layers.18.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00012-of-00019.bin",
|
337 |
+
"model.layers.18.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00012-of-00019.bin",
|
338 |
+
"model.layers.18.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00012-of-00019.bin",
|
339 |
+
"model.layers.18.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00012-of-00019.bin",
|
340 |
+
"model.layers.18.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00012-of-00019.bin",
|
341 |
+
"model.layers.18.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00012-of-00019.bin",
|
342 |
+
"model.layers.18.block_sparse_moe.gate.weight": "pytorch_model-00011-of-00019.bin",
|
343 |
+
"model.layers.18.input_layernorm.weight": "pytorch_model-00012-of-00019.bin",
|
344 |
+
"model.layers.18.post_attention_layernorm.weight": "pytorch_model-00012-of-00019.bin",
|
345 |
+
"model.layers.18.self_attn.k_proj.weight": "pytorch_model-00011-of-00019.bin",
|
346 |
+
"model.layers.18.self_attn.o_proj.weight": "pytorch_model-00011-of-00019.bin",
|
347 |
+
"model.layers.18.self_attn.q_proj.weight": "pytorch_model-00011-of-00019.bin",
|
348 |
+
"model.layers.18.self_attn.v_proj.weight": "pytorch_model-00011-of-00019.bin",
|
349 |
+
"model.layers.19.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00012-of-00019.bin",
|
350 |
+
"model.layers.19.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00012-of-00019.bin",
|
351 |
+
"model.layers.19.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00012-of-00019.bin",
|
352 |
+
"model.layers.19.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00012-of-00019.bin",
|
353 |
+
"model.layers.19.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00012-of-00019.bin",
|
354 |
+
"model.layers.19.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00012-of-00019.bin",
|
355 |
+
"model.layers.19.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00012-of-00019.bin",
|
356 |
+
"model.layers.19.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00012-of-00019.bin",
|
357 |
+
"model.layers.19.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00012-of-00019.bin",
|
358 |
+
"model.layers.19.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00012-of-00019.bin",
|
359 |
+
"model.layers.19.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00012-of-00019.bin",
|
360 |
+
"model.layers.19.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00012-of-00019.bin",
|
361 |
+
"model.layers.19.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00012-of-00019.bin",
|
362 |
+
"model.layers.19.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00012-of-00019.bin",
|
363 |
+
"model.layers.19.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00012-of-00019.bin",
|
364 |
+
"model.layers.19.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00012-of-00019.bin",
|
365 |
+
"model.layers.19.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00012-of-00019.bin",
|
366 |
+
"model.layers.19.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00012-of-00019.bin",
|
367 |
+
"model.layers.19.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00012-of-00019.bin",
|
368 |
+
"model.layers.19.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00012-of-00019.bin",
|
369 |
+
"model.layers.19.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00012-of-00019.bin",
|
370 |
+
"model.layers.19.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00012-of-00019.bin",
|
371 |
+
"model.layers.19.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00012-of-00019.bin",
|
372 |
+
"model.layers.19.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00012-of-00019.bin",
|
373 |
+
"model.layers.19.block_sparse_moe.gate.weight": "pytorch_model-00012-of-00019.bin",
|
374 |
+
"model.layers.19.input_layernorm.weight": "pytorch_model-00012-of-00019.bin",
|
375 |
+
"model.layers.19.post_attention_layernorm.weight": "pytorch_model-00012-of-00019.bin",
|
376 |
+
"model.layers.19.self_attn.k_proj.weight": "pytorch_model-00012-of-00019.bin",
|
377 |
+
"model.layers.19.self_attn.o_proj.weight": "pytorch_model-00012-of-00019.bin",
|
378 |
+
"model.layers.19.self_attn.q_proj.weight": "pytorch_model-00012-of-00019.bin",
|
379 |
+
"model.layers.19.self_attn.v_proj.weight": "pytorch_model-00012-of-00019.bin",
|
380 |
+
"model.layers.2.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00019.bin",
|
381 |
+
"model.layers.2.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00019.bin",
|
382 |
+
"model.layers.2.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00019.bin",
|
383 |
+
"model.layers.2.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00019.bin",
|
384 |
+
"model.layers.2.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00019.bin",
|
385 |
+
"model.layers.2.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00019.bin",
|
386 |
+
"model.layers.2.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00019.bin",
|
387 |
+
"model.layers.2.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00002-of-00019.bin",
|
388 |
+
"model.layers.2.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00002-of-00019.bin",
|
389 |
+
"model.layers.2.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00002-of-00019.bin",
|
390 |
+
"model.layers.2.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00002-of-00019.bin",
|
391 |
+
"model.layers.2.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00002-of-00019.bin",
|
392 |
+
"model.layers.2.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00019.bin",
|
393 |
+
"model.layers.2.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00019.bin",
|
394 |
+
"model.layers.2.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00019.bin",
|
395 |
+
"model.layers.2.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00019.bin",
|
396 |
+
"model.layers.2.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00019.bin",
|
397 |
+
"model.layers.2.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00019.bin",
|
398 |
+
"model.layers.2.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00019.bin",
|
399 |
+
"model.layers.2.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00019.bin",
|
400 |
+
"model.layers.2.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00019.bin",
|
401 |
+
"model.layers.2.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00019.bin",
|
402 |
+
"model.layers.2.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00019.bin",
|
403 |
+
"model.layers.2.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00019.bin",
|
404 |
+
"model.layers.2.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00019.bin",
|
405 |
+
"model.layers.2.input_layernorm.weight": "pytorch_model-00002-of-00019.bin",
|
406 |
+
"model.layers.2.post_attention_layernorm.weight": "pytorch_model-00002-of-00019.bin",
|
407 |
+
"model.layers.2.self_attn.k_proj.weight": "pytorch_model-00002-of-00019.bin",
|
408 |
+
"model.layers.2.self_attn.o_proj.weight": "pytorch_model-00002-of-00019.bin",
|
409 |
+
"model.layers.2.self_attn.q_proj.weight": "pytorch_model-00002-of-00019.bin",
|
410 |
+
"model.layers.2.self_attn.v_proj.weight": "pytorch_model-00002-of-00019.bin",
|
411 |
+
"model.layers.20.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00012-of-00019.bin",
|
412 |
+
"model.layers.20.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00012-of-00019.bin",
|
413 |
+
"model.layers.20.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00012-of-00019.bin",
|
414 |
+
"model.layers.20.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00012-of-00019.bin",
|
415 |
+
"model.layers.20.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00012-of-00019.bin",
|
416 |
+
"model.layers.20.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00012-of-00019.bin",
|
417 |
+
"model.layers.20.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00012-of-00019.bin",
|
418 |
+
"model.layers.20.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00013-of-00019.bin",
|
419 |
+
"model.layers.20.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00013-of-00019.bin",
|
420 |
+
"model.layers.20.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00013-of-00019.bin",
|
421 |
+
"model.layers.20.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00013-of-00019.bin",
|
422 |
+
"model.layers.20.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00013-of-00019.bin",
|
423 |
+
"model.layers.20.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00013-of-00019.bin",
|
424 |
+
"model.layers.20.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00013-of-00019.bin",
|
425 |
+
"model.layers.20.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00013-of-00019.bin",
|
426 |
+
"model.layers.20.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00013-of-00019.bin",
|
427 |
+
"model.layers.20.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00013-of-00019.bin",
|
428 |
+
"model.layers.20.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00013-of-00019.bin",
|
429 |
+
"model.layers.20.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00013-of-00019.bin",
|
430 |
+
"model.layers.20.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00013-of-00019.bin",
|
431 |
+
"model.layers.20.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00013-of-00019.bin",
|
432 |
+
"model.layers.20.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00013-of-00019.bin",
|
433 |
+
"model.layers.20.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00013-of-00019.bin",
|
434 |
+
"model.layers.20.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00013-of-00019.bin",
|
435 |
+
"model.layers.20.block_sparse_moe.gate.weight": "pytorch_model-00012-of-00019.bin",
|
436 |
+
"model.layers.20.input_layernorm.weight": "pytorch_model-00013-of-00019.bin",
|
437 |
+
"model.layers.20.post_attention_layernorm.weight": "pytorch_model-00013-of-00019.bin",
|
438 |
+
"model.layers.20.self_attn.k_proj.weight": "pytorch_model-00012-of-00019.bin",
|
439 |
+
"model.layers.20.self_attn.o_proj.weight": "pytorch_model-00012-of-00019.bin",
|
440 |
+
"model.layers.20.self_attn.q_proj.weight": "pytorch_model-00012-of-00019.bin",
|
441 |
+
"model.layers.20.self_attn.v_proj.weight": "pytorch_model-00012-of-00019.bin",
|
442 |
+
"model.layers.21.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00013-of-00019.bin",
|
443 |
+
"model.layers.21.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00013-of-00019.bin",
|
444 |
+
"model.layers.21.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00013-of-00019.bin",
|
445 |
+
"model.layers.21.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00013-of-00019.bin",
|
446 |
+
"model.layers.21.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00013-of-00019.bin",
|
447 |
+
"model.layers.21.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00013-of-00019.bin",
|
448 |
+
"model.layers.21.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00013-of-00019.bin",
|
449 |
+
"model.layers.21.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00013-of-00019.bin",
|
450 |
+
"model.layers.21.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00013-of-00019.bin",
|
451 |
+
"model.layers.21.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00013-of-00019.bin",
|
452 |
+
"model.layers.21.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00013-of-00019.bin",
|
453 |
+
"model.layers.21.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00013-of-00019.bin",
|
454 |
+
"model.layers.21.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00013-of-00019.bin",
|
455 |
+
"model.layers.21.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00013-of-00019.bin",
|
456 |
+
"model.layers.21.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00013-of-00019.bin",
|
457 |
+
"model.layers.21.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00013-of-00019.bin",
|
458 |
+
"model.layers.21.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00013-of-00019.bin",
|
459 |
+
"model.layers.21.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00013-of-00019.bin",
|
460 |
+
"model.layers.21.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00013-of-00019.bin",
|
461 |
+
"model.layers.21.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00013-of-00019.bin",
|
462 |
+
"model.layers.21.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00013-of-00019.bin",
|
463 |
+
"model.layers.21.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00013-of-00019.bin",
|
464 |
+
"model.layers.21.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00013-of-00019.bin",
|
465 |
+
"model.layers.21.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00013-of-00019.bin",
|
466 |
+
"model.layers.21.block_sparse_moe.gate.weight": "pytorch_model-00013-of-00019.bin",
|
467 |
+
"model.layers.21.input_layernorm.weight": "pytorch_model-00013-of-00019.bin",
|
468 |
+
"model.layers.21.post_attention_layernorm.weight": "pytorch_model-00013-of-00019.bin",
|
469 |
+
"model.layers.21.self_attn.k_proj.weight": "pytorch_model-00013-of-00019.bin",
|
470 |
+
"model.layers.21.self_attn.o_proj.weight": "pytorch_model-00013-of-00019.bin",
|
471 |
+
"model.layers.21.self_attn.q_proj.weight": "pytorch_model-00013-of-00019.bin",
|
472 |
+
"model.layers.21.self_attn.v_proj.weight": "pytorch_model-00013-of-00019.bin",
|
473 |
+
"model.layers.22.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00014-of-00019.bin",
|
474 |
+
"model.layers.22.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00014-of-00019.bin",
|
475 |
+
"model.layers.22.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00014-of-00019.bin",
|
476 |
+
"model.layers.22.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00014-of-00019.bin",
|
477 |
+
"model.layers.22.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00014-of-00019.bin",
|
478 |
+
"model.layers.22.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00014-of-00019.bin",
|
479 |
+
"model.layers.22.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00014-of-00019.bin",
|
480 |
+
"model.layers.22.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00014-of-00019.bin",
|
481 |
+
"model.layers.22.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00014-of-00019.bin",
|
482 |
+
"model.layers.22.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00014-of-00019.bin",
|
483 |
+
"model.layers.22.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00014-of-00019.bin",
|
484 |
+
"model.layers.22.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00014-of-00019.bin",
|
485 |
+
"model.layers.22.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00014-of-00019.bin",
|
486 |
+
"model.layers.22.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00014-of-00019.bin",
|
487 |
+
"model.layers.22.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00014-of-00019.bin",
|
488 |
+
"model.layers.22.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00014-of-00019.bin",
|
489 |
+
"model.layers.22.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00014-of-00019.bin",
|
490 |
+
"model.layers.22.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00014-of-00019.bin",
|
491 |
+
"model.layers.22.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00014-of-00019.bin",
|
492 |
+
"model.layers.22.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00014-of-00019.bin",
|
493 |
+
"model.layers.22.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00014-of-00019.bin",
|
494 |
+
"model.layers.22.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00014-of-00019.bin",
|
495 |
+
"model.layers.22.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00014-of-00019.bin",
|
496 |
+
"model.layers.22.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00014-of-00019.bin",
|
497 |
+
"model.layers.22.block_sparse_moe.gate.weight": "pytorch_model-00013-of-00019.bin",
|
498 |
+
"model.layers.22.input_layernorm.weight": "pytorch_model-00014-of-00019.bin",
|
499 |
+
"model.layers.22.post_attention_layernorm.weight": "pytorch_model-00014-of-00019.bin",
|
500 |
+
"model.layers.22.self_attn.k_proj.weight": "pytorch_model-00013-of-00019.bin",
|
501 |
+
"model.layers.22.self_attn.o_proj.weight": "pytorch_model-00013-of-00019.bin",
|
502 |
+
"model.layers.22.self_attn.q_proj.weight": "pytorch_model-00013-of-00019.bin",
|
503 |
+
"model.layers.22.self_attn.v_proj.weight": "pytorch_model-00013-of-00019.bin",
|
504 |
+
"model.layers.23.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00014-of-00019.bin",
|
505 |
+
"model.layers.23.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00014-of-00019.bin",
|
506 |
+
"model.layers.23.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00014-of-00019.bin",
|
507 |
+
"model.layers.23.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00014-of-00019.bin",
|
508 |
+
"model.layers.23.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00014-of-00019.bin",
|
509 |
+
"model.layers.23.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00014-of-00019.bin",
|
510 |
+
"model.layers.23.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00014-of-00019.bin",
|
511 |
+
"model.layers.23.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00014-of-00019.bin",
|
512 |
+
"model.layers.23.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00014-of-00019.bin",
|
513 |
+
"model.layers.23.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00014-of-00019.bin",
|
514 |
+
"model.layers.23.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00014-of-00019.bin",
|
515 |
+
"model.layers.23.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00014-of-00019.bin",
|
516 |
+
"model.layers.23.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00014-of-00019.bin",
|
517 |
+
"model.layers.23.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00014-of-00019.bin",
|
518 |
+
"model.layers.23.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00014-of-00019.bin",
|
519 |
+
"model.layers.23.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00014-of-00019.bin",
|
520 |
+
"model.layers.23.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00014-of-00019.bin",
|
521 |
+
"model.layers.23.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00015-of-00019.bin",
|
522 |
+
"model.layers.23.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00015-of-00019.bin",
|
523 |
+
"model.layers.23.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00015-of-00019.bin",
|
524 |
+
"model.layers.23.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00015-of-00019.bin",
|
525 |
+
"model.layers.23.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00015-of-00019.bin",
|
526 |
+
"model.layers.23.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00015-of-00019.bin",
|
527 |
+
"model.layers.23.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00015-of-00019.bin",
|
528 |
+
"model.layers.23.block_sparse_moe.gate.weight": "pytorch_model-00014-of-00019.bin",
|
529 |
+
"model.layers.23.input_layernorm.weight": "pytorch_model-00015-of-00019.bin",
|
530 |
+
"model.layers.23.post_attention_layernorm.weight": "pytorch_model-00015-of-00019.bin",
|
531 |
+
"model.layers.23.self_attn.k_proj.weight": "pytorch_model-00014-of-00019.bin",
|
532 |
+
"model.layers.23.self_attn.o_proj.weight": "pytorch_model-00014-of-00019.bin",
|
533 |
+
"model.layers.23.self_attn.q_proj.weight": "pytorch_model-00014-of-00019.bin",
|
534 |
+
"model.layers.23.self_attn.v_proj.weight": "pytorch_model-00014-of-00019.bin",
|
535 |
+
"model.layers.24.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00015-of-00019.bin",
|
536 |
+
"model.layers.24.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00015-of-00019.bin",
|
537 |
+
"model.layers.24.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00015-of-00019.bin",
|
538 |
+
"model.layers.24.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00015-of-00019.bin",
|
539 |
+
"model.layers.24.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00015-of-00019.bin",
|
540 |
+
"model.layers.24.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00015-of-00019.bin",
|
541 |
+
"model.layers.24.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00015-of-00019.bin",
|
542 |
+
"model.layers.24.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00015-of-00019.bin",
|
543 |
+
"model.layers.24.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00015-of-00019.bin",
|
544 |
+
"model.layers.24.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00015-of-00019.bin",
|
545 |
+
"model.layers.24.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00015-of-00019.bin",
|
546 |
+
"model.layers.24.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00015-of-00019.bin",
|
547 |
+
"model.layers.24.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00015-of-00019.bin",
|
548 |
+
"model.layers.24.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00015-of-00019.bin",
|
549 |
+
"model.layers.24.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00015-of-00019.bin",
|
550 |
+
"model.layers.24.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00015-of-00019.bin",
|
551 |
+
"model.layers.24.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00015-of-00019.bin",
|
552 |
+
"model.layers.24.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00015-of-00019.bin",
|
553 |
+
"model.layers.24.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00015-of-00019.bin",
|
554 |
+
"model.layers.24.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00015-of-00019.bin",
|
555 |
+
"model.layers.24.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00015-of-00019.bin",
|
556 |
+
"model.layers.24.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00015-of-00019.bin",
|
557 |
+
"model.layers.24.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00015-of-00019.bin",
|
558 |
+
"model.layers.24.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00015-of-00019.bin",
|
559 |
+
"model.layers.24.block_sparse_moe.gate.weight": "pytorch_model-00015-of-00019.bin",
|
560 |
+
"model.layers.24.input_layernorm.weight": "pytorch_model-00015-of-00019.bin",
|
561 |
+
"model.layers.24.post_attention_layernorm.weight": "pytorch_model-00015-of-00019.bin",
|
562 |
+
"model.layers.24.self_attn.k_proj.weight": "pytorch_model-00015-of-00019.bin",
|
563 |
+
"model.layers.24.self_attn.o_proj.weight": "pytorch_model-00015-of-00019.bin",
|
564 |
+
"model.layers.24.self_attn.q_proj.weight": "pytorch_model-00015-of-00019.bin",
|
565 |
+
"model.layers.24.self_attn.v_proj.weight": "pytorch_model-00015-of-00019.bin",
|
566 |
+
"model.layers.25.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00015-of-00019.bin",
|
567 |
+
"model.layers.25.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00015-of-00019.bin",
|
568 |
+
"model.layers.25.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00015-of-00019.bin",
|
569 |
+
"model.layers.25.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00015-of-00019.bin",
|
570 |
+
"model.layers.25.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00015-of-00019.bin",
|
571 |
+
"model.layers.25.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00015-of-00019.bin",
|
572 |
+
"model.layers.25.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00015-of-00019.bin",
|
573 |
+
"model.layers.25.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00015-of-00019.bin",
|
574 |
+
"model.layers.25.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00015-of-00019.bin",
|
575 |
+
"model.layers.25.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00015-of-00019.bin",
|
576 |
+
"model.layers.25.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00016-of-00019.bin",
|
577 |
+
"model.layers.25.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00016-of-00019.bin",
|
578 |
+
"model.layers.25.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00016-of-00019.bin",
|
579 |
+
"model.layers.25.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00016-of-00019.bin",
|
580 |
+
"model.layers.25.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00016-of-00019.bin",
|
581 |
+
"model.layers.25.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00016-of-00019.bin",
|
582 |
+
"model.layers.25.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00016-of-00019.bin",
|
583 |
+
"model.layers.25.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00016-of-00019.bin",
|
584 |
+
"model.layers.25.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00016-of-00019.bin",
|
585 |
+
"model.layers.25.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00016-of-00019.bin",
|
586 |
+
"model.layers.25.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00016-of-00019.bin",
|
587 |
+
"model.layers.25.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00016-of-00019.bin",
|
588 |
+
"model.layers.25.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00016-of-00019.bin",
|
589 |
+
"model.layers.25.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00016-of-00019.bin",
|
590 |
+
"model.layers.25.block_sparse_moe.gate.weight": "pytorch_model-00015-of-00019.bin",
|
591 |
+
"model.layers.25.input_layernorm.weight": "pytorch_model-00016-of-00019.bin",
|
592 |
+
"model.layers.25.post_attention_layernorm.weight": "pytorch_model-00016-of-00019.bin",
|
593 |
+
"model.layers.25.self_attn.k_proj.weight": "pytorch_model-00015-of-00019.bin",
|
594 |
+
"model.layers.25.self_attn.o_proj.weight": "pytorch_model-00015-of-00019.bin",
|
595 |
+
"model.layers.25.self_attn.q_proj.weight": "pytorch_model-00015-of-00019.bin",
|
596 |
+
"model.layers.25.self_attn.v_proj.weight": "pytorch_model-00015-of-00019.bin",
|
597 |
+
"model.layers.26.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00016-of-00019.bin",
|
598 |
+
"model.layers.26.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00016-of-00019.bin",
|
599 |
+
"model.layers.26.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00016-of-00019.bin",
|
600 |
+
"model.layers.26.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00016-of-00019.bin",
|
601 |
+
"model.layers.26.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00016-of-00019.bin",
|
602 |
+
"model.layers.26.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00016-of-00019.bin",
|
603 |
+
"model.layers.26.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00016-of-00019.bin",
|
604 |
+
"model.layers.26.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00016-of-00019.bin",
|
605 |
+
"model.layers.26.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00016-of-00019.bin",
|
606 |
+
"model.layers.26.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00016-of-00019.bin",
|
607 |
+
"model.layers.26.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00016-of-00019.bin",
|
608 |
+
"model.layers.26.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00016-of-00019.bin",
|
609 |
+
"model.layers.26.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00016-of-00019.bin",
|
610 |
+
"model.layers.26.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00016-of-00019.bin",
|
611 |
+
"model.layers.26.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00016-of-00019.bin",
|
612 |
+
"model.layers.26.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00016-of-00019.bin",
|
613 |
+
"model.layers.26.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00016-of-00019.bin",
|
614 |
+
"model.layers.26.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00016-of-00019.bin",
|
615 |
+
"model.layers.26.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00016-of-00019.bin",
|
616 |
+
"model.layers.26.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00016-of-00019.bin",
|
617 |
+
"model.layers.26.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00016-of-00019.bin",
|
618 |
+
"model.layers.26.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00016-of-00019.bin",
|
619 |
+
"model.layers.26.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00016-of-00019.bin",
|
620 |
+
"model.layers.26.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00016-of-00019.bin",
|
621 |
+
"model.layers.26.block_sparse_moe.gate.weight": "pytorch_model-00016-of-00019.bin",
|
622 |
+
"model.layers.26.input_layernorm.weight": "pytorch_model-00016-of-00019.bin",
|
623 |
+
"model.layers.26.post_attention_layernorm.weight": "pytorch_model-00016-of-00019.bin",
|
624 |
+
"model.layers.26.self_attn.k_proj.weight": "pytorch_model-00016-of-00019.bin",
|
625 |
+
"model.layers.26.self_attn.o_proj.weight": "pytorch_model-00016-of-00019.bin",
|
626 |
+
"model.layers.26.self_attn.q_proj.weight": "pytorch_model-00016-of-00019.bin",
|
627 |
+
"model.layers.26.self_attn.v_proj.weight": "pytorch_model-00016-of-00019.bin",
|
628 |
+
"model.layers.27.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00016-of-00019.bin",
|
629 |
+
"model.layers.27.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00016-of-00019.bin",
|
630 |
+
"model.layers.27.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00016-of-00019.bin",
|
631 |
+
"model.layers.27.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00017-of-00019.bin",
|
632 |
+
"model.layers.27.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00017-of-00019.bin",
|
633 |
+
"model.layers.27.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00017-of-00019.bin",
|
634 |
+
"model.layers.27.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00017-of-00019.bin",
|
635 |
+
"model.layers.27.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00017-of-00019.bin",
|
636 |
+
"model.layers.27.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00017-of-00019.bin",
|
637 |
+
"model.layers.27.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00017-of-00019.bin",
|
638 |
+
"model.layers.27.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00017-of-00019.bin",
|
639 |
+
"model.layers.27.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00017-of-00019.bin",
|
640 |
+
"model.layers.27.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00017-of-00019.bin",
|
641 |
+
"model.layers.27.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00017-of-00019.bin",
|
642 |
+
"model.layers.27.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00017-of-00019.bin",
|
643 |
+
"model.layers.27.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00017-of-00019.bin",
|
644 |
+
"model.layers.27.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00017-of-00019.bin",
|
645 |
+
"model.layers.27.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00017-of-00019.bin",
|
646 |
+
"model.layers.27.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00017-of-00019.bin",
|
647 |
+
"model.layers.27.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00017-of-00019.bin",
|
648 |
+
"model.layers.27.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00017-of-00019.bin",
|
649 |
+
"model.layers.27.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00017-of-00019.bin",
|
650 |
+
"model.layers.27.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00017-of-00019.bin",
|
651 |
+
"model.layers.27.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00017-of-00019.bin",
|
652 |
+
"model.layers.27.block_sparse_moe.gate.weight": "pytorch_model-00016-of-00019.bin",
|
653 |
+
"model.layers.27.input_layernorm.weight": "pytorch_model-00017-of-00019.bin",
|
654 |
+
"model.layers.27.post_attention_layernorm.weight": "pytorch_model-00017-of-00019.bin",
|
655 |
+
"model.layers.27.self_attn.k_proj.weight": "pytorch_model-00016-of-00019.bin",
|
656 |
+
"model.layers.27.self_attn.o_proj.weight": "pytorch_model-00016-of-00019.bin",
|
657 |
+
"model.layers.27.self_attn.q_proj.weight": "pytorch_model-00016-of-00019.bin",
|
658 |
+
"model.layers.27.self_attn.v_proj.weight": "pytorch_model-00016-of-00019.bin",
|
659 |
+
"model.layers.28.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00017-of-00019.bin",
|
660 |
+
"model.layers.28.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00017-of-00019.bin",
|
661 |
+
"model.layers.28.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00017-of-00019.bin",
|
662 |
+
"model.layers.28.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00017-of-00019.bin",
|
663 |
+
"model.layers.28.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00017-of-00019.bin",
|
664 |
+
"model.layers.28.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00017-of-00019.bin",
|
665 |
+
"model.layers.28.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00017-of-00019.bin",
|
666 |
+
"model.layers.28.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00017-of-00019.bin",
|
667 |
+
"model.layers.28.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00017-of-00019.bin",
|
668 |
+
"model.layers.28.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00017-of-00019.bin",
|
669 |
+
"model.layers.28.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00017-of-00019.bin",
|
670 |
+
"model.layers.28.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00017-of-00019.bin",
|
671 |
+
"model.layers.28.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00017-of-00019.bin",
|
672 |
+
"model.layers.28.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00017-of-00019.bin",
|
673 |
+
"model.layers.28.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00017-of-00019.bin",
|
674 |
+
"model.layers.28.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00017-of-00019.bin",
|
675 |
+
"model.layers.28.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00017-of-00019.bin",
|
676 |
+
"model.layers.28.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00017-of-00019.bin",
|
677 |
+
"model.layers.28.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00017-of-00019.bin",
|
678 |
+
"model.layers.28.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00017-of-00019.bin",
|
679 |
+
"model.layers.28.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00018-of-00019.bin",
|
680 |
+
"model.layers.28.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00018-of-00019.bin",
|
681 |
+
"model.layers.28.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00018-of-00019.bin",
|
682 |
+
"model.layers.28.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00018-of-00019.bin",
|
683 |
+
"model.layers.28.block_sparse_moe.gate.weight": "pytorch_model-00017-of-00019.bin",
|
684 |
+
"model.layers.28.input_layernorm.weight": "pytorch_model-00018-of-00019.bin",
|
685 |
+
"model.layers.28.post_attention_layernorm.weight": "pytorch_model-00018-of-00019.bin",
|
686 |
+
"model.layers.28.self_attn.k_proj.weight": "pytorch_model-00017-of-00019.bin",
|
687 |
+
"model.layers.28.self_attn.o_proj.weight": "pytorch_model-00017-of-00019.bin",
|
688 |
+
"model.layers.28.self_attn.q_proj.weight": "pytorch_model-00017-of-00019.bin",
|
689 |
+
"model.layers.28.self_attn.v_proj.weight": "pytorch_model-00017-of-00019.bin",
|
690 |
+
"model.layers.29.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00018-of-00019.bin",
|
691 |
+
"model.layers.29.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00018-of-00019.bin",
|
692 |
+
"model.layers.29.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00018-of-00019.bin",
|
693 |
+
"model.layers.29.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00018-of-00019.bin",
|
694 |
+
"model.layers.29.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00018-of-00019.bin",
|
695 |
+
"model.layers.29.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00018-of-00019.bin",
|
696 |
+
"model.layers.29.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00018-of-00019.bin",
|
697 |
+
"model.layers.29.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00018-of-00019.bin",
|
698 |
+
"model.layers.29.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00018-of-00019.bin",
|
699 |
+
"model.layers.29.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00018-of-00019.bin",
|
700 |
+
"model.layers.29.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00018-of-00019.bin",
|
701 |
+
"model.layers.29.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00018-of-00019.bin",
|
702 |
+
"model.layers.29.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00018-of-00019.bin",
|
703 |
+
"model.layers.29.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00018-of-00019.bin",
|
704 |
+
"model.layers.29.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00018-of-00019.bin",
|
705 |
+
"model.layers.29.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00018-of-00019.bin",
|
706 |
+
"model.layers.29.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00018-of-00019.bin",
|
707 |
+
"model.layers.29.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00018-of-00019.bin",
|
708 |
+
"model.layers.29.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00018-of-00019.bin",
|
709 |
+
"model.layers.29.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00018-of-00019.bin",
|
710 |
+
"model.layers.29.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00018-of-00019.bin",
|
711 |
+
"model.layers.29.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00018-of-00019.bin",
|
712 |
+
"model.layers.29.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00018-of-00019.bin",
|
713 |
+
"model.layers.29.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00018-of-00019.bin",
|
714 |
+
"model.layers.29.block_sparse_moe.gate.weight": "pytorch_model-00018-of-00019.bin",
|
715 |
+
"model.layers.29.input_layernorm.weight": "pytorch_model-00018-of-00019.bin",
|
716 |
+
"model.layers.29.post_attention_layernorm.weight": "pytorch_model-00018-of-00019.bin",
|
717 |
+
"model.layers.29.self_attn.k_proj.weight": "pytorch_model-00018-of-00019.bin",
|
718 |
+
"model.layers.29.self_attn.o_proj.weight": "pytorch_model-00018-of-00019.bin",
|
719 |
+
"model.layers.29.self_attn.q_proj.weight": "pytorch_model-00018-of-00019.bin",
|
720 |
+
"model.layers.29.self_attn.v_proj.weight": "pytorch_model-00018-of-00019.bin",
|
721 |
+
"model.layers.3.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00019.bin",
|
722 |
+
"model.layers.3.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00019.bin",
|
723 |
+
"model.layers.3.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00019.bin",
|
724 |
+
"model.layers.3.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00019.bin",
|
725 |
+
"model.layers.3.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00019.bin",
|
726 |
+
"model.layers.3.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00019.bin",
|
727 |
+
"model.layers.3.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00019.bin",
|
728 |
+
"model.layers.3.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00019.bin",
|
729 |
+
"model.layers.3.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00019.bin",
|
730 |
+
"model.layers.3.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00019.bin",
|
731 |
+
"model.layers.3.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00019.bin",
|
732 |
+
"model.layers.3.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00019.bin",
|
733 |
+
"model.layers.3.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00019.bin",
|
734 |
+
"model.layers.3.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00019.bin",
|
735 |
+
"model.layers.3.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00019.bin",
|
736 |
+
"model.layers.3.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00019.bin",
|
737 |
+
"model.layers.3.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00019.bin",
|
738 |
+
"model.layers.3.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00019.bin",
|
739 |
+
"model.layers.3.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00019.bin",
|
740 |
+
"model.layers.3.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00019.bin",
|
741 |
+
"model.layers.3.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00019.bin",
|
742 |
+
"model.layers.3.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00019.bin",
|
743 |
+
"model.layers.3.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00019.bin",
|
744 |
+
"model.layers.3.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00019.bin",
|
745 |
+
"model.layers.3.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00019.bin",
|
746 |
+
"model.layers.3.input_layernorm.weight": "pytorch_model-00003-of-00019.bin",
|
747 |
+
"model.layers.3.post_attention_layernorm.weight": "pytorch_model-00003-of-00019.bin",
|
748 |
+
"model.layers.3.self_attn.k_proj.weight": "pytorch_model-00002-of-00019.bin",
|
749 |
+
"model.layers.3.self_attn.o_proj.weight": "pytorch_model-00002-of-00019.bin",
|
750 |
+
"model.layers.3.self_attn.q_proj.weight": "pytorch_model-00002-of-00019.bin",
|
751 |
+
"model.layers.3.self_attn.v_proj.weight": "pytorch_model-00002-of-00019.bin",
|
752 |
+
"model.layers.30.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00018-of-00019.bin",
|
753 |
+
"model.layers.30.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00018-of-00019.bin",
|
754 |
+
"model.layers.30.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00018-of-00019.bin",
|
755 |
+
"model.layers.30.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00018-of-00019.bin",
|
756 |
+
"model.layers.30.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00018-of-00019.bin",
|
757 |
+
"model.layers.30.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00018-of-00019.bin",
|
758 |
+
"model.layers.30.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00018-of-00019.bin",
|
759 |
+
"model.layers.30.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00018-of-00019.bin",
|
760 |
+
"model.layers.30.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00018-of-00019.bin",
|
761 |
+
"model.layers.30.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00018-of-00019.bin",
|
762 |
+
"model.layers.30.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00018-of-00019.bin",
|
763 |
+
"model.layers.30.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00018-of-00019.bin",
|
764 |
+
"model.layers.30.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00018-of-00019.bin",
|
765 |
+
"model.layers.30.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00019-of-00019.bin",
|
766 |
+
"model.layers.30.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00019-of-00019.bin",
|
767 |
+
"model.layers.30.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00019-of-00019.bin",
|
768 |
+
"model.layers.30.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00019-of-00019.bin",
|
769 |
+
"model.layers.30.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00019-of-00019.bin",
|
770 |
+
"model.layers.30.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00019-of-00019.bin",
|
771 |
+
"model.layers.30.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00019-of-00019.bin",
|
772 |
+
"model.layers.30.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00019-of-00019.bin",
|
773 |
+
"model.layers.30.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00019-of-00019.bin",
|
774 |
+
"model.layers.30.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00019-of-00019.bin",
|
775 |
+
"model.layers.30.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00019-of-00019.bin",
|
776 |
+
"model.layers.30.block_sparse_moe.gate.weight": "pytorch_model-00018-of-00019.bin",
|
777 |
+
"model.layers.30.input_layernorm.weight": "pytorch_model-00019-of-00019.bin",
|
778 |
+
"model.layers.30.post_attention_layernorm.weight": "pytorch_model-00019-of-00019.bin",
|
779 |
+
"model.layers.30.self_attn.k_proj.weight": "pytorch_model-00018-of-00019.bin",
|
780 |
+
"model.layers.30.self_attn.o_proj.weight": "pytorch_model-00018-of-00019.bin",
|
781 |
+
"model.layers.30.self_attn.q_proj.weight": "pytorch_model-00018-of-00019.bin",
|
782 |
+
"model.layers.30.self_attn.v_proj.weight": "pytorch_model-00018-of-00019.bin",
|
783 |
+
"model.layers.31.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00019-of-00019.bin",
|
784 |
+
"model.layers.31.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00019-of-00019.bin",
|
785 |
+
"model.layers.31.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00019-of-00019.bin",
|
786 |
+
"model.layers.31.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00019-of-00019.bin",
|
787 |
+
"model.layers.31.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00019-of-00019.bin",
|
788 |
+
"model.layers.31.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00019-of-00019.bin",
|
789 |
+
"model.layers.31.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00019-of-00019.bin",
|
790 |
+
"model.layers.31.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00019-of-00019.bin",
|
791 |
+
"model.layers.31.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00019-of-00019.bin",
|
792 |
+
"model.layers.31.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00019-of-00019.bin",
|
793 |
+
"model.layers.31.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00019-of-00019.bin",
|
794 |
+
"model.layers.31.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00019-of-00019.bin",
|
795 |
+
"model.layers.31.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00019-of-00019.bin",
|
796 |
+
"model.layers.31.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00019-of-00019.bin",
|
797 |
+
"model.layers.31.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00019-of-00019.bin",
|
798 |
+
"model.layers.31.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00019-of-00019.bin",
|
799 |
+
"model.layers.31.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00019-of-00019.bin",
|
800 |
+
"model.layers.31.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00019-of-00019.bin",
|
801 |
+
"model.layers.31.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00019-of-00019.bin",
|
802 |
+
"model.layers.31.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00019-of-00019.bin",
|
803 |
+
"model.layers.31.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00019-of-00019.bin",
|
804 |
+
"model.layers.31.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00019-of-00019.bin",
|
805 |
+
"model.layers.31.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00019-of-00019.bin",
|
806 |
+
"model.layers.31.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00019-of-00019.bin",
|
807 |
+
"model.layers.31.block_sparse_moe.gate.weight": "pytorch_model-00019-of-00019.bin",
|
808 |
+
"model.layers.31.input_layernorm.weight": "pytorch_model-00019-of-00019.bin",
|
809 |
+
"model.layers.31.post_attention_layernorm.weight": "pytorch_model-00019-of-00019.bin",
|
810 |
+
"model.layers.31.self_attn.k_proj.weight": "pytorch_model-00019-of-00019.bin",
|
811 |
+
"model.layers.31.self_attn.o_proj.weight": "pytorch_model-00019-of-00019.bin",
|
812 |
+
"model.layers.31.self_attn.q_proj.weight": "pytorch_model-00019-of-00019.bin",
|
813 |
+
"model.layers.31.self_attn.v_proj.weight": "pytorch_model-00019-of-00019.bin",
|
814 |
+
"model.layers.4.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00003-of-00019.bin",
|
815 |
+
"model.layers.4.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00003-of-00019.bin",
|
816 |
+
"model.layers.4.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00019.bin",
|
817 |
+
"model.layers.4.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00019.bin",
|
818 |
+
"model.layers.4.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00019.bin",
|
819 |
+
"model.layers.4.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00019.bin",
|
820 |
+
"model.layers.4.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00019.bin",
|
821 |
+
"model.layers.4.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00019.bin",
|
822 |
+
"model.layers.4.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00019.bin",
|
823 |
+
"model.layers.4.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00019.bin",
|
824 |
+
"model.layers.4.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00019.bin",
|
825 |
+
"model.layers.4.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00019.bin",
|
826 |
+
"model.layers.4.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00019.bin",
|
827 |
+
"model.layers.4.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00019.bin",
|
828 |
+
"model.layers.4.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00019.bin",
|
829 |
+
"model.layers.4.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00019.bin",
|
830 |
+
"model.layers.4.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00019.bin",
|
831 |
+
"model.layers.4.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00019.bin",
|
832 |
+
"model.layers.4.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00019.bin",
|
833 |
+
"model.layers.4.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00019.bin",
|
834 |
+
"model.layers.4.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00019.bin",
|
835 |
+
"model.layers.4.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00019.bin",
|
836 |
+
"model.layers.4.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00019.bin",
|
837 |
+
"model.layers.4.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00019.bin",
|
838 |
+
"model.layers.4.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00019.bin",
|
839 |
+
"model.layers.4.input_layernorm.weight": "pytorch_model-00004-of-00019.bin",
|
840 |
+
"model.layers.4.post_attention_layernorm.weight": "pytorch_model-00004-of-00019.bin",
|
841 |
+
"model.layers.4.self_attn.k_proj.weight": "pytorch_model-00003-of-00019.bin",
|
842 |
+
"model.layers.4.self_attn.o_proj.weight": "pytorch_model-00003-of-00019.bin",
|
843 |
+
"model.layers.4.self_attn.q_proj.weight": "pytorch_model-00003-of-00019.bin",
|
844 |
+
"model.layers.4.self_attn.v_proj.weight": "pytorch_model-00003-of-00019.bin",
|
845 |
+
"model.layers.5.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00019.bin",
|
846 |
+
"model.layers.5.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00019.bin",
|
847 |
+
"model.layers.5.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00019.bin",
|
848 |
+
"model.layers.5.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00019.bin",
|
849 |
+
"model.layers.5.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00019.bin",
|
850 |
+
"model.layers.5.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00019.bin",
|
851 |
+
"model.layers.5.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00019.bin",
|
852 |
+
"model.layers.5.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00019.bin",
|
853 |
+
"model.layers.5.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00019.bin",
|
854 |
+
"model.layers.5.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00019.bin",
|
855 |
+
"model.layers.5.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00019.bin",
|
856 |
+
"model.layers.5.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00019.bin",
|
857 |
+
"model.layers.5.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00019.bin",
|
858 |
+
"model.layers.5.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00019.bin",
|
859 |
+
"model.layers.5.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00019.bin",
|
860 |
+
"model.layers.5.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00019.bin",
|
861 |
+
"model.layers.5.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00019.bin",
|
862 |
+
"model.layers.5.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00019.bin",
|
863 |
+
"model.layers.5.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00004-of-00019.bin",
|
864 |
+
"model.layers.5.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00004-of-00019.bin",
|
865 |
+
"model.layers.5.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00004-of-00019.bin",
|
866 |
+
"model.layers.5.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00004-of-00019.bin",
|
867 |
+
"model.layers.5.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00019.bin",
|
868 |
+
"model.layers.5.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00019.bin",
|
869 |
+
"model.layers.5.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00019.bin",
|
870 |
+
"model.layers.5.input_layernorm.weight": "pytorch_model-00004-of-00019.bin",
|
871 |
+
"model.layers.5.post_attention_layernorm.weight": "pytorch_model-00004-of-00019.bin",
|
872 |
+
"model.layers.5.self_attn.k_proj.weight": "pytorch_model-00004-of-00019.bin",
|
873 |
+
"model.layers.5.self_attn.o_proj.weight": "pytorch_model-00004-of-00019.bin",
|
874 |
+
"model.layers.5.self_attn.q_proj.weight": "pytorch_model-00004-of-00019.bin",
|
875 |
+
"model.layers.5.self_attn.v_proj.weight": "pytorch_model-00004-of-00019.bin",
|
876 |
+
"model.layers.6.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00019.bin",
|
877 |
+
"model.layers.6.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00019.bin",
|
878 |
+
"model.layers.6.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00019.bin",
|
879 |
+
"model.layers.6.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00019.bin",
|
880 |
+
"model.layers.6.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00019.bin",
|
881 |
+
"model.layers.6.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00019.bin",
|
882 |
+
"model.layers.6.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00019.bin",
|
883 |
+
"model.layers.6.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00019.bin",
|
884 |
+
"model.layers.6.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00019.bin",
|
885 |
+
"model.layers.6.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00019.bin",
|
886 |
+
"model.layers.6.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00019.bin",
|
887 |
+
"model.layers.6.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00019.bin",
|
888 |
+
"model.layers.6.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00019.bin",
|
889 |
+
"model.layers.6.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00019.bin",
|
890 |
+
"model.layers.6.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00019.bin",
|
891 |
+
"model.layers.6.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00019.bin",
|
892 |
+
"model.layers.6.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00019.bin",
|
893 |
+
"model.layers.6.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00019.bin",
|
894 |
+
"model.layers.6.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00019.bin",
|
895 |
+
"model.layers.6.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00019.bin",
|
896 |
+
"model.layers.6.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00019.bin",
|
897 |
+
"model.layers.6.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00019.bin",
|
898 |
+
"model.layers.6.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00019.bin",
|
899 |
+
"model.layers.6.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00019.bin",
|
900 |
+
"model.layers.6.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00019.bin",
|
901 |
+
"model.layers.6.input_layernorm.weight": "pytorch_model-00005-of-00019.bin",
|
902 |
+
"model.layers.6.post_attention_layernorm.weight": "pytorch_model-00005-of-00019.bin",
|
903 |
+
"model.layers.6.self_attn.k_proj.weight": "pytorch_model-00004-of-00019.bin",
|
904 |
+
"model.layers.6.self_attn.o_proj.weight": "pytorch_model-00004-of-00019.bin",
|
905 |
+
"model.layers.6.self_attn.q_proj.weight": "pytorch_model-00004-of-00019.bin",
|
906 |
+
"model.layers.6.self_attn.v_proj.weight": "pytorch_model-00004-of-00019.bin",
|
907 |
+
"model.layers.7.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00019.bin",
|
908 |
+
"model.layers.7.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00019.bin",
|
909 |
+
"model.layers.7.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00019.bin",
|
910 |
+
"model.layers.7.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00019.bin",
|
911 |
+
"model.layers.7.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00019.bin",
|
912 |
+
"model.layers.7.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00019.bin",
|
913 |
+
"model.layers.7.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00019.bin",
|
914 |
+
"model.layers.7.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00019.bin",
|
915 |
+
"model.layers.7.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00019.bin",
|
916 |
+
"model.layers.7.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00019.bin",
|
917 |
+
"model.layers.7.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00005-of-00019.bin",
|
918 |
+
"model.layers.7.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00005-of-00019.bin",
|
919 |
+
"model.layers.7.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00005-of-00019.bin",
|
920 |
+
"model.layers.7.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00005-of-00019.bin",
|
921 |
+
"model.layers.7.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00005-of-00019.bin",
|
922 |
+
"model.layers.7.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00019.bin",
|
923 |
+
"model.layers.7.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00019.bin",
|
924 |
+
"model.layers.7.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00019.bin",
|
925 |
+
"model.layers.7.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00019.bin",
|
926 |
+
"model.layers.7.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00019.bin",
|
927 |
+
"model.layers.7.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00019.bin",
|
928 |
+
"model.layers.7.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00019.bin",
|
929 |
+
"model.layers.7.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00019.bin",
|
930 |
+
"model.layers.7.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00019.bin",
|
931 |
+
"model.layers.7.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00019.bin",
|
932 |
+
"model.layers.7.input_layernorm.weight": "pytorch_model-00005-of-00019.bin",
|
933 |
+
"model.layers.7.post_attention_layernorm.weight": "pytorch_model-00005-of-00019.bin",
|
934 |
+
"model.layers.7.self_attn.k_proj.weight": "pytorch_model-00005-of-00019.bin",
|
935 |
+
"model.layers.7.self_attn.o_proj.weight": "pytorch_model-00005-of-00019.bin",
|
936 |
+
"model.layers.7.self_attn.q_proj.weight": "pytorch_model-00005-of-00019.bin",
|
937 |
+
"model.layers.7.self_attn.v_proj.weight": "pytorch_model-00005-of-00019.bin",
|
938 |
+
"model.layers.8.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00019.bin",
|
939 |
+
"model.layers.8.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00019.bin",
|
940 |
+
"model.layers.8.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00019.bin",
|
941 |
+
"model.layers.8.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00019.bin",
|
942 |
+
"model.layers.8.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00019.bin",
|
943 |
+
"model.layers.8.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00019.bin",
|
944 |
+
"model.layers.8.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00019.bin",
|
945 |
+
"model.layers.8.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00019.bin",
|
946 |
+
"model.layers.8.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00019.bin",
|
947 |
+
"model.layers.8.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00019.bin",
|
948 |
+
"model.layers.8.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00019.bin",
|
949 |
+
"model.layers.8.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00019.bin",
|
950 |
+
"model.layers.8.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00019.bin",
|
951 |
+
"model.layers.8.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00019.bin",
|
952 |
+
"model.layers.8.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00019.bin",
|
953 |
+
"model.layers.8.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00019.bin",
|
954 |
+
"model.layers.8.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00019.bin",
|
955 |
+
"model.layers.8.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00019.bin",
|
956 |
+
"model.layers.8.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00019.bin",
|
957 |
+
"model.layers.8.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00019.bin",
|
958 |
+
"model.layers.8.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00019.bin",
|
959 |
+
"model.layers.8.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00019.bin",
|
960 |
+
"model.layers.8.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00019.bin",
|
961 |
+
"model.layers.8.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00019.bin",
|
962 |
+
"model.layers.8.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00019.bin",
|
963 |
+
"model.layers.8.input_layernorm.weight": "pytorch_model-00006-of-00019.bin",
|
964 |
+
"model.layers.8.post_attention_layernorm.weight": "pytorch_model-00006-of-00019.bin",
|
965 |
+
"model.layers.8.self_attn.k_proj.weight": "pytorch_model-00005-of-00019.bin",
|
966 |
+
"model.layers.8.self_attn.o_proj.weight": "pytorch_model-00005-of-00019.bin",
|
967 |
+
"model.layers.8.self_attn.q_proj.weight": "pytorch_model-00005-of-00019.bin",
|
968 |
+
"model.layers.8.self_attn.v_proj.weight": "pytorch_model-00005-of-00019.bin",
|
969 |
+
"model.layers.9.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00019.bin",
|
970 |
+
"model.layers.9.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00019.bin",
|
971 |
+
"model.layers.9.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00019.bin",
|
972 |
+
"model.layers.9.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00019.bin",
|
973 |
+
"model.layers.9.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00019.bin",
|
974 |
+
"model.layers.9.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00019.bin",
|
975 |
+
"model.layers.9.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00019.bin",
|
976 |
+
"model.layers.9.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00019.bin",
|
977 |
+
"model.layers.9.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00019.bin",
|
978 |
+
"model.layers.9.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00019.bin",
|
979 |
+
"model.layers.9.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00019.bin",
|
980 |
+
"model.layers.9.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00019.bin",
|
981 |
+
"model.layers.9.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00019.bin",
|
982 |
+
"model.layers.9.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00019.bin",
|
983 |
+
"model.layers.9.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00019.bin",
|
984 |
+
"model.layers.9.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00019.bin",
|
985 |
+
"model.layers.9.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00019.bin",
|
986 |
+
"model.layers.9.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00019.bin",
|
987 |
+
"model.layers.9.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00019.bin",
|
988 |
+
"model.layers.9.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00019.bin",
|
989 |
+
"model.layers.9.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00019.bin",
|
990 |
+
"model.layers.9.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00019.bin",
|
991 |
+
"model.layers.9.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00019.bin",
|
992 |
+
"model.layers.9.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00019.bin",
|
993 |
+
"model.layers.9.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00019.bin",
|
994 |
+
"model.layers.9.input_layernorm.weight": "pytorch_model-00006-of-00019.bin",
|
995 |
+
"model.layers.9.post_attention_layernorm.weight": "pytorch_model-00006-of-00019.bin",
|
996 |
+
"model.layers.9.self_attn.k_proj.weight": "pytorch_model-00006-of-00019.bin",
|
997 |
+
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00006-of-00019.bin",
|
998 |
+
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00006-of-00019.bin",
|
999 |
+
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00006-of-00019.bin",
|
1000 |
+
"model.norm.weight": "pytorch_model-00019-of-00019.bin"
|
1001 |
+
}
|
1002 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9298e56c094f0d30431b0e52ad53287f0cadc99ac8ca17cc2144b0eb4753f130
|
3 |
+
size 911034
|
tokenizer_config.json
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"61873": {
|
30 |
+
"content": "<EOD>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"61874": {
|
38 |
+
"content": "<PAD>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"bos_token": "<s>",
|
47 |
+
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
48 |
+
"clean_up_tokenization_spaces": false,
|
49 |
+
"eos_token": "</s>",
|
50 |
+
"legacy": true,
|
51 |
+
"model_max_length": 1000000000000000019884624838656,
|
52 |
+
"pad_token": "</s>",
|
53 |
+
"padding_side": "right",
|
54 |
+
"sp_model_kwargs": {},
|
55 |
+
"spaces_between_special_tokens": false,
|
56 |
+
"split_special_tokens": false,
|
57 |
+
"tokenizer_class": "LlamaTokenizer",
|
58 |
+
"trust_remote_code": true,
|
59 |
+
"unk_token": "<unk>",
|
60 |
+
"use_default_system_prompt": false,
|
61 |
+
"use_fast": true
|
62 |
+
}
|